Andrew's git - notes.git/blob

4c89dd596bf579f9ddbea537a1039c46443e3641
   1\documentclass[a4paper]{article}
   2\usepackage[a4paper, margin=2cm]{geometry}
   3\usepackage{array}
   4\usepackage{amsmath}
   5\usepackage{amssymb}
   6\usepackage{tcolorbox}
   7\usepackage{fancyhdr}
   8\usepackage{pgfplots}
   9\usepackage{tabularx}
  10\usepackage{keystroke}
  11\usepackage{listings}
  12\usepackage{xcolor} % used only to show the phantomed stuff
  13\definecolor{cas}{HTML}{e6f0fe}
  14\usepackage{mathtools}
  15\pgfplotsset{compat=1.16}
  16
  17\pagestyle{fancy}
  18\fancyhead[LO,LE]{Unit 4 Specialist --- Statistics}
  19\fancyhead[CO,CE]{Andrew Lorimer}
  20
  21\setlength\parindent{0pt}
  22
  23\begin{document}
  24
  25  \title{Statistics}
  26  \author{}
  27  \date{}
  28  \maketitle
  29
  30  \section{Linear combinations of random variables}
  31
  32  \subsection*{Continuous random variables}
  33
  34  A continuous random variable \(X\) has a pdf \(f\) such that:
  35
  36  \begin{enumerate}
  37    \item \(f(x) \ge 0 \forall x \)
  38    \item \(\int^\infty_{-\infty} f(x) \> dx = 1\)
  39  \end{enumerate}
  40
  41  \[ \Pr(X \le c) = \int^c_{-\infty} f(x) \> dx \]
  42
  43  \subsubsection*{Linear functions \(X \rightarrow aX+b\)}
  44
  45  \begin{align*}
  46    \Pr(Y \le y) &= \Pr(aX+b \le y) \\
  47    &= \Pr\left(X \le \dfrac{y-b}{a}\right) \\
  48    &= \int^{\frac{y-b}{a}}_{-\infty} f(x) \> dx
  49  \end{align*}
  50
  51  \begin{align*}
  52    \textbf{Mean:} && \operatorname{E}(aX+b) & = a\operatorname{E}(X)+b \\
  53    \textbf{Variance:} && \operatorname{Var}(aX+b) &= a^2 \operatorname{Var}(X) \\
  54  \end{align*}
  55
  56  \subsection*{Linear combination of two random variables}
  57
  58  \begin{align*}
  59    \textbf{Mean:} && \operatorname{E}(aX+bY) & = a\operatorname{E}(X)+b\operatorname{E}(Y) \\
  60    \textbf{Variance:} && \operatorname{Var}(aX+bY) &= a^2 \operatorname{Var}(X) + b^2 \operatorname{Var}(Y) \tag{if \(X\) and \(Y\) are independent}\\
  61  \end{align*}
  62
  63  \section{Sample mean}
  64
  65  Approximation of the \textbf{population mean} determined experimentally.
  66
  67  \[ \overline{x} = \dfrac{\Sigma x}{n} \]
  68
  69  where \(n\) is the size of the sample (number of sample points) and \(x\) is the value of a sample point
  70
  71  \begin{tcolorbox}[colframe=cas!75!black, title=On CAS]
  72
  73  \begin{enumerate}
  74    \item Spreadsheet
  75    \item In cell A1: \verb;mean(randNorm(sd, mean, sample size));
  76    \item Edit \(\rightarrow\) Fill \(\rightarrow\) Fill Range
  77    \item Input range as A1:An where \(n\) is the number of samples
  78    \item Graph \(\rightarrow\) Histogram
  79  \end{enumerate}
  80  \end{tcolorbox}
  81
  82  \subsubsection*{Sample size of \(n\)}
  83
  84  \[ \overline{X} = \sum_{i=1}^n \frac{x_i}{n} = \dfrac{\sum x}{n} \]
  85
  86  Sample mean is distributed with mean \(\mu\) and sd \(\frac{\sigma}{\sqrt{n}}\) (approaches these values for increasing sample size \(n\)).
  87
  88  For a new distribution with mean of \(n\) trials, \(\operatorname{E}(X^\prime) = \operatorname{E}(X), \quad \operatorname{sd}(X^\prime) = \dfrac{\operatorname{sd}(X)}{\sqrt{n}}\)
  89
  90  \begin{tcolorbox}[colframe=cas!75!black, title=On CAS]
  91  
  92    \begin{itemize}
  93      \item Spreadsheet \(\rightarrow\) Catalog \(\rightarrow\) \verb;randNorm(sd, mean, n); where \verb;n; is the number of samples. Show histogram with Histogram key in top left
  94      \item To calculate parameters of a dataset: Calc \(\rightarrow\) One-variable
  95    \end{itemize}
  96  \end{tcolorbox}
  97  
  98  \section{Normal distributions}
  99
 100  mean = mode = median
 101
 102  \[ Z = \frac{X - \mu}{\sigma} \]
 103
 104  Normal distributions must have area (total prob.) of 1 \(\implies \int^\infty_{-\infty} f(x) \> dx = 1\)
 105\pgfmathdeclarefunction{gauss}{2}{%
 106  \pgfmathparse{1/(#2*sqrt(2*pi))*exp(-((x-#1)^2)/(2*#2^2))}
 107}
 108
 109\begin{tikzpicture}
 110  \pgfplotsset{set layers}
 111\begin{axis}[every axis plot post/.append style={
 112  mark=none,domain=-3:3,samples=50,smooth}, 
 113  axis x line=bottom, 
 114  axis y line=left,
 115  enlargelimits=upper,
 116  x=\textwidth/10,
 117  ytick={0.55},
 118  yticklabels={\(\frac{1}{\sigma \sqrt{2\pi}}\)}, 
 119  xtick={-2,-1,0,1,2},
 120  x tick label style = {font=\footnotesize},
 121  xticklabels={\((\mu-2\sigma)\), \((\mu-\sigma)\), \(\mu\), \((\mu+\sigma)\), \((\mu+2\sigma)\)},
 122  xlabel={\(x\)},
 123  every axis x label/.style={at={(current axis.right of origin)},anchor=north west},
 124  every axis y label/.style={at={(axis description cs:-0.02,0.2)}, anchor=south west, rotate=90},
 125  ylabel={\(\Pr(X=x)\)}]
 126  \addplot {gauss(0,0.75)};
 127\end{axis}
 128\begin{axis}[every axis plot post/.append style={
 129  mark=none,domain=-3:3,samples=50,smooth}, 
 130  axis x line=bottom, 
 131  enlargelimits=upper,
 132  x=\textwidth/10,
 133  xtick={-2,-1,0,1,2},
 134  axis x line shift=30pt,
 135  hide y axis,
 136  x tick label style = {font=\footnotesize},
 137  xlabel={\(Z\)},
 138  every axis x label/.style={at={(axis description cs:1,-0.25)},anchor=south west}]
 139  \addplot {gauss(0,0.75)};
 140\end{axis}
 141\end{tikzpicture}
 142
 143  \section{Central limit theorem}
 144
 145  If \(X\) is randomly distributed with mean \(\mu\) and sd \(\sigma\), then with an adequate sample size \(n\) the distribution of the sample mean \(\overline{X}\) is approximately normal with mean \(E(\overline{X})\) and \(\operatorname{sd}(\overline{X}) = \frac{\sigma}{\sqrt{n}}\).
 146
 147  \section{Confidence intervals}
 148
 149  \begin{itemize}
 150    \item \textbf{Point estimate:} single-valued estimate of the population mean from the value of the sample mean \(\overline{x}\)
 151    \item \textbf{Interval estimate:} confidence interval for population mean \(\mu\)
 152  \end{itemize}
 153
 154  \subsection{95\% confidence interval}
 155
 156  The 95\% confidence interval for a population mean \(\mu\) is given by
 157
 158  \[ \overline{x} \pm 1.96 \dfrac{\sigma}{\sqrt{n}} \]
 159
 160  where: \\
 161  \(\overline{x}\) is the sample mean \\
 162  \(\sigma\) is the population sd \\
 163  \(n\) is the sample size from which \(\overline{x}\) was calculated
 164
 165  Always express \(z\) as +ve. Express confidence \textit{interval} as ordered pair.
 166
 167  \colorbox{cas}{\textbf{On CAS}}
 168
 169  Menu \(\rightarrow\) Stats \(\rightarrow\) Calc \(\rightarrow\) Interval \\
 170  Set Type = One-Sample Z Int, Variable
 171
 172  \subsection*{Interpretation of confidence intervals}
 173
 174  95\% confidence interval \(\implies\) 95\% of samples will contain population mean \(\mu\).
 175
 176  \subsection*{Margin of error}
 177
 178  For 95\% confidence interval for \(\mu\), margin of error \(M\) is:
 179
 180  \begin{align*}
 181    M &= 1.96 \times \dfrac{\sigma}{\sqrt{n}} \\
 182    \implies n &= \left( \dfrac{1.96 \sigma}{M} \right)^2
 183  \end{align*}
 184
 185  \subsection*{General case}
 186
 187  A confidence interval of \(C\)\% for a mean \(\mu\)  s given by
 188
 189  \[ x \in \left( \overline{x} \pm k \dfrac{\sigma}{\sqrt{n}} \right) \quad \text{ where } k \text{ is such that } \Pr(-k < Z < k) = \frac{C}{100} \]
 190
 191  \subsection*{Confidence interval for multiple trials}
 192
 193  For a set of \(n\) confidence intervals (samples), there is \(0.95^n\) chance that all \(n\) intervals contain the population mean \(\mu\).
 194
 195  \section{Hypothesis testing}
 196
 197  Note hypotheses are always expressed in terms of population parameters
 198
 199  \subsection*{Null hypothesis \(H_0\)}
 200
 201  Sample drawn from population has same mean as control population, and any difference can be explained by sample variations.
 202
 203  \subsection*{Alternative hypothesis \(H_1\)}
 204
 205  Amount of variation from control is significant, despite standard sample variations.
 206
 207  \subsection*{\(p\)-value}
 208
 209  Probability of observing a value of the sample statistic as significant as the one observed, assuming null hypothesis is true.
 210
 211  % table of p-values for strength of evidence
 212
 213  \subsection*{Distribution of sample mean}
 214
 215  If \(X \sim \operatorname{N}(\mu, \sigma)\), then distribution of sample mean \(\overline{X}\) is also normal with \(\overline{X} \sim \operatorname{N}(\mu, \frac{\sigma}{\sqrt{n}}\).
 216
 217  \subsection*{Statistical significance}
 218
 219  Significance level is denoted by \(\alpha\).
 220
 221  If \(p<\alpha\), null hypothesis is \textbf{rejected} \\
 222  If \(p>\alpha\), null hypothesis is \textbf{accepted}
 223
 224  \subsection*{\(z\)-test}
 225
 226  Hypothesis test for a mean of a sample drawn from a normally distributed population with a known standard deviation.
 227
 228  \subsubsection*{\colorbox{cas}{\textbf{On CAS:}}}
 229  
 230  Menu \(\rightarrow\) Statistics \(\rightarrow\) Calc \(\rightarrow\) Test. \\
 231  Select \textit{One-Sample Z-Test} and \textit{Variable}, then input:
 232  \begin{itemize}
 233    \item \(\mu\) condition - same operator as \(H_1\)
 234    \item \(\mu_0\) - expected sample mean (null hypothesis)
 235    \item \(\sigma\) - standard deviation (null hypothesis)
 236    \item \(\overline{x}\) - sample mean
 237    \item \(n\) - sample size
 238  \end{itemize}
 239
 240\end{document}