1\documentclass[a4paper]{article} 2\usepackage[a4paper, margin=2cm]{geometry} 3\usepackage{array} 4\usepackage{amsmath} 5\usepackage{amssymb} 6\usepackage{tcolorbox} 7\usepackage{fancyhdr} 8\usepackage{pgfplots} 9\usepackage{tabularx} 10\usepackage{keystroke} 11\usepackage{listings} 12\usepackage{xcolor} % used only to show the phantomed stuff 13\definecolor{cas}{HTML}{e6f0fe} 14\usepackage{mathtools} 15\pgfplotsset{compat=1.16} 16 17\pagestyle{fancy} 18\fancyhead[LO,LE]{Unit 4 Specialist --- Statistics} 19\fancyhead[CO,CE]{Andrew Lorimer} 20 21\setlength\parindent{0pt} 22 23\begin{document} 24 25 \title{Statistics} 26 \author{} 27 \date{} 28 \maketitle 29 30 \section{Linear combinations of random variables} 31 32 \subsection*{Continuous random variables} 33 34 A continuous random variable \(X\) has a pdf \(f\) such that: 35 36 \begin{enumerate} 37 \item \(f(x) \ge 0 \forall x \) 38 \item \(\int^\infty_{-\infty} f(x) \> dx = 1\) 39 \end{enumerate} 40 41 \[ \Pr(X \le c) = \int^c_{-\infty} f(x) \> dx \] 42 43 \subsubsection*{Linear functions \(X \rightarrow aX+b\)} 44 45 \begin{align*} 46 \Pr(Y \le y) &= \Pr(aX+b \le y) \\ 47 &= \Pr\left(X \le \dfrac{y-b}{a}\right) \\ 48 &= \int^{\frac{y-b}{a}}_{-\infty} f(x) \> dx 49 \end{align*} 50 51 \begin{align*} 52 \textbf{Mean:} && \operatorname{E}(aX+b) & = a\operatorname{E}(X)+b \\ 53 \textbf{Variance:} && \operatorname{Var}(aX+b) &= a^2 \operatorname{Var}(X) \\ 54 \end{align*} 55 56 \subsection*{Linear combination of two random variables} 57 58 \begin{align*} 59 \textbf{Mean:} && \operatorname{E}(aX+bY) & = a\operatorname{E}(X)+b\operatorname{E}(Y) \\ 60 \textbf{Variance:} && \operatorname{Var}(aX+bY) &= a^2 \operatorname{Var}(X) + b^2 \operatorname{Var}(Y) \tag{if \(X\) and \(Y\) are independent}\\ 61 \end{align*} 62 63 \section{Sample mean} 64 65 Approximation of the \textbf{population mean} determined experimentally. 66 67 \[ \overline{x} = \dfrac{\Sigma x}{n} \] 68 69 where \(n\) is the size of the sample (number of sample points) and \(x\) is the value of a sample point 70 71 \begin{tcolorbox}[colframe=cas!75!black, title=On CAS] 72 73 \begin{enumerate} 74 \item Spreadsheet 75 \item In cell A1: \verb;mean(randNorm(sd, mean, sample size)); 76 \item Edit \(\rightarrow\) Fill \(\rightarrow\) Fill Range 77 \item Input range as A1:An where \(n\) is the number of samples 78 \item Graph \(\rightarrow\) Histogram 79 \end{enumerate} 80 \end{tcolorbox} 81 82 \subsubsection*{Sample size of \(n\)} 83 84 \[ \overline{X} = \sum_{i=1}^n \frac{x_i}{n} = \dfrac{\sum x}{n} \] 85 86 Sample mean is distributed with mean \(\mu\) and sd \(\frac{\sigma}{\sqrt{n}}\) (approaches these values for increasing sample size \(n\)). 87 88 For a new distribution with mean of \(n\) trials, \(\operatorname{E}(X^\prime) = \operatorname{E}(X), \quad \operatorname{sd}(X^\prime) = \dfrac{\operatorname{sd}(X)}{\sqrt{n}}\) 89 90 \begin{tcolorbox}[colframe=cas!75!black, title=On CAS] 91 92 \begin{itemize} 93 \item Spreadsheet \(\rightarrow\) Catalog \(\rightarrow\) \verb;randNorm(sd, mean, n); where \verb;n; is the number of samples. Show histogram with Histogram key in top left 94 \item To calculate parameters of a dataset: Calc \(\rightarrow\) One-variable 95 \end{itemize} 96 \end{tcolorbox} 97 98 \section{Normal distributions} 99 100 mean = mode = median 101 102 \[ Z = \frac{X - \mu}{\sigma} \] 103 104 Normal distributions must have area (total prob.) of 1 \(\implies \int^\infty_{-\infty} f(x) \> dx = 1\) 105\pgfmathdeclarefunction{gauss}{2}{% 106 \pgfmathparse{1/(#2*sqrt(2*pi))*exp(-((x-#1)^2)/(2*#2^2))} 107} 108 109\begin{tikzpicture} 110 \pgfplotsset{set layers} 111\begin{axis}[every axis plot post/.append style={ 112 mark=none,domain=-3:3,samples=50,smooth}, 113 axis x line=bottom, 114 axis y line=left, 115 enlargelimits=upper, 116 x=\textwidth/10, 117 ytick={0.55}, 118 yticklabels={\(\frac{1}{\sigma \sqrt{2\pi}}\)}, 119 xtick={-2,-1,0,1,2}, 120 x tick label style = {font=\footnotesize}, 121 xticklabels={\((\mu-2\sigma)\), \((\mu-\sigma)\), \(\mu\), \((\mu+\sigma)\), \((\mu+2\sigma)\)}, 122 xlabel={\(x\)}, 123 every axis x label/.style={at={(current axis.right of origin)},anchor=north west}, 124 every axis y label/.style={at={(axis description cs:-0.02,0.2)}, anchor=south west, rotate=90}, 125 ylabel={\(\Pr(X=x)\)}] 126 \addplot {gauss(0,0.75)}; 127\end{axis} 128\begin{axis}[every axis plot post/.append style={ 129 mark=none,domain=-3:3,samples=50,smooth}, 130 axis x line=bottom, 131 enlargelimits=upper, 132 x=\textwidth/10, 133 xtick={-2,-1,0,1,2}, 134 axis x line shift=30pt, 135 hide y axis, 136 x tick label style = {font=\footnotesize}, 137 xlabel={\(Z\)}, 138 every axis x label/.style={at={(axis description cs:1,-0.25)},anchor=south west}] 139 \addplot {gauss(0,0.75)}; 140\end{axis} 141\end{tikzpicture} 142 143 \section{Central limit theorem} 144 145 If \(X\) is randomly distributed with mean \(\mu\) and sd \(\sigma\), then with an adequate sample size \(n\) the distribution of the sample mean \(\overline{X}\) is approximately normal with mean \(E(\overline{X})\) and \(\operatorname{sd}(\overline{X}) = \frac{\sigma}{\sqrt{n}}\). 146 147 \section{Confidence intervals} 148 149 \begin{itemize} 150 \item \textbf{Point estimate:} single-valued estimate of the population mean from the value of the sample mean \(\overline{x}\) 151 \item \textbf{Interval estimate:} confidence interval for population mean \(\mu\) 152 \end{itemize} 153 154 \subsection{95\% confidence interval} 155 156 The 95\% confidence interval for a population mean \(\mu\) is given by 157 158 \[ \overline{x} \pm 1.96 \dfrac{\sigma}{\sqrt{n}} \] 159 160 where: \\ 161 \(\overline{x}\) is the sample mean \\ 162 \(\sigma\) is the population sd \\ 163 \(n\) is the sample size from which \(\overline{x}\) was calculated 164 165 Always express \(z\) as +ve. Express confidence \textit{interval} as ordered pair. 166 167 \colorbox{cas}{\textbf{On CAS}} 168 169 Menu \(\rightarrow\) Stats \(\rightarrow\) Calc \(\rightarrow\) Interval \\ 170 Set Type = One-Sample Z Int, Variable 171 172 \subsection*{Interpretation of confidence intervals} 173 174 95\% confidence interval \(\implies\) 95\% of samples will contain population mean \(\mu\). 175 176 \subsection*{Margin of error} 177 178 For 95\% confidence interval for \(\mu\), margin of error \(M\) is: 179 180 \begin{align*} 181 M &= 1.96 \times \dfrac{\sigma}{\sqrt{n}} \\ 182 \implies n &= \left( \dfrac{1.96 \sigma}{M} \right)^2 183 \end{align*} 184 185 \subsection*{General case} 186 187 A confidence interval of \(C\)\% for a mean \(\mu\) s given by 188 189 \[ x \in \left( \overline{x} \pm k \dfrac{\sigma}{\sqrt{n}} \right) \quad \text{ where } k \text{ is such that } \Pr(-k < Z < k) = \frac{C}{100} \] 190 191 \subsection*{Confidence interval for multiple trials} 192 193 For a set of \(n\) confidence intervals (samples), there is \(0.95^n\) chance that all \(n\) intervals contain the population mean \(\mu\). 194 195 \section{Hypothesis testing} 196 197 Note hypotheses are always expressed in terms of population parameters 198 199 \subsection*{Null hypothesis \(H_0\)} 200 201 Sample drawn from population has same mean as control population, and any difference can be explained by sample variations. 202 203 \subsection*{Alternative hypothesis \(H_1\)} 204 205 Amount of variation from control is significant, despite standard sample variations. 206 207 \subsection*{\(p\)-value} 208 209 Probability of observing a value of the sample statistic as significant as the one observed, assuming null hypothesis is true. 210 211 % table of p-values for strength of evidence 212 213 \subsection*{Distribution of sample mean} 214 215 If \(X \sim \operatorname{N}(\mu, \sigma)\), then distribution of sample mean \(\overline{X}\) is also normal with \(\overline{X} \sim \operatorname{N}(\mu, \frac{\sigma}{\sqrt{n}}\). 216 217 \subsection*{Statistical significance} 218 219 Significance level is denoted by \(\alpha\). 220 221 If \(p<\alpha\), null hypothesis is \textbf{rejected} \\ 222 If \(p>\alpha\), null hypothesis is \textbf{accepted} 223 224 \subsection*{\(z\)-test} 225 226 Hypothesis test for a mean of a sample drawn from a normally distributed population with a known standard deviation. 227 228 \subsubsection*{\colorbox{cas}{\textbf{On CAS:}}} 229 230 Menu \(\rightarrow\) Statistics \(\rightarrow\) Calc \(\rightarrow\) Test. \\ 231 Select \textit{One-Sample Z-Test} and \textit{Variable}, then input: 232 \begin{itemize} 233 \item \(\mu\) condition - same operator as \(H_1\) 234 \item \(\mu_0\) - expected sample mean (null hypothesis) 235 \item \(\sigma\) - standard deviation (null hypothesis) 236 \item \(\overline{x}\) - sample mean 237 \item \(n\) - sample size 238 \end{itemize} 239 240\end{document}