1\documentclass[a4paper]{article} 2\usepackage[a4paper, margin=2cm]{geometry} 3\usepackage{array} 4\usepackage{amsmath} 5\usepackage{amssymb} 6\usepackage{tcolorbox} 7\usepackage{fancyhdr} 8\usepackage{pgfplots} 9\usepackage{tabularx} 10\usepackage{keystroke} 11\usepackage{listings} 12\usepackage{xcolor} % used only to show the phantomed stuff 13\definecolor{cas}{HTML}{e6f0fe} 14\usepackage{mathtools} 15\pgfplotsset{compat=1.16} 16 17\pagestyle{fancy} 18\fancyhead[LO,LE]{Unit 4 Specialist --- Statistics} 19\fancyhead[CO,CE]{Andrew Lorimer} 20 21\setlength\parindent{0pt} 22 23\begin{document} 24 25 \title{Statistics} 26 \author{} 27 \date{} 28 \maketitle 29 30 \section{Linear combinations of random variables} 31 32 \subsection*{Continuous random variables} 33 34 A continuous random variable \(X\) has a pdf \(f\) such that: 35 36 \begin{enumerate} 37 \item \(f(x) \ge 0 \forall x \) 38 \item \(\int^\infty_{-\infty} f(x) \> dx = 1\) 39 \end{enumerate} 40 41 \[ \Pr(X \le c) = \int^c_{-\infty} f(x) \> dx \] 42 43 \subsubsection*{Linear functions \(X \rightarrow aX+b\)} 44 45 \begin{align*} 46 \Pr(Y \le y) &= \Pr(aX+b \le y) \\ 47 &= \Pr\left(X \le \dfrac{y-b}{a}\right) \\ 48 &= \int^{\frac{y-b}{a}}_{-\infty} f(x) \> dx 49 \end{align*} 50 51 \begin{align*} 52 \textbf{Mean:} && \operatorname{E}(aX+b) & = a\operatorname{E}(X)+b \\ 53 \textbf{Variance:} && \operatorname{Var}(aX+b) &= a^2 \operatorname{Var}(X) \\ 54 \end{align*} 55 56 \subsection*{Linear combination of two random variables} 57 58 \begin{align*} 59 \textbf{Mean:} && \operatorname{E}(aX+bY) & = a\operatorname{E}(X)+b\operatorname{E}(Y) \\ 60 \textbf{Variance:} && \operatorname{Var}(aX+bY) &= a^2 \operatorname{Var}(X) + b^2 \operatorname{Var}(Y) \tag{if \(X\) and \(Y\) are independent}\\ 61 \end{align*} 62 63 \section{Sample mean} 64 65 Approximation of the \textbf{population mean} determined experimentally. 66 67 \[ \overline{x} = \dfrac{\Sigma x}{n} \] 68 69 where \(n\) is the size of the sample (number of sample points) and \(x\) is the value of a sample point 70 71 \begin{tcolorbox}[colframe=cas!75!black, title=On CAS] 72 73 \begin{enumerate} 74 \item Spreadsheet 75 \item In cell A1: \verb;mean(randNorm(sd, mean, sample size)); 76 \item Edit \(\rightarrow\) Fill \(\rightarrow\) Fill Range 77 \item Input range as A1:An where \(n\) is the number of samples 78 \item Graph \(\rightarrow\) Histogram 79 \end{enumerate} 80 \end{tcolorbox} 81 82 \subsubsection*{Sample size of \(n\)} 83 84 \[ \overline{X} = \sum_{i=1}^n \frac{x_i}{n} = \dfrac{\sum x}{n} \] 85 86 Sample mean is distributed with mean \(\mu\) and sd \(\frac{\sigma}{\sqrt{n}}\) (approaches these values for increasing sample size \(n\)). 87 88 For a new distribution with mean of \(n\) trials, \(\operatorname{E}(X^\prime) = \operatorname{E}(X), \quad \operatorname{sd}(X^\prime) = \dfrac{\operatorname{sd}(X)}{\sqrt{n}}\) 89 90 \begin{tcolorbox}[colframe=cas!75!black, title=On CAS] 91 92 \begin{itemize} 93 \item Spreadsheet \(\rightarrow\) Catalog \(\rightarrow\) \verb;randNorm(sd, mean, n); where \verb;n; is the number of samples. Show histogram with Histogram key in top left 94 \item To calculate parameters of a dataset: Calc \(\rightarrow\) One-variable 95 \end{itemize} 96 \end{tcolorbox} 97 98 \section{Normal distributions} 99 100 mean = mode = median 101 102 \[ Z = \frac{X - \mu}{\sigma} \] 103 104 Normal distributions must have area (total prob.) of 1 \(\implies \int^\infty_{-\infty} f(x) \> dx = 1\) 105\pgfmathdeclarefunction{gauss}{2}{% 106 \pgfmathparse{1/(#2*sqrt(2*pi))*exp(-((x-#1)^2)/(2*#2^2))}% 107} 108 109{\begin{center} \begin{tikzpicture} 110 \pgfplotsset{set layers} 111\begin{axis}[every axis plot post/.append style={ 112 mark=none,domain=-3:3,samples=50,smooth}, 113 axis x line=bottom, 114 axis y line=left, 115 enlargelimits=upper, 116 x=\textwidth/10, 117 ytick={0.55}, 118 yticklabels={\(\frac{1}{\sigma \sqrt{2\pi}}\)}, 119 xtick={-2,-1,0,1,2}, 120 x tick label style = {font=\footnotesize}, 121 xticklabels={\((\mu-2\sigma)\), \((\mu-\sigma)\), \(\mu\), \((\mu+\sigma)\), \((\mu+2\sigma)\)}, 122 xlabel={\(x\)}, 123 every axis x label/.style={at={(current axis.right of origin)},anchor=north west}, 124 every axis y label/.style={at={(axis description cs:-0.02,0.2)}, anchor=south west, rotate=90}, 125 ylabel={\(\Pr(X=x)\)}] 126 \addplot {gauss(0,0.75)}; 127\end{axis} 128\begin{axis}[every axis plot post/.append style={ 129 mark=none,domain=-3:3,samples=50,smooth}, 130 axis x line=bottom, 131 enlargelimits=upper, 132 x=\textwidth/10, 133 xtick={-2,-1,0,1,2}, 134 axis x line shift=30pt, 135 hide y axis, 136 x tick label style = {font=\footnotesize}, 137 xlabel={\(Z\)}, 138 every axis x label/.style={at={(axis description cs:1,-0.25)},anchor=south west}] 139 \addplot {gauss(0,0.75)}; 140\end{axis} 141\end{tikzpicture}\end{center}} 142 143 \section{Central limit theorem} 144 145 If \(X\) is randomly distributed with mean \(\mu\) and sd \(\sigma\), then with an adequate sample size \(n\) the distribution of the sample mean \(\overline{X}\) is approximately normal with mean \(E(\overline{X})\) and \(\operatorname{sd}(\overline{X}) = \frac{\sigma}{\sqrt{n}}\). 146 147 \section{Confidence intervals} 148 149 \begin{itemize} 150 \item \textbf{Point estimate:} single-valued estimate of the population mean from the value of the sample mean \(\overline{x}\) 151 \item \textbf{Interval estimate:} confidence interval for population mean \(\mu\) 152 \end{itemize} 153 154 \subsection*{95% confidence interval} 155 156 \[ \left( \overline{x} \pm 1.96 \dfrac{\sigma}{\sqrt{n}} \] 157 158 where: \\ 159 \(\mu\) is the population mean (unknown) \\ 160 \(\overline{x}\) is the sample mean \\ 161 \(\sigma\) is the population sd \\ 162 \(n\) is the sample size from which \(\overline{x}\) was calculated 163 164 Always express \(z\) as +ve. Express confidence \textit{interval} as ordered pair. 165 166 \colorbox{cas}{\textbf{On CAS}} 167 168 Menu \(\rightarrow\) Stats \(\rightarrow\) Calc \(\rightarrow\) Interval \\ 169 Set Type = One-Sample Z Int, Variable 170 171 \subsection*{Interpretation of confidence intervals} 172 173 95% confidence interval \(\implies\) 95% of samples will contain population mean \(\mu\). 174 175 \subsection*{Margin of error} 176 177 For 95% confidence interval for \(\mu\), margin of error \(M\) is: 178 179 \begin{align*} 180 M &= 1.96 \times \dfrac{\sigma}{\sqrt{n}} \\ 181 \implies n &= \left( \dfrac{1.96 \sigma}{M} \right)^2 182 \end{align*} 183 184 \subsection*{General case} 185 186 A confidence interval of \(C\)% is given by 187 188 A 95% confidence interval for \(\mu\) will have \(M\) when 189 \[ \overline{x} \pm k \dfrac{\sigma}{\sqrt{n}} \] 190 191 where \(k\) is such that \(\Pr(-k < Z < k) = \frac{C}{100}\) 192 193 194\end{document}