1\documentclass[a4paper]{article} 2\usepackage[a4paper, margin=2cm]{geometry} 3\usepackage{array} 4\usepackage{amsmath} 5\usepackage{amssymb} 6\usepackage{tcolorbox} 7\usepackage{fancyhdr} 8\usepackage{pgfplots} 9\usepackage{tabularx} 10\usepackage{keystroke} 11\usepackage{listings} 12\usepackage{xcolor}% used only to show the phantomed stuff 13\definecolor{cas}{HTML}{e6f0fe} 14\usepackage{mathtools} 15\pgfplotsset{compat=1.16} 16 17\pagestyle{fancy} 18\fancyhead[LO,LE]{Unit 4 Specialist --- Statistics} 19\fancyhead[CO,CE]{Andrew Lorimer} 20 21\setlength\parindent{0pt} 22 23\begin{document} 24 25\title{Statistics} 26\author{} 27\date{} 28\maketitle 29 30\section{Linear combinations of random variables} 31 32\subsection*{Continuous random variables} 33 34 A continuous random variable \(X\) has a pdf \(f\) such that: 35 36\begin{enumerate} 37\item \(f(x) \ge0\forall x \) 38\item \(\int^\infty_{-\infty} f(x) \> dx = 1\) 39\end{enumerate} 40 41 \[\Pr(X \le c) = \int^c_{-\infty} f(x) \> dx \] 42 43\subsubsection*{Linear functions \(X \rightarrow aX+b\)} 44 45\begin{align*} 46\Pr(Y \le y) &= \Pr(aX+b \le y) \\ 47 &= \Pr\left(X \le \dfrac{y-b}{a}\right) \\ 48 &= \int^{\frac{y-b}{a}}_{-\infty} f(x) \> dx 49\end{align*} 50 51\begin{align*} 52\textbf{Mean:} && \operatorname{E}(aX+b) & = a\operatorname{E}(X)+b \\ 53\textbf{Variance:} && \operatorname{Var}(aX+b) &= a^2\operatorname{Var}(X) \\ 54\end{align*} 55 56\subsection*{Linear combination of two random variables} 57 58\begin{align*} 59\textbf{Mean:} && \operatorname{E}(aX+bY) & = a\operatorname{E}(X)+b\operatorname{E}(Y) \\ 60\textbf{Variance:} && \operatorname{Var}(aX+bY) &= a^2\operatorname{Var}(X) + b^2\operatorname{Var}(Y) \tag{if \(X\) and \(Y\) are independent}\\ 61\end{align*} 62 63\section{Sample mean} 64 65 Approximation of the \textbf{population mean} determined experimentally. 66 67 \[\overline{x} = \dfrac{\Sigma x}{n} \] 68 69 where \(n\) is the size of the sample (number of sample points) and \(x\) is the value of a sample point 70 71\begin{tcolorbox}[colframe=cas!75!black, title=On CAS] 72 73\begin{enumerate} 74\item Spreadsheet 75\item In cell A1: \verb;mean(randNorm(sd, mean, sample size)); 76\item Edit \(\rightarrow\) Fill \(\rightarrow\) Fill Range 77\item Input range as A1:An where \(n\) is the number of samples 78\item Graph \(\rightarrow\) Histogram 79\end{enumerate} 80\end{tcolorbox} 81 82\subsubsection*{Sample size of \(n\)} 83 84 \[\overline{X} = \sum_{i=1}^n \frac{x_i}{n} = \dfrac{\sum x}{n} \] 85 86 Sample mean is distributed with mean \(\mu\) and sd \(\frac{\sigma}{\sqrt{n}}\) (approaches these values for increasing sample size \(n\)). 87 88 For a new distribution with mean of \(n\) trials, \(\operatorname{E}(X^\prime) = \operatorname{E}(X), \quad \operatorname{sd}(X^\prime) = \dfrac{\operatorname{sd}(X)}{\sqrt{n}}\) 89 90\begin{tcolorbox}[colframe=cas!75!black, title=On CAS] 91 92\begin{itemize} 93\item Spreadsheet \(\rightarrow\) Catalog \(\rightarrow\) \verb;randNorm(sd, mean, n); where \verb;n; is the number of samples. Show histogram with Histogram key in top left 94\item To calculate parameters of a dataset: Calc \(\rightarrow\) One-variable 95\end{itemize} 96\end{tcolorbox} 97 98\section{Normal distributions} 99 100 mean = mode = median 101 102 \[ Z = \frac{X - \mu}{\sigma} \] 103 104 Normal distributions must have area (total prob.) of 1 \(\implies \int^\infty_{-\infty} f(x) \> dx = 1\) 105\pgfmathdeclarefunction{gauss}{2}{% 106\pgfmathparse{1/(#2*sqrt(2*pi))*exp(-((x-#1)^2)/(2*#2^2))}% 107} 108 109{\begin{center}\begin{tikzpicture} 110\pgfplotsset{set layers} 111\begin{axis}[every axis plot post/.append style={ 112 mark=none,domain=-3:3,samples=50,smooth}, 113 axis x line=bottom, 114 axis y line=left, 115 enlargelimits=upper, 116 x=\textwidth/10, 117 ytick={0.55}, 118 yticklabels={\(\frac{1}{\sigma \sqrt{2\pi}}\)}, 119 xtick={-2,-1,0,1,2}, 120 x tick label style = {font=\footnotesize}, 121 xticklabels={\((\mu-2\sigma)\), \((\mu-\sigma)\), \(\mu\), \((\mu+\sigma)\), \((\mu+2\sigma)\)}, 122 xlabel={\(x\)}, 123 every axis x label/.style={at={(current axis.right of origin)},anchor=north west}, 124 every axis y label/.style={at={(axis description cs:-0.02,0.2)}, anchor=south west, rotate=90}, 125 ylabel={\(\Pr(X=x)\)}] 126\addplot{gauss(0,0.75)}; 127\end{axis} 128\begin{axis}[every axis plot post/.append style={ 129 mark=none,domain=-3:3,samples=50,smooth}, 130 axis x line=bottom, 131 enlargelimits=upper, 132 x=\textwidth/10, 133 xtick={-2,-1,0,1,2}, 134 axis x line shift=30pt, 135 hide y axis, 136 x tick label style = {font=\footnotesize}, 137 xlabel={\(Z\)}, 138 every axis x label/.style={at={(axis description cs:1,-0.25)},anchor=south west}] 139\addplot{gauss(0,0.75)}; 140\end{axis} 141\end{tikzpicture}\end{center}} 142 143\section{Central limit theorem} 144 145 If \(X\) is randomly distributed with mean \(\mu\) and sd \(\sigma\), then with an adequate sample size \(n\) the distribution of the sample mean \(\overline{X}\) is approximately normal with mean \(E(\overline{X})\) and \(\operatorname{sd}(\overline{X}) = \frac{\sigma}{\sqrt{n}}\). 146 147\section{Confidence intervals} 148 149\begin{itemize} 150\item \textbf{Point estimate:} single-valued estimate of the population mean from the value of the sample mean \(\overline{x}\) 151\item \textbf{Interval estimate:} confidence interval for population mean \(\mu\) 152\end{itemize} 153 154\subsection*{95% confidence interval} 155 156 \[\left( \overline{x}\pm1.96\dfrac{\sigma}{\sqrt{n}} \] 157 158 where: \\ 159 \(\mu\) is the population mean (unknown) \\ 160 \(\overline{x}\) is the sample mean \\ 161 \(\sigma\) is the population sd \\ 162 \(n\) is the sample size from which \(\overline{x}\) was calculated 163 164 Always express \(z\) as +ve. Express confidence \textit{interval} as ordered pair. 165 166\colorbox{cas}{\textbf{On CAS}} 167 168 Menu \(\rightarrow\) Stats \(\rightarrow\) Calc \(\rightarrow\) Interval \\ 169 Set Type = One-Sample Z Int, Variable 170 171\subsection*{Interpretation of confidence intervals} 172 17395% confidence interval \(\implies\) 95% of samples will contain population mean \(\mu\). 174 175\subsection*{Margin of error} 176 177 For 95% confidence interval for \(\mu\), margin of error \(M\) is: 178 179\begin{align*} 180 M &= 1.96\times \dfrac{\sigma}{\sqrt{n}} \\ 181\implies n &= \left( \dfrac{1.96\sigma}{M}\right)^2 182\end{align*} 183 184\subsection*{General case} 185 186 A confidence interval of \(C\)% is given by 187 188 A 95% confidence interval for \(\mu\) will have \(M\) when 189 \[\overline{x}\pm k \dfrac{\sigma}{\sqrt{n}} \] 190 191 where \(k\) is such that \(\Pr(-k < Z < k) = \frac{C}{100}\) 192 193 194\end{document}