Andrew's git - notes.git/blob - spec/statistics.tex

   1\documentclass[a4paper]{article}
   2\usepackage[a4paper, margin=2cm]{geometry}
   3\usepackage{array}
   4\usepackage{amsmath}
   5\usepackage{amssymb}
   6\usepackage{tcolorbox}
   7\usepackage{fancyhdr}
   8\usepackage{pgfplots}
   9\usepackage{tabularx}
  10\usepackage{keystroke}
  11\usepackage{listings}
  12\usepackage{xcolor} % used only to show the phantomed stuff
  13\definecolor{cas}{HTML}{e6f0fe}
  14\usepackage{mathtools}
  15\pgfplotsset{compat=1.16}
  16
  17\pagestyle{fancy}
  18\fancyhead[LO,LE]{Unit 4 Specialist --- Statistics}
  19\fancyhead[CO,CE]{Andrew Lorimer}
  20
  21\setlength\parindent{0pt}
  22
  23\begin{document}
  24
  25  \title{Statistics}
  26  \author{}
  27  \date{}
  28  \maketitle
  29
  30  \section{Linear combinations of random variables}
  31
  32  \subsection*{Continuous random variables}
  33
  34  A continuous random variable \(X\) has a pdf \(f\) such that:
  35
  36  \begin{enumerate}
  37    \item \(f(x) \ge 0 \forall x \)
  38    \item \(\int^\infty_{-\infty} f(x) \> dx = 1\)
  39  \end{enumerate}
  40
  41  \[ \Pr(X \le c) = \int^c_{-\infty} f(x) \> dx \]
  42
  43  \subsubsection*{Linear functions \(X \rightarrow aX+b\)}
  44
  45  \begin{align*}
  46    \Pr(Y \le y) &= \Pr(aX+b \le y) \\
  47    &= \Pr\left(X \le \dfrac{y-b}{a}\right) \\
  48    &= \int^{\frac{y-b}{a}}_{-\infty} f(x) \> dx
  49  \end{align*}
  50
  51  \begin{align*}
  52    \textbf{Mean:} && \operatorname{E}(aX+b) & = a\operatorname{E}(X)+b \\
  53    \textbf{Variance:} && \operatorname{Var}(aX+b) &= a^2 \operatorname{Var}(X) \\
  54  \end{align*}
  55
  56  \subsection*{Linear combination of two random variables}
  57
  58  \begin{align*}
  59    \textbf{Mean:} && \operatorname{E}(aX+bY) & = a\operatorname{E}(X)+b\operatorname{E}(Y) \\
  60    \textbf{Variance:} && \operatorname{Var}(aX+bY) &= a^2 \operatorname{Var}(X) + b^2 \operatorname{Var}(Y) \tag{if \(X\) and \(Y\) are independent}\\
  61  \end{align*}
  62
  63  \section{Sample mean}
  64
  65  Approximation of the \textbf{population mean} determined experimentally.
  66
  67  \[ \overline{x} = \dfrac{\Sigma x}{n} \]
  68
  69  where \(n\) is the size of the sample (number of sample points) and \(x\) is the value of a sample point
  70
  71  \begin{tcolorbox}[colframe=cas!75!black, title=On CAS]
  72
  73  \begin{enumerate}
  74    \item Spreadsheet
  75    \item In cell A1: \verb;mean(randNorm(sd, mean, sample size));
  76    \item Edit \(\rightarrow\) Fill \(\rightarrow\) Fill Range
  77    \item Input range as A1:An where \(n\) is the number of samples
  78    \item Graph \(\rightarrow\) Histogram
  79  \end{enumerate}
  80  \end{tcolorbox}
  81
  82  \subsubsection*{Sample size of \(n\)}
  83
  84  \[ \overline{X} = \sum_{i=1}^n \frac{x_i}{n} = \dfrac{\sum x}{n} \]
  85
  86  Sample mean is distributed with mean \(\mu\) and sd \(\frac{\sigma}{\sqrt{n}}\) (approaches these values for increasing sample size \(n\)).
  87
  88  For a new distribution with mean of \(n\) trials, \(\operatorname{E}(X^\prime) = \operatorname{E}(X), \quad \operatorname{sd}(X^\prime) = \dfrac{\operatorname{sd}(X)}{\sqrt{n}}\)
  89
  90  \begin{tcolorbox}[colframe=cas!75!black, title=On CAS]
  91  
  92    \begin{itemize}
  93      \item Spreadsheet \(\rightarrow\) Catalog \(\rightarrow\) \verb;randNorm(sd, mean, n); where \verb;n; is the number of samples. Show histogram with Histogram key in top left
  94      \item To calculate parameters of a dataset: Calc \(\rightarrow\) One-variable
  95    \end{itemize}
  96  \end{tcolorbox}
  97  
  98  \section{Normal distributions}
  99
 100  mean = mode = median
 101
 102  \[ Z = \frac{X - \mu}{\sigma} \]
 103
 104  Normal distributions must have area (total prob.) of 1 \(\implies \int^\infty_{-\infty} f(x) \> dx = 1\)
 105\pgfmathdeclarefunction{gauss}{2}{%
 106  \pgfmathparse{1/(#2*sqrt(2*pi))*exp(-((x-#1)^2)/(2*#2^2))}%
 107}
 108
 109{\begin{center} \begin{tikzpicture}
 110  \pgfplotsset{set layers}
 111\begin{axis}[every axis plot post/.append style={
 112  mark=none,domain=-3:3,samples=50,smooth}, 
 113  axis x line=bottom, 
 114  axis y line=left,
 115  enlargelimits=upper,
 116  x=\textwidth/10,
 117  ytick={0.55},
 118  yticklabels={\(\frac{1}{\sigma \sqrt{2\pi}}\)}, 
 119  xtick={-2,-1,0,1,2},
 120  x tick label style = {font=\footnotesize},
 121  xticklabels={\((\mu-2\sigma)\), \((\mu-\sigma)\), \(\mu\), \((\mu+\sigma)\), \((\mu+2\sigma)\)},
 122  xlabel={\(x\)},
 123  every axis x label/.style={at={(current axis.right of origin)},anchor=north west},
 124  every axis y label/.style={at={(axis description cs:-0.02,0.2)}, anchor=south west, rotate=90},
 125  ylabel={\(\Pr(X=x)\)}]
 126  \addplot {gauss(0,0.75)};
 127\end{axis}
 128\begin{axis}[every axis plot post/.append style={
 129  mark=none,domain=-3:3,samples=50,smooth}, 
 130  axis x line=bottom, 
 131  enlargelimits=upper,
 132  x=\textwidth/10,
 133  xtick={-2,-1,0,1,2},
 134  axis x line shift=30pt,
 135  hide y axis,
 136  x tick label style = {font=\footnotesize},
 137  xlabel={\(Z\)},
 138  every axis x label/.style={at={(axis description cs:1,-0.25)},anchor=south west}]
 139  \addplot {gauss(0,0.75)};
 140\end{axis}
 141\end{tikzpicture}\end{center}}
 142
 143  \section{Central limit theorem}
 144
 145  If \(X\) is randomly distributed with mean \(\mu\) and sd \(\sigma\), then with an adequate sample size \(n\) the distribution of the sample mean \(\overline{X}\) is approximately normal with mean \(E(\overline{X})\) and \(\operatorname{sd}(\overline{X}) = \frac{\sigma}{\sqrt{n}}\).
 146
 147  \section{Confidence intervals}
 148
 149  \begin{itemize}
 150    \item \textbf{Point estimate:} single-valued estimate of the population mean from the value of the sample mean \(\overline{x}\)
 151    \item \textbf{Interval estimate:} confidence interval for population mean \(\mu\)
 152  \end{itemize}
 153
 154  \subsection*{95% confidence interval}
 155
 156  \[ \left( \overline{x} \pm 1.96 \dfrac{\sigma}{\sqrt{n}} \]
 157
 158  where: \\
 159  \(\mu\) is the population mean (unknown) \\
 160  \(\overline{x}\) is the sample mean \\
 161  \(\sigma\) is the population sd \\
 162  \(n\) is the sample size from which \(\overline{x}\) was calculated
 163
 164  Always express \(z\) as +ve. Express confidence \textit{interval} as ordered pair.
 165
 166  \colorbox{cas}{\textbf{On CAS}}
 167
 168  Menu \(\rightarrow\) Stats \(\rightarrow\) Calc \(\rightarrow\) Interval \\
 169  Set Type = One-Sample Z Int, Variable
 170
 171  \subsection*{Interpretation of confidence intervals}
 172
 173  95% confidence interval \(\implies\) 95% of samples will contain population mean \(\mu\).
 174
 175  \subsection*{Margin of error}
 176
 177  For 95% confidence interval for \(\mu\), margin of error \(M\) is:
 178
 179  \begin{align*}
 180    M &= 1.96 \times \dfrac{\sigma}{\sqrt{n}} \\
 181    \implies n &= \left( \dfrac{1.96 \sigma}{M} \right)^2
 182  \end{align*}
 183
 184  \subsection*{General case}
 185
 186  A confidence interval of \(C\)% is given by
 187
 188  A 95% confidence interval for \(\mu\) will have \(M\) when
 189  \[ \overline{x} \pm k \dfrac{\sigma}{\sqrt{n}} \]
 190
 191  where \(k\) is such that \(\Pr(-k < Z < k) = \frac{C}{100}\)
 192
 193
 194\end{document}