Andrew's git - notes.git/blob - spec/statistics.tex

   1\documentclass[a4paper]{article}
   2\usepackage[a4paper, margin=2cm]{geometry}
   3\usepackage{array}
   4\usepackage{amsmath}
   5\usepackage{amssymb}
   6\usepackage{tcolorbox}
   7\usepackage{fancyhdr}
   8\usepackage{pgfplots}
   9\usepackage{tabularx}
  10\usepackage{keystroke}
  11\usepackage{listings}
  12\usepackage{xcolor} % used only to show the phantomed stuff
  13\definecolor{cas}{HTML}{e6f0fe}
  14\usepackage{mathtools}
  15\pgfplotsset{compat=1.16}
  16
  17\pagestyle{fancy}
  18\fancyhead[LO,LE]{Unit 4 Specialist --- Statistics}
  19\fancyhead[CO,CE]{Andrew Lorimer}
  20
  21\setlength\parindent{0pt}
  22
  23\begin{document}
  24
  25  \title{Statistics}
  26  \author{}
  27  \date{}
  28  \maketitle
  29
  30  \section{Linear combinations of random variables}
  31
  32  \subsection*{Continuous random variables}
  33
  34  A continuous random variable \(X\) has a pdf \(f\) such that:
  35
  36  \begin{enumerate}
  37    \item \(f(x) \ge 0 \forall x \)
  38    \item \(\int^\infty_{-\infty} f(x) \> dx = 1\)
  39  \end{enumerate}
  40
  41  \[ \Pr(X \le c) = \int^c_{-\infty} f(x) \> dx \]
  42
  43  \subsubsection*{Linear functions \(X \rightarrow aX+b\)}
  44
  45  \begin{align*}
  46    \Pr(Y \le y) &= \Pr(aX+b \le y) \\
  47    &= \Pr\left(X \le \dfrac{y-b}{a}\right) \\
  48    &= \int^{\frac{y-b}{a}}_{-\infty} f(x) \> dx
  49  \end{align*}
  50
  51  \begin{align*}
  52    \textbf{Mean:} && \operatorname{E}(aX+b) & = a\operatorname{E}(X)+b \\
  53    \textbf{Variance:} && \operatorname{Var}(aX+b) &= a^2 \operatorname{Var}(X) \\
  54  \end{align*}
  55
  56  \subsection*{Linear combination of two random variables}
  57
  58  \begin{align*}
  59    \textbf{Mean:} && \operatorname{E}(aX+bY) & = a\operatorname{E}(X)+b\operatorname{E}(Y) \\
  60    \textbf{Variance:} && \operatorname{Var}(aX+bY) &= a^2 \operatorname{Var}(X) + b^2 \operatorname{Var}(Y) \tag{if \(X\) and \(Y\) are independent}\\
  61  \end{align*}
  62
  63  \section{Sample mean}
  64
  65  Approximation of the \textbf{population mean} determined experimentally.
  66
  67  \[ \overline{x} = \dfrac{\Sigma x}{n} \]
  68
  69  where \(n\) is the size of the sample (number of sample points) and \(x\) is the value of a sample point
  70
  71  \begin{tcolorbox}[colframe=cas!75!black, title=On CAS]
  72
  73  \begin{enumerate}
  74    \item Spreadsheet
  75    \item In cell A1: \verb;mean(randNorm(sd, mean, sample size));
  76    \item Edit \(\rightarrow\) Fill \(\rightarrow\) Fill Range
  77    \item Input range as A1:An where \(n\) is the number of samples
  78    \item Graph \(\rightarrow\) Histogram
  79  \end{enumerate}
  80  \end{tcolorbox}
  81
  82  \subsubsection*{Sample size of \(n\)}
  83
  84  \[ \overline{X} = \sum_{i=1}^n \frac{x_i}{n} = \dfrac{\sum x}{n} \]
  85
  86  Sample mean is distributed with mean \(\mu\) and sd \(\frac{\sigma}{\sqrt{n}}\) (approaches these values for increasing sample size \(n\)).
  87
  88  \begin{tcolorbox}[colframe=cas!75!black, title=On CAS]
  89  
  90    \begin{itemize}
  91      \item Spreadsheet \(\rightarrow\) Catalog \(\rightarrow\) \verb;randNorm(sd, mean, n); where \verb;n; is the number of samples. Show histogram with Histogram key in top left
  92      \item To calculate parameters of a dataset: Calc \(\rightarrow\) One-variable
  93    \end{itemize}
  94  \end{tcolorbox}
  95  
  96  \section{Normal distributions}
  97
  98  mean = mode = median
  99
 100  \[ Z = \frac{X - \mu}{\sigma} \]
 101
 102  Normal distributions must have area (total prob.) of 1 \(\implies \int^\infty_{-\infty} f(x) \> dx = 1\)
 103\pgfmathdeclarefunction{gauss}{2}{%
 104  \pgfmathparse{1/(#2*sqrt(2*pi))*exp(-((x-#1)^2)/(2*#2^2))}%
 105}
 106
 107{\begin{center} \begin{tikzpicture}
 108  \pgfplotsset{set layers}
 109\begin{axis}[every axis plot post/.append style={
 110  mark=none,domain=-3:3,samples=50,smooth}, 
 111  axis x line=bottom, 
 112  axis y line=left,
 113  enlargelimits=upper,
 114  x=\textwidth/10,
 115  ytick={0.55},
 116  yticklabels={\(\frac{1}{\sigma \sqrt{2\pi}}\)}, 
 117  xtick={-2,-1,0,1,2},
 118  x tick label style = {font=\footnotesize},
 119  xticklabels={\((\mu-2\sigma)\), \((\mu-\sigma)\), \(\mu\), \((\mu+\sigma)\), \((\mu+2\sigma)\)},
 120  xlabel={\(x\)},
 121  every axis x label/.style={at={(current axis.right of origin)},anchor=north west},
 122  every axis y label/.style={at={(axis description cs:-0.02,0.2)}, anchor=south west, rotate=90},
 123  ylabel={\(\Pr(X=x)\)}]
 124  \addplot {gauss(0,0.75)};
 125\end{axis}
 126\begin{axis}[every axis plot post/.append style={
 127  mark=none,domain=-3:3,samples=50,smooth}, 
 128  axis x line=bottom, 
 129  enlargelimits=upper,
 130  x=\textwidth/10,
 131  xtick={-2,-1,0,1,2},
 132  axis x line shift=30pt,
 133  hide y axis,
 134  x tick label style = {font=\footnotesize},
 135  xlabel={\(Z\)},
 136  every axis x label/.style={at={(axis description cs:1,-0.25)},anchor=south west}]
 137  \addplot {gauss(0,0.75)};
 138\end{axis}
 139\end{tikzpicture}\end{center}}
 140
 141  \section{Central limit theorem}
 142
 143  If \(X\) is randomly distributed with mean \(\mu\) and sd \(\sigma\), then with an adequate sample size \(n\) the distribution of the sample mean \(\overline{X}\) is approximately normal with mean \(E(\overline{X})\) and \(\operatorname{sd}(\overline{X}) = \frac{\sigma}{\sqrt{n}}\).
 144
 145\end{document}