\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{mvyjayanti} \pdfinfo{ /Title (data-mining.pdf) /Creator (Cheatography) /Author (mvyjayanti) /Subject (data mining Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{A3A3A3} \definecolor{LightBackground}{HTML}{F3F3F3} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{data mining Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{mvyjayanti} via \textcolor{DarkBackground}{\uline{cheatography.com/72036/cs/18263/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}mvyjayanti \\ \uline{cheatography.com/mvyjayanti} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 13th December, 2018.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{4} \begin{tabularx}{3.833cm}{x{1.81949 cm} x{1.61351 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{3.833cm}}{\bf\textcolor{white}{Naive Bayes and LogReg}} \tn % Row 0 \SetRowColor{LightBackground} P(A|C) = (P(C|A)P(A))/P(C) & predicts T/F, "S" shaped, from 0-1 \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} posterior = (likelihood x prior)/normalizing constant & log(odds) = log(p/(1-p)) \tn % Row Count 5 (+ 3) % Row 2 \SetRowColor{LightBackground} pros: easy/fast, assuming independence, categorical & z = estimated intercept/std error \tn % Row Count 8 (+ 3) % Row 3 \SetRowColor{white} cons: if not in set -\textgreater{} 0\% -{}- can use Laplace estimation (add 1), bad estimator, independent predictor assumption -\textgreater{} unlikely & y = log(F)B1 + log(T/F)B2 \tn % Row Count 14 (+ 6) % Row 4 \SetRowColor{LightBackground} LR: p = e\textasciicircum{}log(odds))\textasciicircum{}/(1+e\textasciicircum{}log(odds)\textasciicircum{}) & likelihood = mul. all T x all (1-F) \tn % Row Count 16 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{3.833cm}}{log(L) =sum i to n(log(Tn) + sum(log(Fn)) \newline R\textasciicircum{}2\textasciicircum{}=(SS(mean) - SS(fit))/SS(mean)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{x{1.7165 cm} x{1.7165 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{3.833cm}}{\bf\textcolor{white}{ANNs}} \tn % Row 0 \SetRowColor{LightBackground} neuron = things that hold number from 0 to 1 & boolean: T=1, F=0 \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} ŷ = 1 if \seqsplit{w₁x₁+w₂x₂+...wnxn-t(bias} factor) \textgreater{}0\{\{noshy\}\} & , -1 if \textless{}0 \tn % Row Count 7 (+ 4) % Row 2 \SetRowColor{LightBackground} \seqsplit{ŷ=sign(w₁x₁+w₂x₂+}...wnxn-t=sign({\bf{w}}•{\bf{x}})\{\{nl\}\}λ=learning rate\{\{nl\}\}xij=val of jth attribute of training example xi & for weight update: wj\textasciicircum{}(k+1)\textasciicircum{} = weight param associated w/ i\textasciicircum{}th\textasciicircum{} input link after k\textasciicircum{}th\textasciicircum{} iteration \tn % Row Count 14 (+ 7) % Row 3 \SetRowColor{white} wj\textasciicircum{}(k+1)\textasciicircum{}=wj\textasciicircum{}(k)+λ(y1-ŷi\textasciicircum{}k\textasciicircum{})xij & error = y - ŷ \tn % Row Count 16 (+ 2) % Row 4 \SetRowColor{LightBackground} if error = 2, inc w of +ves & if error = -2, in w of -ves \tn % Row Count 18 (+ 2) % Row 5 \SetRowColor{white} Error E = ΣEk | k∊outputs & Ek = 1/2(tk-ok)\textasciicircum{}2\textasciicircum{} \tn % Row Count 20 (+ 2) % Row 6 \SetRowColor{LightBackground} output oi = 1/(1+e\textasciicircum{}-net i\textasciicircum{}) & net i = Σwij*oi \tn % Row Count 22 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{x{1.7165 cm} x{1.7165 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{3.833cm}}{\bf\textcolor{white}{Inductive Bias, No Free Lunch}} \tn % Row 0 \SetRowColor{LightBackground} IB: anything influencing hypothesis choice other than training set & part of language accessible, method of choosing \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} NFL: for any 2 algorithms A\&B, there exists a dataset for which A outperforms B & assuming uniform P(x,y)→\#of datasets for which A\textgreater{}B = \# B\textgreater{}A \tn % Row Count 8 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{x{1.7165 cm} x{1.7165 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{3.833cm}}{\bf\textcolor{white}{SVM}} \tn % Row 0 \SetRowColor{LightBackground} frontier that best segregates 2 classes by margins & polynomial kernel: k(x,y)=(x*y+1)\textasciicircum{}d\textasciicircum{} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} RBF kernel:k(x,y)=\{\{nl\}\}e\textasciicircum{}-𝛄(||x-y||$^{\textrm{2}}$\textasciicircum{} & tune by k-fold cross-val (k=5) \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} adv: high dimension spaces, \#of dimensions \textgreater{} \#of samples & diff kernel functions for diff decisions \tn % Row Count 9 (+ 3) % Row 3 \SetRowColor{white} k1+k2 = even more complex\{\{br\}\} & \{\{bt\}\}dis: if \#features \textgreater{} \#samples, CV \tn % Row Count 11 (+ 2) % Row 4 \SetRowColor{LightBackground} min(||w||\textasciicircum{}2\textasciicircum{}) for linear\{\{nl\}\}ξ: how far ptᵢ is from correct side & wxᵢ+b\textgreater{}=1-ξ if yᵢ=1\{\{nl\}\}wxᵢ+b\textgreater{}=-1+ξ if yᵢ=-1 \tn % Row Count 15 (+ 4) % Row 5 \SetRowColor{white} min(||w||+C(Σi=1→nξᵢ)\{\{noshy\}\} & max((Σλᵢ) -1/2(λᵢλⱼyᵢyⱼxᵢxⱼ)\{\{noshy\}\} \tn % Row Count 18 (+ 3) % Row 6 \SetRowColor{LightBackground} dist btw parallel planes = z/||w|| & ||w|| = sqrt(w₁+w₂...) \tn % Row Count 20 (+ 2) % Row 7 \SetRowColor{white} generalization error\textless{}= p(bar)(1-s\textasciicircum{}2\textasciicircum{})/s\textasciicircum{}2\textasciicircum{} & p(bar) = avg correlation, s=strength \tn % Row Count 23 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{x{1.7165 cm} x{1.7165 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{3.833cm}}{\bf\textcolor{white}{Errors}} \tn % Row 0 \SetRowColor{LightBackground} P(+) = 1/(1+ e\textasciicircum{}-(w0+w1x1...)\textasciicircum{}) & error= misclassification \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} For new cases, predict: & 1 if (w0+w1x1+...)\textgreater{}=1, 0 else \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} if w0 inc as x inc, p(+) inc & error = (FP+FN)/All \tn % Row Count 6 (+ 2) % Row 3 \SetRowColor{white} sensitivity = TP/(TP+FN) & specificity = TN/(TN+FP) \tn % Row Count 8 (+ 2) % Row 4 \SetRowColor{LightBackground} +ve predicted val = TP/(TP+FP) & -ve predicted val = TN/(TN+FN) \tn % Row Count 10 (+ 2) % Row 5 \SetRowColor{white} true error: error on true underlying distribution (unmeasurable) & apparent error: error on example used to train model (underestimates TE) \tn % Row Count 14 (+ 4) % Row 6 \SetRowColor{LightBackground} generalization: ability to predict unseen cases & Occam's Razor: should not be multiplied beyond necessity \tn % Row Count 17 (+ 3) % Row 7 \SetRowColor{white} Overfitting: memorizing training set & test error: error on ex. held out of training \tn % Row Count 20 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{x{1.92248 cm} x{1.51052 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{3.833cm}}{\bf\textcolor{white}{KNN}} \tn % Row 0 \SetRowColor{LightBackground} select k: sqrt(n), if n is even, choose odd & Ri = \{x:d(x,xi)\textless{} d(x,x2), i!=j\} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{2}{x{3.833cm}}{euclidean distance =sqrt((x-x1)\textasciicircum{}2\textasciicircum{}+(y-y1)\textasciicircum{}2\textasciicircum{})} \tn % Row Count 3 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{x{1.7165 cm} x{1.7165 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{3.833cm}}{\bf\textcolor{white}{Model Eval}} \tn % Row 0 \SetRowColor{LightBackground} Holdout & train on 2/3, test on 1/3, one is validation set (high variance on estimate) \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} Leave-one-out & train on N-1, test on 1 (good estimate) \tn % Row Count 6 (+ 2) % Row 2 \SetRowColor{LightBackground} K-folds Cross Val & divide set into k parts, LOO each, repeat N times, compute mean and std dev for each \tn % Row Count 11 (+ 5) % Row 3 \SetRowColor{white} Bootstrapping\{\{noshy\}\} & randomly draw N points (can repeat), train, test on S - S1 \tn % Row Count 14 (+ 3) % Row 4 \SetRowColor{LightBackground} Compare 2 methods: H0: meanLR = meanNB, H1: meanLR\textless{}meanNB & t=(meanNB-meanLR)/S (S:pooled variance), reject H0 if t\textgreater{}t alpha \tn % Row Count 18 (+ 4) % Row 5 \SetRowColor{white} OR H1: meanLR!=meanNB & 2-tailed t. t alpha/2. \seqsplit{((meanVar)x(sqrt(n)))} / S \tn % Row Count 21 (+ 3) % Row 6 \SetRowColor{LightBackground} OR H1: meanLR!=meanNB & 2-tailed t. t alpha/2. \seqsplit{((meanVar)x(sqrt(n)))} / S \tn % Row Count 24 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{3.833cm}}{Better: stratify each fold to contain same \% of positives and negatives} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{x{1.7165 cm} x{1.7165 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{3.833cm}}{\bf\textcolor{white}{Decision Trees}} \tn % Row 0 \SetRowColor{LightBackground} asks a question: classifies based on T/F & root, internal(arrows to and from), external(arrows to)(leaves) \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} break into categories & T/F and Y/N for each \tn % Row Count 6 (+ 2) % Row 2 \SetRowColor{LightBackground} P(Y|T), P(N|T), P(Y|F), P(N|F) & GI\textasciicircum{}2\textasciicircum{}= 1-(Y|F/(Y|F + N|F))\textasciicircum{}2\textasciicircum{}+(N|F/(Y|F + N|F))\textasciicircum{}2\textasciicircum{}\{\{noshy\}\} \tn % Row Count 9 (+ 3) % Row 3 \SetRowColor{white} GI\textasciicircum{}1\textasciicircum{} = & 1-(Y|T/(Y|T + N|T))\textasciicircum{}2\textasciicircum{}+(N|T/(Y|T + N|T))\textasciicircum{}2\textasciicircum{}\{\{noshy\}\} \tn % Row Count 12 (+ 3) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{2}{x{3.833cm}}{GI\textasciicircum{}all\textasciicircum{}=(T/(T+F) x GI\textasciicircum{}1\textasciicircum{})+(F/(T+F) x GI\textasciicircum{}2\textasciicircum{})} \tn % Row Count 13 (+ 1) % Row 5 \SetRowColor{white} entropy: \seqsplit{H(S)=P(y)log2(P(y))-P(n)log(P(n))} & find H(S\textasciicircum{}true\textasciicircum{}) and H(S\textasciicircum{}false\textasciicircum{}), H(S)-w1H(S\textasciicircum{}true\textasciicircum{})-w2H(S\textasciicircum{}false\textasciicircum{})\{\{noshy\}\} \tn % Row Count 17 (+ 4) % Row 6 \SetRowColor{LightBackground} w1 = T instances/all\{\{noshy\}\} & w2 = F instances/all \tn % Row Count 19 (+ 2) % Row 7 \SetRowColor{white} w1 = T instances/all\{\{noshy\}\} & w2 = F instances/all \tn % Row Count 21 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{3.833cm}}{largest info gain, least GI} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{x{1.7165 cm} x{1.7165 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{3.833cm}}{\bf\textcolor{white}{ROC and Lift Curves}} \tn % Row 0 \SetRowColor{LightBackground} ROC: sensitivity vs. (1-specificity), higher val the better, \{\{nl\}\}flatter line the worse & sens: TP rate,\{\{nl\}\} 1-spec: FP rate \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} Lift curves: find \% of each total response from sum of all & find \% of each +ve responses from total +ve responses \tn % Row Count 8 (+ 3) % Row 2 \SetRowColor{LightBackground} y = +ve \% / \% of total & x = \% of total \tn % Row Count 10 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{x{1.7165 cm} x{1.7165 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{3.833cm}}{\bf\textcolor{white}{k-means clustering}} \tn % Row 0 \SetRowColor{LightBackground} user choose k, initialize k centers, loop: assign pts nearest those centers, move centroid of assigned pts & center in dense regions or random, optimizing (total distance)\textasciicircum{}2\textasciicircum{} \tn % Row Count 6 (+ 6) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{3.833cm}}{returns local solution} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{x{1.7165 cm} x{1.7165 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{3.833cm}}{\bf\textcolor{white}{Ensembles}} \tn % Row 0 \SetRowColor{LightBackground} Bagging: bootstrap aggregating\{\{noshy\}\} & \{\{bl\}\}Boosting: changing weights on pts and building series of classifiers, start w=1 \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} \{\{bt\}\}incorrect pts weighed by \# that is inversely proportional to training error\{\{noshy\}\} & w inc if misclassified, dec else\{\{nl\}\}classifiers combined by weighting-accuracy of training set \tn % Row Count 10 (+ 5) % Row 2 \SetRowColor{LightBackground} \{\{bt\}\}Arcing(Adaptive resample\&combine):\{\{nl\}\}like boosting but change w by update method & eg. Arc x4: w(x) = 1+e(x)\textasciicircum{}4\textasciicircum{} \{\{nl\}\}e(x)=times x has been misclassified so far\{\{bt\}\} \tn % Row Count 15 (+ 5) % Row 3 \SetRowColor{white} \{\{noshy\}\}depends on: strength(perf of individuals), diversity (uncorrelated errors) & bagging error: from reducing var\{\{nl\}\}boosting can reduce bias\&var | bagging is \textgreater{} base classifier \tn % Row Count 20 (+ 5) % Row 4 \SetRowColor{LightBackground} boosting better or overfit noisy & Random forests: for tree,choose pts,for node, features subset w/ best IG,split, end,recurse,end \tn % Row Count 25 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{x{1.7165 cm} x{1.7165 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{3.833cm}}{\bf\textcolor{white}{feature selection}} \tn % Row 0 \SetRowColor{LightBackground} removing irrelevant info for a better, faster model & drop missing values or encode them \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} drop: if all values are the same & if highly correlated, one of them \tn % Row Count 5 (+ 2) % Row 2 \SetRowColor{LightBackground} if low correlation with target|\{\{nl\}\}trees with least info gain & forward, backward, stepwise selection: best model with f1, then keep going until validation error stops dropping \tn % Row Count 11 (+ 6) % Row 3 \SetRowColor{white} beam or heuristic search & for computation interpretability |\{\{nl\}\}genetic algorithms \tn % Row Count 14 (+ 3) % Row 4 \SetRowColor{LightBackground} 1) filters: all above + other correlation & 2) wrappers: build a classifier with a subset+eval on validation data. but 2\textasciicircum{}d\textasciicircum{} possible subsets \tn % Row Count 19 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{x{1.7165 cm} x{1.7165 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{3.833cm}}{\bf\textcolor{white}{Bias and Var}} \tn % Row 0 \SetRowColor{LightBackground} PCA : dimensionality reduction & linear combo of OG features \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} max. variance: smallest \# until 90\% var explained & μ=E(y|x)=T(uk)\{\{nl\}\}ŷ=f(x,Ө) \tn % Row Count 5 (+ 3) % Row 2 \SetRowColor{LightBackground} error: MSE = (ŷ-μ)\textasciicircum{}2\textasciicircum{}\{\{nl\}\}var: E(ŷE(ŷ))\textasciicircum{}2\textasciicircum{}\{\{nl\}\}bias:(E(ŷ-μ)\textasciicircum{}2\textasciicircum{}+noise & \textasciicircum{}best estimate of y given x and fixed params Ө \tn % Row Count 9 (+ 4) % Row 3 \SetRowColor{white} \mymulticolumn{2}{x{3.833cm}}{KNN,ANN,DT: low bias, high var} \tn % Row Count 10 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{3.833cm}}{var: how much does my estimate var across datasets| bias: systematic error prediction, inability to fit} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{x{1.7165 cm} x{1.7165 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{3.833cm}}{\bf\textcolor{white}{EM Expectation Maximization clust.}} \tn % Row 0 \SetRowColor{LightBackground} hard clustering: each pt only belongs to one cluster & soft clustering: can belong to more than one cluster by \% \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} EM: automatically discover all params for k "sources"→but we may not know source\{\{nl\}\} if we know μ,σ, can find likeliness & mixture models: probabilistic way of soft clustering\{\{nl\}\}each cluster Gaussian or multinominal \tn % Row Count 10 (+ 7) % Row 2 \SetRowColor{LightBackground} 1/sqrt(2πσ\textasciicircum{}2\textasciicircum{})*exp(-(xᵢ - μᵦ)\textasciicircum{}2\textasciicircum{}/2σᵦ\textasciicircum{}2\textasciicircum{})\{\{nl\}\}aᵢ=1-bᵢ=P(aᵢ) & Bayesian posterior: bᵢ = P(b|xᵢ) = (P(xᵢ|b)P(b)) / (P(xᵢ|b)P(b) + P(xᵢ|a)P(a)) \tn % Row Count 15 (+ 5) % Row 3 \SetRowColor{white} σᵦ\textasciicircum{}2\textasciicircum{}=(b₁(x₁-μᵦ)\textasciicircum{}2\textasciicircum{}+...) /(b₁+b₂+...) & μᵦ = \seqsplit{(b₁x₁+b₂x₂+..)} / (b₁+b₂+...) \tn % Row Count 18 (+ 3) % Row 4 \SetRowColor{LightBackground} em: places randomly,for each pt P(b|xᵢ): does it look like it came from b & Working to adjust (μₐ, σₐ\textasciicircum{}2\textasciicircum{}) and (μᵦ, σᵦ\textasciicircum{}2\textasciicircum{}) to fit points assigned \tn % Row Count 22 (+ 4) % Row 5 \SetRowColor{white} Iterate until convergence\{\{nl\}\}P(a) = 1- P(b) & Could also estimate priors: P(b) = (b₁+b₂+...)/n \tn % Row Count 25 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{3.833cm}}{"What proportion of the data is each distribution describing"} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}