\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{angelica9373} \pdfinfo{ /Title (isds-474-midterm-2.pdf) /Creator (Cheatography) /Author (angelica9373) /Subject (ISDS 474 Midterm 2 Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{1FD655} \definecolor{LightBackground}{HTML}{F1FCF4} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{ISDS 474 Midterm 2 Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{angelica9373} via \textcolor{DarkBackground}{\uline{cheatography.com/208601/cs/44725/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}angelica9373 \\ \uline{cheatography.com/angelica9373} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 19th October, 2024.\\ Updated 22nd October, 2024.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Chapter 7}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{How to select K in kNN (1Q)}}} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}K= 1: By validation data; whichever k gives the lowest validation error.} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Binary Classification K With Even K's}}} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}DO NOT USE even numbers since it could lead to an tie. XLMiner will pick the lowest probability and can chose an even number but that doesn't mean it should be chosen} \tn % Row Count 8 (+ 5) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}K \textgreater{}1: classify by the majority decision rule based on the nearest k records} \tn % Row Count 10 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Low K values: }}} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}Capture local structure but may also capture noise. You cant rely on one Neighbour} \tn % Row Count 13 (+ 3) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{High K values:}}} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}Provide more smoothing but may lose local detail. K can be as large as the training sample} \tn % Row Count 16 (+ 3) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Chose the K that gives you the lowest valid ER}}} \tn % Row Count 17 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Euclidean Distance (1Q)}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\emph{sometimes predictors need to be standardized to equalize scales before computing distances. Standarized = normalized (-3, 3) }}} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{\# of possible partitions in Recursive Partition (2Q)}}} \tn % Row Count 5 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Continuous:}} (n-1)*p} \tn % Row Count 6 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Categorical:}} 2ID - 1P, 3ID -3P, 4ID - 7P, 5ID - 15P} \tn % Row Count 8 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Cut Off Value in Classification}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- Cutoff = 0.5 by default becuase the proportion of observation neighbors 1's in the k nearest neighbors. Majority decision rule is related to the cut off value for classifying records} \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{You can adjust the cut off value to improve accuracy} \tn % Row Count 6 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Y = 1 (if p\textgreater{} cutoff)} \tn % Row Count 7 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{Y = 0 ( if p \textless{} cutoff)} \tn % Row Count 8 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Cut Off Example Question}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Example: Suppose cutoff = 0.9, k=7, we observed 5 C1 and 2C0. Y = 1 or 0?} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}- Probability (Y=1) = 5/7 = 0.71 -{}-{}-\textgreater{} 0.71 \textless{} 0.9 -\textgreater{} Y= 0} \tn % Row Count 4 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Regression Tree}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Used with {\bf{continuous}} outcome variables. Many splits attempted, chose the one that minimizes impurity \newline % Row Count 3 (+ 3) - Prediction is computed as the {\bf{average}} of numerical target variables in the rectangle \newline % Row Count 5 (+ 2) - {\bf{Impurity measured by the sum of squared deviation from leaf mean }} \newline % Row Count 7 (+ 2) - Performance measured by RMSE \newline % Row Count 8 (+ 1) Regression Tree is used for prediction. Compared to classification tree, we only have to ... \newline % Row Count 10 (+ 2) {\bf{Replace impurity measure by the sum of squared deviation}} everything else will be the same. \newline % Row Count 13 (+ 3) Split by irrelevant variables = Bad impurity score \newline % Row Count 15 (+ 2) Only split with relevant variables% Row Count 16 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{{\bf{General Info}}}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Makes no assumptions about the data \newline % Row Count 1 (+ 1) - Gets classified as whatever the predominant class us among nearby records \newline % Row Count 3 (+ 2) - the way to find the k nearest neighbors in Knn is through the Euclidean distance \newline % Row Count 5 (+ 2) {\bf{Rescaling:}} Only for kNN do you need to rescale because the amount of contribution from each variable. No need for logistic regression since it does not change the P value or RMSE \newline % Row Count 9 (+ 4) No need for CART since it doesn't change the order of values in a variable \newline % Row Count 11 (+ 2) {\bf{XLMiner can only handle up to K= 10}}% Row Count 12 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Chapter 9}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Properties of CART (Classification And Regression Tree)(3Q)}}} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Model Free} \tn % Row Count 3 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- Automatic variable selection} \tn % Row Count 4 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Needs large sample size (bc its model free)} \tn % Row Count 5 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- Only gives horizontal or vertical splits} \tn % Row Count 6 (+ 1) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- both methods of CART are BOTH model free} \tn % Row Count 7 (+ 1) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Best pruned tree}}: You naturally get overfitting when the natural end of process is 100\% purity in each leaf which ends up fitting noise in the data. Slightly overfitted so people partition a bit less to accommodate based of the minimal error tree} \tn % Row Count 12 (+ 5) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Minimum error tree}}: The tree with lowest validation error.} \tn % Row Count 14 (+ 2) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Full tree}}: largest tree training error equals zero; overfitted} \tn % Row Count 16 (+ 2) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\emph{Note: The full tree can be the same as the minimum error tree BUT usually best pruned tree should be smaller than the other trees}}} \tn % Row Count 19 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Impurity Score}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Metric to determine the homogeneity of the resulting subgroups of observations \newline % Row Count 2 (+ 2) - For both, the lower the better \newline % Row Count 3 (+ 1) - One has no advantage using one over the other. \newline % Row Count 4 (+ 1) {\bf{Gini Index}} (0, 0.50 binary) \newline % Row Count 5 (+ 1) {\bf{Entropy Measure}}: (0, log\textasciitilde{}2\textasciitilde{}\textasciicircum{}2\textasciicircum{} if binary) OR (0, log\textasciitilde{}2\textasciitilde{}(m) -\textgreater{} m is the total \# of classes of Y) \newline % Row Count 8 (+ 3) {\bf{Overall Impurity Measure: }} Weighted average of impurity from individuals' rectangles weights being the proportion of cases in each rectangle. \newline % Row Count 11 (+ 3) Choose the split that reduces impurity the most (split points becomes nodes on the tree)% Row Count 13 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Check notes for that distance = to weighted average ratio} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Dimensional Predictors Q's}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Continous Partitions }}} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}(n-1) x P -\textgreater{} p dimentional predictors (more than 2 dimensional predictors)} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Categorical Partitions}}} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}abcd split. (3Levels, 3P), (4Levels, 7P)} \tn % Row Count 5 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\emph{ {\bf{XLMiner only supports binary categorical variables}} }}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{When to Stop Partitioning}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Error rate as a function of the number of splits for training vs validation data -\textgreater{} Indicates overfitting \newline % Row Count 3 (+ 3) We can continue partitioning the tree so a {\bf{FULL tree}} will be obtained in the end. A full tree is usually overfitted so we have to impose and EARLY STOP ... \newline % Row Count 7 (+ 4) - Stop when training error rate is approaching 0 as you partition further but you must have an early stop before letting it touch 0.{\bf{Early - Stop }}(Minimum Error Tree or Best Prune tree): OR Stop based off {\bf{Chi-square tests}}: \newline % Row Count 12 (+ 5) - if the improvement of the additional split is{\bf{ statistically significant }}-\textgreater{} continue. If not, STOP. \newline % Row Count 15 (+ 3) {\bf{Largest to Smallest}}: Full Tree \textgreater{} Min error tree \textgreater{} Best prune tree ( Std usually smaller than min error). {\emph{Keep in mind: Full tree CAN BE THE SAME as your Min error tree}}% Row Count 19 (+ 4) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Error Rate as you continue Splitting}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{5.377cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/angelica9373_1729374403_Screenshot 2024-10-19 144510.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Training error decreases as you split more \newline Validation error decreases and then increases with the tree size} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Chapter 10}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Assumptions For Logistic Regression (1Q)}}} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Generalized linearity} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Logistic Regression Equation(2Q)}}} \tn % Row Count 3 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{NOT model free -\textgreater{} based on following equations} \tn % Row Count 4 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\emph{ log odds = beta\textasciitilde{}0\textasciitilde{} + beta\textasciitilde{}1\textasciitilde{}}}X\textasciitilde{}1\textasciitilde{} + ... + beta\textasciitilde{}q\textasciitilde{}{\emph{X\textasciitilde{}q\textasciitilde{} }}} \tn % Row Count 6 (+ 2) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\emph{log p/(1-p) = beta\textasciitilde{}0\textasciitilde{} + beta\textasciitilde{}1\textasciitilde{}}}X\textasciitilde{}1\textasciitilde{} + ... beta\textasciitilde{}q\textasciitilde{}{\emph{X\textasciitilde{}q\textasciitilde{} }}} \tn % Row Count 8 (+ 2) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\emph{P = 1/(1+exp(-(beta\textasciitilde{}0\textasciitilde{} + beta \textasciitilde{}1\textasciitilde{}}}X\textasciitilde{}1\textasciitilde{} +... + beta\textasciitilde{}q\textasciitilde{}{\emph{X\textasciitilde{}q\textasciitilde{}))) }}} \tn % Row Count 10 (+ 2) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{EX. Probabiliy could look like this: **P = 1/(1+exp(0.6535 - 0.4535(X))) -{}-{}-{}-\textgreater{} where you can sub in x} \tn % Row Count 13 (+ 3) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Direct interpretation of beta 1 is that per unit increase of X1, log odds will increase by beta 1 -\textgreater{} not clear so thus you must say} \tn % Row Count 16 (+ 3) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{The Log odds are going to increase by beta 1} \tn % Row Count 17 (+ 1) % Row 10 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Types of Regression : }}} \tn % Row Count 18 (+ 1) % Row 11 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Logistic regression (Binary outcome)} \tn % Row Count 19 (+ 1) % Row 12 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- Multiple Linear Regression (Continuous outcome)} \tn % Row Count 20 (+ 1) % Row 13 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Multinomial Logistic Regression (categorical outcome of 3 or more levels)} \tn % Row Count 22 (+ 2) % Row 14 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- Ordinal Logistic Regression (Categorical outcome of 3 or more ordinal levels)} \tn % Row Count 24 (+ 2) % Row 15 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Poisson Regression (Count outcome)} \tn % Row Count 25 (+ 1) % Row 16 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- Negative Binomial Regression (Count outcome)} \tn % Row Count 26 (+ 1) % Row 17 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{All those regression models can be called generalized linear models (GLM) !}}} \tn % Row Count 28 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- 3 equations equivalent to each other/ \newline - Y= 0 in MLR is never true if Y is binary and thus cannot use this mode \newline Since Y is continuous, change Y into P (probability) and it eliminates the error term since you add some randomness} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{The Odds}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Odds of ration is the exponential form of beta}} \newline % Row Count 2 (+ 2) - Beta is your coefficient number on your regression model \newline % Row Count 4 (+ 2) The Odds: p/ (1 - p) -{}-{}-\textgreater{} p is probability \newline % Row Count 5 (+ 1) {\bf{The Odds: e\textasciicircum{}(beta\textasciitilde{}1\textasciitilde{})\textasciicircum{}}} =1 \newline % Row Count 6 (+ 1) {\bf{Logit}}: Log (odds). It takes the values from -infinity to positive infinity. Dependent var. \newline % Row Count 8 (+ 2) Probability: P =(odds)/ (1 + Odds)% Row Count 9 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Comparing 2 Models}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{First criteria,}} pick the model with the lowest validation error \newline % Row Count 2 (+ 2) {\bf{Second criterion,}} when the validation errors are comparable, pick the one with few variables \newline % Row Count 4 (+ 2) E.g suppose models 1 and 2 have a validation errors 26.2\% and 26.3\%. Their model sizes are \newline % Row Count 7 (+ 3) 10 and, respectively. Which model is better? \newline % Row Count 9 (+ 2) - Initially go based of lowest validation error but when its too similar (23\% and 26\% -\textgreater{} its comparable) and thus you go based off of LOWEST model Size% Row Count 13 (+ 4) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Performance Evaluation}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{(1)Partition the data into Training and Validation Sets} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{Training set Used to grow tree} \tn % Row Count 3 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Validation set used to assess classification performance} \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{(2) More than 2 classes (M \textgreater{}2 )} \tn % Row Count 6 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Same structure except that the terminal nodes would take one of the m-class labels} \tn % Row Count 8 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Example Probelm}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{5.377cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/angelica9373_1729568391_Classification Regression.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Check if Binary variable or continuous. \newline If binary: use the odds ratio DIRECTLY and use \_\_\_\_ "times" \newline If continuous: odds ratio - 1 then convert to \%.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}