\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{angelica9373} \pdfinfo{ /Title (isds-474-midterm-2.pdf) /Creator (Cheatography) /Author (angelica9373) /Subject (ISDS 474 Midterm 2 Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{1FD655} \definecolor{LightBackground}{HTML}{F1FCF4} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{ISDS 474 Midterm 2 Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{angelica9373} via \textcolor{DarkBackground}{\uline{cheatography.com/208601/cs/44725/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}angelica9373 \\ \uline{cheatography.com/angelica9373} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 19th October, 2024.\\ Updated 19th October, 2024.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Chapter 7}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{How to select K in kNN (1Q)}}} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}K= 1: By validation data; whichever k gives the lowest validation error.} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Binary Classification K With Even K's}}} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}DO NOT USE even numbers since it could lead to an tie. XLMiner will pick the lowest probability and can chose an even number but that doesn't mean it should be chosen} \tn % Row Count 8 (+ 5) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}K \textgreater{}1: classify by the majority decision rule based on the nearest k records} \tn % Row Count 10 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Low K values: }}} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}Capture local structure but may also capture noise. You cant rely on one Neighbour} \tn % Row Count 13 (+ 3) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{High K values:}}} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}Provide more smoothing but may lose local detail. K can be as large as the training sample} \tn % Row Count 16 (+ 3) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Chose the K that gives you the lowest valid ER}}} \tn % Row Count 17 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Euclidean Distance (1Q)}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\emph{sometimes predictors need to be standardized to equalize scales before computing distances. Standarized = normalized (-3, 3) }}} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{\# of possible partitions in Recursive Partition (2Q)}}} \tn % Row Count 5 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Continuous:}} (n-1)*p} \tn % Row Count 6 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Categorical:}} 1ID- \_P, 2ID - \_P, 3ID -\_P, 4ID - \_P, 5ID - 15P} \tn % Row Count 8 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Cut Off Value in Classification}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- Cutoff = 0.5 by default becuase the proportion of observation neighbors 1's in the k nearest neighbors. Majority decision rule is related to the cut off value for classifying records} \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{You can adjust the cut off value to improve accuracy} \tn % Row Count 6 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Y = 1 (if p\textgreater{} cutoff)} \tn % Row Count 7 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{Y = 0 ( if p \textless{} cutoff)} \tn % Row Count 8 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Cut Off Example Question}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Example: Suppose cutoff = 0.9, k=7, we observed 5 C1 and 2C0. Y = 1 or 0?} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}- Probability (Y=1) = 5/7 = 0.71 -{}-{}-\textgreater{} 0.71 \textless{} 0.9 -\textgreater{} Y= 0} \tn % Row Count 4 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Advantages and Disadvantages}} \tn % Row 0 \SetRowColor{LightBackground} Simple and intuitive & Curse of Dimensionality (req size and many predictors \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} No assumptions required about data -\textgreater{} always correct & \# of predictors x 1000 x 100? 50 predictors = need 5 mil observations \tn % Row Count 7 (+ 4) % Row 2 \SetRowColor{LightBackground} Effective with large training data & n/a \tn % Row Count 9 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{{\bf{General Info}}}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Makes no assumptions about the data \newline % Row Count 1 (+ 1) - Gets classified as whatever the predominant class us among nearby records \newline % Row Count 3 (+ 2) - the way to find the k nearest neighbors in Knn is through the Euclidean distance \newline % Row Count 5 (+ 2) {\bf{Rescaling:}} Only for kNN do you need to rescale because the amount of contribution from each variable. No need for logistic regression since it does not change the P value or RMSE \newline % Row Count 9 (+ 4) No need for CART since it doesn't change the order of values in a variable \newline % Row Count 11 (+ 2) {\bf{XLMiner can only handle up to K= 10}}% Row Count 12 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Chapter 9}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Properties of CART (3Q)}}} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Model Free} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- Automatic variable selection} \tn % Row Count 3 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Needs large sample size (bc its model free)} \tn % Row Count 4 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- Only gives horizontal or vertical splits} \tn % Row Count 5 (+ 1) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Training error gets smaller and smaller with the tree size} \tn % Row Count 7 (+ 2) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- Validation error decreases and then increases with the tree size} \tn % Row Count 9 (+ 2) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- both methods of CART are BOTH model free} \tn % Row Count 10 (+ 1) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Trees}}} \tn % Row Count 11 (+ 1) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Best pruned tree}}: the tree whole validation error equal minimum error plus standard error; usually smaller than minimum error tree. You naturally get overfitting when the natural end of process is 100\% purity in each leaf which ends up fitting noise in the data. Slightly overfitted so people partition a bit less to accommodate based of the minimal error tree} \tn % Row Count 19 (+ 8) % Row 10 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Minimum error tree}}: The tree with lowest validation error.} \tn % Row Count 21 (+ 2) % Row 11 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Full tree}}: largest tree training error equals zero; overfitted} \tn % Row Count 23 (+ 2) % Row 12 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\emph{Note: The full tree can be the same as the minimum error tree BUT usually best pruned tree should be smaller than the other trees}}} \tn % Row Count 26 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Recursive Partitioning}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{(1) Enumerate all possible partitions and select the one with the lowest impurity score} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Impurity Score:}} Gini or Entropy Measure} \tn % Row Count 3 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{(2) Partition following the first step is a subset partition of the same dataset -\textgreater{} Repeat choosing the lowest impurity score each time and drop} \tn % Row Count 6 (+ 3) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Identify the midway point of the two lowest values of the output (14.0 \& 14.8 -\textgreater{} split at 14.4)} \tn % Row Count 9 (+ 3) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- Repeat with the lowest purity being dropped and therefore compare values of 2nd and 3rd lowest (14.8 \& 16.0) -\textgreater{} split at 15.4} \tn % Row Count 12 (+ 3) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{(3) Continue the partitioning until ALL regions have either class 1 or class 0} \tn % Row Count 14 (+ 2) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- But must impose an early stop mark to prevent overfitting error since you can split it too much and lower training error to 0 but validation error will be very HIGH} \tn % Row Count 18 (+ 4) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{algorithm decides where to partition}}} \tn % Row Count 19 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Impurity Score}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Metric to determine the homogeneity of the resulting subgroups of observations \newline % Row Count 2 (+ 2) - For both, the lower the better \newline % Row Count 3 (+ 1) - One has no advantage using one over the other. \newline % Row Count 4 (+ 1) {\bf{Gini Index}} (0, 0.50 binary) \newline % Row Count 5 (+ 1) {\bf{Entropy Measure}}: (0, log\textasciitilde{}2\textasciitilde{}\textasciicircum{}2\textasciicircum{} if binary) OR (0, log\textasciitilde{}2\textasciitilde{}(m) -\textgreater{} m is the total \# of classes of Y) \newline % Row Count 8 (+ 3) {\bf{Overall Impurity Measure: }} Weighted average of impurity from individuals' rectangles weights being the proportion of cases in each rectangle. \newline % Row Count 11 (+ 3) Choose the split that reduces impurity the most (split points becomes nodes on the tree)% Row Count 13 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Check notes for that distance = to weighted average ratio} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Dimensional Predictors Q's}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{With 21 observations, 2 dimensional continuous predictors, how many partitions can we have?} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}\# of partition = \# of observation -1 -\textgreater{} 20 P} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Continous Partitions }}} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}(n-1) x P -\textgreater{} p dimentional predictors (more than 2 dimensional predictors)} \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Categorical Partitions}}} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}abcd split. (3Levels, 3P), (4Levels, 7P)} \tn % Row Count 8 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\emph{ {\bf{XLMiner only supports binary categorical variables}} }}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{When to Stop Partitioning}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Error rate as a function of the number of splits for training vs validation data -\textgreater{} Indicates overfitting \newline % Row Count 3 (+ 3) We can continue partitioning the tree so a {\bf{FULL tree}} will be obtained in the end. A full tree is usually overfitted so we have to impose and EARLY STOP ... \newline % Row Count 7 (+ 4) - Stop when training error rate is approaching 0 as you partition further but you must have an early stop before letting it touch 0 \newline % Row Count 10 (+ 3) {\bf{Early - Stop }}(Minimum Error Tree or Best Prune tree): \newline % Row Count 12 (+ 2) OR \newline % Row Count 13 (+ 1) Stop based off {\bf{Chi-square tests}}: (not commonly used for CART. They use min error tree or best prune tree) \newline % Row Count 16 (+ 3) - if the improvement of the additional split is{\bf{ statistically significant }}-\textgreater{} continue. If not, STOP. \newline % Row Count 19 (+ 3) {\bf{Largest to Smallest}}: Full Tree \textgreater{} Min error tree \textgreater{} Best prune tree ( Std usually smaller than min error). {\emph{Keep in mind: Full tree CAN BE THE SAME as your Min error tree}}% Row Count 23 (+ 4) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Regression Tree}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Used with {\bf{continuous}} outcome variables. Many splits attempted, chose the one that minimizes impurity \newline % Row Count 3 (+ 3) - Prediction is computed as the {\bf{average}} of numerical target variables in the rectangle \newline % Row Count 5 (+ 2) - {\bf{Impurity measured by the sum of squared deviation from leaf mean }} \newline % Row Count 7 (+ 2) - Performance measured by RMSE \newline % Row Count 8 (+ 1) Regression Tree is used for prediction. Compared to classification tree, we only have to ... \newline % Row Count 10 (+ 2) {\bf{Replace impurity measure by the sum of squared deviation}} everything else will be the same. \newline % Row Count 13 (+ 3) Split by irrelevant variables = Bad impurity score \newline % Row Count 15 (+ 2) Only split with relevant variables% Row Count 16 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Error Rate as you continue Splitting}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{5.377cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/angelica9373_1729374403_Screenshot 2024-10-19 144510.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Performance Evaluation}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{(1)Partition the data into Training and Validation Sets} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{Training set Used to grow tree} \tn % Row Count 3 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Validation set used to assess classification performance} \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{(2) More than 2 classes (M \textgreater{}2 )} \tn % Row Count 6 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Same structure except that the terminal nodes would take one of the m-class labels} \tn % Row Count 8 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Chapter 10}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Assumptions For Logistic Regression (1Q)}}} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{- Generalized linearity} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Logistic Regression Equation(2Q)}}} \tn % Row Count 3 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{NOT model free -\textgreater{} based on following equations} \tn % Row Count 4 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\emph{ log odds = beta\textasciitilde{}0\textasciitilde{} + beta\textasciitilde{}1\textasciitilde{}}}X\textasciitilde{}1\textasciitilde{} + ... + beta\textasciitilde{}q\textasciitilde{}{\emph{X\textasciitilde{}q\textasciitilde{} }}} \tn % Row Count 6 (+ 2) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\emph{log p/(1-p) = beta\textasciitilde{}0\textasciitilde{} + beta\textasciitilde{}1\textasciitilde{}}}X\textasciitilde{}1\textasciitilde{} + ... beta\textasciitilde{}q\textasciitilde{}{\emph{X\textasciitilde{}q\textasciitilde{} }}} \tn % Row Count 8 (+ 2) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\emph{P = 1/(1+exp(-(beta\textasciitilde{}0\textasciitilde{} + beta \textasciitilde{}1\textasciitilde{}}}X\textasciitilde{}1\textasciitilde{} +... + beta\textasciitilde{}q\textasciitilde{}{\emph{X\textasciitilde{}q\textasciitilde{}))) }}} \tn % Row Count 10 (+ 2) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{Direct interpretation of beta 1 is that per unit increase of X1, log odds will increase by beta 1 -\textgreater{} not clear so thus you must say} \tn % Row Count 13 (+ 3) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{The Log odds are going to increase by beta 1} \tn % Row Count 14 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- 3 equations equivalent to each other/ \newline - All regression models can be called generalized linear modes \newline - Y= 0 in MLR is never true if Y is binary and thus cannot use this mode \newline Since Y is continuous, change Y into P (probability) and it eliminates the error term since you add some randomness} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{The Odds}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Odds of ration is the exponential form of beta}} \newline % Row Count 2 (+ 2) - Beta is your coefficient number on your regression model% Row Count 4 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Comparing 2 Models}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{First criteria,}} pick the model with the lowest validation error \newline % Row Count 2 (+ 2) {\bf{Second criterion,}} when the validation errors are comparable, pick the one with few variables \newline % Row Count 4 (+ 2) E.g suppose models 1 and 2 have a validation errors 26.2\% and 26.3\%. Their model sizes are \newline % Row Count 7 (+ 3) 10 and, respectively. Which model is better? \newline % Row Count 9 (+ 2) - Initially go based of lowest validation error but when its too similar (23\% and 26\% -\textgreater{} its comparable) and thus you go based off of LOWEST model Size% Row Count 13 (+ 4) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}