\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{Ivan Patel (patelivan)} \pdfinfo{ /Title (supervised-learning-in-r-regression.pdf) /Creator (Cheatography) /Author (Ivan Patel (patelivan)) /Subject (Supervised Learning in R: Regression Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{00008B} \definecolor{LightBackground}{HTML}{F7F7FB} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Supervised Learning in R: Regression Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{Ivan Patel (patelivan)} via \textcolor{DarkBackground}{\uline{cheatography.com/135316/cs/28145/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}Ivan Patel (patelivan) \\ \uline{cheatography.com/patelivan} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 28th May, 2021.\\ Updated 28th May, 2021.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{What is a Regression?}} \tn \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{From a machine learning perspective, the term regression generally encompasses the prediction of continuous values. Statistically, these predictions are the expected value, or the average value one would observe for the given input values. \newline % Row Count 5 (+ 5) In R, use the lm() function, shown above, to code an OLS model. You can simply print the model to observe the coefficients' sign and magnitude. \newline % Row Count 8 (+ 3) After estimating the coefficients, you can always make predictions on your training data to assess the actual values v. predicted values. Use ggplot2() and a reference line of slope 1 to see if the points are close to the line or not. \newline % Row Count 13 (+ 5) You can also plot a gain curve to measure how well your model sorts the outcome. Useful when sorting instances is more important than predicting the exact outcome.% Row Count 17 (+ 4) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{{\bf{An example of the Gain Curve.}}}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{17.67cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/patelivan_1622159679_Screen Shot 2021-05-27 at 4.49.42 PM.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{The diagonal line represents the gain curve if the outcome was sorted randomly. A perfect model would trace out the green curve. And our model chases out the blue curve. Per the above graph, the model correctly identified the top 30\% highest home values and sorted them by price correctly. \newline \newline We can also see that the top 30\% of highest priced homes are 50\% of total home sales (\$).} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Basic Regression Functions to get started}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{lm(formula = dependent variable \textasciitilde{} independent variable1 + independent variable2 + ..., data).} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{summary() - To get more details about the model results.} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{broom::glance() - See the model details in a tidier form.} \tn % Row Count 6 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{predict(model, newdata)} \tn % Row Count 7 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{ggplot(dframe, aes(x = pred, y = outcome)) + geom\_point() + geom\_abline()} \tn % Row Count 9 (+ 2) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{GainCurvePlot(data, "prediction", "price", "model")} \tn % Row Count 11 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Evaluating a regression model}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{RMSE = sqrt(mean(res\textasciicircum{}2\textasciicircum{})). It is the typical prediction error of your model on the data. Want to minimize this.} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{R\textasciicircum{}2\textasciicircum{} = 1-(RSS/TSS). A measure of how well the model fits or explains the data.} \tn % Row Count 5 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{One heuristic is to compare the RMSE to the standard deviation of the outcome. With a good model, the RMSE should be smaller. \newline \newline RSS = sum(res\textasciicircum{}2\textasciicircum{}). \newline TSS = sum(outcome value - mean of outcome)\textasciicircum{}2\textasciicircum{}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Important things to remember about Regression}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{{\bf{Pros of Regression}}} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{Easy to fit and apply} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{Concise} \tn % Row Count 3 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{Less prone to overfit} \tn % Row Count 4 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{Interpretable - One can easily observe the signs and magnitude of each coefficient.} \tn % Row Count 6 (+ 2) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{{\bf{Things to look out for}}} \tn % Row Count 7 (+ 1) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{Cannot express complex, non-linear, non-additive relationships. Non-linear relationships can be made linear by applying transformations.} \tn % Row Count 10 (+ 3) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{Collinearity is when input variables are partially correlated. When independent variables are correlated, signs of the variables may not be what you expect. Moreover, it can be difficult to separate out the individual effects of collinear variables on the response.. Try manually removing offending variables, dimension reduction, or regularization.} \tn % Row Count 17 (+ 7) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{Variance among residuals should be constant. Plot the residuals on the y-axis and predicted values on the x-axis. The errors should be evenly distributed between positive and negative, and have about the same magnitude above and below.} \tn % Row Count 22 (+ 5) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{Autocorrelation - Errors are independent and uncorrelated. Plot the row\_number() on the x-axis and residuals on the y-axis.} \tn % Row Count 25 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Training a regression model}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{17.67cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/patelivan_1622161306_Screen Shot 2021-05-27 at 5.18.41 PM.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{It is crucial to split your data into training and test sets to evaluate how your model does on unseen data.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{k-fold cv}} \tn \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{k-fold cross-validation (aka k-fold CV) is a resampling method that randomly divides the training data into k groups (aka folds) of approximately equal size. The model is fit on k −1 folds and then the remaining fold is used to compute model performance. This procedure is repeated k times; each time, a different fold is treated as the validation set. Thus, the k-fold CV estimate is computed by averaging the k test errors, providing us with an approximation of the error we might expect on unseen data.% Row Count 11 (+ 11) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Training and testing using k-fold cv.}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{\# Returns a list of nSplits. \newline \# Each sub-list will contain two vectors; train and app. \newline \newline splitPlan \textless{}- \seqsplit{vtreat::kWayCrossValidation(nRows}, nSplits, NULL, NULL) \newline \newline \# Initialize a column of the appropriate length \newline dframe\$pred.cv \textless{}- 0 \newline \newline \# k is the number of folds \newline \# splitPlan is the cross validation plan \newline \newline for(i in 1:k) \{ \newline \# Get the ith split \newline split \textless{}- splitPlan{[}{[}i{]}{]} \newline \newline \# Build a model on the training data from this split (lm, in this case) \newline model \textless{}- lm(fmla, data = dframe{[}split\$train,{]}) \newline \newline \# make predictions on the application data from this split \newline dframe\$pred.cv{[}split\$app{]} \textless{}- predict(model, newdata = dframe{[}split\$app,{]}) \newline \}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{nRows - number of rows in training data \newline nSplits - number of folds.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \end{document}