\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{Niki (worlddoit)} \pdfinfo{ /Title (r-project-example.pdf) /Creator (Cheatography) /Author (Niki (worlddoit)) /Subject (r project example Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{ED1515} \definecolor{LightBackground}{HTML}{FDF0F0} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{r project example Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{Niki (worlddoit)} via \textcolor{DarkBackground}{\uline{cheatography.com/170195/cs/35989/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}Niki (worlddoit) \\ \uline{cheatography.com/worlddoit} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 9th December, 2022.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{2} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Intro}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{This tutorial is a good first step for someone looking to learn the steps needed for exploring data, cleaning data, and training/evaluating some basic machine learning algorithms. \newline % Row Count 4 (+ 4) Resources: \newline % Row Count 5 (+ 1) \{\{popup="https://www.kaggle.com/code/camnugent/introduction-to-machine-learning-in-r-tutorial"\}\}Main\{\{/popup\}\} \newline % Row Count 8 (+ 3) \{\{popup="https://www.r-bloggers.com/2022/02/beginners-guide-to-machine-learning-in-r-with-step-by-step-tutorial/"\}\}Additional 1\{\{/popup\}\} \newline % Row Count 11 (+ 3) \{\{popup="https://machinelearningmastery.com/machine-learning-in-r-step-by-step/"\}\}Additional 2\{\{/popup\}\}% Row Count 14 (+ 3) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{(***Advanced)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Step 1: Load in the data.}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`library(tidyverse)`\{\{noshy\}\} \newline % Row Count 1 (+ 1) `library(reshape2)`\{\{noshy\}\} \newline % Row Count 2 (+ 1) `housing = \seqsplit{read.csv('../input/housing}.csv')`\{\{noshy\}\} \newline % Row Count 4 (+ 2) `head(housing)`\{\{noshy\}\} \newline % Row Count 5 (+ 1) `summary(housing)`\{\{noshy\}\} \newline % Row Count 6 (+ 1) {\bf{Output Picture 1}}\{\{noshy\}\} \newline % Row Count 7 (+ 1) `par(mfrow=c(2,5))`\{\{noshy\}\} \newline % Row Count 8 (+ 1) `colnames(housing)`\{\{noshy\}\} \newline % Row Count 9 (+ 1) `ggplot(data = melt(housing), mapping = aes(x = value)) + `\{\{noshy\}\} \newline % Row Count 11 (+ 2) ` geom\_histogram(bins = 30) + facet\_wrap(\textasciitilde{}variable, scales = 'free\_x')`\{\{noshy\}\} \newline % Row Count 13 (+ 2) {\bf{Output Picture 2}}\{\{noshy\}\}% Row Count 14 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{(***Advanced)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Output Picture 1}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/worlddoit_1670613737_ экрана 2022-12-09 в 20.17.50.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{(***Advanced)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Output Picture 2}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/worlddoit_1670613784_ экрана 2022-12-09 в 20.18.55.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{(***Advanced)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Step 2: Clean the data}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Impute missing values\{\{noshy\}\} \newline % Row Count 1 (+ 1) `housing\$total\_bedrooms{[} is.na(housing\$total\_bedrooms){]} = \seqsplit{median(housing\$total\_bedrooms} , na.rm = TRUE)`\{\{noshy\}\} \newline % Row Count 4 (+ 3) Fix the total columns - make them means\{\{noshy\}\} \newline % Row Count 5 (+ 1) `housing\$mean\_bedrooms = housing\$total\_bedrooms/housing\$households`\{\{noshy\}\} \newline % Row Count 7 (+ 2) `housing\$mean\_rooms = housing\$total\_rooms/housing\$households`\{\{noshy\}\} \newline % Row Count 9 (+ 2) `drops = c('total\_bedrooms', 'total\_rooms')`\{\{noshy\}\} \newline % Row Count 11 (+ 2) `housing = housing{[} , !(names(housing) \%in\% drops){]}`\{\{noshy\}\} \newline % Row Count 13 (+ 2) Turn categoricals into booleans\{\{noshy\}\} \newline % Row Count 14 (+ 1) `categories = unique(housing\$ocean\_proximity)`\{\{noshy\}\} \newline % Row Count 16 (+ 2) `\#split the categories off`\{\{noshy\}\} \newline % Row Count 17 (+ 1) `cat\_housing = \seqsplit{data.frame(ocean\_proximity} = housing\$ocean\_proximity)`\{\{noshy\}\} \newline % Row Count 19 (+ 2) `for(cat in categories)\{`\{\{noshy\}\} \newline % Row Count 20 (+ 1) ` cat\_housing{[},cat{]} = rep(0, times= nrow(cat\_housing))`\{\{noshy\}\} \newline % Row Count 22 (+ 2) `\}`\{\{noshy\}\} \newline % Row Count 23 (+ 1) `for(i in 1:length(cat\_housing\$ocean\_proximity))\{`\{\{noshy\}\} \newline % Row Count 25 (+ 2) ` cat = as.character(cat\_housing\$ocean\_proximity{[}i{]})`\{\{noshy\}\} \newline % Row Count 27 (+ 2) ` cat\_housing{[},cat{]}{[}i{]} = 1`\{\{noshy\}\} \newline % Row Count 28 (+ 1) `\}`\{\{noshy\}\} \newline % Row Count 29 (+ 1) `cat\_columns = names(cat\_housing)`\{\{noshy\}\} \newline % Row Count 30 (+ 1) } \tn \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Step 2: Clean the data (cont)}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`keep\_columns = cat\_columns{[}cat\_columns != 'ocean\_proximity'{]}`\{\{noshy\}\} \newline % Row Count 2 (+ 2) `cat\_housing = select(cat\_housing,one\_of(keep\_columns))`\{\{noshy\}\} \newline % Row Count 4 (+ 2) Scale the numerical variables\{\{noshy\}\} \newline % Row Count 5 (+ 1) `drops = c('ocean\_proximity','median\_house\_value')`\{\{noshy\}\} \newline % Row Count 7 (+ 2) `housing\_num = housing{[} , !(names(housing) \%in\% drops){]}`\{\{noshy\}\} \newline % Row Count 9 (+ 2) `scaled\_housing\_num = scale(housing\_num)`\{\{noshy\}\} \newline % Row Count 11 (+ 2) Merge the altered numerical and categorical dataframes\{\{noshy\}\} \newline % Row Count 13 (+ 2) `cleaned\_housing = cbind(cat\_housing, scaled\_housing\_num, median\_house\_value=housing\$median\_house\_value)`\{\{noshy\}\} \newline % Row Count 16 (+ 3) `head(cleaned\_housing)`\{\{noshy\}\} \newline % Row Count 17 (+ 1) {\bf{Output Picture 3}} \newline % Row Count 18 (+ 1) {\bf{Output Picture 4}}% Row Count 19 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{(***Advanced)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Output Picture 3}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/worlddoit_1670614414_ экрана 2022-12-09 в 20.28.49.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{(***Advanced)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Output Picture 4}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/worlddoit_1670614452_ экрана 2022-12-09 в 20.29.02.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{(***Advanced)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Step 3: Create a test set of data}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`set.seed(1738)`\{\{noshy\}\} \newline % Row Count 1 (+ 1) `sample = sample.int(n = nrow(cleaned\_housing), size = \seqsplit{floor(.8*nrow(cleaned\_housing))}, replace = F)`\{\{noshy\}\} \newline % Row Count 4 (+ 3) `train = cleaned\_housing{[}sample, {]} \#just the samples`\{\{noshy\}\} \newline % Row Count 6 (+ 2) `test = cleaned\_housing{[}-sample, {]} \#everything but the samples`\{\{noshy\}\} \newline % Row Count 8 (+ 2) `nrow(train) + nrow(test) == nrow(cleaned\_housing)`\{\{noshy\}\} \newline % Row Count 10 (+ 2) {\emph{TRUE}}% Row Count 11 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{(***Advanced)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Step 4: Test some predictive models.}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`library('boot')`\{\{noshy\}\} \newline % Row Count 1 (+ 1) `?cv.glm`\{\{noshy\}\} \newline % Row Count 2 (+ 1) `glm\_house = glm(median\_house\_value\textasciitilde{}median\_income+ mean\_rooms+ population, data= cleaned\_housing)`\{\{noshy\}\} \newline % Row Count 5 (+ 3) `k\_fold\_cv\_error = cv.glm(cleaned\_housing , glm\_house, K=5)`\{\{noshy\}\} \newline % Row Count 7 (+ 2) `k\_fold\_cv\_error\$delta`\{\{noshy\}\} \newline % Row Count 8 (+ 1) {\emph{6946162248.89155}}\{\{noshy\}\} \newline % Row Count 9 (+ 1) {\emph{6942675168.18876}}\{\{noshy\}\} \newline % Row Count 10 (+ 1) `glm\_cv\_rmse = sqrt(k\_fold\_cv\_error\$delta){[}1{]}`\{\{noshy\}\} \newline % Row Count 12 (+ 2) `glm\_cv\_rmse` \{\{noshy\}\} \newline % Row Count 13 (+ 1) {\emph{83343.6395227107}}\{\{noshy\}\} \newline % Row Count 14 (+ 1) `glm\_house\$coefficients`\{\{noshy\}\} \newline % Row Count 15 (+ 1) {\bf{Output Picture 5}}\{\{noshy\}\} \newline % Row Count 16 (+ 1) Random forest model\{\{noshy\}\} \newline % Row Count 17 (+ 1) `library('randomForest')`\{\{noshy\}\} \newline % Row Count 18 (+ 1) `?randomForest`\{\{noshy\}\} \newline % Row Count 19 (+ 1) `set.seed(1738)`\{\{noshy\}\} \newline % Row Count 20 (+ 1) `train\_y = train{[},'median\_house\_value'{]}`\{\{noshy\}\} \newline % Row Count 21 (+ 1) `train\_x = train{[}, names(train) != 'median\_house\_value'{]}`\{\{noshy\}\} \newline % Row Count 23 (+ 2) `rf\_model = randomForest(train\_x, y = train\_y , ntree = 500, importance = TRUE)`\{\{noshy\}\} \newline % Row Count 25 (+ 2) `rf\_model\$importance`\{\{noshy\}\} \newline % Row Count 26 (+ 1) {\bf{Output Picture 6}}\{\{noshy\}\} \newline % Row Count 27 (+ 1) The out-of-bag (oob) error estimate\{\{noshy\}\} \newline % Row Count 28 (+ 1) `oob\_prediction = predict(rf\_model) `\{\{noshy\}\} \newline % Row Count 29 (+ 1) `\#leaving out a data source forces OOB predictions`\{\{noshy\}\} \newline % Row Count 31 (+ 2) } \tn \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Step 4: Test some predictive models. (cont)}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`train\_mse = mean(as.numeric ((oob\_prediction - train\_y)\textasciicircum{}2))`\{\{noshy\}\} \newline % Row Count 2 (+ 2) `oob\_rmse = sqrt(train\_mse)`\{\{noshy\}\} \newline % Row Count 3 (+ 1) `oob\_rmse`\{\{noshy\}\} \newline % Row Count 4 (+ 1) {\emph{48976.2521584537}}\{\{noshy\}\} \newline % Row Count 5 (+ 1) `test\_y = test{[},'median\_house\_value'{]}`\{\{noshy\}\} \newline % Row Count 6 (+ 1) `test\_x = test{[}, names(test) !='median\_house\_value'{]}`\{\{noshy\}\} \newline % Row Count 8 (+ 2) `y\_pred = predict(rf\_model , test\_x)`\{\{noshy\}\} \newline % Row Count 9 (+ 1) `test\_mse = mean(((y\_pred - test\_y)\textasciicircum{}2))`\{\{noshy\}\} \newline % Row Count 10 (+ 1) `test\_rmse = sqrt(test\_mse)`\{\{noshy\}\} \newline % Row Count 11 (+ 1) `test\_rmse`\{\{noshy\}\} \newline % Row Count 12 (+ 1) {\emph{48354.9021429439}}\{\{noshy\}\}% Row Count 13 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{(***Advanced)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Output Picture 5}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/worlddoit_1670615379_ экрана 2022-12-09 в 20.42.12.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{(***Advanced)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{1}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{} \tn % Row Count 0 (+ 0) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Output Picture 6}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/worlddoit_1670615421_ экрана 2022-12-09 в 20.43.43.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{(***Advanced)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}