
% Packages
\usepackage{fancyhdr}           % For header and footer
\usepackage{multicol}           % Allows multicols in tables
\usepackage{tabularx}           % Intelligent column widths
\usepackage{tabulary}           % Used in header and footer
\usepackage{hhline}             % Border under tables
\usepackage{graphicx}           % For images
\usepackage{xcolor}             % For hex colours
%\usepackage[utf8x]{inputenc}    % For unicode character support
\usepackage[T1]{fontenc}        % Without this we get weird character replacements
\usepackage{colortbl}           % For coloured tables
\usepackage{setspace}           % For line height
\usepackage{lastpage}           % Needed for total page number
\usepackage{seqsplit}           % Splits long words.
%\usepackage{opensans}          % Can't make this work so far. Shame. Would be lovely.
\usepackage[normalem]{ulem}     % For underlining links
% Most of the following are not required for the majority
% of cheat sheets but are needed for some symbol support.
\usepackage{amsmath}            % Symbols
\usepackage{MnSymbol}           % Symbols
\usepackage{wasysym}            % Symbols
%\usepackage[english,german,french,spanish,italian]{babel}              % Languages

% Document Info
\author{Niki (worlddoit)}
  /Title (r-project-example.pdf)
  /Creator (Cheatography)
  /Author (Niki (worlddoit))
  /Subject (r project example Cheat Sheet)

% Lengths and widths
\setlength{\tabcolsep}{0.2cm} % Space between columns
\setlength{\headsep}{-12pt} % Reduce space between header and content
\setlength{\headheight}{85pt} % If less, LaTeX automatically increases it
\renewcommand{\footrulewidth}{0pt} % Remove footer line
\renewcommand{\headrulewidth}{0pt} % Remove header line
\renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit
% This two commands together give roughly
% the right line height in the tables

% Commands
\newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour
\newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols
\newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns
\newcommand{\tn}{\tabularnewline} % Required as custom column type in use

% Font and Colours

% Set font size to small. Switch to any value
% from this page to resize cheat sheet text:
% www.emerson.emory.edu/services/latex/latex_169.html
\footnotesize % Small font.


\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Intro}}  \tn
\mymulticolumn{1}{x{8.4cm}}{This tutorial is a good first step for someone looking to learn the steps needed for exploring data, cleaning data, and training/evaluating some basic machine learning algorithms. \newline % Row Count 4 (+ 4)
Resources: \newline % Row Count 5 (+ 1)
\{\{popup="https://www.kaggle.com/code/camnugent/introduction-to-machine-learning-in-r-tutorial"\}\}Main\{\{/popup\}\} \newline % Row Count 8 (+ 3)
\{\{popup="https://www.r-bloggers.com/2022/02/beginners-guide-to-machine-learning-in-r-with-step-by-step-tutorial/"\}\}Additional 1\{\{/popup\}\} \newline % Row Count 11 (+ 3)
\{\{popup="https://machinelearningmastery.com/machine-learning-in-r-step-by-step/"\}\}Additional 2\{\{/popup\}\}% Row Count 14 (+ 3)
} \tn 
\mymulticolumn{1}{x{8.4cm}}{(***Advanced)}  \tn 

\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Step 1: Load in the data.}}  \tn
\mymulticolumn{1}{x{8.4cm}}{`library(tidyverse)`\{\{noshy\}\} \newline % Row Count 1 (+ 1)
`library(reshape2)`\{\{noshy\}\} \newline % Row Count 2 (+ 1)
`housing = \seqsplit{read.csv('../input/housing}.csv')`\{\{noshy\}\} \newline % Row Count 4 (+ 2)
`head(housing)`\{\{noshy\}\} \newline % Row Count 5 (+ 1)
`summary(housing)`\{\{noshy\}\} \newline % Row Count 6 (+ 1)
{\bf{Output Picture 1}}\{\{noshy\}\} \newline % Row Count 7 (+ 1)
`par(mfrow=c(2,5))`\{\{noshy\}\} \newline % Row Count 8 (+ 1)
`colnames(housing)`\{\{noshy\}\} \newline % Row Count 9 (+ 1)
`ggplot(data = melt(housing), mapping = aes(x = value)) + `\{\{noshy\}\} \newline % Row Count 11 (+ 2)
`    geom\_histogram(bins = 30) + facet\_wrap(\textasciitilde{}variable, scales = 'free\_x')`\{\{noshy\}\} \newline % Row Count 13 (+ 2)
{\bf{Output Picture 2}}\{\{noshy\}\}% Row Count 14 (+ 1)
} \tn 
\mymulticolumn{1}{x{8.4cm}}{(***Advanced)}  \tn 

\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Output Picture 1}}  \tn
\mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/worlddoit_1670613737_ экрана 2022-12-09 в 20.17.50.png}}} \tn 
\mymulticolumn{1}{x{8.4cm}}{(***Advanced)}  \tn 

\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Output Picture 2}}  \tn
\mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/worlddoit_1670613784_ экрана 2022-12-09 в 20.18.55.png}}} \tn 
\mymulticolumn{1}{x{8.4cm}}{(***Advanced)}  \tn 

\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Step 2: Clean the data}}  \tn
\mymulticolumn{1}{x{8.4cm}}{Impute missing values\{\{noshy\}\} \newline % Row Count 1 (+ 1)
`housing\$total\_bedrooms{[} is.na(housing\$total\_bedrooms){]} = \seqsplit{median(housing\$total\_bedrooms} , na.rm = TRUE)`\{\{noshy\}\} \newline % Row Count 4 (+ 3)
Fix the total columns - make them means\{\{noshy\}\} \newline % Row Count 5 (+ 1)
`housing\$mean\_bedrooms = housing\$total\_bedrooms/housing\$households`\{\{noshy\}\} \newline % Row Count 7 (+ 2)
`housing\$mean\_rooms = housing\$total\_rooms/housing\$households`\{\{noshy\}\} \newline % Row Count 9 (+ 2)
`drops = c('total\_bedrooms', 'total\_rooms')`\{\{noshy\}\} \newline % Row Count 11 (+ 2)
`housing = housing{[} , !(names(housing) \%in\% drops){]}`\{\{noshy\}\} \newline % Row Count 13 (+ 2)
Turn categoricals into booleans\{\{noshy\}\} \newline % Row Count 14 (+ 1)
`categories = unique(housing\$ocean\_proximity)`\{\{noshy\}\} \newline % Row Count 16 (+ 2)
`\#split the categories off`\{\{noshy\}\} \newline % Row Count 17 (+ 1)
`cat\_housing = \seqsplit{data.frame(ocean\_proximity} = housing\$ocean\_proximity)`\{\{noshy\}\} \newline % Row Count 19 (+ 2)
`for(cat in categories)\{`\{\{noshy\}\} \newline % Row Count 20 (+ 1)
`    cat\_housing{[},cat{]} = rep(0, times= nrow(cat\_housing))`\{\{noshy\}\} \newline % Row Count 22 (+ 2)
`\}`\{\{noshy\}\} \newline % Row Count 23 (+ 1)
`for(i in 1:length(cat\_housing\$ocean\_proximity))\{`\{\{noshy\}\} \newline % Row Count 25 (+ 2)
`    cat = as.character(cat\_housing\$ocean\_proximity{[}i{]})`\{\{noshy\}\} \newline % Row Count 27 (+ 2)
`    cat\_housing{[},cat{]}{[}i{]} = 1`\{\{noshy\}\} \newline % Row Count 28 (+ 1)
`\}`\{\{noshy\}\} \newline % Row Count 29 (+ 1)
`cat\_columns = names(cat\_housing)`\{\{noshy\}\} \newline % Row Count 30 (+ 1)
} \tn 

\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Step 2: Clean the data (cont)}}  \tn
\mymulticolumn{1}{x{8.4cm}}{`keep\_columns = cat\_columns{[}cat\_columns != 'ocean\_proximity'{]}`\{\{noshy\}\} \newline % Row Count 2 (+ 2)
`cat\_housing = select(cat\_housing,one\_of(keep\_columns))`\{\{noshy\}\} \newline % Row Count 4 (+ 2)
Scale the numerical variables\{\{noshy\}\} \newline % Row Count 5 (+ 1)
`drops = c('ocean\_proximity','median\_house\_value')`\{\{noshy\}\} \newline % Row Count 7 (+ 2)
`housing\_num =  housing{[} , !(names(housing) \%in\% drops){]}`\{\{noshy\}\} \newline % Row Count 9 (+ 2)
`scaled\_housing\_num = scale(housing\_num)`\{\{noshy\}\} \newline % Row Count 11 (+ 2)
Merge the altered numerical and categorical dataframes\{\{noshy\}\} \newline % Row Count 13 (+ 2)
`cleaned\_housing = cbind(cat\_housing, scaled\_housing\_num, median\_house\_value=housing\$median\_house\_value)`\{\{noshy\}\} \newline % Row Count 16 (+ 3)
`head(cleaned\_housing)`\{\{noshy\}\} \newline % Row Count 17 (+ 1)
{\bf{Output Picture 3}} \newline % Row Count 18 (+ 1)
{\bf{Output Picture 4}}% Row Count 19 (+ 1)
} \tn 
\mymulticolumn{1}{x{8.4cm}}{(***Advanced)}  \tn 

\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Output Picture 3}}  \tn
\mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/worlddoit_1670614414_ экрана 2022-12-09 в 20.28.49.png}}} \tn 
\mymulticolumn{1}{x{8.4cm}}{(***Advanced)}  \tn 

\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Output Picture 4}}  \tn
\mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/worlddoit_1670614452_ экрана 2022-12-09 в 20.29.02.png}}} \tn 
\mymulticolumn{1}{x{8.4cm}}{(***Advanced)}  \tn 

\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Step 3: Create a test set of data}}  \tn
\mymulticolumn{1}{x{8.4cm}}{`set.seed(1738)`\{\{noshy\}\} \newline % Row Count 1 (+ 1)
`sample = sample.int(n = nrow(cleaned\_housing), size = \seqsplit{floor(.8*nrow(cleaned\_housing))}, replace = F)`\{\{noshy\}\} \newline % Row Count 4 (+ 3)
`train = cleaned\_housing{[}sample, {]} \#just the samples`\{\{noshy\}\} \newline % Row Count 6 (+ 2)
`test  = cleaned\_housing{[}-sample, {]} \#everything but the samples`\{\{noshy\}\} \newline % Row Count 8 (+ 2)
`nrow(train) + nrow(test) == nrow(cleaned\_housing)`\{\{noshy\}\} \newline % Row Count 10 (+ 2)
{\emph{TRUE}}% Row Count 11 (+ 1)
} \tn 
\mymulticolumn{1}{x{8.4cm}}{(***Advanced)}  \tn 

\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Step 4: Test some predictive models.}}  \tn
\mymulticolumn{1}{x{8.4cm}}{`library('boot')`\{\{noshy\}\} \newline % Row Count 1 (+ 1)
`?cv.glm`\{\{noshy\}\} \newline % Row Count 2 (+ 1)
`glm\_house = glm(median\_house\_value\textasciitilde{}median\_income+ mean\_rooms+ population, data= cleaned\_housing)`\{\{noshy\}\} \newline % Row Count 5 (+ 3)
`k\_fold\_cv\_error = cv.glm(cleaned\_housing , glm\_house, K=5)`\{\{noshy\}\} \newline % Row Count 7 (+ 2)
`k\_fold\_cv\_error\$delta`\{\{noshy\}\} \newline % Row Count 8 (+ 1)
{\emph{6946162248.89155}}\{\{noshy\}\} \newline % Row Count 9 (+ 1)
{\emph{6942675168.18876}}\{\{noshy\}\} \newline % Row Count 10 (+ 1)
`glm\_cv\_rmse = sqrt(k\_fold\_cv\_error\$delta){[}1{]}`\{\{noshy\}\} \newline % Row Count 12 (+ 2)
`glm\_cv\_rmse` \{\{noshy\}\} \newline % Row Count 13 (+ 1)
{\emph{83343.6395227107}}\{\{noshy\}\} \newline % Row Count 14 (+ 1)
`glm\_house\$coefficients`\{\{noshy\}\} \newline % Row Count 15 (+ 1)
{\bf{Output Picture 5}}\{\{noshy\}\} \newline % Row Count 16 (+ 1)
Random forest model\{\{noshy\}\} \newline % Row Count 17 (+ 1)
`library('randomForest')`\{\{noshy\}\} \newline % Row Count 18 (+ 1)
`?randomForest`\{\{noshy\}\} \newline % Row Count 19 (+ 1)
`set.seed(1738)`\{\{noshy\}\} \newline % Row Count 20 (+ 1)
`train\_y = train{[},'median\_house\_value'{]}`\{\{noshy\}\} \newline % Row Count 21 (+ 1)
`train\_x = train{[}, names(train)  != 'median\_house\_value'{]}`\{\{noshy\}\} \newline % Row Count 23 (+ 2)
`rf\_model = randomForest(train\_x, y = train\_y ,  ntree = 500, importance = TRUE)`\{\{noshy\}\} \newline % Row Count 25 (+ 2)
`rf\_model\$importance`\{\{noshy\}\} \newline % Row Count 26 (+ 1)
{\bf{Output Picture 6}}\{\{noshy\}\} \newline % Row Count 27 (+ 1)
The out-of-bag (oob) error estimate\{\{noshy\}\} \newline % Row Count 28 (+ 1)
`oob\_prediction = predict(rf\_model) `\{\{noshy\}\} \newline % Row Count 29 (+ 1)
`\#leaving out a data source forces OOB predictions`\{\{noshy\}\} \newline % Row Count 31 (+ 2)
} \tn 

\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Step 4: Test some predictive models. (cont)}}  \tn
\mymulticolumn{1}{x{8.4cm}}{`train\_mse = mean(as.numeric ((oob\_prediction - train\_y)\textasciicircum{}2))`\{\{noshy\}\} \newline % Row Count 2 (+ 2)
`oob\_rmse = sqrt(train\_mse)`\{\{noshy\}\} \newline % Row Count 3 (+ 1)
`oob\_rmse`\{\{noshy\}\} \newline % Row Count 4 (+ 1)
{\emph{48976.2521584537}}\{\{noshy\}\} \newline % Row Count 5 (+ 1)
`test\_y = test{[},'median\_house\_value'{]}`\{\{noshy\}\} \newline % Row Count 6 (+ 1)
`test\_x = test{[}, names(test) !='median\_house\_value'{]}`\{\{noshy\}\} \newline % Row Count 8 (+ 2)
`y\_pred = predict(rf\_model , test\_x)`\{\{noshy\}\} \newline % Row Count 9 (+ 1)
`test\_mse = mean(((y\_pred - test\_y)\textasciicircum{}2))`\{\{noshy\}\} \newline % Row Count 10 (+ 1)
`test\_rmse = sqrt(test\_mse)`\{\{noshy\}\} \newline % Row Count 11 (+ 1)
`test\_rmse`\{\{noshy\}\} \newline % Row Count 12 (+ 1)
{\emph{48354.9021429439}}\{\{noshy\}\}% Row Count 13 (+ 1)
} \tn 
\mymulticolumn{1}{x{8.4cm}}{(***Advanced)}  \tn 

\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Output Picture 5}}  \tn
\mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/worlddoit_1670615379_ экрана 2022-12-09 в 20.42.12.png}}} \tn 
\mymulticolumn{1}{x{8.4cm}}{(***Advanced)}  \tn 

\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{1}}  \tn
% Row 0
\mymulticolumn{1}{x{8.4cm}}{} \tn 
% Row Count 0 (+ 0)

\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Output Picture 6}}  \tn
\mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/worlddoit_1670615421_ экрана 2022-12-09 в 20.43.43.png}}} \tn 
\mymulticolumn{1}{x{8.4cm}}{(***Advanced)}  \tn 

