Show Menu
Cheatography

r project example Cheat Sheet (DRAFT) by

R example projects, additional to lectures

This is a draft cheat sheet. It is a work in progress and is not finished yet.

Intro

This tutorial is a good first step for someone looking to learn the steps needed for exploring data, cleaning data, and traini­ng/­eva­luating some basic machine learning algori­thms.

Resources:
Main
Additional 1
Additional 2
(***Ad­vanced)

Step 1: Load in the data.

library(tidyverse)

library(reshape2)


housing = read.csv('../input/housing.csv')

head(housing)

summary(housing)

Output Picture 1
par(mfrow=c(2,5))

colnames(housing)

ggplot(data = melt(housing), mapping = aes(x = value)) + 

    geom_histogram(bins = 30) + facet_wrap(~variable, scales = 'free_x')

Output Picture 2
(***Ad­vanced)

Output Picture 1

(***Ad­vanced)

Output Picture 2

(***Ad­vanced)

Step 2: Clean the data

Impute missing values
housing$total_bedrooms[ is.na(housing$total_bedrooms)] = median(housing$total_bedrooms , na.rm = TRUE)


Fix the total columns - make them means
housing$mean_bedrooms = housing$total_bedrooms/housing$households

housing$mean_rooms = housing$total_rooms/housing$households

drops = c('total_bedrooms', 'total_rooms')

housing = housing[ , !(names(housing) %in% drops)]


Turn categoricals into booleans
categories = unique(housing$ocean_proximity)

#split the categories off

cat_housing = data.frame(ocean_proximity = housing$ocean_proximity)

for(cat in categories){

    cat_housing[,cat] = rep(0, times= nrow(cat_housing))

}

for(i in 1:length(cat_housing$ocean_proximity)){

    cat = as.character(cat_housing$ocean_proximity[i])

    cat_housing[,cat][i] = 1

}

cat_columns = names(cat_housing)

keep_columns = cat_columns[cat_columns != 'ocean_proximity']

cat_housing = select(cat_housing,one_of(keep_columns))


Scale the numerical variables
drops = c('ocean_proximity','median_house_value')

housing_num =  housing[ , !(names(housing) %in% drops)]

scaled_housing_num = scale(housing_num)


Merge the altered numerical and categorical dataframes
cleaned_housing = cbind(cat_housing, scaled_housing_num, median_house_value=housing$median_house_value)


head(cleaned_housing)

Output Picture 3
Output Picture 4
(***Ad­vanced)

Output Picture 3

(***Ad­vanced)

Output Picture 4

(***Ad­vanced)
 

Step 3: Create a test set of data

set.seed(1738)

sample = sample.int(n = nrow(cleaned_housing), size = floor(.8*nrow(cleaned_housing)), replace = F)

train = cleaned_housing[sample, ] #just the samples

test  = cleaned_housing[-sample, ] #everything but the samples

nrow(train) + nrow(test) == nrow(cleaned_housing)

TRUE
(***Ad­vanced)

Step 4: Test some predictive models.

library('boot')

?cv.glm

glm_house = glm(median_house_value~median_income+ mean_rooms+ population, data= cleaned_housing)

k_fold_cv_error = cv.glm(cleaned_housing , glm_house, K=5)

k_fold_cv_error$delta

6946162248.89155
6942675168.18876

glm_cv_rmse = sqrt(k_fold_cv_error$delta)[1]

glm_cv_rmse

83343.6395227107

glm_house$coefficients

Output Picture 5

Random forest model
library('randomForest')

?randomForest

set.seed(1738)

train_y = train[,'median_house_value']

train_x = train[, names(train)  != 'median_house_value']

rf_model = randomForest(train_x, y = train_y ,  ntree = 500, importance = TRUE)

rf_model$importance

Output Picture 6

The out-of-bag (oob) error estimate
oob_prediction = predict(rf_model) 

#leaving out a data source forces OOB predictions

train_mse = mean(as.numeric ((oob_prediction - train_y)^2))

oob_rmse = sqrt(train_mse)

oob_rmse

48976.2521584537

test_y = test[,'median_house_value']

test_x = test[, names(test) !='median_house_value']

y_pred = predict(rf_model , test_x)

test_mse = mean(((y_pred - test_y)^2))

test_rmse = sqrt(test_mse)

test_rmse

48354.9021429439
(***Ad­vanced)

Output Picture 5

(***Ad­vanced)

Output Picture 6

(***Ad­vanced)