\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{skydlins} \pdfinfo{ /Title (r-programming.pdf) /Creator (Cheatography) /Author (skydlins) /Subject (R Programming Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{FAA7EF} \definecolor{LightBackground}{HTML}{FDE9FB} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{R Programming Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{skydlins} via \textcolor{DarkBackground}{\uline{cheatography.com/194596/cs/40633/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}skydlins \\ \uline{cheatography.com/skydlins} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 5th October, 2023.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{x{1.4931 cm} x{3.4839 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Data Structures}} \tn % Row 0 \SetRowColor{LightBackground} {\bf{Vector}} & ordered array of elements of the same data type a\textless{}-c(3,1,5) \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Vector Naming & a\textless{}-c("desks" = 1, "tables" = 3, "chairs" = 4) \tn % Row Count 5 (+ 2) % Row 2 \SetRowColor{LightBackground} Vector Coercion & a\textless{}-c(TRUE, FALSE, TRUE) = 1 0 1 \tn % Row Count 7 (+ 2) % Row 3 \SetRowColor{white} & seq(1,9,2) and rep(c(2,3,4), 3) \tn % Row Count 9 (+ 2) % Row 4 \SetRowColor{LightBackground} Vector Subsetting & materials \textless{}- c(wood = 17, cloth = 36, silver = 24, gold = 3) \tn % Row Count 12 (+ 3) % Row 5 \SetRowColor{white} & materials{[}1{]} = wood = 17 \tn % Row Count 13 (+ 1) % Row 6 \SetRowColor{LightBackground} {\bf{Matrix}} & vector of elements arranged in two dimensions \tn % Row Count 15 (+ 2) % Row 7 \SetRowColor{white} & m1\textless{}-matrix(3:8,ncol=3,nrow=2) \tn % Row Count 17 (+ 2) % Row 8 \SetRowColor{LightBackground} & m2\textless{}-3:8 and dim(m2)\textless{}-c(3,2) \tn % Row Count 18 (+ 1) % Row 9 \SetRowColor{white} {\bf{Factor}} & used to store categorical variables (numeric or character) \tn % Row Count 21 (+ 3) % Row 10 \SetRowColor{LightBackground} & a\textless{}-c(0,1,0,0,1) \tn % Row Count 22 (+ 1) % Row 11 \SetRowColor{white} & a.f\textless{}-factor(a,labels = c("Male","Female")) \tn % Row Count 24 (+ 2) % Row 12 \SetRowColor{LightBackground} & a.f = Male Female Male Male Female \tn % Row Count 26 (+ 2) % Row 13 \SetRowColor{white} gl() function & generate factors by specifying the pattern of their levels \tn % Row Count 29 (+ 3) % Row 14 \SetRowColor{LightBackground} & gl(2,8,labels=c("male","female")) \tn % Row Count 31 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{1.4931 cm} x{3.4839 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Data Structures (cont)}} \tn % Row 15 \SetRowColor{LightBackground} {\bf{List}} & multiple types of elements ()list \tn % Row Count 2 (+ 2) % Row 16 \SetRowColor{white} & Mike\textless{}-list(Name="Mike",Salary=10000,Age=43,Children=c("Tom","Lily","Alice")) \tn % Row Count 5 (+ 3) % Row 17 \SetRowColor{LightBackground} \#\$ & is a convenient way to retrieve element by element name. \tn % Row Count 8 (+ 3) % Row 18 \SetRowColor{white} str() & display the internal structure \tn % Row Count 10 (+ 2) % Row 19 \SetRowColor{LightBackground} c() & combine several lists into one \tn % Row Count 12 (+ 2) % Row 20 \SetRowColor{white} {\bf{Array}} & multi-dimensional arrangement of data in a vector. \tn % Row Count 14 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{1.89126 cm} x{3.08574 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Exploring Data}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{{\bf{Missing Data}}} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} Causes & human error, system error, loopholes \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} Dealing & summary() - how much data is missing \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} missing categorical data & set a new category called "Unknown" \tn % Row Count 7 (+ 2) % Row 4 \SetRowColor{LightBackground} missing numerical data & assign mean value or assign a value based on its relationship to other related variables \tn % Row Count 11 (+ 4) % Row 5 \SetRowColor{white} Other Data Problems & data entry, logical errors, outdated, inconsistent \tn % Row Count 14 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.04057 cm} x{2.93643 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Data Visualization}} \tn % Row 0 \SetRowColor{LightBackground} Principles & Simplify, Compare, Attend (Details), Explore (Visual), View diversely, Ask why, Be skeptical, Respond \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} {\bf{GGPlot2}} & (+) allows us to make complex and aesthetically pleasing plots quickly and intuitively \tn % Row Count 9 (+ 4) % Row 2 \SetRowColor{LightBackground} & (-) work exclusively with data tables \tn % Row Count 11 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{{\bf{Components}}} \tn % Row Count 12 (+ 1) % Row 4 \SetRowColor{LightBackground} data & data table in the example plot is summarized. \tn % Row Count 14 (+ 2) % Row 5 \SetRowColor{white} geometry & scatter plot, histograms, smooth densities, q-q plots, and blocks plots. \tn % Row Count 18 (+ 4) % Row 6 \SetRowColor{LightBackground} aesthetic mapping & x and y axis \tn % Row Count 20 (+ 2) % Row 7 \SetRowColor{white} scale & range of x-axis and y-axis appear to be defined by the range of the data \tn % Row Count 24 (+ 4) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{labels, title, legend,} \tn % Row Count 25 (+ 1) % Row 9 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{{\bf{Creating a New Plot}}} \tn % Row Count 26 (+ 1) % Row 10 \SetRowColor{LightBackground} ggplot() function & specify the graph's data component. \tn % Row Count 28 (+ 2) % Row 11 \SetRowColor{white} df \%\textgreater{}\% ggplot() & associates the dataset with the plotting object \tn % Row Count 31 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.04057 cm} x{2.93643 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Data Visualization (cont)}} \tn % Row 12 \SetRowColor{LightBackground} geom\_point() & add a layer, assigning population to x and total to y \tn % Row Count 3 (+ 3) % Row 13 \SetRowColor{white} aes() & recognizes variables from the data component \tn % Row Count 5 (+ 2) % Row 14 \SetRowColor{LightBackground} geom\_label() and geom\_text() & functions to add text to the plot. \tn % Row Count 7 (+ 2) % Row 15 \SetRowColor{white} Size Color & geom\_point(size = 3, color = "blue") \tn % Row Count 9 (+ 2) % Row 16 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{geom\_histogram()} \tn % Row Count 10 (+ 1) % Row 17 \SetRowColor{white} geom\_density() & create smooth densities \tn % Row Count 11 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.18988 cm} x{2.78712 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Programming Structure and Functions}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{{\bf{Basic }}} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} if-else & use curly braces "\{\} \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{if(boolean condition)\{ expressions \} else\{ alternative expressions \}} \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} any() (similar to OR "|") & returns TRUE if any of the logicals are true \tn % Row Count 8 (+ 3) % Row 4 \SetRowColor{LightBackground} z \textless{}- c(TRUE, TRUE, FALSE) any(z) & TRUE \tn % Row Count 10 (+ 2) % Row 5 \SetRowColor{white} all() (similar to \&) & returns TRUE if all of the logicals are true \tn % Row Count 13 (+ 3) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{{\bf{Basic Functions}}} \tn % Row Count 14 (+ 1) % Row 7 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{my\_function \textless{}- function(x)\{ operations that operate on x which is defined by user of function value of final line is returned \}} \tn % Row Count 17 (+ 3) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{{\bf{For Loops}}} \tn % Row Count 18 (+ 1) % Row 9 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{for (i in range of values)\{ operations that use i, which is changing across the range of values \}} \tn % Row Count 20 (+ 2) % Row 10 \SetRowColor{LightBackground} for (i in 1:5)\{ print(i) \} & \#\# {[}1{]} 1 \#\# {[}1{]} 2 \#\# {[}1{]} 3 \#\# {[}1{]} 4 \#\# {[}1{]} 5 \tn % Row Count 22 (+ 2) % Row 11 \SetRowColor{white} {\bf{apply()}} & apply a function to the margin of a matrix or a dataframe \tn % Row Count 25 (+ 3) % Row 12 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{apply(x, MARGIN, FUNC, ...)} \tn % Row Count 26 (+ 1) % Row 13 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{z \textless{}- cbind(A=1:3,B=4:6,C=7:9,D=10:12)} \tn % Row Count 27 (+ 1) % Row 14 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{apply(z,2,sum)} \tn % Row Count 28 (+ 1) % Row 15 \SetRowColor{white} {\bf{lapply()}} & works on list or vector inputs instead of matrix/dataframe input. \tn % Row Count 31 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.18988 cm} x{2.78712 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Programming Structure and Functions (cont)}} \tn % Row 16 \SetRowColor{LightBackground} & returns a list of the same length as the given list or array. \tn % Row Count 3 (+ 3) % Row 17 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{x \textless{}- list(A=1:4, B=seq(0.1,1,by=0.1))} \tn % Row Count 4 (+ 1) % Row 18 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{lapply(x, mean)} \tn % Row Count 5 (+ 1) % Row 19 \SetRowColor{white} sapply() & wrapper of the lapply() function. It also takes in a list or vector, however it returns a vector instead of a list \tn % Row Count 11 (+ 6) % Row 20 \SetRowColor{LightBackground} vapply() & performs exactly like lapply() except that we can specify the return value type from FUNC \tn % Row Count 16 (+ 5) % Row 21 \SetRowColor{white} & can be faster if we know that our output can use a atomic data type that takes up less memory space. \tn % Row Count 21 (+ 5) % Row 22 \SetRowColor{LightBackground} rapply() & a specified function to all elements of a list recursively \tn % Row Count 24 (+ 3) % Row 23 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{x \textless{}- list(A=2,B=list(-1,3),C=list(-2,list(-5,6)))} \tn % Row Count 25 (+ 1) % Row 24 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{rapply(x, function(x)\{x\textasciicircum{}2\}) \#returns a vector} \tn % Row Count 26 (+ 1) % Row 25 \SetRowColor{white} mapply() & take multiple vectors as inputs. \tn % Row Count 28 (+ 2) % Row 26 \SetRowColor{LightBackground} tapply() & applies the specified FUNC to each group of an array, grouped based on levels of certain factors. \tn % Row Count 33 (+ 5) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.18988 cm} x{2.78712 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Programming Structure and Functions (cont)}} \tn % Row 27 \SetRowColor{LightBackground} Pivot Table & grouping data by different fields \tn % Row Count 2 (+ 2) % Row 28 \SetRowColor{white} & summarize the data with your own function for specific purposes \tn % Row Count 5 (+ 3) % Row 29 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{data(murders) tapply(murders\$total, murders\$region, sum)} \tn % Row Count 7 (+ 2) % Row 30 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{\seqsplit{tapply(murders\$total/murders\$population}, murders\$region, mean)} \tn % Row Count 9 (+ 2) % Row 31 \SetRowColor{LightBackground} split() & split a dataframe into a list of data frames based on a factor array. \tn % Row Count 13 (+ 4) % Row 32 \SetRowColor{white} tapply() & group data by multiple factors \tn % Row Count 15 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Basic Data Wrangling}} \tn % Row 0 \SetRowColor{LightBackground} {\bf{Data Frame}} & use the data.frame() function. elements in the same column should be of the same data type. \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} & name \textless{}- c("Anne"), age \textless{}- c(28), child \textless{}- c(FALSE) \tn % Row Count 8 (+ 3) % Row 2 \SetRowColor{LightBackground} & df \textless{}- data.frame(name, age, child) \tn % Row Count 10 (+ 2) % Row 3 \SetRowColor{white} Data Frame Naming & names(df) \textless{}- c("Name", "Age", "Child") \tn % Row Count 12 (+ 2) % Row 4 \SetRowColor{LightBackground} Data Frame Structure & Data Frame in R is implemented as a list of vectors with an important restriction of equal length vectors. \tn % Row Count 18 (+ 6) % Row 5 \SetRowColor{white} & R stores the character data type as a factor instead \tn % Row Count 21 (+ 3) % Row 6 \SetRowColor{LightBackground} str() & prevents R from converting the characters to vectors \tn % Row Count 24 (+ 3) % Row 7 \SetRowColor{white} Data Frame Subsetting & "{[}{]}" and "{[}{[}{]}{]}" and "\$" \tn % Row Count 26 (+ 2) % Row 8 \SetRowColor{LightBackground} & df{[}3,2{]} \#r3c2 \tn % Row Count 27 (+ 1) % Row 9 \SetRowColor{white} c() & used to subset multiple portions of the Data Frame. \tn % Row Count 30 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Basic Data Wrangling (cont)}} \tn % Row 10 \SetRowColor{LightBackground} Data Frame Extension & adding new variables or observations to an existing Data Frame. \tn % Row Count 4 (+ 4) % Row 11 \SetRowColor{white} & height \textless{}- c(163, 177, 163, 162, 157) \tn % Row Count 6 (+ 2) % Row 12 \SetRowColor{LightBackground} & df\$height \textless{}- height \tn % Row Count 7 (+ 1) % Row 13 \SetRowColor{white} Sorting & sort(df\$age) \#based on age \tn % Row Count 9 (+ 2) % Row 14 \SetRowColor{LightBackground} & max(df\$age) \#getting the highest age \tn % Row Count 11 (+ 2) % Row 15 \SetRowColor{white} & which.max(df\$age) \#index of the oldest person \tn % Row Count 14 (+ 3) % Row 16 \SetRowColor{LightBackground} Data Frame Indexing & find specific cases in DF \tn % Row Count 16 (+ 2) % Row 17 \SetRowColor{white} & index \textless{}- df\$height \textgreater{} 171 \tn % Row Count 18 (+ 2) % Row 18 \SetRowColor{LightBackground} & sum(index) \#number of people taller than the male average \tn % Row Count 21 (+ 3) % Row 19 \SetRowColor{white} & df\$name{[}index{]} \#person who is taller: pete \tn % Row Count 24 (+ 3) % Row 20 \SetRowColor{LightBackground} finding those older than 30 without children. & index \textless{}- df\$age \textgreater{} 30 \& df\$child == FALSE \tn % Row Count 27 (+ 3) % Row 21 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{library(dplyr)} \tn % Row Count 28 (+ 1) % Row 22 \SetRowColor{LightBackground} mutate() function & extend DF for row and col \tn % Row Count 30 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Basic Data Wrangling (cont)}} \tn % Row 23 \SetRowColor{LightBackground} & df \textless{}- mutate(df, bmi = weight/height\textasciicircum{}2*10000) \tn % Row Count 3 (+ 3) % Row 24 \SetRowColor{white} & or df\$bmi \textless{}- df\$weight/df\$height\textasciicircum{}2*10000 \tn % Row Count 5 (+ 2) % Row 25 \SetRowColor{LightBackground} filter() & subset rows \tn % Row Count 6 (+ 1) % Row 26 \SetRowColor{white} & filter(df, bmi \textgreater{} 18.5 \& bmi \textless{} 24.9) \tn % Row Count 8 (+ 2) % Row 27 \SetRowColor{LightBackground} select() & health \textless{}- select(df, name, height, weight, bmi) \tn % Row Count 11 (+ 3) % Row 28 \SetRowColor{white} & filter(health, bmi \textgreater{} 18.5 \& bmi \textless{} 24.9) \tn % Row Count 13 (+ 2) % Row 29 \SetRowColor{LightBackground} \%\textgreater{}\% & chain these three functions together. \tn % Row Count 15 (+ 2) % Row 30 \SetRowColor{white} & df \%\textgreater{}\% select(name, height, weight, bmi) \%\textgreater{}\% filter(bmi \textgreater{} 18.5 \& bmi \textless{} 24.9) \tn % Row Count 19 (+ 4) % Row 31 \SetRowColor{LightBackground} merge 2 df based on col & right\_join \& left\_join \tn % Row Count 21 (+ 2) % Row 32 \SetRowColor{white} suffix & added to the column names from each data frame to make them unique in the result. \tn % Row Count 26 (+ 5) % Row 33 \SetRowColor{LightBackground} & should be a vector with two elements \tn % Row Count 28 (+ 2) % Row 34 \SetRowColor{white} & \seqsplit{right\_join(driver\_q2}, constructors, by = c("constructor" = "constructor"),suffix = c("\_driver", "\_constructor")) \tn % Row Count 34 (+ 6) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Basic Data Wrangling (cont)}} \tn % Row 35 \SetRowColor{LightBackground} inner\_join & returns only the rows that have matching values in both data frames based on specified key columns \tn % Row Count 5 (+ 5) % Row 36 \SetRowColor{white} union & combine two or more data frames vertically, stacking them on top of each other. \tn % Row Count 9 (+ 4) % Row 37 \SetRowColor{LightBackground} anti\_join & filtering rows from the first data frame based on values that do not have matching values in the second data frame. \tn % Row Count 15 (+ 6) % Row 38 \SetRowColor{white} common used for df & rbind \& bind\_rows \tn % Row Count 16 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Advance Data Wrangling}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{{\bf{Importing Data}}} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} Via readr & read\_csv: comma separated values \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} & read\_tsv: tab delimited separated values \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} & read\_delim: general text file format \tn % Row Count 7 (+ 2) % Row 4 \SetRowColor{LightBackground} & head() function display it as a tibble. \tn % Row Count 9 (+ 2) % Row 5 \SetRowColor{white} readxl & read\_excel,xls,xlsx \tn % Row Count 10 (+ 1) % Row 6 \SetRowColor{LightBackground} R-base & read.csv() and read.table() can be used without having to install any libraries \tn % Row Count 14 (+ 4) % Row 7 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{R-base import function will automatically convert any character strings to factors} \tn % Row Count 16 (+ 2) % Row 8 \SetRowColor{LightBackground} CSV & widespread use in the data science community due to its efficiency at storing large amounts of data and also as it is platform agnostic.There is also no size limit with csv files. \tn % Row Count 25 (+ 9) % Row 9 \SetRowColor{white} Via URL & read\_csv(url) \tn % Row Count 26 (+ 1) % Row 10 \SetRowColor{LightBackground} tempdir() \& tempfile() & it is useful to have a temporary directory or filename auto generated to manage these URL imports \tn % Row Count 31 (+ 5) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Advance Data Wrangling (cont)}} \tn % Row 11 \SetRowColor{LightBackground} Via JSON & provided via API, library(jsonlite), fromJSON(url) \tn % Row Count 3 (+ 3) % Row 12 \SetRowColor{white} Via XML & rawling a website, \seqsplit{xmlParse("books.xml")} \tn % Row Count 5 (+ 2) % Row 13 \SetRowColor{LightBackground} xmlRoot() & access the root node of the tree. \tn % Row Count 7 (+ 2) % Row 14 \SetRowColor{white} xmlChildren() & use the children nodes of the tree \tn % Row Count 9 (+ 2) % Row 15 \SetRowColor{LightBackground} xmlToList(data), \seqsplit{xmlToDataFrame(books)} & convert the XML file to list or data frame format \tn % Row Count 12 (+ 3) % Row 16 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{{\bf{Reshaping Data}}} \tn % Row Count 13 (+ 1) % Row 17 \SetRowColor{LightBackground} Wide to Tidy: gather() & convert the above wide data into tidy data \tn % Row Count 16 (+ 3) % Row 18 \SetRowColor{white} country,year,feartility & new\_tidy\_data \textless{}- wide\_data \%\textgreater{}\% gather(year, fertility, '1960':'2015') \tn % Row Count 20 (+ 4) % Row 19 \SetRowColor{LightBackground} Tidy to Wide: spread() & The first argument of the spread() function is to declare which variables are to be used as column names. While the second argument is to specify the variables used to fill out the cells. \tn % Row Count 30 (+ 10) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Advance Data Wrangling (cont)}} \tn % Row 20 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Separate and Unite} \tn % Row Count 1 (+ 1) % Row 21 \SetRowColor{white} separate() & requires the target column, the names for the new columns and the separator character. \tn % Row Count 6 (+ 5) % Row 22 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{dat \%\textgreater{}\% separate(key, c("year", "first\_variable\_name", \seqsplit{"second\_variable\_name")}, fill = "right")} \tn % Row Count 8 (+ 2) % Row 23 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{spread()} \tn % Row Count 9 (+ 1) % Row 24 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{dat \%\textgreater{}\% separate(key, c("year", "variable\_name"), extra = "merge") \%\textgreater{}\% spread(variable\_name, value)} \tn % Row Count 11 (+ 2) % Row 25 \SetRowColor{white} unite() & first name \& last name \tn % Row Count 13 (+ 2) % Row 26 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{{\bf{Combining Data}}} \tn % Row Count 14 (+ 1) % Row 27 \SetRowColor{white} join() & combined so that matching rows are together \tn % Row Count 17 (+ 3) % Row 28 \SetRowColor{LightBackground} Inner Join & eturns only the rows that have matching values in both tables \tn % Row Count 21 (+ 4) % Row 29 \SetRowColor{white} Left Join & returns all the rows from the left table and the matching rows from the right table \tn % Row Count 26 (+ 5) % Row 30 \SetRowColor{LightBackground} Full Join & all the rows from both tables, with NULL values in columns where there is no match in the other table \tn % Row Count 32 (+ 6) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Advance Data Wrangling (cont)}} \tn % Row 31 \SetRowColor{LightBackground} Semi Join & keep the part of the first table for which we have information in the second table, but doesnt add the columns of the second. \tn % Row Count 7 (+ 7) % Row 32 \SetRowColor{white} Anti Join & opposite of the semi\_join() function. It allows us to keep the part of the first table for which we have NO information in the second table, but doesnt add the columns of the second. \tn % Row Count 17 (+ 10) % Row 33 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{{\bf{Set Operators}}} \tn % Row Count 18 (+ 1) % Row 34 \SetRowColor{white} Intersect: inds common elements shared among sets. & intersect(1:10, 6:15) = 6 7 8 9 10 \tn % Row Count 21 (+ 3) % Row 35 \SetRowColor{LightBackground} Union: ombines sets into one, removing duplicates. & same with interse \tn % Row Count 24 (+ 3) % Row 36 \SetRowColor{white} Setequal & helps us check if two sets are the same regardless of order. \tn % Row Count 27 (+ 3) % Row 37 \SetRowColor{LightBackground} Setdiff & find the elements that are in one set (or vector) but not in another set. \tn % Row Count 31 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}