\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{Raygun246} \pdfinfo{ /Title (data-202-r-studio-sql-wrangling-ggplot.pdf) /Creator (Cheatography) /Author (Raygun246) /Subject (DATA 202 | R Studio | SQL + Wrangling + GGplot Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{FFB5B8} \definecolor{LightBackground}{HTML}{FFF5F6} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{DATA 202 | R Studio | SQL + Wrangling + GGplot Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{Raygun246} via \textcolor{DarkBackground}{\uline{cheatography.com/203728/cs/43414/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}Raygun246 \\ \uline{cheatography.com/raygun246} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 16th May, 2024.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{4} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 5}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{\#1) \newline library(DBI) \newline library(RSQLite) \newline \newline 2) \newline test\_conn \textless{}- \seqsplit{dbConnect(RSQLite::SQLite()}, \newline "test\_db.sqlite") \newline \newline 3) \newline file.info("test\_db.sqlite") \newline \newline 4) \newline test\_conn} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{Needed Library's | Create Connection Special File | List Files in Folder | Test connection find where it exists} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 5 | Basic Operations}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{1) \newline dbListTables(test\_conn) \newline \newline 2) \newline schools \textless{}- data.frame(School= \newline c("Mathematics and Statistics"), \newline Code=c("SMS","SGEES"), \newline Faculty=c("Science","Science") \newline ) \newline schools \newline \newline 3) \newline dbWriteTable(test\_conn, "vicschools", schools, overwrite=TRUE) \newline dbListTables(test\_conn) \newline \newline 4) \newline file.info("test\_db.sqlite") \newline \newline 5) \newline dbDisconnect(test\_conn)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{Test whats in database | Define Dataframe in R | Copy into new database table called vicschools | Check db file size | Need to disconnect at end of session} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 5 | Useful Commands}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{This are the main ones we'll need. \newline \newline Connections \newline dbConnect() \newline dbDisconnect() \newline dbCanConnect() \newline \newline Finding out what is in the database \newline dbListTables() \newline dbExistsTable() \newline dbListFields() \newline \newline Fetching data from and Writing data to the database \newline dbReadTable() \newline dbWriteTable() (note overwrite and append options) \newline dbRemoveTable() \newline dbGetQuery() \newline \newline Controlling queries and changes to the database \newline dbExecute() \newline dbBegin() \newline dbCommit() \newline dbRollback() \newline dbFetch()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 5 | Read + Delete Table}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{1) \newline test\_conn \textless{}- \seqsplit{dbConnect(RSQLite::SQLite()}, "test\_db.sqlite") \newline dbReadTable(test\_conn, "vicschools") \newline \newline 2) \newline vv \textless{}- dbReadTable(test\_conn, "vicschools") \newline vv \newline \newline 3) \newline dbListTables(test\_conn) \newline dbRemoveTable(test\_conn, "vicschools")} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{Reading | Storing | Deleting} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 5 | Rolling}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{By default when SQLite starts it is in auto-commit mode: \newline so that all changes that are requested are automatically made permanent. \newline \newline To make a set of tenative changes enter commit mode using \newline the dbBegin() command: \newline \newline dbBegin(test\_conn) \newline Then make a series of changes to the database. \newline \newline If you want to keep the changes go: \newline \newline dbCommit(test\_conn) \newline or if you want to abandon the changes go: \newline \newline dbRollback(test\_conn) \newline This abandons all changes made after the dbBegin() statement. \newline \newline After either of these two calls (dbCommit or dbRollback) \newline the database is back in auto-commit mode} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 5 | Rolling}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{By default when SQLite starts it is in auto-commit mode: \newline so that all changes that are requested are automatically made permanent. \newline \newline To make a set of tenative changes enter commit mode using \newline the dbBegin() command: \newline \newline dbBegin(test\_conn) \newline Then make a series of changes to the database. \newline \newline If you want to keep the changes go: \newline \newline dbCommit(test\_conn) \newline or if you want to abandon the changes go: \newline \newline dbRollback(test\_conn) \newline This abandons all changes made after the dbBegin() statement. \newline \newline After either of these two calls (dbCommit or dbRollback) \newline the database is back in auto-commit mode} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 6 | Using SQL select}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{1) \newline library(DBI) \newline library(RSQLite) \newline test\_conn \textless{}- \seqsplit{dbConnect(RSQLite::SQLite()}, "test\_db.sqlite") \newline surf \textless{}- read.csv("surf.csv") \newline dbWriteTable(test\_conn, "surfshort", surf{[}1:10,1:8{]}, overwrite=TRUE) \newline \newline 2) \newline dbGetQuery(test\_conn, "SELECT {\emph{ FROM surfshort") \newline ss \textless{}- dbGetQuery(test\_conn, "SELECT }} FROM surfshort") \newline \newline \newline 3) \newline ```\{sql connection=test\_conn\} \newline SELECT * FROM surfshort \newline ``` \newline \newline 4) \newline SELECT marital, gender FROM surfshort} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{Create n start table | Select n show table n save to var | What tp write in markdown | select specific column} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 6 | Where Clause}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{1) \newline SELECT {\emph{ \newline FROM surfshort \newline WHERE Marital="never" \newline \newline 2) \newline SELECT }} \newline FROM surfshort \newline WHERE Marital\textless{}\textgreater{}"never" \newline \newline 3) \newline SELECT * FROM surfshort WHERE Marital="never" \newline \newline 4) \newline SELECT Age, Gender \newline FROM surfshort \newline WHERE Marital="never" AND Qualification="school" \newline \newline 5) \newline SELECT Age, Gender, Qualification, Marital, Marital, Qualification \newline FROM surfshort \newline WHERE (Marital="never" AND Qualification="school") OR Marital="married" \newline \newline 6) \newline SELECT Age, Gender AS Sex, Qualification, Marital, Marital AS MaritalStatus, Qualification \newline FROM surfshort \newline WHERE (Marital="never" AND Qualification="school") OR Marital="married" \newline \newline 7) \newline Common operators we want to use in WHERE clause are: \newline \newline AND \newline OR \newline NOT \newline and we make comparisons with \newline \newline =, \textless{}\textgreater{} \newline \textgreater{}, \textgreater{}=, \textless{}, \textless{}= \newline LIKE \newline IN \newline IS NULL, IS NOT NULL \newline Here NULL is the way SQL refers to missing data. \newline \newline 8) \newline SELECT Marital, Age, Qualification \newline FROM surfshort \newline WHERE Age IN (34,35,36,45) \newline ORDER BY Age DESC, Qualification} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{Select using Conditions | Select not equal | select equal | multiple condition | Mulitple condition v2 pro | rename based on query | Common operators | Order by Ascending} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 6 | Creating Tables Manipulation}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{1) \newline CREATE TABLE lecturers ( \newline first\_name TEXT, \newline last\_name TEXT, \newline start\_week INTEGER, \newline end\_week INTEGER, \newline school TEXT \newline \newline ) \newline \newline 2) \newline SELECT {\emph{ FROM lecturers \newline \newline \newline 3) \newline INSERT INTO lecturers (first\_name, last\_name, school) \newline VALUES \newline ("Richard","Arnold","SMS"), \newline ("Louise","McMillan","SMS"), \newline ("Ryan","Admiraal","SMS"), \newline ("John","Haywood","SMS") \newline \newline 4) \newline SELECT }} FROM lecturers \newline \newline 5) \newline UPDATE lecturers \newline SET start\_week=1, end\_week=6 \newline WHERE first\_name = "Richard" \newline \newline 6) \newline UPDATE lecturers SET school="Mathematics and Statistics" \newline \newline 7) \newline DELETE FROM lecturers WHERE first\_name="John" \newline \newline 8) \newline Delete Bunch \newline \newline 9) \newline DROP TABLE lecturers \newline \newline 10) \newline SELECT Marital, COUNT({\emph{) \newline FROM SURF \newline GROUP BY Marital \newline \newline SELECT Marital, COUNT(}}) AS Number, MIN(Age) as AgeMin, MAX(Age) as AgeMax \newline FROM SURF \newline WHERE Gender = "female" \newline GROUP BY Marital} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{Create table | Insert Data | Insert Data V2 | Checking | Modify | Mulitple Rows at once | Delete | Delete bunch | Delete Table | Counts} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 6 | Joins}} \tn \SetRowColor{white} \mymulticolumn{1}{x{3.833cm}}{1) \newline % Row Count 1 (+ 1) SELECT * \newline % Row Count 2 (+ 1) FROM students LEFT JOIN enrolments \newline % Row Count 3 (+ 1) ON \seqsplit{students.idno=enrolments.idno} \newline % Row Count 4 (+ 1) ORDER BY idno \newline % Row Count 5 (+ 1) 2) \newline % Row Count 6 (+ 1) SELECT * \newline % Row Count 7 (+ 1) FROM enrolments INNER JOIN students \newline % Row Count 8 (+ 1) ON \seqsplit{students.idno=enrolments.idno} \newline % Row Count 9 (+ 1) ORDER BY idno \newline % Row Count 10 (+ 1) 3) \newline % Row Count 11 (+ 1) SELECT students.idno, enrolments.idno, "first.name", "last.name", course, grade \newline % Row Count 13 (+ 2) FROM students LEFT JOIN enrolments \newline % Row Count 14 (+ 1) ON \seqsplit{students.idno=enrolments.idno} \newline % Row Count 15 (+ 1) UNION \newline % Row Count 16 (+ 1) SELECT students.idno, enrolments.idno, "first.name", "last.name", course, grade \newline % Row Count 18 (+ 2) FROM enrolments LEFT JOIN students \newline % Row Count 19 (+ 1) ON \seqsplit{students.idno=enrolments.idno} \newline % Row Count 20 (+ 1) ORDER BY students.idno \newline % Row Count 21 (+ 1) 4) \newline % Row Count 22 (+ 1) xx \textless{}- \seqsplit{data.frame(colour=c("Red"},"Green","Blue"), \newline % Row Count 24 (+ 2) height=c("Tall","Tall","Short")) \newline % Row Count 25 (+ 1) yy \textless{}- \seqsplit{data.frame(width=c("wide"},"narrow")) \newline % Row Count 26 (+ 1) dbWriteTable(test\_conn, "xx", xx, overwrite=TRUE) \newline % Row Count 27 (+ 1) dbWriteTable(test\_conn, "yy", yy, overwrite=TRUE) \newline % Row Count 28 (+ 1) 5) \newline % Row Count 29 (+ 1) merge(xx, yy) \newline % Row Count 30 (+ 1) } \tn \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 6 | Joins (cont)}} \tn \SetRowColor{white} \mymulticolumn{1}{x{3.833cm}}{merge(students, enrolments, by="idno") \newline % Row Count 1 (+ 1) 6) \newline % Row Count 2 (+ 1) Thus when combining two datasets with merge(): \newline % Row Count 3 (+ 1) all=FALSE (the default) keeps only matching records (inner join) \newline % Row Count 5 (+ 2) all=TRUE keeps all records from both datasets, whether matching or not (full outer join) \newline % Row Count 7 (+ 2) all.x=TRUE keeps all records from the first dataset (left join) \newline % Row Count 9 (+ 2) all.y=TRUE keeps all records from the second dataset (right join) \newline % Row Count 11 (+ 2) and \newline % Row Count 12 (+ 1) by=NULL does not use a matching key (cross join) \newline % Row Count 13 (+ 1) by="xxx" matches on column xxx in both tables \newline % Row Count 14 (+ 1) by.x="xxx", by.y="zzz" mathches column xxx in the first table with column zzz in the second. Thus if the matching key has different names in the two tables then the merge() command allows us to specify them separately.% Row Count 19 (+ 5) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{Left Join | Inner Join | Full outer join | Cross Join | Merging in R | Sussy} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 6 | Subquery}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{1) \newline We can use a subquery to define and populate a table \newline \newline CREATE TABLE counts \newline AS \newline SELECT idno, COUNT({\emph{) AS ncourses \newline FROM enrolments \newline GROUP BY idno \newline \newline 2) \newline dbRemoveTable(test\_conn, "counts") \newline \newline 3) \newline CREATE TABLE counts \newline (idno INTEGER, \newline ncourses INTEGER) \newline \newline 4) \newline INSERT INTO counts (idno, ncourses) \newline SELECT idno, COUNT(}}) AS ncourses \newline FROM enrolments \newline GROUP BY idno \newline \newline 5) \newline SELECT {\emph{ \newline FROM counts \newline WHERE ncourses = (SELECT MAX(ncourses) FROM counts) \newline \newline 6) \newline SELECT grade, COUNT(}}) AS num \newline FROM enrolments \newline GROUP BY grade \newline \newline 7) \newline SELECT grade, COUNT({\emph{) AS num, \newline ROUND(COUNT(}}){\emph{100.0/(SELECT COUNT(}}) FROM enrolments),1) AS pct \newline FROM enrolments \newline GROUP BY grade \newline \newline 8) \newline CREATE TABLE patients ( \newline PatientID INTEGER, \newline FirstName TEXT, \newline LastName TEXT, \newline DateOfBirth TEXT, \newline PRIMARY KEY(PatientID) \newline ) \newline \newline 9) \newline INSERT INTO patients (PatientID, FirstName, LastName, DateOfBirth) \newline VALUES \newline (1121, "Richard", "Arnold", "1/1/1965"), \newline (2155, "Ella", "Li", "6/7/1999"), \newline (2338, "Gemma", "Watson", "18/3/2001") \newline \newline 10) Chaning existing \newline DROP TABLE IF EXISTS simple \newline CREATE TABLE simple \newline (name TEXT) \newline INSERT INTO simple (name) \newline VALUES \newline ('Richard'), \newline ('John'), \newline ('Louise') \newline \newline We can rename the table \newline \newline DROP TABLE IF EXISTS csimple \newline ALTER TABLE simple RENAME TO csimple \newline dbListTables(test\_conn) \newline \newline 11) \newline We can insert further rows using a query to \newline \newline INSERT INTO simple \newline SELECT {\emph{ FROM csimple \newline Though we have to be sure that the column names coming in from csimple match those in simple or the INSERT won't work. \newline \newline We can add a column to an existing table: \newline \newline ALTER TABLE simple \newline ADD COLUMN \newline first\_week INTEGER \newline UPDATE simple SET first\_week = 1 WHERE name = 'Richard' \newline SELECT }} FROM simple \newline \newline 12) \newline Renaming a column is easy too: \newline \newline ALTER TABLE simple RENAME COLUMN first\_week to firstweek \newline \newline 13) date n time \newline DROP TABLE IF EXISTS dates \newline CREATE TABLE dates \newline (datestring TEXT) \newline INSERT INTO dates (datestring) \newline VALUES \newline ('2020-01-01'), \newline ('1977-12-25') \newline We can output any format we like using SELECT and various conversion functions. \newline \newline SELECT datestring, strftime("\%d/\%m/\%Y", datestring) FROM dates} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{Create table | Remove Table | Create Table n Populate | Use where to find specific | Group by | Convert to percentages Rounded | Create primary key table so only one key per person| insert into new table | renaming |} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 7 | dyplr n tidyr}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{library(dplyr) \newline library(tidyr) \newline \newline 1) \newline Copying and renaming columns \newline Copying a data frame in base R \newline \newline surf.copy \textless{}- surf \newline Copying a data frame in dplyr \newline \newline surf.copy \textless{}- rename(surf) \newline but with the ability to rename columns as we go \newline \newline surf.copy \textless{}- rename(surf, Sex=Gender, \seqsplit{Highest\_Qualification=Qualification)} \newline surf{[}1:2,{]} \newline \newline 2) \newline Selecting specific columns \newline To list just the Age and Income columns in surf in Base R we go \newline \newline ageinc \textless{}- surf{[},c("Age","Income"){]} \newline ageinc{[}1:3,{]} \newline \newline \#\# Age Income \newline \#\# 1 15 87 \newline \#\# 2 40 596 \newline \#\# 3 38 497 \newline The select() function in dplyr allows us to go \newline \newline ageinc \textless{}- select(surf, Age, Income) \newline ageinc{[}1:3,{]} \newline \newline 3) \newline We can also omit columns, using the negative sign before the name \newline \newline noageinc \textless{}- select(surf, -Age, -Income) \newline noageinc{[}1:3,{]} \newline \newline 4) \newline and just like in Base R we can select columns by specifying their numeric locations: \newline \newline surf{[}1:3, c(1,6,7){]} \newline \newline 5) \newline base \newline surf{[}surf\$Gender=="female" \& surf\$Income\textgreater{}900,{]} \newline \newline In dplyr we can use the filter() function to achieve this \newline \newline filter(surf, Gender=="female" \& Income\textgreater{}900) \newline \newline 6) \newline tidyr \newline surf{[}surf\$Gender=="female" \& surf\$Income\textgreater{}900,{]} \newline \newline base \newline surf{[}which(surf\$Gender=="female" \& surf\$Income\textgreater{}900),{]} \newline \newline \newline 7) near certain tolerance \newline filter(starwars, near(height, 170, tol=5)) \newline \newline 8) \newline base \newline Reordering a data frame \newline We may want to reorder the rows of a data set by one more more variables. In base R the order() command allows us to to this. \newline \newline Here are the male high earners: \newline \newline mhe \textless{}- filter(surf, Income\textgreater{}1200, Gender=="male") \newline mhe \newline sort(mhe\$Age) \newline order(mhe\$Age) \newline \newline tidy \newline arrange(mhe, Age) \newline \newline two or more variables \newline mhe{[}order(mhe\$Qualification, mhe\$Age),{]} \newline \newline 9) \newline Creating new columns \newline In Base R we can create new columns by simply referring to a name that does not yet exist \newline \newline mhe\$AgeSquared \textless{}- mhe\$Age\textasciicircum{}2 \newline In dplyr we use the mutate() function - and we can create multiple new columns in one step: \newline \newline mhe \textless{}- mutate(mhe, N=nrow(mhe), AgeSquared=Age\textasciicircum{}2, AgeCubed=Age\textasciicircum{}3) \newline mhe \newline \newline 10) \newline subsurf \textless{}- surf \%\textgreater{}\% \newline select(-X) \%\textgreater{}\% \newline rename(Sex=Gender) \%\textgreater{}\% \newline \seqsplit{filter(Qualification\%in\%c("vocational"},"degre")) \%\textgreater{}\% \newline mutate(AgeSquared=Age\textasciicircum{}2) \newline In this chain of piped substatements, the pipe sends the output of each substatement to be the first argument of the function in the following substatement. We only specify the second and subsequent arguments. \newline \newline 11) \newline Now convert it to a tibble: \newline \newline mtcars \textless{}- as\_tibble(mtcars) \newline \newline We can convert a tibble back to a standard data frame with as.data.frame() \newline \newline mtcars \textless{}- as.data.frame(mtcars)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 7 | Extra}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{1)Full join \newline \newline Keep all entries from A \newline and B \newline (i.e., keep entries in A \newline that do not have a match in B \newline , and keep entries in B \newline that do not have a match in A \newline ). \newline \newline merge(A, B, all = TRUE, ...) \newline merge(A, B, all.x = TRUE, all.y = TRUE, ...) \newline full\_join(A, B, by = id, ...) \newline Entries in A \newline that do not have matches in B \newline will have NAs in fields from B \newline , and vice versa. \newline \newline merge(students, enrolments, by="id", all=TRUE) \newline \newline 2) \newline In base R we can use reshape() \newline \newline reshape(failure\_data, \newline idvar="Course", \newline varying=c("D","E","Withdraw"), \newline times=c("D","E","Withdraw"), \newline timevar="Result", \newline v.names=c("Percentage"), \newline direction="long") \newline \newline We will do this using the pivot\_longer function, from the tidyr package: \newline \newline library(tidyr) \newline failure\_long \textless{}- \seqsplit{pivot\_longer(failure\_data}, \newline cols = c(D, E, Withdraw), \newline names\_to = "Result", \newline values\_to = "Percentage") \newline failure\_long \textless{}- arrange(failure\_long, Course) \newline kable(failure\_long) \newline \newline 3) \newline Transforming Data From Long to Wide Format \newline reshape(as.data.frame(gdp\_long\_2000s), \newline timevar="year", \newline v.names="gdpPercap", \newline direction="wide", \newline idvar="country") \newline \newline gdp\_wide \textless{}- \seqsplit{pivot\_wider(gdp\_long\_2000s}, names\_from = year, values\_from = gdpPercap) \newline kable(gdp\_wide)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Week 8 | GGplot}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{1) \newline Bar charts for categorical variables \newline barplot(table(rugby\$position), xlab="", ylab="Count", las=2) \newline \newline library(ggplot2) \newline \# Two ways to produce exactly the same bar chart of player position. \newline ggplot(rugby, aes(x = position)) + \newline geom\_bar() \newline \newline FLIP \newline ggplot(rugby) + \newline geom\_bar(aes(x = position)) + \newline coord\_flip() \newline \newline LABELS n THEME \newline ggplot(rugby) + \newline geom\_bar(aes(x = position, y = (..count..) / sum(..count..))) + \newline labs(x = "Position", y = "Proportion", title="Distributions over positions") + \newline theme(axis.title = element\_text(size=20)) \newline \newline 2) \newline BOXPLOT \newline \newline ggplot(rugby) + \newline geom\_boxplot(aes(x = weight\_kg)) + \newline labs(x="Weight (kg)") + \newline coord\_flip() \newline \newline 3) \newline HISTOGRAM \newline ggplot(rugby) + \newline geom\_histogram(aes(x = weight\_kg, y = ..density..), binwidth = 5) + \newline labs(x = "Weight (kg)", y = "Density") \newline \newline 4) \newline Frequency polygons and density plots for numeric variables \newline ggplot(rugby) + \newline geom\_freqpoly(aes(x = weight\_kg, y = ..density..), binwidth = 5) + \newline labs(x = "Weight (kg)", y = "Density") \newline \newline ggplot(rugby, aes(x = weight\_kg, y = ..density..)) + \newline geom\_histogram(binwidth = 5) + \newline labs(x = "Weight (kg)", y = "Density") + \newline geom\_freqpoly(binwidth = 5) \newline \newline ggplot(rugby) + \newline geom\_freqpoly(aes(x = weight\_kg), stat = "density") \newline \newline 5) \newline SCATTER TWO VARIABLES \newline \newline plot.settings \textless{}- ggplot(rugby, aes(x = height\_cm, y = weight\_kg)) + \newline labs(x = "Height (cm)", y = "Weight (kg)") + \newline theme\_classic() \newline \newline 6) \newline HEXSCATTER \newline library(hexbin) \newline plot.settings + \newline \newline 7) \newline our.scatterplot \textless{}- plot.settings + \newline geom\_point(position = "jitter") \# Scatterplot of weight versus height. \newline our.scatterplot + \newline geom\_smooth(method = "lm") \newline \newline 8) \newline BUNCH OF GRAPHS \newline ggplot(rugby) + \newline geom\_point(aes(x = height\_cm, y = weight\_kg), position = "jitter") + \newline facet\_wrap(\textasciitilde{}position) \newline \newline \newline 9) \newline Sidebyside box \newline ggplot(rugby) + \newline geom\_boxplot(aes(x = position, y = weight\_kg)) + \newline \newline 10) \newline Summary of Plot Types \newline Plot type geom type aes options Additional arguments \newline Bar chart geom\_bar x, y, fill position = "fill", position = "dodge", stat = "identity" \newline Histogram geom\_histogram x binwidth, bins \newline Boxplot geom\_boxplot x, y boxplot() \newline Scatterplot geom\_point x, y, colour, size, shape position = "jitter" \newline Line of best fit overlay, line plot geom\_line x, y, colour, linetype size \newline Hexagonally binned scatterplot geom\_hex x, y binwidth, bins \newline Bar/column chart geom\_col x, y, fill barplot() \newline labs(x="Position", y="Weight (kg)") \newline geom\_hex() \newline plot.settings + geom\_point()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}