\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{datamansam} \pdfinfo{ /Title (cleaning-data-with-sql.pdf) /Creator (Cheatography) /Author (datamansam) /Subject (Cleaning Data with SQL Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{A3A3A3} \definecolor{LightBackground}{HTML}{F3F3F3} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Cleaning Data with SQL Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{datamansam} via \textcolor{DarkBackground}{\uline{cheatography.com/139410/cs/32044/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}datamansam \\ \uline{cheatography.com/datamansam} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 30th September, 2022.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{p{0.4977 cm} x{4.4793 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Regular Expressions}} \tn % Row 0 \SetRowColor{LightBackground} \textbackslash{}d & A digit \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} {[}{]} & Any item in brackets \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} ? & previous character 0 or 1 times \tn % Row Count 3 (+ 1) % Row 3 \SetRowColor{white} + & Previous character 1 or more times \tn % Row Count 4 (+ 1) % Row 4 \SetRowColor{LightBackground} * & Previous character 0 or more times \tn % Row Count 5 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Identifying Missing Data}} \tn % Row 0 \SetRowColor{LightBackground} Missing Completely at Random & Missing Not at Random \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} SELECT * FROM \seqsplit{restaurant\_inspection} WHERE score IS NULL; & SELECT inspection\_type, COUNT(*) as count FROM \seqsplit{restaurant\_inspection} WHERE score IS NULL \tn % Row Count 7 (+ 5) % Row 2 \SetRowColor{LightBackground} SELECT COUNT(*) FROM \seqsplit{restaurant\_inspection} WHERE score IS NULL; & GROUP BY inspection\_type ORDER BY count D \tn % Row Count 11 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Replacing null values with an average}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{UPDATE TABLE Patient \newline SET Income = (SELECT avg(Income ) FROM Patient) \newline WHERE Income IS NULL;} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Dropping Values where a column is null}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{SELECT count(*) \newline FROM Patient \newline WHERE Weight IS NULL;} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Dealing with Duplicates}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{WITH DuplicateWithNumber AS \newline ( SELECT Column1, Column2 \newline , row\_number() OVER ( PARTITION BY Column1, Column2 \newline ORDER BY Column3) AS NthAppearance \newline FROM YourTableName \newline ) \newline DELETE FROM DuplicateWithNumber \newline WHERE NthAppearance\textgreater{} 1} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Cleaning with Case}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{UPDATE Project..housing \newline SET SoldAsVacant=CASE \newline WHEN SoldAsVacant = 'Y' THEN 'Yes' \newline WHEN SoldAsVacant = 'N' THEN 'No' \newline ELSE SoldAsVacant \newline END} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Replacing missing values with COALESCE()}} \tn % Row 0 \SetRowColor{LightBackground} The COALESCE() functiin substitutes null valuees & SELECT name, COALESCE(score, -1), inspection\_type FROM \seqsplit{restaurant\_inspection} \tn % Row Count 4 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{1.55618 cm} x{1.51041 cm} x{1.51041 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{5.377cm}}{\bf\textcolor{white}{SQL Set Operations}} \tn % Row 0 \SetRowColor{LightBackground} EXCEPT/MINUS & INTERSECT & UNION \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} Return the rows that are found in one relation but not the other. & the rows that are found in both relations & e rows that are found in either relation. \tn % Row Count 6 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}---} \SetRowColor{LightBackground} \mymulticolumn{3}{x{5.377cm}}{Note that input relations must have the same number of columns and compatible data types for the respective columns.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Detecting out of range values}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{SELECT {\emph{ FROM series \newline WHERE rating NOT BETWEEN 0 AND 10 \newline \newline SELECT }} FROM series \newline WHERE rating \textless{} 0 OR rating \textgreater{} 10} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Replicate: Repeats string x no of times}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{REPLICATE (string, integer) \newline \newline REPLICATE('0', 9 - LEN(registration\_code))} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Checking if a value is a website}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{select \newline case \newline when email LIKE '\%.com' then email \newline else null \newline end AS email \newline from pension\_funds} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{p{0.54924 cm} x{2.24273 cm} x{1.78503 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{5.377cm}}{\bf\textcolor{white}{Handy Numeric Functions}} \tn % Row 0 \SetRowColor{LightBackground} IS \seqsplit{NUMERIC} & Returns true if the expression has a numeric value, false if not & \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} \seqsplit{CONVERT} & Converts a character string into a number & SELECT CONVERT(int, 25.65); \tn % Row Count 7 (+ 3) % Row 2 \SetRowColor{LightBackground} \seqsplit{TRUNCATE} & n truncated to d decimal places. If you skip d, then n is truncated to 0 decimal places. If d is a negative number, the function truncates the number n to d digits left to the decimal point. & SELECT \seqsplit{TRUNCATE(123.4567},2); 100 \tn % Row Count 17 (+ 10) \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{1.64241 cm} x{3.33459 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Column values must translate to type to stand}} \tn % Row 0 \SetRowColor{LightBackground} ALTER TABLE & tableName \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} ALTER COLUMN & columnName \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} TYPE smallint & USING columnName::smallint \tn % Row Count 3 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{String Operations}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{-{}- SUBSTR \newline \newline syntax is: \newline SUBSTR({\emph{string}}, {\emph{starting character position}}, {\emph{\# of characters}}): \newline SELECT incidnt\_num, \newline date, \newline SUBSTR(date, 4, 2) AS day \newline FROM \seqsplit{tutorial.sf\_crime\_incidents\_2014\_01} \newline \newline \# Extracting date and time specifically from a column \newline \newline SELECT incidnt\_num, \newline date, \newline LEFT(date, 10) AS cleaned\_date, \newline RIGHT(date, 17) AS cleaned\_time \newline FROM \seqsplit{tutorial.sf\_crime\_incidents\_2014\_01} \newline \newline \# TRIM, to remove characters \newline from the beginning and end of a string. \newline SELECT location, \newline TRIM(both '()' FROM location) \newline FROM \seqsplit{tutorial.sf\_crime\_incidents\_2014\_01} \newline \newline \# POSITION allows you to specify a substring, \newline then returns a numerical value equal to the \newline character number (counting from left) where that \newline substring first appears in the target string. \newline \newline SELECT incidnt\_num, \newline descript, \newline POSITION('A' IN descript) AS a\_position \newline FROM \seqsplit{tutorial.sf\_crime\_incidents\_2014\_01} \newline \newline \# COALESCE can replace the null values \newline \newline SELECT incidnt\_num, \newline descript, \newline COALESCE(descript, 'No Description') \newline FROM \seqsplit{tutorial.sf\_crime\_incidents\_cleandate} \newline ORDER BY descript DESC} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Left for time \newline Right for date \newline \newline Trim to remove chars at beginning and end \newline \newline Position to return the location of a character! \newline \newline COALESCE to replace the nulls!} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}