\documentclass[10pt,a4paper]{article}

% Packages
\usepackage{fancyhdr}           % For header and footer
\usepackage{multicol}           % Allows multicols in tables
\usepackage{tabularx}           % Intelligent column widths
\usepackage{tabulary}           % Used in header and footer
\usepackage{hhline}             % Border under tables
\usepackage{graphicx}           % For images
\usepackage{xcolor}             % For hex colours
%\usepackage[utf8x]{inputenc}    % For unicode character support
\usepackage[T1]{fontenc}        % Without this we get weird character replacements
\usepackage{colortbl}           % For coloured tables
\usepackage{setspace}           % For line height
\usepackage{lastpage}           % Needed for total page number
\usepackage{seqsplit}           % Splits long words.
%\usepackage{opensans}          % Can't make this work so far. Shame. Would be lovely.
\usepackage[normalem]{ulem}     % For underlining links
% Most of the following are not required for the majority
% of cheat sheets but are needed for some symbol support.
\usepackage{amsmath}            % Symbols
\usepackage{MnSymbol}           % Symbols
\usepackage{wasysym}            % Symbols
%\usepackage[english,german,french,spanish,italian]{babel}              % Languages

% Document Info
\author{datamansam}
\pdfinfo{
  /Title (cleaning-with-pyspark.pdf)
  /Creator (Cheatography)
  /Author (datamansam)
  /Subject (Cleaning with PySpark Cheat Sheet)
}

% Lengths and widths
\addtolength{\textwidth}{6cm}
\addtolength{\textheight}{-1cm}
\addtolength{\hoffset}{-3cm}
\addtolength{\voffset}{-2cm}
\setlength{\tabcolsep}{0.2cm} % Space between columns
\setlength{\headsep}{-12pt} % Reduce space between header and content
\setlength{\headheight}{85pt} % If less, LaTeX automatically increases it
\renewcommand{\footrulewidth}{0pt} % Remove footer line
\renewcommand{\headrulewidth}{0pt} % Remove header line
\renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit
% This two commands together give roughly
% the right line height in the tables
\renewcommand{\arraystretch}{1.3}
\onehalfspacing

% Commands
\newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour
\newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols
\newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns
\newcommand{\tn}{\tabularnewline} % Required as custom column type in use

% Font and Colours
\definecolor{HeadBackground}{HTML}{333333}
\definecolor{FootBackground}{HTML}{666666}
\definecolor{TextColor}{HTML}{333333}
\definecolor{DarkBackground}{HTML}{A3A3A3}
\definecolor{LightBackground}{HTML}{F3F3F3}
\renewcommand{\familydefault}{\sfdefault}
\color{TextColor}

% Header and Footer
\pagestyle{fancy}
\fancyhead{} % Set header to blank
\fancyfoot{} % Set footer to blank
\fancyhead[L]{
\noindent
\begin{multicols}{3}
\begin{tabulary}{5.8cm}{C}
    \SetRowColor{DarkBackground}
    \vspace{-7pt}
    {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent
        \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}}
    }
\end{tabulary}
\columnbreak
\begin{tabulary}{11cm}{L}
    \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Cleaning with PySpark Cheat Sheet}}}} \\
    \normalsize{by \textcolor{DarkBackground}{datamansam} via \textcolor{DarkBackground}{\uline{cheatography.com/139410/cs/32141/}}}
\end{tabulary}
\end{multicols}}

\fancyfoot[L]{ \footnotesize
\noindent
\begin{multicols}{3}
\begin{tabulary}{5.8cm}{LL}
  \SetRowColor{FootBackground}
  \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}}  \\
  \vspace{-2pt}datamansam \\
  \uline{cheatography.com/datamansam} \\
  \end{tabulary}
\vfill
\columnbreak
\begin{tabulary}{5.8cm}{L}
  \SetRowColor{FootBackground}
  \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}}  \\
   \vspace{-2pt}Published 3rd September, 2022.\\
   Updated 12th September, 2022.\\
   Page {\thepage} of \pageref{LastPage}.
\end{tabulary}
\vfill
\columnbreak
\begin{tabulary}{5.8cm}{L}
  \SetRowColor{FootBackground}
  \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}}  \\
  \SetRowColor{white}
  \vspace{-5pt}
  %\includegraphics[width=48px,height=48px]{dave.jpeg}
  Measure your website readability!\\
  www.readability-score.com
\end{tabulary}
\end{multicols}}


\begin{document}
\raggedright
\raggedcolumns

% Set font size to small. Switch to any value
% from this page to resize cheat sheet text:
% www.emerson.emory.edu/services/latex/latex_169.html
\footnotesize % Small font.

\begin{multicols*}{3}

\begin{tabularx}{5.377cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Defining Schema}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{from pyspark.sql.types import * \newline Schema=StructType({[} \newline   StructField('Store',StringType(),nullable=True), \newline   StructField('StoreType',StringType(),nullable=True), \newline   StructField('Assortment',StringType(),nullable=True), \newline   \seqsplit{StructField('CompetitionDistance'},FloatType(),nullable=True), \newline   \seqsplit{StructField('CompetitionOpenSinceMonth'},IntegerType(),nullable=True),  \seqsplit{StructField('CompetitionOpenSinceYear'},IntegerType(),nullable=True),  StructField('Promo2',IntegerType(),nullable=True), \newline   \seqsplit{StructField('Promo2SinceWeek'},IntegerType(),nullable=True), \newline   \seqsplit{StructField('Promo2SinceYear'},IntegerType(),nullable=True), \newline   \seqsplit{StructField('PromoInterval'},StringType(),nullable=True) \newline {]}) \newline df = \seqsplit{spark.read.option("header"},True).schema(Schema).csv('store.csv') \newline  \newline \# We can drop invalid rows while reading the dataset by setting the read mode as "DROPMALFORMED" \newline df\_1=spark.read.option("header",True).option("mode",'DROPMALFORMED').csv('store.csv') \newline  \newline df.show()} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{Spark does not detect schema itself properly, so we need to define the schema as well for the data set.}  \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{5.377cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{PySpark DataTypes}}  \tn
% Row 0
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{Type} \tn 
% Row Count 1 (+ 1)
% Row 1
\SetRowColor{white}
\mymulticolumn{1}{x{5.377cm}}{byte} \tn 
% Row Count 2 (+ 1)
% Row 2
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{short} \tn 
% Row Count 3 (+ 1)
% Row 3
\SetRowColor{white}
\mymulticolumn{1}{x{5.377cm}}{int} \tn 
% Row Count 4 (+ 1)
% Row 4
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{long} \tn 
% Row Count 5 (+ 1)
% Row 5
\SetRowColor{white}
\mymulticolumn{1}{x{5.377cm}}{floats} \tn 
% Row Count 6 (+ 1)
% Row 6
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{double} \tn 
% Row Count 7 (+ 1)
% Row 7
\SetRowColor{white}
\mymulticolumn{1}{x{5.377cm}}{DecimalType} \tn 
% Row Count 8 (+ 1)
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{5.377cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Filtering Data}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{voter\_df.filter(voter\_df{[}'name'{]}.isNotNull()) \newline   OR   \newline voter\_df.where(\textasciitilde{} voter\_df.\_c1.isNull()) \newline  \newline voter\_df.filter(voter\_df.date.year \textgreater{} 1800)             \newline voter\_df.where(voter\_df{[}'\_c0'{]}.contains('VOTE')) \newline  \newline \#Multiple Conditions  \newline whereDF = \seqsplit{flattenDF.where((col("firstName")} == "xiangrui") | (col("firstName") == \seqsplit{"michael")).sort(asc("lastName"))} \newline whereDF.show(truncate=False) \newline  \newline  \newline \#Unique Values \newline voter\_df = df.select(df{[}"VOTER NAME"{]}).distinct() \newline  \newline \# Show the rows with 10 highest IDs in the set \newline voter\_df.orderBy(voter\_df.ROW\_ID.desc()).show(10)} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{5.377cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{User Defined Functions}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{1. Define a Python method \newline def reverseString(mystr): \newline return mystr{[}::-1{]} \newline  \newline 2. Wrap the function and store as a variable \newline udfReverseString = udf(reverseString, StringType()) \newline  \newline 3. Use with Spark \newline user\_df = \seqsplit{user\_df.withColumn('ReverseName'}, \newline udfReverseString(user\_df.Name))} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{5.377cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Using SQL to clean script}}  \tn
% Row 0
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{\seqsplit{df.createOrReplaceTempView("table1")}} \tn 
% Row Count 1 (+ 1)
% Row 1
\SetRowColor{white}
\mymulticolumn{1}{x{5.377cm}}{df2 = spark.sql("SELECT field1, field2 FROM table1")} \tn 
% Row Count 3 (+ 2)
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{5.377cm}{x{1.59264 cm} x{3.38436 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{String Data Types}}  \tn
% Row 0
\SetRowColor{LightBackground}
\mymulticolumn{2}{x{5.377cm}}{StringType} \tn 
% Row Count 1 (+ 1)
% Row 1
\SetRowColor{white}
\seqsplit{VarcharType(length)} & A variant of StringType which has a length limitation. Data writing will fail if the input string exceeds the length limitation \tn 
% Row Count 6 (+ 5)
% Row 2
\SetRowColor{LightBackground}
\seqsplit{CharType(length)} & Reading column of type CharType(n) always returns string values of length n. Char type column comparison will pad the short one to the longer length. \tn 
% Row Count 12 (+ 6)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{5.377cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Adding, renaming and removing columns}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{add - withColumn \newline voter\_df.withColumn('year', voter\_df.date.year) \newline  \newline renaming - withColumnRenamed \newline test\_df\_sex = \seqsplit{test\_df.withColumnRenamed('Gender'}, 'Sex') \newline  \newline drop \newline  \newline voter\_df.drop('unused\_column') \newline  \newline  \newline from pyspark.sql import functions as F \newline  \newline add\_n = udf(lambda x, y: x + y, IntegerType()) \newline  \newline \# We register a UDF that adds a column to the DataFrame,  \newline and we cast the id column to an Integer type. \newline df = \seqsplit{df.withColumn('id\_offset'}, add\_n(F.lit(1000),  \newline df.id.cast(IntegerType())))} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{5.377cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Validating with Joins}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{parsed\_df = \seqsplit{spark.read.parquet('parsed\_data.parquet')} \newline company\_df = \seqsplit{spark.read.parquet('companies.parquet')} \newline verified\_df = \seqsplit{parsed\_df.join(company\_df}, parsed\_df.company == company\_df.company) \newline  \newline \# This automatically removes any rows with a company not in the valid\_df !} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{5.377cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{View data/actions:}}  \tn
% Row 0
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{printSchema(), head(), show(), count(), columns and describe()} \tn 
% Row Count 2 (+ 2)
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{show() - Displays/Prints a number of rows in a tabular format. By default it displays 20 rows and to change the default number, you can pass a value to show(n). \newline  \newline where as take(n) returns first n rows as Array of row objects. It is an alias for first(). \newline  \newline count() -  total rows}  \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Complex Data Types}}  \tn
% Row 0
\SetRowColor{LightBackground}
\seqsplit{ArrayType(elementType}, containsNull) & nts values comprising a sequence of elements \tn 
% Row Count 3 (+ 3)
% Row 1
\SetRowColor{white}
MapType(keyType, valueType, valueContainsNull) & Represents values comprising a set of key-value pairs. The data type of keys is described by keyType and the data type of values is described by valueType. For a MapType value, keys are not allowed to have null values. valueContainsNull is used to indicate if values of a MapType value can have null values. \tn 
% Row Count 19 (+ 16)
% Row 2
\SetRowColor{LightBackground}
StructType(fields) & Represents values with the structure described by a sequence of StructFields (fields) \tn 
% Row Count 24 (+ 5)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{5.377cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{If, elif, else equivalent}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{.when(\textless{}if condition\textgreater{}, \textless{}then x\textgreater{}) \newline  \newline df.select(df.Name, df.Age, \newline .when(df.Age \textgreater{}= 18, "Adult") \newline .when(df.Age \textless{} 18, "Minor")) \newline  \newline  \newline .otherwise() is like else \newline  \newline df.select(df.Name, df.Age, \newline .when(df.Age \textgreater{}= 18, "Adult") \newline .otherwise("Minor")} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{5.377cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Remove duplicate rows \& replace values}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{5.377cm}}{dropDuplicates() \newline test\_df\_no\_dup = \newline  \seqsplit{test\_df.select('User\_ID'},'Gender', 'Age').dropDuplicates() \newline  \newline  \newline  \newline fillna()  \newline used to replace null value with any other value \newline df.fillna(value=-99,subset= \newline {[}"Promo2SinceWeek","Promo2SinceYear"{]}).show() \newline  \newline  \newline .withColumn()  ,when() \newline creating a new column, with value equal to 1 if  \newline Promo2SinceYear \textgreater{} 2000 otherwise 0 \newline  \newline df.withColumn("greater\_than\_2000", \newline when(df.CompetitionDistance==2000,1).otherwise(0) \newline .alias('value\_desc')).show()} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}


% That's all folks
\end{multicols*}

\end{document}