\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{datamansam} \pdfinfo{ /Title (apache-spark.pdf) /Creator (Cheatography) /Author (datamansam) /Subject (Apache Spark Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{A3A3A3} \definecolor{LightBackground}{HTML}{F3F3F3} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Apache Spark Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{datamansam} via \textcolor{DarkBackground}{\uline{cheatography.com/139410/cs/30084/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}datamansam \\ \uline{cheatography.com/datamansam} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 17th January, 2022.\\ Updated 28th February, 2022.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Introduction to Apache Spark}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{An open-source, distributed processing system used for big data workloads.} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{Utilizes in-memory caching, and optimized query execution for fast analytic queries against data of any size.} \tn % Row Count 5 (+ 3) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Provides:} \tn % Row Count 6 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{Development APIs. Batch processing, interactive queries, real-time analytics, machine learning, and graph processing} \tn % Row Count 9 (+ 3) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Apache Spark vs. Apache Hadoop} \tn % Row Count 10 (+ 1) % Row 5 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{Hadoop MapReduce is a programming model for processing big data sets with a parallel, distributed algorithm.} \tn % Row Count 13 (+ 3) % Row 6 \SetRowColor{LightBackground} With each step, MapReduce reads data from the cluster, performs operations, and writes the results back to HDFS. Because each step requires a disk read, and write, MapReduce jobs are slower due to the latency of disk I/O. & Because each step requires a disk read, and write, MapReduce jobs are slower due to the latency of disk I/O. \tn % Row Count 25 (+ 12) % Row 7 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{Spark was created to address the limitations to MapReduce} \tn % Row Count 27 (+ 2) % Row 8 \SetRowColor{LightBackground} Spark does processing in-memory, reducing the number of steps in a job, and by reusing data across multiple parallel operations. & With Spark, only one-step is needed where data is read into memory, operations performed, and the results written back \tn % Row Count 34 (+ 7) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{DDL}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Data Definition Language} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{Resilient Distributed Dataset (RDD) is the fundamental data structure of Spark} \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{immutable (and therefore fault-tolerant) Distributed collections of objects of any type.} \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} Each Dataset in Spark RDD is divided into logical partitions across the cluster & thus can be operated in parallel, on different nodes of the cluster. \tn % Row Count 9 (+ 4) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{RDD features} \tn % Row Count 10 (+ 1) % Row 5 \SetRowColor{white} Lazy Evaluation & Transformation do not compute the results as and when stated \tn % Row Count 13 (+ 3) % Row 6 \SetRowColor{LightBackground} In-Memory Computation & Data is kept in RAM (random access memory) instead of the slower disk drives \tn % Row Count 17 (+ 4) % Row 7 \SetRowColor{white} Fault Tolerance & Tracks data lineage information to allow for rebuilding lost data automatically on failure \tn % Row Count 22 (+ 5) % Row 8 \SetRowColor{LightBackground} Immutability & Immutability simply rules out lots of potential problems due to various updates from varying threads at once. \tn % Row Count 28 (+ 6) % Row 9 \SetRowColor{white} & Having Immutable data is safer to share across processes \tn % Row Count 31 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{DDL (cont)}} \tn % Row 10 \SetRowColor{LightBackground} Partitioning & Each node in a spark cluster contains one or more partitions. \tn % Row Count 4 (+ 4) % Row 11 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{Two ways to apply operations on RDDs} \tn % Row Count 5 (+ 1) % Row 12 \SetRowColor{LightBackground} 1, Transformation − These are the operations, which are applied on a RDD to create a new RDD. Filter, groupBy and map are the examples of transformations. & Narrow Transformations: In this type, all the elements which are required to compute the records in a single partition live in that single partition. \tn % Row Count 13 (+ 8) % Row 13 \SetRowColor{white} & Wide Transformations: Here, all elements required to compute the records in that single partition may live in many of the partitions of the parent RDD. These use groupbyKey() and reducebyKey(). \tn % Row Count 23 (+ 10) % Row 14 \SetRowColor{LightBackground} 2, Action − These are the operations that are applied on RDD, which instructs Spark to perform computation and send the result back to the driver. & count(), collect(), take(n), top(), count value(), reduce(), fold(), aggregate(), foreach(). \tn % Row Count 31 (+ 8) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{DDL (cont)}} \tn % Row 15 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Create Dataframees} \tn % Row Count 1 (+ 1) % Row 16 \SetRowColor{white} via CSV & \seqsplit{df=spark.read.option("header"},True) \textbackslash{} \seqsplit{.csv("/tmp/resources/simple-zipcodes.csv")} \tn % Row Count 6 (+ 5) % Row 17 \SetRowColor{LightBackground} & If you have a header with column names on your input file, you need to explicitly specify True \tn % Row Count 11 (+ 5) % Row 18 \SetRowColor{white} & df = \seqsplit{spark.read.csv("path1},path2,path3") ; df = \seqsplit{spark.read.csv("Folder} path") \tn % Row Count 15 (+ 4) % Row 19 \SetRowColor{LightBackground} & Using the read.csv() method you can also read multiple csv files, just pass all file names by separating comma as a path \tn % Row Count 21 (+ 6) % Row 20 \SetRowColor{white} & Using nullValues option you can specify the string in a CSV to consider as null. For example, if you want to consider a date column with a value "1900-01-01" set null on DataFrame. \tn % Row Count 30 (+ 9) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{DDL (cont)}} \tn % Row 21 \SetRowColor{LightBackground} Parition & Used to partition the large dataset (DataFrame) into smaller files based on one or multiple columns while writing to disk \tn % Row Count 7 (+ 7) % Row 22 \SetRowColor{white} & \seqsplit{df.write.option("header"},True) \textbackslash{} \seqsplit{.partitionBy("state")} \textbackslash{} .mode("overwrite") \textbackslash{} \seqsplit{.csv("/tmp/zipcodes-state")} \tn % Row Count 14 (+ 7) % Row 23 \SetRowColor{LightBackground} & PySpark splits the records based on the partition column and stores each partition data into a sub-directory., If we have a total of 6 different states hence, it creates 6 directories \tn % Row Count 24 (+ 10) % Row 24 \SetRowColor{white} & \seqsplit{df.write.option("header"},True) \textbackslash{} \seqsplit{.partitionBy("state"},"city") \textbackslash{} .mode("overwrite") \textbackslash{} \seqsplit{.csv("/tmp/zipcodes-state")} \tn % Row Count 31 (+ 7) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{DDL (cont)}} \tn % Row 25 \SetRowColor{LightBackground} & t creates a folder hierarchy for each partition; we have mentioned the first partition as state followed by city hence, it creates a city folder inside the state folder (one folder for each city in a state). \tn % Row Count 11 (+ 11) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Queries}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{from pyspark.sql import functions as F \newline \# Select Columns \newline df.select("firstName").show() \newline df.select("firstName","lastName") \textbackslash{} \newline .show() \newline \newline \# split multiple array column data into rows \newline df2 = df.select(df.name,explode(df.subjectandID)) \newline \newline \# Show all entries where age \textgreater{}24 \newline df.select(df{[}'age'{]} \textgreater{} 24).show() \newline \newline \# Show name and 0 or 1 depending on age \textgreater{} or \textless{} than 30 \newline df.select("Name", \newline F.when(df.age \textgreater{} 30, 1) \newline .otherwise(0)) \textbackslash{} \newline .show() \newline \newline \# Show firstName if in the given options \newline df{[}df.firstName.isin("Jane","Boris"){]}.collect() \newline \newline \# Show firstName, and lastName if lastName is Smith. \newline df.select("firstName", \newline \seqsplit{df.lastName.like("Smith"))} \newline .show() \newline \newline \# Like also excepts wildcard matches. \newline df.select("firstName", \newline \seqsplit{df.lastName.like("\%Sm"))} \newline .show() \newline \newline \newline \# Show firstName, and TRUE if \newline df.lastName \textbackslash{} lastName starts with Sm \newline \newline Startswith - Endswith \newline df.select("firstName \newline .startswith("Sm")) \textbackslash{} \newline .show() \newline \newline \newline \# Show last names ending in th \newline df.select(df.lastName.endswith("th"))\textbackslash{} \newline .show() \newline \newline \# Return substrings of firstName \newline Substring \newline df.select(df.firstName.substr(1, 3) \textbackslash{} \newline .alias("name")) \textbackslash{} \newline .collect() \newline \newline Between \newline \# Show values where age is between 22 and 24 \newline df.select(df.age.between(22, 24)) \textbackslash{} \newline .show() \newline \newline \# Show all entries in firstName and age + 1 \newline df.select(df{[}"firstName"{]},df{[}"age"{]}+ 1), .show()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{1.51041 cm} x{1.51041 cm} x{1.55618 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{5.377cm}}{\bf\textcolor{white}{DML}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{3}{x{5.377cm}}{Dealing with nulls} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} df= \seqsplit{df.na.drop(how} = 'any', thresh = 2) & To drop null values we use the na function with the drop() attribute. & how: 'any' or 'all'. If 'any', drop a row if it contains any nulls. If 'all', drop a row only if all its values are null \tn % Row Count 12 (+ 11) % Row 2 \SetRowColor{LightBackground} & & thresh: default None If specified, drop rows that have less than thresh non-null values. This overwrites the how parameter. \tn % Row Count 22 (+ 10) % Row 3 \SetRowColor{white} & & subset: optional optional list of column names to consider. \tn % Row Count 27 (+ 5) % Row 4 \SetRowColor{LightBackground} To fill nulls & \seqsplit{df.na.fill(50)} & \tn % Row Count 29 (+ 2) % Row 5 \SetRowColor{white} \mymulticolumn{3}{x{5.377cm}}{} \tn % Row Count 29 (+ 0) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{3}{x{5.377cm}}{union() method of the DataFrame is used to merge two DataFrame's of the same structure/schema.} \tn % Row Count 31 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{1.51041 cm} x{1.51041 cm} x{1.55618 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{5.377cm}}{\bf\textcolor{white}{DML (cont)}} \tn % Row 7 \SetRowColor{LightBackground} & unionDF = \seqsplit{df.union(df2)} & returns the new DataFrame with all rows from two Dataframes regardless of duplicate data. \tn % Row Count 7 (+ 7) % Row 8 \SetRowColor{white} & use the use the distinct() function to return just one record when duplicate exists.() function to return just one record when duplicate exists. & use the distinct() function to return just one record when duplicate exists. \tn % Row Count 19 (+ 12) \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Creating a Session}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{import pyspark \# importing the module \newline \newline \# importing the SparkSession module \newline from pyspark.sql import SparkSession \newline \newline \# creating a session \newline session = \seqsplit{SparkSession.builder.appName('First} App') \newline .getOrCreate() \newline \newline \# calling the session variable \newline session} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Creating delta tables}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\# Define the input and output formats and paths and the table name. \newline read\_format = 'delta' \newline write\_format = 'delta' \newline load\_path = \seqsplit{'/databricks-datasets/learning-spark-v2/people/people-10m}.delta' \newline save\_path = '/tmp/delta/people-10m' \newline table\_name = 'default.people10m' \newline \newline \# Load the data from its source. \newline people = spark \textbackslash{} \newline .read \textbackslash{} \newline .format(read\_format) \textbackslash{} \newline .load(load\_path) \newline \newline \# Write the data to its target. \newline people.write \textbackslash{} \newline .format(write\_format) \textbackslash{} \newline .save(save\_path) \newline \newline \# Create the table. \newline spark.sql("CREATE TABLE " + table\_name + " USING DELTA LOCATION '" + save\_path + "'") \newline \newline session} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Data preprocessing}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{To select one or multiple columns the select() function works \newline \newline dataframe.select(column\_name) \# selecting one column \newline dataframe.select(column\_1, column\_2, .., column\_N) \newline \# selecting many columns \newline \newline \newline dataframe.withColumn() \newline To add a column the dataframe.withColumn() function \newline takes two parameters \newline \newline New column name to add \newline Existing column name to use for (not necessary if the \newline new column has nothing to do with the existing column) \newline \newline \# adding columns in dataframe \newline data = \seqsplit{data.withColumn('Age\_after\_3\_y'}, data{[}'Age'{]}+3) \newline \newline \newline to change data type \newline You would also need cast() along with withColumn(). \newline The below statement changes the datatype from String \newline to Integer for the salary column. \newline \newline \newline df.withColumn("salary",col("salary").cast("Integer")).show() \newline \newline \newline Change a value \newline Pass an existing column name as a first argument \newline and a column as the value to be assigned as a second argument \newline \newline df.withColumn("salary",col("salary")*100).show() \newline \newline \newline Drop \newline df.drop("salary") \textbackslash{} \newline .show() \newline \newline \newline withColumnRenamed() \newline rename an existing column \newline \newline df.withColumnRenamed("gender","sex") \textbackslash{} \newline .show(truncate=False)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Adding columns - df.withColumn('newCol', newVal) \newline Changing data types - df.withColumn("newCol",col("OldCol").cast("NewDT")).show() \newline Changing Values - df.withColumn('oldcol', col("oldcol") operation) \newline \newline Dropping = withColumnRenamed \newline \newline Renaming = withColumnRenamed} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.23965 cm} x{2.73735 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Sorting and Grouping}} \tn % Row 0 \SetRowColor{LightBackground} df.sort("col", ascending = false) & Default sorting technique used by order by is ASC \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{df.groupby("col").agg() / \seqsplit{df.groupby("age").counr()}} \tn % Row Count 5 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{p{0.4977 cm} p{0.4977 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Spark SQL}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{spark.sql(select * from tablename)} \tn % Row Count 1 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}