\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{Datacademy.ai (Datacademy.ai)} \pdfinfo{ /Title (pyspark.pdf) /Creator (Cheatography) /Author (Datacademy.ai (Datacademy.ai)) /Subject (PySpark Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{00059E} \definecolor{LightBackground}{HTML}{F7F7FB} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{PySpark Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{Datacademy.ai (Datacademy.ai)} via \textcolor{DarkBackground}{\uline{cheatography.com/174553/cs/36684/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}Datacademy.ai (Datacademy.ai) \\ \uline{cheatography.com/datacademy-ai} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 24th January, 2023.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{What is Pyspark?}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{PySpark is an interface for Apache Spark in Python. It not only allows you to write Spark applications using Python APIs, but also provides the PySpark shell for interactively analyzing your data in a distributed environment.} \tn % Row Count 5 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Initializing SparkSession}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} from pyspark.sql import SparkSession \newline \textgreater{}\textgreater{}\textgreater{} spark = SparkSession \textbackslash{} \newline .builder \textbackslash{} \newline .appName("Python Spark SQL basic example" ) \textbackslash{} \newline \seqsplit{.config("spark.some.config.option"},"some-value") \textbackslash{} \newline .getOrCreate()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{A SparkSession can be used create DataFrame, register \newline DataFrame as tables, execute SQL over tables, cache \newline tables, and read parquet files} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Creating DataFrames}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{From RDDs}}} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{`\textgreater{}\textgreater{}\textgreater{} from pyspark.sql.types import *`} \tn % Row Count 2 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{p{0.4977 cm} p{0.4977 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Creating DataFrames from RDDs}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{{\bf{Infer Schema}}} \tn % Row Count 1 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{`\textgreater{}\textgreater{}\textgreater{} sc = spark.sparkContext` \newline `\textgreater{}\textgreater{}\textgreater{} lines = \seqsplit{sc.textFile("people.txt")`} \newline `\textgreater{}\textgreater{}\textgreater{} parts = lines.map(lambda l: l.split(","))` \newline `\textgreater{}\textgreater{}\textgreater{} people = parts.map(lambda p: Row(name=p{[}0{]},age=int(p{[}1{]})))` \newline `\textgreater{}\textgreater{}\textgreater{} peopledf = \seqsplit{spark.createDataFrame(people)`}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Creating DataFrames from RDDs}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Specify Schema}}} \tn % Row Count 1 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{`\textgreater{}\textgreater{}\textgreater{} people = parts.map(lambda p: Row(name=p{[}0{]}, age=int(p{[}1{]}.strip())))` \newline `\textgreater{}\textgreater{}\textgreater{} schemaString = "name age"` \newline `\textgreater{}\textgreater{}\textgreater{} fields = {[}StructField(field\_name, StringType(), True) for field\_name in schemaString.split(){]}` \newline `\textgreater{}\textgreater{}\textgreater{} schema = StructType(fields)` \newline `\textgreater{}\textgreater{}\textgreater{} \seqsplit{spark.createDataFrame(people}, schema).show()`} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{p{0.4977 cm} p{0.4977 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{From Spark Data Sources}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{{\bf{JSON}}} \tn % Row Count 1 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{`\textgreater{}\textgreater{}\textgreater{} df = \seqsplit{spark.read.json("customer.json")`} \newline `\textgreater{}\textgreater{}\textgreater{} df.show()` \newline `\textgreater{}\textgreater{}\textgreater{} df2 = \seqsplit{spark.read.load("people.json"}, format="json")`} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{p{0.4977 cm} p{0.4977 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{From Spark Data Sources}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{{\bf{Parquet Files}}} \tn % Row Count 1 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{`\textgreater{}\textgreater{}\textgreater{} df3 = \seqsplit{spark.read.load("users.parquet")`}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{p{0.4977 cm} p{0.4977 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{From Spark Data Sources}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{{\bf{Text Files}}} \tn % Row Count 1 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{`\textgreater{}\textgreater{}\textgreater{} df4 = \seqsplit{spark.read.text("people.txt")`}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Duplicate Values}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} df = df.dropDuplicates()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.04057 cm} x{2.93643 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Inspect Data}} \tn % Row 0 \SetRowColor{LightBackground} `\textgreater{}\textgreater{}\textgreater{} df.dtypes` & Return df column names and data types \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} `\textgreater{}\textgreater{}\textgreater{} df.show()` & Display the content of df \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} `\textgreater{}\textgreater{}\textgreater{} df.head()` & Return first n rows \tn % Row Count 5 (+ 1) % Row 3 \SetRowColor{white} `\textgreater{}\textgreater{}\textgreater{} df.first()` & Return first row \tn % Row Count 6 (+ 1) % Row 4 \SetRowColor{LightBackground} `\textgreater{}\textgreater{}\textgreater{} df.take(2)` & Return the first n rows \tn % Row Count 7 (+ 1) % Row 5 \SetRowColor{white} `\textgreater{}\textgreater{}\textgreater{} df.schema` & Return the schema of df \tn % Row Count 8 (+ 1) % Row 6 \SetRowColor{LightBackground} `\textgreater{}\textgreater{}\textgreater{} \seqsplit{df.describe().show()`} & Compute summary statistics \tn % Row Count 10 (+ 2) % Row 7 \SetRowColor{white} `\textgreater{}\textgreater{}\textgreater{} df.columns` & Return the columns of df \tn % Row Count 12 (+ 2) % Row 8 \SetRowColor{LightBackground} `\textgreater{}\textgreater{}\textgreater{} df.count()` & Count the number of rows in df \tn % Row Count 14 (+ 2) % Row 9 \SetRowColor{white} `\textgreater{}\textgreater{}\textgreater{} \seqsplit{df.distinct().count()`} & Count the number of distinct rows in df \tn % Row Count 16 (+ 2) % Row 10 \SetRowColor{LightBackground} `\textgreater{}\textgreater{}\textgreater{} \seqsplit{df.printSchema()`} & Print the schema of df \tn % Row Count 18 (+ 2) % Row 11 \SetRowColor{white} `\textgreater{}\textgreater{}\textgreater{} df.explain()` & Print the (logical and physical) plans \tn % Row Count 20 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Queries}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} from pyspark.sql import functions as F} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Select- Query}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} \seqsplit{df.select("firstName").show()} \newline \textgreater{}\textgreater{}\textgreater{} df.select("firstName","lastName").show() \newline \textgreater{}\textgreater{}\textgreater{} df.select(df{[}"firstName"{]},df{[}"age"{]}+ 1).show() \newline \textgreater{}\textgreater{}\textgreater{} df.select(df{[}'age'{]} \textgreater{} 24).show()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Between- Query}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} \seqsplit{df.select(df.age.between(22}, 24)).show()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Substring- Query}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{df.select(df.firstName.substr(1}, 3) \textbackslash{} \newline \seqsplit{.alias("name")).collect()}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Startswith, Endswith- Query}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} df.select("firstName", \newline \seqsplit{df.lastName.startswith("Sm")).show()} \newline \textgreater{}\textgreater{}\textgreater{} \seqsplit{df.select(df.lastName.endswith("th")).show(}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Like - Query}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} df.select("firstName", \newline \seqsplit{df.lastName.like("Smith")).show()}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{When- Query}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} df.select("firstName", \newline F.when(df.age \textgreater{} 30, 1) \textbackslash{} \newline .otherwise(0)) \textbackslash{} \newline .show() \newline \textgreater{}\textgreater{}\textgreater{} df{[}df.firstName.isin("Jane","Boris"){]}.collect()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{GroupBy}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} \seqsplit{df.groupBy("age").count().show()}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Filter}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} df.filter(df{[}"age"{]}\textgreater{}24).show()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Add Columns}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} df = df.withColumn('city',df.address.city) \textbackslash{} \newline .withColumn('postalCode',df.address.postalCode) \textbackslash{} \newline .withColumn('state',df.address.state) \textbackslash{} \newline \seqsplit{.withColumn('streetAddress'},df.address.streetAddress) \textbackslash{} \newline \seqsplit{.withColumn('telePhoneNumber'}, \newline \seqsplit{explode(df.phoneNumber.number))} \textbackslash{} \newline \seqsplit{.withColumn('telePhoneType'}, \newline \seqsplit{explode(df.phoneNumber.type))}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Update Columns}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} df = \seqsplit{df.withColumnRenamed('telePhoneNumber'}, \textbackslash{} \newline 'phoneNumber')} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Remove Columns}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} df = df.drop("address", "phoneNumber") \newline \textgreater{}\textgreater{}\textgreater{} df = \seqsplit{df.drop(df.address).drop(df.phoneNumber)}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Sort}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} \seqsplit{peopledf.sort(peopledf.age.desc()).collect()} \newline \textgreater{}\textgreater{}\textgreater{} df.sort("age", \seqsplit{ascending=False).collect()} \newline \textgreater{}\textgreater{}\textgreater{} df.orderBy({[}"age","city"{]},ascending={[}0,1{]}).collect()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Missing \& Replacing Values}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} df.na.fill(50).show() \newline \textgreater{}\textgreater{}\textgreater{} df.na.drop().show() \newline \textgreater{}\textgreater{}\textgreater{} df.na.replace(10, 20).show()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Repartitioning}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} \seqsplit{df.repartition(10).rdd.getNumPartitions()} \newline \textgreater{}\textgreater{}\textgreater{} \seqsplit{df.coalesce(1).rdd.getNumPartitions()}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Registering DataFrames as Views}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} \seqsplit{peopledf.createGlobalTempView("people")} \newline \textgreater{}\textgreater{}\textgreater{} \seqsplit{df.createTempView("customer")} \newline \textgreater{}\textgreater{}\textgreater{} \seqsplit{df.createOrReplaceTempView("customer")}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Query Views}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} df5 = spark.sql("SELECT {\emph{ FROM customer").show() \newline \textgreater{}\textgreater{}\textgreater{} peopledf2 =spark.sql("SELECT }}FROM global\_temp.people") \textbackslash{} \newline .show()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Output- Data Structures}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{`\textgreater{}\textgreater{}\textgreater{} rdd1 = df.rdd`} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}Convert df into an RDD} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{`\textgreater{}\textgreater{}\textgreater{} df.toJSON().first()`} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}Convert df into a RDD of string} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{`\textgreater{}\textgreater{}\textgreater{} df.toPandas()`} \tn \mymulticolumn{1}{x{5.377cm}}{\hspace*{6 px}\rule{2px}{6px}\hspace*{6 px}Return the contents of df as Pandas DataFrame} \tn % Row Count 6 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Output- Write \& Save to Files}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} df.select("firstName", "city")\textbackslash{} \newline .write.save("nameAndCity.parquet") \newline \newline \textgreater{}\textgreater{}\textgreater{} df.select("firstName", "age") \textbackslash{} \newline .write.save("namesAndAges.json",format="json")} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Stopping SparkSession}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\textgreater{}\textgreater{}\textgreater{} spark.stop()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}