\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{datamansam} \pdfinfo{ /Title (spark-sql-joins.pdf) /Creator (Cheatography) /Author (datamansam) /Subject (Spark SQL Joins Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{A3A3A3} \definecolor{LightBackground}{HTML}{F3F3F3} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Spark SQL Joins Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{datamansam} via \textcolor{DarkBackground}{\uline{cheatography.com/139410/cs/34410/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}datamansam \\ \uline{cheatography.com/datamansam} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 29th September, 2022.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{2} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Spark Joins that have a SQL Equivalent}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Basic Syntax: \newline df1.join(df2, \seqsplit{df1("common\_col\_from1")} === \seqsplit{df2("common\_col\_from2")}, \seqsplit{"method").show()/.show(false)} \newline \newline Self: \newline Though there is no self-join type available, we can use any of the above-explained join types to join DataFrame to itself. \newline \newline println("self join") \newline \seqsplit{empDF.as("emp1").join(empDF.as("emp2")}, \newline col("emp1.sup\_emp\_id") === col("emp2.emp\_id"),"inner") \newline \seqsplit{.select(col("emp1.emp\_id")},col("emp1.name"), \newline \seqsplit{col("emp2.emp\_id").as("sup\_emp\_id")}, \newline \seqsplit{col("emp2.name").as("sup\_emp\_name"))} \newline .show(false) \newline Inner: \newline empDF.join(deptDF,empDF("emp\_dept\_id") === deptDF("dept\_id"),"inner") \newline .show(false) \newline \newline Full outer: \newline empDF.join(deptDF,empDF("emp\_dept\_id") === deptDF("dept\_id"),"full") \newline .show(false) \newline \newline Left Outer: \newline empDF.join(deptDF,empDF("emp\_dept\_id") === deptDF("dept\_id"),"left") \newline .show(false)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Spark Joins that are not in SQL!}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Left semi join \newline Returns all rows from the left DF on records match in the right dataset on join expression, records not matched on join expression are ignored from both left and right datasets. \newline \newline empDF.join(deptDF,empDF("emp\_dept\_id") === deptDF("dept\_id"),"leftsemi") \newline .show(false) \newline \newline Left anti join \newline leftanti join returns only columns from the left DataFrame/Dataset for non-matched records. \newline empDF.join(deptDF,empDF("emp\_dept\_id") === deptDF("dept\_id"),"leftanti") \newline .show(false) \newline \newline Cross Join \newline use the CROSS JOIN syntax to allow cartesian products between these \newline relations, or: enable implicit cartesian products by setting the configuration \newline variable \seqsplit{spark.sql.crossJoin.enabled=true;} \newline \newline println("cross join") \newline empDF.join(deptDF,empDF("emp\_dept\_id") === deptDF("dept\_id"),"cross") \newline .show(false) \newline \newline println("Using crossJoin()") \newline \seqsplit{empDF.crossJoin(deptDF).show(false)}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}