\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{Ryan (ryan2002)} \pdfinfo{ /Title (spark-scala-api-v2-3.pdf) /Creator (Cheatography) /Author (Ryan (ryan2002)) /Subject (Spark Scala API v2.3 Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{142B6B} \definecolor{LightBackground}{HTML}{F7F8FA} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Spark Scala API v2.3 Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{Ryan (ryan2002)} via \textcolor{DarkBackground}{\uline{cheatography.com/108946/cs/21509/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}Ryan (ryan2002) \\ \uline{cheatography.com/ryan2002} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 31st January, 2020.\\ Updated 5th February, 2020.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{tabularx}{17.67cm}{p{1.687 cm} x{7.2541 cm} x{7.9289 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{17.67cm}}{\bf\textcolor{white}{Data Sources - read}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{format} & "csv", "text", "json", "parquet" (default), "orc", "jdbc" & \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} \seqsplit{option} & csv & {\bf{sep}} (default ,): sets a single character as a separator for each field and value. \{\{nl\}\}{\bf{quote}} (default "): sets a single character used for escaping quoted values where the separator can be part of the value. If you would like to turn off quotations, you need to set not null but an empty string. This behaviour is different from \seqsplit{com.databricks.spark.csv.} \{\{nl\}\}{\bf{escape}} (default \textbackslash{}): sets a single character used for escaping quotes inside an already quoted value. \seqsplit{charToEscapeQuoteEscaping} (default escape or \textbackslash{}0): sets a single character used for escaping the escape for the quote character. The default value is escape character when escape and quote characters are different, \textbackslash{}0 otherwise. comment (default empty string): sets a single character used for skipping lines beginning with this character. By default, it is disabled. \{\{nl\}\}{\bf{header}} (default false): uses the first line as names of columns. \{\{nl\}\}{\bf{inferSchema}} (default false): infers the input schema automatically from data. It requires one extra pass over the data. \{\{nl\}\}{\bf{mode}} (default PERMISSIVE): allows a mode for dealing with corrupt records during parsing. It supports the following case-insensitive modes. \{\{nl\}\}~~~~{\emph{PERMISSIVE}} : sets other fields to null when it meets a corrupted record, and puts the malformed string into a field configured by \seqsplit{columnNameOfCorruptRecord}. To keep corrupt records, an user can set a string type field named \seqsplit{columnNameOfCorruptRecord} in an user-defined schema. If a schema does not have the field, it drops corrupt records during parsing. When a length of parsed CSV tokens is shorter than an expected length of a schema, it sets null for extra fields. \{\{nl\}\}~~~~{\emph{DROPMALFORMED}} : ignores the whole corrupted records. \{\{nl\}\}~~~~{\emph{FAILFAST}} : throws an exception when it meets corrupted records. \{\{nl\}\}{\bf{nullValue}} (default empty string): sets the string representation of a null value. Since 2.0.1, this applies to all supported types including the string type. \{\{nl\}\}{\bf{nanValue}} (default NaN): sets the string representation of a non-number" value. \{\{nl\}\}{\bf{dateFormat}} (default yyyy-MM-dd): sets the string that indicates a date format. Custom date formats follow the formats at \seqsplit{java.text.SimpleDateFormat.} This applies to date type. \{\{nl\}\}{\bf{timestampFormat}} (default \seqsplit{yyyy-MM-dd'T'HH:mm:ss}.SSSXXX): sets the string that indicates a timestamp format. Custom date formats follow the formats at \seqsplit{java.text.SimpleDateFormat.} This applies to timestamp type. \{\{nl\}\}{\bf{maxColumns}} (default 20480): defines a hard limit of how many columns a record can have. maxCharsPerColumn (default -1): defines the maximum number of characters allowed for any given value being read. By default, it is -1 meaning unlimited length \{\{nl\}\}{\bf{multiLine}} (default false): parse one record, which may span multiple lines. \{\{nl\}\}{\bf{encoding}} (default UTF-8): decodes the CSV files by the given encoding type. \{\{nl\}\}{\bf{ignoreLeadingWhiteSpace}} (default false): a flag indicating whether or not leading whitespaces from values being read should be skipped. \{\{nl\}\}{\bf{ignoreTrailingWhiteSpace}} (default false): a flag indicating whether or not trailing whitespaces from values being read should be skipped. \{\{nl\}\}{\bf{positiveInf}} (default Inf): sets the string representation of a positive infinity value. \{\{nl\}\}{\bf{negativeInf}} (default -Inf): sets the string representation of a negative infinity value. \{\{nl\}\}{\bf{columnNameOfCorruptRecord}} (default is the value specified in \seqsplit{spark.sql.columnNameOfCorruptRecord):} allows renaming the new field having malformed string created by PERMISSIVE mode. This overrides \seqsplit{spark.sql.columnNameOfCorruptRecord.} \tn % Row Count 210 (+ 206) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{p{1.687 cm} x{7.2541 cm} x{7.9289 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{17.67cm}}{\bf\textcolor{white}{Data Sources - read (cont)}} \tn % Row 2 \SetRowColor{LightBackground} & text & {\bf{wholetext}}( default false) \tn % Row Count 2 (+ 2) % Row 3 \SetRowColor{white} & json & {\bf{mode}} (default PERMISSIVE): allows a mode for dealing with corrupt records during parsing. \{\{nl\}\}~~~~{\emph{PERMISSIVE}} : sets other fields to null when it meets a corrupted record, and puts the malformed string into a field configured by \seqsplit{columnNameOfCorruptRecord}. To keep corrupt records, an user can set a string type field named \seqsplit{columnNameOfCorruptRecord} in an user-defined schema. If a schema does not have the field, it drops corrupt records during parsing. When inferring a schema, it implicitly adds a \seqsplit{columnNameOfCorruptRecord} field in an output schema. \{\{nl\}\}~~~~{\emph{DROPMALFORMED}} : ignores the whole corrupted records. \{\{nl\}\}~~~~{\emph{FAILFAST}} : throws an exception when it meets corrupted records. \{\{nl\}\}{\bf{columnNameOfCorruptRecord}} (default is the value specified in \seqsplit{spark.sql.columnNameOfCorruptRecord):} allows renaming the new field having malformed string created by PERMISSIVE mode. This overrides \seqsplit{spark.sql.columnNameOfCorruptRecord.} \{\{nl\}\}{\bf{dateFormat}} (default yyyy-MM-dd): sets the string that indicates a date format. Custom date formats follow the formats at \seqsplit{java.text.SimpleDateFormat.} This applies to date type. \{\{nl\}\}{\bf{timestampFormat}} (default \seqsplit{yyyy-MM-dd'T'HH:mm:ss}.SSSXXX): sets the string that indicates a timestamp format. Custom date formats follow the formats at \seqsplit{java.text.SimpleDateFormat.} This applies to timestamp type. \{\{nl\}\}{\bf{multiLine}} (default false): parse one record, which may span multiple lines, per file \{\{nl\}\}{\bf{primitivesAsString}} (default false): infers all primitive values as a string type \{\{nl\}\}{\bf{prefersDecimal}} (default false): infers all floating-point values as a decimal type. If the values do not fit in decimal, then it infers them as doubles. \{\{nl\}\}{\bf{allowComments}} (default false): ignores Java/C++ style comment in JSON records \{\{nl\}\}{\bf{allowUnquotedFieldNames}} (default false): allows unquoted JSON field names \{\{nl\}\}{\bf{allowSingleQuotes}} (default true): allows single quotes in addition to double quotes \{\{nl\}\}{\bf{allowNumericLeadingZeros}} (default false): allows leading zeros in numbers (e.g. 00012) \{\{nl\}\}{\bf{allowBackslashEscapingAnyCharacter}} (default false): allows accepting quoting of all character using backslash quoting mechanism \{\{nl\}\}{\bf{allowUnquotedControlChars}} (default false): allows JSON Strings to contain unquoted control characters (ASCII characters with value less than 32, including tab and line feed characters) or not. \tn % Row Count 139 (+ 137) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{p{1.687 cm} x{7.2541 cm} x{7.9289 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{17.67cm}}{\bf\textcolor{white}{Data Sources - read (cont)}} \tn % Row 4 \SetRowColor{LightBackground} & parquet & {\bf{mergeSchema}} (default is the value specified in \seqsplit{spark.sql.parquet.mergeSchema):} sets whether we should merge schemas collected from all Parquet part-files. This will override \seqsplit{spark.sql.parquet.mergeSchema.} \tn % Row Count 12 (+ 12) % Row 5 \SetRowColor{white} & orc & \tn % Row Count 13 (+ 1) % Row 6 \SetRowColor{LightBackground} & jdbc & {\bf{url}}: The JDBC URL for Spark to connect to. At the minimum, it should contain the host, port, and database name. For MySQL, it may look something like this: \seqsplit{jdbc:mysql://localhost:3306/sakila}. \{\{nl\}\}{\bf{dbtable}}: The name of a database table for Spark to read data from or write data to. \{\{nl\}\}{\bf{user}} \{\{nl\}\}{\bf{password}} \{\{nl\}\}{\bf{driver}}: The class name of the JDBC driver that Spark will instantiate to connect to the previous URL. Consult the JDBC driver documentation that you are using. For the MySQL Connector/J driver, the class name is \seqsplit{com.mysql.jdbc.Driver.} \tn % Row Count 45 (+ 32) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{p{1.687 cm} x{7.2541 cm} x{7.9289 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{17.67cm}}{\bf\textcolor{white}{Data Sources - read (cont)}} \tn % Row 7 \SetRowColor{LightBackground} \seqsplit{schema} & \{\{fa-bolt\}\}can use """...""" define he schema, need use the scala data type.\{\{nl\}\}e.g. \seqsplit{`schema("""stockticker} STRING, tradedate INT, openprice FLOAT""")`\{\{nl\}\}\{\{nl\}\}`//~Mode 1` \{\{nl\}\}`val movieSchema = \seqsplit{StructType(Array(StructField("stockticker"}, StringType, true), \{\{nl\}\}~~StructField("tradedate", IntegerType, true), \{\{nl\}\}~~StructField("openprice", FloatType, true)))` \{\{nl\}\}\{\{nl\}\}`//~Mode 2: equivalent to mode 1` \{\{nl\}\}`val movieSchema = """stockticker STRING, tradedate INT, openprice FLOAT"""` & \tn % Row Count 32 (+ 32) \hhline{>{\arrayrulecolor{DarkBackground}}---} \SetRowColor{LightBackground} \mymulticolumn{3}{x{17.67cm}}{\seqsplit{`DataFrameReader.format(...).option("key"}, \seqsplit{"value").schema(...).load(paths:} String{\emph{)` \newline can give multiple paths, can give directory path to read all files in the directory, can use wildcard "}}" in the path \newline To get a DataFrameReader, use `spark.read`} \tn \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{7.7715 cm} x{9.4985 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Two ways to define Schema}} \tn % Row 0 \SetRowColor{LightBackground} Define a schema programmatically: & `val schema = \seqsplit{StructType(Array(StructField("author"}, StringType, false),\{\{nl\}\}~ StructField("title", StringType, false),\{\{nl\}\}~~StructField("pages", IntegerType, false)))` \tn % Row Count 9 (+ 9) % Row 1 \SetRowColor{white} Define a schema with a DDL String & `val schema = "author STRING, title STRING, pages INT"` \tn % Row Count 12 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{5.2297 cm} x{5.9045 cm} x{5.7358 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{17.67cm}}{\bf\textcolor{white}{Data Source - write}} \tn % Row 0 \SetRowColor{LightBackground} format & "csv", "text", "json", "parquet" (default), "orc", "jdbc" & \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} mode & "overwrite", "append", "ignore", \seqsplit{"error/errorIfExists"} (default) & \tn % Row Count 10 (+ 5) % Row 2 \SetRowColor{LightBackground} option & csv & {\bf{sep}} (default ,): sets a single character as a separator for each field and value. \{\{nl\}\}{\bf{quote}} (default "): sets a single character used for escaping quoted values where the separator can be part of the value. If an empty string is set, it uses u0000 (null character). \{\{nl\}\}{\bf{escape}} (default \textbackslash{}): sets a single character used for escaping quotes inside an already quoted value. \seqsplit{charToEscapeQuoteEscaping} (default escape or \textbackslash{}0): sets a single character used for escaping the escape for the quote character. The default value is escape character when escape and quote characters are different, \textbackslash{}0 otherwise. \{\{nl\}\}{\bf{escapeQuotes}} (default true): a flag indicating whether values containing quotes should always be enclosed in quotes. Default is to escape all values containing a quote character. \{\{nl\}\}{\bf{quoteAll}} (default false): a flag indicating whether all values should always be enclosed in quotes. Default is to only escape values containing a quote character. \{\{nl\}\}{\bf{header}} (default false): writes the names of columns as the first line. \{\{nl\}\}{\bf{nullValue}} (default empty string): sets the string \seqsplit{representation} of a null value. \{\{nl\}\}{\bf{compression}} (default null): compression codec to use when saving to file. This can be one of the known \seqsplit{case-insensitive} shorten names (none, bzip2, gzip, lz4, snappy and deflate). \{\{nl\}\}{\bf{dateFormat}} (default yyyy-MM-dd): sets the string that indicates a date format. Custom date formats follow the formats at \seqsplit{java.text.SimpleDateFormat.} This applies to date type. \{\{nl\}\}{\bf{timestampFormat}} (default \seqsplit{yyyy-MM-dd'T'HH:mm:ss}.SSSXXX): sets the string that indicates a timestamp format. Custom date formats follow the formats at \seqsplit{java.text.SimpleDateFormat.} This applies to timestamp type. \{\{nl\}\}{\bf{ignoreLeadingWhiteSpace}} (default true): a flag indicating whether or not leading whitespaces from values being written should be skipped. \{\{nl\}\}{\bf{ignoreTrailingWhiteSpace}} (default true): a flag indicating defines whether or not trailing whitespaces from values being written should be skipped. \tn % Row Count 168 (+ 158) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{5.2297 cm} x{5.9045 cm} x{5.7358 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{17.67cm}}{\bf\textcolor{white}{Data Source - write (cont)}} \tn % Row 3 \SetRowColor{LightBackground} & text & {\bf{compression}} (default null): compression codec to use when saving to file. This can be one of the known \seqsplit{case-insensitive} shorten names (none, bzip2, gzip, lz4, snappy and deflate). \tn % Row Count 15 (+ 15) % Row 4 \SetRowColor{white} & json & {\bf{compression}} (default null): compression codec to use when saving to file. This can be one of the known \seqsplit{case-insensitive} shorten names (none, bzip2, gzip, lz4, snappy and deflate). \{\{nl\}\}{\bf{dateFormat}} (default yyyy-MM-dd): sets the string that indicates a date format. Custom date formats follow the formats at \seqsplit{java.text.SimpleDateFormat.} This applies to date type. \{\{nl\}\}{\bf{timestampFormat}} (default \seqsplit{yyyy-MM-dd'T'HH:mm:ss}.SSSXXX): sets the string that indicates a timestamp format. Custom date formats follow the formats at \seqsplit{java.text.SimpleDateFormat.} This applies to timestamp type. \tn % Row Count 61 (+ 46) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{5.2297 cm} x{5.9045 cm} x{5.7358 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{17.67cm}}{\bf\textcolor{white}{Data Source - write (cont)}} \tn % Row 5 \SetRowColor{LightBackground} & parquet & {\bf{compression}} (default is the value specified in \seqsplit{spark.sql.parquet.compression.codec):} compression codec to use when saving to file. This can be one of the known \seqsplit{case-insensitive} shorten names(none, snappy, gzip, and lzo). This will override \seqsplit{spark.sql.parquet.compression.codec.} \tn % Row Count 22 (+ 22) % Row 6 \SetRowColor{white} & orc & {\bf{compression}} (default is the value specified in \seqsplit{spark.sql.orc.compression.codec):} compression codec to use when saving to file. This can be one of the known \seqsplit{case-insensitive} shorten names(none, snappy, zlib, and lzo). This will override orc.compress and \seqsplit{spark.sql.orc.compression.codec.} If orc.compress is given, it overrides \seqsplit{spark.sql.orc.compression.codec.} \tn % Row Count 50 (+ 28) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{5.2297 cm} x{5.9045 cm} x{5.7358 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{17.67cm}}{\bf\textcolor{white}{Data Source - write (cont)}} \tn % Row 7 \SetRowColor{LightBackground} & jdbc & {\bf{truncate}} (default false): use TRUNCATE TABLE instead of DROP TABLE.\{\{nl\}\}In case of failures, users should turn off truncate option to use DROP TABLE again. Also, due to the different behavior of TRUNCATE TABLE among DBMS, it's not always safe to use this. MySQLDialect, DB2Dialect, \seqsplit{MsSqlServerDialect}, DerbyDialect, and \seqsplit{OracleDialect} supports this while \seqsplit{PostgresDialect} and default JDBCDirect doesn't. For unknown and unsupported JDBCDirect, the user option truncate is ignored. \tn % Row Count 38 (+ 38) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{5.2297 cm} x{5.9045 cm} x{5.7358 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{17.67cm}}{\bf\textcolor{white}{Data Source - write (cont)}} \tn % Row 8 \SetRowColor{LightBackground} \seqsplit{saveAsTable(tableName:} String): Unit & Saves the content of the DataFrame as the specified table. \{\{nl\}\}\{\{nl\}\}In the case the table already exists, behavior of this function depends on the save mode, specified by the mode function (default to throwing an exception). When mode is Overwrite, the schema of the DataFrame does not need to be the same as that of the existing table. \{\{nl\}\}\{\{nl\}\}When mode is Append, if there is an existing table, we will use the format and options of the existing table. The column order in the schema of the DataFrame doesn't need to be same as that of the existing table. Unlike insertInto, saveAsTable will use the column names to find the correct column positions. For example: \{\{nl\}\}\{\{nl\}\}`scala\textgreater{} Seq((1, 2)).toDF("i", \seqsplit{"j").write.mode("overwrite").saveAsTable("t1")} scala\textgreater{} Seq((3, 4)).toDF("j", \seqsplit{"i").write.mode("append").saveAsTable("t1")} \{\{nl\}\}scala\textgreater{} sql("select * from t1").show` \{\{nl\}\}+-{}-{}-+-{}-{}-+ \{\{nl\}\}|~~~~i|~~~~j| \{\{nl\}\}+-{}-{}-+-{}-{}-+ \{\{nl\}\}|~~~~1|~~~~2| \{\{nl\}\}|~~~~4|~~~~3| \{\{nl\}\}+-{}-{}-+-{}-{}-+ \{\{nl\}\}In this method, save mode is used to determine the behavior if the data source table exists in Spark catalog. We will always overwrite the underlying data of data source (e.g. a table in JDBC data source) if the table doesn't exist in Spark catalog, and will always append to the underlying data of data source if the table already exists. \{\{nl\}\}\{\{nl\}\}When the DataFrame is created from a \seqsplit{non-partitioned} \seqsplit{HadoopFsRelation} with a single input path, and the data source provider can be mapped to an existing Hive builtin SerDe (i.e. ORC and Parquet), the table is persisted in a Hive compatible format, which means other systems like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL specific format. & \tn % Row Count 133 (+ 133) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{5.2297 cm} x{5.9045 cm} x{5.7358 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{17.67cm}}{\bf\textcolor{white}{Data Source - write (cont)}} \tn % Row 9 \SetRowColor{LightBackground} \seqsplit{insertInto(tableName:} String): Unit & Inserts the content of the DataFrame to the specified table. It requires that the schema of the DataFrame is the same as the schema of the table. \{\{nl\}\}\{\{nl\}\} Unlike saveAsTable, insertInto ignores the column names and just uses \seqsplit{position-based} resolution. For example: \{\{nl\}\}\{\{nl\}\} `scala\textgreater{} Seq((1, 2)).toDF("i", \seqsplit{"j").write.mode("overwrite").saveAsTable("t1")} \{\{nl\}\}scala\textgreater{} Seq((3, 4)).toDF("j", \seqsplit{"i").write.insertInto("t1")} \{\{nl\}\}scala\textgreater{} Seq((5, 6)).toDF("a", \seqsplit{"b").write.insertInto("t1")} \{\{nl\}\}scala\textgreater{} sql("select * from t1").show` \{\{nl\}\}+-{}-{}-+-{}-{}-+ \{\{nl\}\}|~~~~~~i|~~~~~~j| \{\{nl\}\}+-{}-{}-+-{}-{}-+ \{\{nl\}\}|~~~~~~5|~~~~~~6| \{\{nl\}\}|~~~~~~3|~~~~~~4| \{\{nl\}\}|~~~~~~1|~~~~~~2| \{\{nl\}\}+-{}-{}-+-{}-{}-+ \{\{nl\}\}\{\{nl\}\} {\bf{Because it inserts data to an existing table, format or options will be ignored.}} & \tn % Row Count 73 (+ 73) \hhline{>{\arrayrulecolor{DarkBackground}}---} \SetRowColor{LightBackground} \mymulticolumn{3}{x{17.67cm}}{\seqsplit{`DataFrameWriter.format(...).mode(...).option(...).partitionBy(colNames:} String{\emph{).bucketBy(numBuckets: Int, colName: String, colNames: String}}).sortBy(colName: String, colNames: String{\emph{).save(path: String)` \newline `DataFrameWriter.format(...).mode(...).option(...).partitionBy(colNames: String}}).bucketBy(numBuckets: Int, colName: String, colNames: String{\emph{).sortBy(colName: String, colNames: String}}).saveAsTable/insertInto(tableName: String)` \newline To get DataFrameWriter, use `dataFrame.write`} \tn \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{3.5427 cm} x{6.748 cm} x{6.5793 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{17.67cm}}{\bf\textcolor{white}{Data Type}} \tn % Row 0 \SetRowColor{LightBackground} {\bf{Spark}} & {\bf{Scala}} & {\bf{Java}} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \seqsplit{ByteType} & Byte & byte or Byte \tn % Row Count 3 (+ 1) % Row 2 \SetRowColor{LightBackground} \seqsplit{ShortType} & Short & short or Short \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} \seqsplit{IntegerType} & Int & int or Integer \tn % Row Count 7 (+ 2) % Row 4 \SetRowColor{LightBackground} \seqsplit{LongType} & Long & long or Long \tn % Row Count 8 (+ 1) % Row 5 \SetRowColor{white} \seqsplit{FloatType} & Float & float or Float \tn % Row Count 10 (+ 2) % Row 6 \SetRowColor{LightBackground} \seqsplit{DoubleType} & Double & double or Double \tn % Row Count 12 (+ 2) % Row 7 \SetRowColor{white} \seqsplit{DecimalType} & \seqsplit{java.math.BigDecimal} & java.,math.BigDecimal \tn % Row Count 14 (+ 2) % Row 8 \SetRowColor{LightBackground} \seqsplit{StringType} & String & String \tn % Row Count 16 (+ 2) % Row 9 \SetRowColor{white} \seqsplit{BinaryType} & Array{[}Byte{]} & byte{[}{]} \tn % Row Count 18 (+ 2) % Row 10 \SetRowColor{LightBackground} \seqsplit{BooleanType} & Boolean & boolean or Boolean \tn % Row Count 20 (+ 2) % Row 11 \SetRowColor{white} \seqsplit{DateType} & java.sql.Date & java.sql.Date \tn % Row Count 21 (+ 1) % Row 12 \SetRowColor{LightBackground} \seqsplit{TimestampType} & \seqsplit{java.sql.Timestamp} & \seqsplit{java.sql.Timestamp} \tn % Row Count 23 (+ 2) % Row 13 \SetRowColor{white} \seqsplit{ArrayType} & \seqsplit{scala.collection.Seq} & java.util.List \tn % Row Count 25 (+ 2) % Row 14 \SetRowColor{LightBackground} MapType & \seqsplit{scala.collection.Map} & java.util.Map \tn % Row Count 27 (+ 2) % Row 15 \SetRowColor{white} \seqsplit{StructType} & \seqsplit{org.apache.spark.sql.Row} & \seqsplit{org.apache.spark.sql.Row} \tn % Row Count 29 (+ 2) % Row 16 \SetRowColor{LightBackground} \mymulticolumn{3}{x{17.67cm}}{StructField} \tn % Row Count 30 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{6.5626 cm} x{10.7074 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Expressions}} \tn % Row 0 \SetRowColor{LightBackground} Computational expressions & `(((col("someCol") + 5) * 200) - 6) \textless{} col("otherCol")` \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Relational expressions & `expr("(((someCol + 5) * 200) - 6) \textless{} otherCol")` \tn % Row Count 5 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{An expression is a set of transformations on one or more values in a record in a DataFrame. Think of it like a function that takes as input one or more column names, resolves them, and then potentially applies more expressions to create a single value for each record in the dataset. Importantly, this "single value" can actually be a complex type like a Map or Array.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{6.5626 cm} x{10.7074 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Converting to Spark Types - functions}} \tn % Row 0 \SetRowColor{LightBackground} lit(literal: Any): Column & Creates a Column of literal value.\{\{nl\}\}The passed in object is returned directly if it is already a Column. If the object is a Scala Symbol, it is converted into a Column also. Otherwise, a new Column is created to represent the literal value. \tn % Row Count 11 (+ 11) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.functions}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{6.7353 cm} x{10.5347 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Change the Column Data Type - Column}} \tn % Row 0 \SetRowColor{LightBackground} cast(to: String): Column & Casts the column to a different data type, using the canonical string representation of the type. The supported types are: {\bf{string, boolean, byte, short, int, long, float, double, decimal, date, timestamp}}. \{\{nl\}\}`//~Casts colA to integer.` \{\{nl\}\}`df.select(col("colA").cast("int"))` \tn % Row Count 13 (+ 13) % Row 1 \SetRowColor{white} cast(to: DataType): Column & Casts the column to a different data type. \{\{nl\}\}`//~Casts colA to IntegerType. \{\{nl\}\}import \seqsplit{org.apache.spark.sql.types.IntegerType} \{\{nl\}\}df.select(col("colA").cast(IntegerType)) \{\{nl\}\} \{\{nl\}\}//~equivalent to \{\{nl\}\}df.select(df("colA").cast("int"))` \tn % Row Count 24 (+ 11) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{e.g. `df.withColumn("id", \seqsplit{col("id").cast("string"))`}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{\seqsplit{org.apache.spark.sql.Dataset} - others}} \tn % Row 0 \SetRowColor{LightBackground} first(): T & Returns the first row. Alias for head(). \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} head(): T & Returns the first row. \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} head(n: Int): Array{[}T{]} & Returns the first n rows. \tn % Row Count 6 (+ 2) % Row 3 \SetRowColor{white} take(n: Int): Array{[}T{]} & Returns the first n rows in the Dataset. \tn % Row Count 8 (+ 2) % Row 4 \SetRowColor{LightBackground} takeAsList(n: Int): List{[}T{]} & Returns the first n rows in the Dataset as a list. \tn % Row Count 11 (+ 3) % Row 5 \SetRowColor{white} collect(): Array{[}T{]} & Returns an array that contains all rows in this Dataset. \tn % Row Count 14 (+ 3) % Row 6 \SetRowColor{LightBackground} collectAsList(): List{[}T{]} & Returns a Java list that contains all rows in this Dataset. \tn % Row Count 17 (+ 3) % Row 7 \SetRowColor{white} count(): Long & Returns the number of rows in the Dataset. \tn % Row Count 20 (+ 3) % Row 8 \SetRowColor{LightBackground} show(): Unit & Displays the top 20 rows of Dataset in a tabular form. Strings more than 20 characters will be truncated, and all cells will be aligned right. \tn % Row Count 28 (+ 8) % Row 9 \SetRowColor{white} show(numRows: Int): Unit & Displays the Dataset in a tabular form. Strings more than 20 characters will be truncated, and all cells will be aligned right. \tn % Row Count 35 (+ 7) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{\seqsplit{org.apache.spark.sql.Dataset} - others (cont)}} \tn % Row 10 \SetRowColor{LightBackground} show(truncate: Boolean): Unit & Displays the top 20 rows of Dataset in a tabular form. \tn % Row Count 3 (+ 3) % Row 11 \SetRowColor{white} show(numRows: Int, truncate: Boolean): Unit & Displays the Dataset in a tabular form. \tn % Row Count 6 (+ 3) % Row 12 \SetRowColor{LightBackground} printSchema(): Unit & Prints the schema to the console in a nice tree format. \tn % Row Count 9 (+ 3) % Row 13 \SetRowColor{white} explain(): Unit & Prints the physical plan to the console for debugging purposes. \tn % Row Count 13 (+ 4) % Row 14 \SetRowColor{LightBackground} explain(extended: Boolean): Unit & Prints the plans (logical and physical) to the console for debugging purposes. \tn % Row Count 17 (+ 4) % Row 15 \SetRowColor{white} schema: StructType & Returns the schema of this Dataset. \tn % Row Count 19 (+ 2) % Row 16 \SetRowColor{LightBackground} columns: Array{[}String{]} & Returns all column names as an array. \tn % Row Count 21 (+ 2) % Row 17 \SetRowColor{white} describe(cols: String*): DataFrame & Computes basic statistics for numeric and string columns, including count, mean, stddev, min, and max.\{\{nl\}\}`ds.describe("age",~"height").show()`\{\{nl\}\}\{\{nl\}\}`//~output:`\{\{nl\}\}`//~summary~age~~~height`\{\{nl\}\}`//~count~~~10.0~~10.0`\{\{nl\}\}`//~mean~~~~53.3~~178.05`\{\{nl\}\}`//~stddev~~11.6~~15.7`\{\{nl\}\}`//~min~~~~~18.0~~163.0`\{\{nl\}\}`//~max~~~~~92.0~~192.0` \tn % Row Count 49 (+ 28) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{\seqsplit{org.apache.spark.sql.Dataset} - others (cont)}} \tn % Row 18 \SetRowColor{LightBackground} summary(statistics: String*): DataFrame & Computes specified statistics for numeric and string columns. Available statistics are:\{\{nl\}\}- count - mean - stddev - min - max - arbitrary approximate percentiles specified as a percentage (eg, 75\%)\{\{nl\}\}`ds.summary().show()` \{\{nl\}\} \{\{nl\}\}`//~output:` \{\{nl\}\}`//~summary~age~~~height` \{\{nl\}\}`//~count~~~10.0~~10.0` \{\{nl\}\}`//~mean~~~~53.3~~178.05` \{\{nl\}\}`//~stddev~~11.6~~15.7` \{\{nl\}\}`//~min~~~~~18.0~~163.0` \{\{nl\}\}`//~25\%~~~~~24.0~~176.0` \{\{nl\}\}`//~50\%~~~~~24.0~~176.0` \{\{nl\}\}`//~75\%~~~~~32.0~~180.0` \{\{nl\}\}`//~max~~~~~92.0~~192.0`\{\{nl\}\} \{\{nl\}\}`ds.summary("count",~"min",~"25\%",~"75\%",~"max").show()` \{\{nl\}\} \{\{nl\}\}`//~output:` \{\{nl\}\}`//~summary~age~~~height` \{\{nl\}\}`//~count~~~10.0~~10.0` \{\{nl\}\}`//~mean~~~~53.3~~178.05` \{\{nl\}\}`//~min~~~~~18.0~~163.0` \{\{nl\}\}`//~25\%~~~~~24.0~~176.0` \{\{nl\}\}`//~75\%~~~~~32.0~~180.0` \{\{nl\}\}`//~max~~~~~92.0~~192.0`\{\{nl\}\} \{\{nl\}\}To do a summary for specific columns first select them: \{\{nl\}\} \{\{nl\}\}`ds.select("age", \seqsplit{"height").summary().show()`} \tn % Row Count 79 (+ 79) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{\seqsplit{org.apache.spark.sql.Dataset} - others (cont)}} \tn % Row 19 \SetRowColor{LightBackground} cache(): Dataset.this.type & Persist this Dataset with the default storage level \seqsplit{(MEMORY\_AND\_DISK).} \tn % Row Count 4 (+ 4) % Row 20 \SetRowColor{white} persist(): Dataset.this.type & Persist this Dataset with the default storage level \seqsplit{(MEMORY\_AND\_DISK).} \tn % Row Count 8 (+ 4) % Row 21 \SetRowColor{LightBackground} persist(newLevel: StorageLevel): Dataset.this.type & Persist this Dataset with the given storage level.\{\{nl\}\}newLevel\{\{nl\}\}One of: MEMORY\_ONLY, MEMORY\_AND\_DISK, MEMORY\_ONLY\_SER, \seqsplit{MEMORY\_AND\_DISK\_SER}, DISK\_ONLY, MEMORY\_ONLY\_2, \seqsplit{MEMORY\_AND\_DISK\_2}, etc. \tn % Row Count 18 (+ 10) % Row 22 \SetRowColor{white} unpersist(): Dataset.this.type & Mark the Dataset as non-persistent, and remove all blocks for it from memory and disk. \tn % Row Count 23 (+ 5) % Row 23 \SetRowColor{LightBackground} unpersist(blocking: Boolean): Dataset.this.type & Mark the Dataset as non-persistent, and remove all blocks for it from memory and disk.\{\{nl\}\}blocking: Whether to block until all blocks are deleted. \tn % Row Count 31 (+ 8) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{\seqsplit{org.apache.spark.sql.Dataset} - others (cont)}} \tn % Row 24 \SetRowColor{LightBackground} storageLevel: StorageLevel & Get the Dataset's current storage level, or StorageLevel.NONE if not persisted. \tn % Row Count 4 (+ 4) % Row 25 \SetRowColor{white} rdd: RDD{[}T{]} & Represents the content of the Dataset as an RDD of T. \tn % Row Count 7 (+ 3) % Row 26 \SetRowColor{LightBackground} toDF(): DataFrame & Converts this strongly typed collection of data to generic Dataframe. \tn % Row Count 11 (+ 4) % Row 27 \SetRowColor{white} toDF(colNames: String*): DataFrame & Converts this strongly typed collection of data to generic DataFrame with columns renamed.`\{\{nl\}\}val~rdd:~RDD{[}(Int,~String){]}~=~... \{\{nl\}\}rdd.toDF()~~//~this~implicit~conversion~creates~a~DataFrame~with~column~name~\textbackslash{}`\_1\textbackslash{}`~and~\textbackslash{}`\_2\textbackslash{}`\{\{nl\}\}rdd.toDF("id",~"name")~~//~this~creates~a~DataFrame~with~column~name~"id"~and~"name"` \tn % Row Count 36 (+ 25) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{\seqsplit{org.apache.spark.sql.Dataset} - others (cont)}} \tn % Row 28 \SetRowColor{LightBackground} \seqsplit{coalesce(numPartitions:} Int): Dataset{[}T{]} & Returns a new Dataset that has exactly numPartitions partitions, when the fewer partitions are requested. \tn % Row Count 6 (+ 6) % Row 29 \SetRowColor{white} \seqsplit{repartition(numPartitions:} Int): Dataset{[}T{]} & Returns a new Dataset that has exactly numPartitions partitions. \tn % Row Count 10 (+ 4) % Row 30 \SetRowColor{LightBackground} \seqsplit{repartition(numPartitions:} Int, partitionExprs: Column*): Dataset{[}T{]} & Returns a new Dataset partitioned by the given partitioning expressions into numPartitions. The resulting Dataset is hash partitioned. \tn % Row Count 17 (+ 7) % Row 31 \SetRowColor{white} \seqsplit{repartition(partitionExprs:} Column*): Dataset{[}T{]} & Returns a new Dataset partitioned by the given partitioning expressions, using \seqsplit{spark.sql.shuffle.partitions} as number of partitions. \tn % Row Count 24 (+ 7) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{org.apache.spark.sql - Transformations}} \tn % Row 0 \SetRowColor{LightBackground} select(col: String, cols: String*): DataFrame & Selects a set of columns.\{\{nl\}\}`ds.select("colA", "colB")` \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} select(cols: Column*): DataFrame & Selects a set of column based expressions.\{\{nl\}\}`ds.select(\$"colA", \$"colB")` \tn % Row Count 7 (+ 4) % Row 2 \SetRowColor{LightBackground} selectExpr(exprs: String*): DataFrame & Selects a set of SQL expressions.\{\{nl\}\}`// The following are equivalent:`\{\{nl\}\}`ds.selectExpr("colA", "colB as newName", "abs(colC)")`\{\{nl\}\}`ds.select(expr("colA"), expr("colB as newName"), expr("abs(colC)"))`\{\{nl\}\}\{\{nl\}\} `df.selectExpr("*","(produced\_year - (produced\_year \% 10)) as decade")` \tn % Row Count 22 (+ 15) % Row 3 \SetRowColor{white} \seqsplit{where(conditionExpr:} String): Dataset{[}T{]} & Filters rows using the given SQL expression.\{\{nl\}\}To filter a DataFrame, you can also just specify a Boolean column: `df.where("isExpensive")`\{\{nl\}\}`peopleDs.where("age \textgreater{} 15")` \tn % Row Count 31 (+ 9) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{org.apache.spark.sql - Transformations (cont)}} \tn % Row 4 \SetRowColor{LightBackground} where(condition: Column): Dataset{[}T{]} & Filters rows using the given condition.\{\{nl\}\}`peopleDs.where(\$"age" \textgreater{} 15)` \tn % Row Count 4 (+ 4) % Row 5 \SetRowColor{white} \seqsplit{filter(conditionExpr:} String): Dataset{[}T{]} & Filters rows using the given SQL expression.\{\{nl\}\}`peopleDs.filter("age \textgreater{} 15")` \tn % Row Count 8 (+ 4) % Row 6 \SetRowColor{LightBackground} filter(condition: Column): Dataset{[}T{]} & Filters rows using the given condition.\{\{nl\}\}// The following are equivalent:\{\{nl\}\}`peopleDs.filter(\$"age" \textgreater{} 15)`\{\{nl\}\}`peopleDs.where(\$"age" \textgreater{} 15)` \tn % Row Count 16 (+ 8) % Row 7 \SetRowColor{white} filter(func: (T) ⇒ Boolean): Dataset{[}T{]} & Returns a new Dataset that only contains elements where func returns true. \tn % Row Count 20 (+ 4) % Row 8 \SetRowColor{LightBackground} orderBy(sortExprs: Column*): Dataset{[}T{]} & Returns a new Dataset sorted by the given expressions. This is an alias of the sort function.\{\{nl\}\}`movieTitles.orderBy('title\_length.desc, 'produced\_year)` \tn % Row Count 28 (+ 8) % Row 9 \SetRowColor{white} orderBy(sortCol: String, sortCols: String*): Dataset{[}T{]} & Returns a new Dataset sorted by the given expressions. This is an alias of the sort function. \tn % Row Count 33 (+ 5) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{org.apache.spark.sql - Transformations (cont)}} \tn % Row 10 \SetRowColor{LightBackground} sort(sortExprs: Column*): Dataset{[}T{]} & Returns a new Dataset sorted by the given expressions. \{\{nl\}\}e.g. `ds.sort(\$"col1", \$"col2".desc)` \tn % Row Count 5 (+ 5) % Row 11 \SetRowColor{white} sort(sortCol: String, sortCols: String*): Dataset{[}T{]} & Returns a new Dataset sorted by the specified column, all in ascending order.\{\{nl\}\}`// The following 3 are equivalent`\{\{nl\}\}`ds.sort("sortcol")`\{\{nl\}\}`ds.sort(\$"sortcol")`\{\{nl\}\}`ds.sort(\$"sortcol".asc)` \tn % Row Count 16 (+ 11) % Row 12 \SetRowColor{LightBackground} distinct(): Dataset{[}T{]} & Returns a new Dataset that contains only the unique rows from this Dataset. This is an alias for dropDuplicates. \tn % Row Count 22 (+ 6) % Row 13 \SetRowColor{white} dropDuplicates(): Dataset{[}T{]} & Returns a new Dataset that contains only the unique rows from this Dataset. This is an alias for distinct. \tn % Row Count 28 (+ 6) % Row 14 \SetRowColor{LightBackground} \seqsplit{dropDuplicates(col1:} String, cols: String*): Dataset{[}T{]} & Returns a new Dataset with duplicate rows removed, considering only the subset of columns.\{\{nl\}\}`//~The following are equivalent \{\{nl\}\}movies.select("movie\_title").distinct.selectExpr("count(movie\_title) as movies").show \{\{nl\}\}movies.dropDuplicates("movie\_title").selectExpr("count(movie\_title) as movies").show` \tn % Row Count 44 (+ 16) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{org.apache.spark.sql - Transformations (cont)}} \tn % Row 15 \SetRowColor{LightBackground} \seqsplit{dropDuplicates(colNames:} Seq{[}String{]}): Dataset{[}T{]} & Returns a new Dataset with duplicate rows removed, considering only the subset of columns. \tn % Row Count 5 (+ 5) % Row 16 \SetRowColor{white} \seqsplit{dropDuplicates(colNames:} Array{[}String{]}): Dataset{[}T{]} & Returns a new Dataset with duplicate rows removed, considering only the subset of columns. \tn % Row Count 10 (+ 5) % Row 17 \SetRowColor{LightBackground} limit(n: Int): Dataset{[}T{]} & Returns a new Dataset by taking the first n rows. The difference between this function and head is that head is an action and returns an array (by triggering query execution) while limit returns a new Dataset. \tn % Row Count 21 (+ 11) % Row 18 \SetRowColor{white} withColumn(colName: String, col: Column): DataFrame & Returns a new Dataset by adding a column or replacing the existing column that has the same name. \{\{nl\}\}However, if the given column name matches one of the existing ones, then that column is replaced with the given column expression.\{\{nl\}\}`//~adding a new column based on a certain column expression \{\{nl\}\}movies.withColumn("decade", ('produced\_year - 'produced\_year \% 10))` \tn % Row Count 40 (+ 19) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{org.apache.spark.sql - Transformations (cont)}} \tn % Row 19 \SetRowColor{LightBackground} \seqsplit{withColumnRenamed(existingName:} String, newName: String): DataFrame & Returns a new Dataset with a column renamed. This is a no-op if schema doesn't contain existingName.\{\{nl\}\}Notice that if the provided existingColName doesn't exist in the schema, Spark doesn't throw an error, and it will silently do nothing.\{\{nl\}\}`movies.withColumnRenamed("actor\_name", "actor")` \tn % Row Count 15 (+ 15) % Row 20 \SetRowColor{white} drop(colName: String): DataFrame & Returns a new Dataset with columns dropped. This is a no-op if schema doesn't contain column name(s).\{\{nl\}\}`movies.drop("actor\_name")` \tn % Row Count 22 (+ 7) % Row 21 \SetRowColor{LightBackground} drop(colNames: String*): DataFrame & Returns a new Dataset with columns dropped. This is a no-op if schema doesn't contain column name(s). \{\{nl\}\}You can specify one or more column names to drop, but only the ones that exist in the schema will be dropped and the ones that don't will be silently ignored.\{\{nl\}\}`movies.drop("actor\_name", "me")` \tn % Row Count 38 (+ 16) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{org.apache.spark.sql - Transformations (cont)}} \tn % Row 22 \SetRowColor{LightBackground} drop(col: Column): DataFrame & Returns a new Dataset with a column dropped. This version of drop accepts a Column rather than a name. This is a no-op if the Dataset doesn't have a column with an equivalent expression.\{\{nl\}\}`movies.drop(\$"actor\_name")` \tn % Row Count 11 (+ 11) % Row 23 \SetRowColor{white} union(other: Dataset{[}T{]}): Dataset{[}T{]} & Returns a new Dataset containing union of rows in this Dataset and another Dataset. \{\{nl\}\}This is equivalent to UNION ALL in SQL. To do a SQL-style set union (that does deduplication of elements), use this function followed by a distinct.\{\{nl\}\}Notice that the column positions in the schema aren't necessarily matched with the fields in the strongly typed objects in a Dataset. This function resolves columns by their positions in the schema, not the fields in the strongly typed objects. Use unionByName to resolve columns by field name in the typed objects.\{\{nl\}\}`val df1 = Seq((1, 2, 3)).toDF("col0", "col1", "col2")`\{\{nl\}\}`val df2 = Seq((4, 5, 6)).toDF("col1", "col2", "col0")`\{\{nl\}\}`df1.union(df2).show`\{\{nl\}\}\{\{nl\}\}`// output:`\{\{nl\}\}`// +-{}-{}-{}-+-{}-{}-{}-+-{}-{}-{}-+`\{\{nl\}\}`// |col0|col1|col2|`\{\{nl\}\}`// +-{}-{}-{}-+-{}-{}-{}-+-{}-{}-{}-+`\{\{nl\}\}`// |~~~1|~~~2|~~~3|`\{\{nl\}\}`// |~~~4|~~~5|~~~6|`\{\{nl\}\}`// +-{}-{}-{}-+-{}-{}-{}-+-{}-{}-{}-+` \tn % Row Count 61 (+ 50) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{org.apache.spark.sql - Transformations (cont)}} \tn % Row 24 \SetRowColor{LightBackground} unionByName(other: Dataset{[}T{]}): Dataset{[}T{]} & Returns a new Dataset containing union of rows in this Dataset and another Dataset. \{\{nl\}\}This is different from both UNION ALL and UNION DISTINCT in SQL. To do a SQL-style set union (that does deduplication of elements), use this function followed by a distinct. \{\{nl\}\}The difference between this function and union is that this function resolves columns by name (not by position).\{\{nl\}\}`val df1 = Seq((1, 2, 3)).toDF("col0", "col1", "col2")`\{\{nl\}\}`val df2 = Seq((4, 5, 6)).toDF("col1", "col2", "col0")`\{\{nl\}\}`df1.unionByName(df2).show`\{\{nl\}\}\{\{nl\}\}`// output:`\{\{nl\}\}`// +-{}-{}-{}-+-{}-{}-{}-+-{}-{}-{}-+`\{\{nl\}\}`// |col0|col1|col2|`\{\{nl\}\}`// +-{}-{}-{}-+-{}-{}-{}-+-{}-{}-{}-+`\{\{nl\}\}`// |~~~1|~~~2|~~~3|`\{\{nl\}\}`// |~~~6|~~~4|~~~5|`\{\{nl\}\}`// +-{}-{}-{}-+-{}-{}-{}-+-{}-{}-{}-+` \tn % Row Count 41 (+ 41) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{org.apache.spark.sql - Transformations (cont)}} \tn % Row 25 \SetRowColor{LightBackground} intersect(other: Dataset{[}T{]}): Dataset{[}T{]} & Returns a new Dataset containing rows only in both this Dataset and another Dataset. This is equivalent to INTERSECT in SQL. \tn % Row Count 7 (+ 7) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.1169 cm} x{9.1531 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Booleans - Column}} \tn % Row 0 \SetRowColor{LightBackground} ===(other: Any): Column & Equality test.\{\{nl\}\}`df.where(col("InvoiceNo") === 536365)` \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} equalTo(other: Any): Column & Equality test. \{\{nl\}\}`df.where(col("InvoiceNo").equalTo(536365))` \tn % Row Count 7 (+ 4) % Row 2 \SetRowColor{LightBackground} \textless{}=\textgreater{}(other: Any): Column & Equality test that is safe for null values. \tn % Row Count 10 (+ 3) % Row 3 \SetRowColor{white} =!=(other: Any): Column & Inequality test. \{\{nl\}\}`df.where(col("InvoiceNo") =!= 536365)` \tn % Row Count 13 (+ 3) % Row 4 \SetRowColor{LightBackground} \textless{}(other: Any): Column & Less than. \tn % Row Count 15 (+ 2) % Row 5 \SetRowColor{white} \textless{}=(other: Any): Column & Less than or equal to. \tn % Row Count 17 (+ 2) % Row 6 \SetRowColor{LightBackground} \textgreater{}(other: Any): Column & Greater than. \tn % Row Count 19 (+ 2) % Row 7 \SetRowColor{white} \textgreater{}=(other: Any): Column & Greater than or equal to an expression. \tn % Row Count 21 (+ 2) % Row 8 \SetRowColor{LightBackground} \&\&(other: Any): Column & Boolean AND. \tn % Row Count 23 (+ 2) % Row 9 \SetRowColor{white} ||(other: Any): Column & Boolean OR. \tn % Row Count 25 (+ 2) % Row 10 \SetRowColor{LightBackground} isNaN: Column & True if the current expression is NaN. \tn % Row Count 27 (+ 2) % Row 11 \SetRowColor{white} isNotNull: Column & True if the current expression is NOT null. \tn % Row Count 30 (+ 3) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.1169 cm} x{9.1531 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Booleans - Column (cont)}} \tn % Row 12 \SetRowColor{LightBackground} isNull: Column & True if the current expression is null. \tn % Row Count 2 (+ 2) % Row 13 \SetRowColor{white} isin(list: Any*): Column & A boolean expression that is evaluated to true if the value of this expression is contained by the evaluated values of the arguments.\{\{nl\}\}{\bf{According to documentation, isin takes a vararg, not a list. List is actually a confusing name here.}}\{\{nl\}\}`val items = List("a", "b", "c")` \{\{nl\}\}`df.filter(\$"c1".isin(items:\_*))`\{\{nl\}\}or\{\{nl\}\}`df.filter(\$"c1".isin("a", "b", "c"))` \tn % Row Count 20 (+ 18) % Row 14 \SetRowColor{LightBackground} like(literal: String): Column & SQL like expression. Returns a boolean column based on a SQL LIKE match.\{\{nl\}\}\{\{nl\}\}{\bf{SQL Wildcards}}\{\{nl\}\}{\bf{\%}}: Represents zero or more characters, e.g. `bl\%` finds bl, black, blue, and blob\{\{nl\}\}{\bf{\_}}: Represents a single character, e.g. `h\_t` finds hot, hat, and hit\{\{nl\}\}{\bf{{[}{]}}}: Represents any single character within the brackets, e.g. `h{[}oa{]}t` finds hot and hat, but not hit\{\{nl\}\}{\bf{\textasciicircum{}}}: Represents any character not in the brackets, e.g. `h{[}\textasciicircum{}oa{]}t` finds hit, but not hot and hat\{\{nl\}\}{\bf{-}}: Represents a range of characters, e.g. `c{[}a-b{]}t` finds cat and cbt\{\{nl\}\} \tn % Row Count 48 (+ 28) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.1169 cm} x{9.1531 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Booleans - Column (cont)}} \tn % Row 15 \SetRowColor{LightBackground} rlike(literal: String): Column & SQL RLIKE expression (LIKE with Regex). \tn % Row Count 2 (+ 2) % Row 16 \SetRowColor{white} \seqsplit{startsWith(literal:} String): Column & String starts with another string literal. Returns a boolean column based on a string match. \tn % Row Count 7 (+ 5) % Row 17 \SetRowColor{LightBackground} startsWith(other: Column): Column & String starts with. \tn % Row Count 9 (+ 2) % Row 18 \SetRowColor{white} endsWith(literal: String): Column & String ends with another string literal. Returns a boolean column based on a string match. \tn % Row Count 14 (+ 5) % Row 19 \SetRowColor{LightBackground} endsWith(other: Column): Column & String ends with. Returns a boolean column based on a string match. \tn % Row Count 18 (+ 4) % Row 20 \SetRowColor{white} contains(other: Any): Column & Contains the other element. Returns a boolean column based on a string match. \tn % Row Count 22 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.Column}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{6.5626 cm} x{10.7074 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Booleans - functions}} \tn % Row 0 \SetRowColor{LightBackground} not(e: Column): Column & Inversion of boolean expression, i.e. NOT. \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} isnan(e: Column): Column & Return true iff the column is NaN. \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} isnull(e: Column): Column & Return true iff the column is null. \tn % Row Count 6 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.functions}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{5.8718 cm} x{11.3982 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Numbers - Column}} \tn % Row 0 \SetRowColor{LightBackground} +(other: Any): Column & Sum of this expression and another expression. \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} -(other: Any): Column & Subtraction. Subtract the other expression from this expression. \tn % Row Count 5 (+ 3) % Row 2 \SetRowColor{LightBackground} *(other: Any): Column & Multiplication of this expression and another expression. \tn % Row Count 8 (+ 3) % Row 3 \SetRowColor{white} /(other: Any): Column & Division this expression by another expression. \tn % Row Count 10 (+ 2) % Row 4 \SetRowColor{LightBackground} \%(other: Any): Column & Modulo (a.k.a.\{\{nl\}\}取余数, e.g.`11 mod 4 = 3` \tn % Row Count 12 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.Column}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Numbers - functions}} \tn % Row 0 \SetRowColor{LightBackground} abs(e: Column): Column & Computes the absolute value. \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} round(e: Column): Column & Returns the value of the column e rounded to 0 decimal places with HALF\_UP round mode. \tn % Row Count 7 (+ 5) % Row 2 \SetRowColor{LightBackground} round(e: Column, scale: Int): Column & Round the value of e to scale decimal places with HALF\_UP round mode if scale is greater than or equal to 0 or at integral part when scale is less than 0. \tn % Row Count 15 (+ 8) % Row 3 \SetRowColor{white} bround(e: Column): Column & Returns the value of the column e rounded to 0 decimal places with HALF\_EVEN round mode. \{\{nl\}\}HALF\_EVEN round towards the "nearest neighbor" unless both neighbors are equidistant, in which case, round towards the even neighbor. \tn % Row Count 27 (+ 12) % Row 4 \SetRowColor{LightBackground} bround(e: Column, scale: Int): Column & Round the value of e to scale decimal places with HALF\_EVEN round mode if scale is greater than or equal to 0 or at integral part when scale is less than 0. \tn % Row Count 35 (+ 8) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Numbers - functions (cont)}} \tn % Row 5 \SetRowColor{LightBackground} pow(l: Double, rightName: String): Column & Returns the value of the first argument raised to the power of the second argument. \tn % Row Count 5 (+ 5) % Row 6 \SetRowColor{white} pow(l: Double, r: Column): Column & Returns the value of the first argument raised to the power of the second argument. \tn % Row Count 10 (+ 5) % Row 7 \SetRowColor{LightBackground} pow(leftName: String, r: Double): Column & Returns the value of the first argument raised to the power of the second argument. \tn % Row Count 15 (+ 5) % Row 8 \SetRowColor{white} pow(l: Column, r: Double): Column & Returns the value of the first argument raised to the power of the second argument. \tn % Row Count 20 (+ 5) % Row 9 \SetRowColor{LightBackground} pow(leftName: String, rightName: String): Column & Returns the value of the first argument raised to the power of the second argument. \tn % Row Count 25 (+ 5) % Row 10 \SetRowColor{white} pow(leftName: String, r: Column): Column & Returns the value of the first argument raised to the power of the second argument. \tn % Row Count 30 (+ 5) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Numbers - functions (cont)}} \tn % Row 11 \SetRowColor{LightBackground} pow(l: Column, rightName: String): Column & Returns the value of the first argument raised to the power of the second argument. \tn % Row Count 5 (+ 5) % Row 12 \SetRowColor{white} pow(l: Column, r: Column): Column & Returns the value of the first argument raised to the power of the second argument. \tn % Row Count 10 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.functions}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Strings - Column}} \tn % Row 0 \SetRowColor{LightBackground} contains(other: Any): Column & Contains the other element. Returns a boolean column based on a string match. \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} startsWith(literal: String): Column & String starts with another string literal. Returns a boolean column based on a string match. \tn % Row Count 9 (+ 5) % Row 2 \SetRowColor{LightBackground} startsWith(other: Column): Column & String starts with. Returns a boolean column based on a string match. \tn % Row Count 13 (+ 4) % Row 3 \SetRowColor{white} endsWith(literal: String): Column & String ends with another string literal. Returns a boolean column based on a string match. \tn % Row Count 18 (+ 5) % Row 4 \SetRowColor{LightBackground} endsWith(other: Column): Column & String ends with. Returns a boolean column based on a string match. \tn % Row Count 22 (+ 4) % Row 5 \SetRowColor{white} substr(startPos: Int, len: Int): Column & An expression that returns a substring.\{\{nl\}\} startPos begins with {\bf{1}}.\{\{nl\}\}{\emph{In scala, String has also a function substring(int beginIndex, int endIndex), here the beginIndex starts from {\bf{0}}.}} \tn % Row Count 32 (+ 10) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Strings - Column (cont)}} \tn % Row 6 \SetRowColor{LightBackground} substr(startPos: Column, len: Column): Column & An expression that returns a substring. \tn % Row Count 3 (+ 3) % Row 7 \SetRowColor{white} like(literal: String): Column & SQL like expression. Returns a boolean column based on a SQL LIKE match.\{\{nl\}\}\{\{nl\}\}{\bf{SQL Wildcards}}\{\{nl\}\}{\bf{\%}}: Represents zero or more characters, e.g. `bl\%` finds bl, black, blue, and blob\{\{nl\}\}{\bf{\_}}: Represents a single character, e.g. `h\_t` finds hot, hat, and hit\{\{nl\}\}{\bf{{[}{]}}}: Represents any single character within the brackets, e.g. `h{[}oa{]}t` finds hot and hat, but not hit\{\{nl\}\}{\bf{\textasciicircum{}}}: Represents any character not in the brackets, e.g. `h{[}\textasciicircum{}oa{]}t` finds hit, but not hot and hat\{\{nl\}\}{\bf{-}}: Represents a range of characters, e.g. `c{[}a-b{]}t` finds cat and cbt\{\{nl\}\} \tn % Row Count 32 (+ 29) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Strings - Column (cont)}} \tn % Row 8 \SetRowColor{LightBackground} rlike(literal: String): Column & SQL RLIKE expression (LIKE with Regex). \tn % Row Count 2 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.Column}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Strings - functions}} \tn % Row 0 \SetRowColor{LightBackground} initcap(e: Column): Column & Returns a new string column by converting the first letter of each word to uppercase. \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} lower(e: Column): Column & Converts a string column to lower case. \tn % Row Count 7 (+ 2) % Row 2 \SetRowColor{LightBackground} upper(e: Column): Column & Converts a string column to upper case. \tn % Row Count 9 (+ 2) % Row 3 \SetRowColor{white} trim(e: Column): Column & Trim the spaces from both ends for the specified string column. \tn % Row Count 13 (+ 4) % Row 4 \SetRowColor{LightBackground} trim(e: Column, trimString: String): Column & Trim the specified character from both ends for the specified string column. \tn % Row Count 17 (+ 4) % Row 5 \SetRowColor{white} ltrim(e: Column): Column & Trim the spaces from left end for the specified string value. \tn % Row Count 21 (+ 4) % Row 6 \SetRowColor{LightBackground} ltrim(e: Column, trimString: String): Column & Trim the specified character string from left end for the specified string column. \tn % Row Count 26 (+ 5) % Row 7 \SetRowColor{white} rtrim(e: Column): Column & Trim the spaces from right end for the specified string value. \tn % Row Count 30 (+ 4) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Strings - functions (cont)}} \tn % Row 8 \SetRowColor{LightBackground} rtrim(e: Column, trimString: String): Column & Trim the specified character string from right end for the specified string column. \tn % Row Count 5 (+ 5) % Row 9 \SetRowColor{white} lpad(str: Column, len: Int, pad: String): Column & Left-pad the string column with pad to a length of len. If the string column is longer than len, the return value is shortened to len characters. \tn % Row Count 13 (+ 8) % Row 10 \SetRowColor{LightBackground} rpad(str: Column, len: Int, pad: String): Column & Right-pad the string column with pad to a length of len. If the string column is longer than len, the return value is shortened to len characters. \tn % Row Count 21 (+ 8) % Row 11 \SetRowColor{white} substring(str: Column, pos: Int, len: Int): Column & Substring starts at pos and is of length len when str is String type or returns the slice of byte array that starts at pos in byte and is of length len when str is Binary type.\{\{nl\}\} {\bf{{\emph{Note:}} The position is not zero based, but 1 based index.{\bf{1}}}}.\{\{nl\}\}{\emph{In scala, String has also a function substring(int beginIndex, int endIndex), here the beginIndex starts from {\bf{0}}.}} \tn % Row Count 40 (+ 19) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Strings - functions (cont)}} \tn % Row 12 \SetRowColor{LightBackground} \seqsplit{substring\_index(str:} Column, delim: String, count: Int): Column & Returns the substring from string str before count occurrences of the delimiter delim. If count is positive, everything the left of the final delimiter (counting from left) is returned. If count is negative, every to the right of the final delimiter (counting from the right) is returned. substring\_index performs a case-sensitive match when searching for delim. \tn % Row Count 19 (+ 19) % Row 13 \SetRowColor{white} regexp\_extract(e: Column, exp: String, groupIdx: Int): Column & Extract a specific group matched by a Java regex, from the specified string column. If the regex did not match, or the specified group did not match, an empty string is returned.\{\{nl\}\}`val rhymeDF = Seq(("A fox saw a crow sitting on a tree singing \textbackslash{}"Caw! Caw! Caw!\textbackslash{}"")).toDF("rhyme")`\{\{nl\}\}`rhymeDF.select(regexp\_extract('rhyme, "{[}a-z{]}*o{[}xw{]}",0).as("substring")).show`\{\{nl\}\}There could be multiple matches of the pattern in a string; therefore, the group index (starts with 0) is needed to identify which one. \tn % Row Count 45 (+ 26) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Strings - functions (cont)}} \tn % Row 14 \SetRowColor{LightBackground} regexp\_replace(e: Column, pattern: Column, replacement: Column): Column & Replace all substrings of the specified string value that match regexp with rep. \tn % Row Count 4 (+ 4) % Row 15 \SetRowColor{white} regexp\_replace(e: Column, pattern: String, replacement: String): Column & Replace all substrings of the specified string value that match regexp with rep.\{\{nl\}\}`val rhymeDF = Seq(("A fox saw a crow sitting on a tree singing \textbackslash{}"Caw! Caw! Caw!\textbackslash{}"")).toDF("rhyme")`\{\{nl\}\}`rhymeDF.select(regexp\_replace('rhyme, "fox|crow", \seqsplit{"animal").as("new\_rhyme")).show(false)`} \{\{nl\}\}`rhymeDF.select(regexp\_replace('rhyme, "{[}a-z{]}*o{[}xw{]}", \seqsplit{"animal").as("new\_rhyme")).show(false)`} \tn % Row Count 24 (+ 20) % Row 16 \SetRowColor{LightBackground} repeat(str: Column, n: Int): Column & Repeats a string column n times, and returns it as a new string column. \tn % Row Count 28 (+ 4) % Row 17 \SetRowColor{white} reverse(str: Column): Column & Reverses the string column and returns it as a new string column. \tn % Row Count 32 (+ 4) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Strings - functions (cont)}} \tn % Row 18 \SetRowColor{LightBackground} split(str: Column, pattern: String): Column & Splits str around pattern (pattern is a regular expression). \tn % Row Count 3 (+ 3) % Row 19 \SetRowColor{white} length(e: Column): Column & Computes the character length of a given string or number of bytes of a binary string. The length of character strings include the trailing spaces. The length of binary strings includes binary zeros. \tn % Row Count 13 (+ 10) % Row 20 \SetRowColor{LightBackground} translate(src: Column, matchingString: String, replaceString: String): Column & Translate any character in the src by a character in replaceString. The characters in replaceString correspond to the characters in matchingString. The translate will happen when any character in the string matches the character in the matchingString. \tn % Row Count 26 (+ 13) % Row 21 \SetRowColor{white} concat(exprs: Column*): Column & Concatenates multiple input columns together into a single column. If all inputs are binary, concat returns an output as binary. Otherwise, it returns as string. \tn % Row Count 35 (+ 9) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Strings - functions (cont)}} \tn % Row 22 \SetRowColor{LightBackground} concat\_ws(sep: String, exprs: Column*): Column & Concatenates multiple input string columns together into a single string column, using the given separator. \tn % Row Count 6 (+ 6) % Row 23 \SetRowColor{white} instr(str: Column, substring: String): Column & Locate the position of the first occurrence of substr column in the given string. Returns null if either of the arguments are null.\{\{nl\}\} {\bf{{\emph{Note:}} The position is not zero based, but 1 based index. Returns 0 if substr could not be found in str.}} \tn % Row Count 19 (+ 13) % Row 24 \SetRowColor{LightBackground} locate(substr: String, str: Column, pos: Int): Column & Locate the position of the first occurrence of substr in a string column, after position pos.\{\{nl\}\} {\bf{{\emph{Note:}} The position is not zero based, but 1 based index. Returns 0 if substr could not be found in str.}} \tn % Row Count 30 (+ 11) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Strings - functions (cont)}} \tn % Row 25 \SetRowColor{LightBackground} locate(substr: String, str: Column): Column & Locate the position of the first occurrence of substr in a string column, after position pos.\{\{nl\}\} {\bf{{\emph{Note:}} The position is not zero based, but 1 based index. Returns 0 if substr could not be found in str.}} \tn % Row Count 11 (+ 11) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.functions}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Date/Time - functions}} \tn % Row 0 \SetRowColor{LightBackground} current\_date(): Column & Returns the current date as a date column. \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \seqsplit{current\_timestamp():} Column & Returns the current timestamp as a timestamp column. \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} date\_add(start: Column, days: Int): Column & Returns the date that is days days after start \tn % Row Count 9 (+ 3) % Row 3 \SetRowColor{white} date\_sub(start: Column, days: Int): Column & Returns the date that is days days before start \tn % Row Count 12 (+ 3) % Row 4 \SetRowColor{LightBackground} datediff(end: Column, start: Column): Column & Returns the number of days from start to end. \tn % Row Count 15 (+ 3) % Row 5 \SetRowColor{white} \seqsplit{add\_months(startDate:} Column, numMonths: Int): Column & Returns the date that is numMonths after startDate. \tn % Row Count 18 (+ 3) % Row 6 \SetRowColor{LightBackground} \seqsplit{months\_between(date1:} Column, date2: Column): Column & Returns number of months between dates date1 and date2. \tn % Row Count 21 (+ 3) % Row 7 \SetRowColor{white} year(e: Column): Column & Extracts the year as an integer from a given \seqsplit{date/timestamp/string}. \tn % Row Count 25 (+ 4) % Row 8 \SetRowColor{LightBackground} quarter(e: Column): Column & Extracts the quarter as an integer from a given \seqsplit{date/timestamp/string}. \tn % Row Count 29 (+ 4) % Row 9 \SetRowColor{white} month(e: Column): Column & Extracts the month as an integer from a given \seqsplit{date/timestamp/string}. \tn % Row Count 33 (+ 4) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Date/Time - functions (cont)}} \tn % Row 10 \SetRowColor{LightBackground} weekofyear(e: Column): Column & Extracts the week number as an integer from a given \seqsplit{date/timestamp/string}. \tn % Row Count 4 (+ 4) % Row 11 \SetRowColor{white} dayofyear(e: Column): Column & Extracts the day of the year as an integer from a given \seqsplit{date/timestamp/string}. \tn % Row Count 8 (+ 4) % Row 12 \SetRowColor{LightBackground} dayofmonth(e: Column): Column & Extracts the day of the month as an integer from a given \seqsplit{date/timestamp/string}. \tn % Row Count 12 (+ 4) % Row 13 \SetRowColor{white} dayofweek(e: Column): Column & Extracts the day of the week as an integer from a given \seqsplit{date/timestamp/string}. \tn % Row Count 16 (+ 4) % Row 14 \SetRowColor{LightBackground} hour(e: Column): Column & Extracts the hours as an integer from a given \seqsplit{date/timestamp/string}. \tn % Row Count 20 (+ 4) % Row 15 \SetRowColor{white} minute(e: Column): Column & Extracts the minutes as an integer from a given \seqsplit{date/timestamp/string}. \tn % Row Count 24 (+ 4) % Row 16 \SetRowColor{LightBackground} second(e: Column): Column & Extracts the seconds as an integer from a given \seqsplit{date/timestamp/string}. \tn % Row Count 28 (+ 4) % Row 17 \SetRowColor{white} to\_date(e: Column): Column & Converts the column into DateType by casting rules to DateType. \tn % Row Count 32 (+ 4) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Date/Time - functions (cont)}} \tn % Row 18 \SetRowColor{LightBackground} to\_date(e: Column, fmt: String): Column & Converts the column into a DateType with a specified format (see {[}http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html{]}) return null if fail.\{\{nl\}\}The format here is the format, which is used by Date to be saved in DF.\{\{nl\}\}DF.show() will display the date in default format yyyy-MM-dd. \tn % Row Count 16 (+ 16) % Row 19 \SetRowColor{white} to\_timestamp(s: Column): Column & Convert time string to a Unix timestamp (in seconds) by casting rules to TimestampType. \tn % Row Count 21 (+ 5) % Row 20 \SetRowColor{LightBackground} to\_timestamp(s: Column, fmt: String): Column & Convert time string to a Unix timestamp (in seconds) with a specified format (see {[}http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html{]}) to Unix timestamp (in seconds), return null if fail.\{\{nl\}\}The format here is the format, which is used by timestamp to be saved in DF.\{\{nl\}\}DF.show() will display the timestamp in default format yyyy-MM-dd HH:mm:ss \tn % Row Count 40 (+ 19) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Date/Time - functions (cont)}} \tn % Row 21 \SetRowColor{LightBackground} \seqsplit{date\_format(dateExpr:} Column, format: String): Column & Converts a \seqsplit{date/timestamp/string} to a value of string in the format specified by the date format given by the second argument. A pattern dd.MM.yyyy would return a string like 18.03.1993. All pattern letters of \seqsplit{java.text.SimpleDateFormat} can be used. \tn % Row Count 13 (+ 13) % Row 22 \SetRowColor{white} unix\_timestamp(): Column & Returns the current Unix timestamp (in seconds). \tn % Row Count 16 (+ 3) % Row 23 \SetRowColor{LightBackground} unix\_timestamp(s: Column): Column & Converts time string in format yyyy-MM-dd HH:mm:ss to Unix timestamp (in seconds), using the default timezone and the default locale. Returns null if fails. \tn % Row Count 24 (+ 8) % Row 24 \SetRowColor{white} unix\_timestamp(s: Column, p: String): Column & Converts time string with given pattern to Unix timestamp (in seconds). Returns null if fails. \tn % Row Count 29 (+ 5) % Row 25 \SetRowColor{LightBackground} from\_unixtime(ut: Column, f: String): Column & Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string representing the timestamp of that moment in the current system time zone in the given format. \tn % Row Count 38 (+ 9) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Date/Time - functions (cont)}} \tn % Row 26 \SetRowColor{LightBackground} from\_unixtime(ut: Column): Column & Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string representing the timestamp of that moment in the current system time zone in the given format. \tn % Row Count 9 (+ 9) % Row 27 \SetRowColor{white} last\_day(e: Column): Column & Given a date column, returns the last day of the month which the given date belongs to. For example, input "2015-07-27" returns "2015-07-31" since July 31 is the last day of the month in July 2015. \tn % Row Count 19 (+ 10) % Row 28 \SetRowColor{LightBackground} next\_day(date: Column, dayOfWeek: String): Column & Given a date column, returns the first date which is later than the value of the date column that is on the specified day of the week.\{\{nl\}\}For example, \seqsplit{next\_day('2015-07-27'}, "Sunday") returns 2015-08-02 because that is the first Sunday after 2015-07-27.\{\{nl\}\}Day of the week parameter is case insensitive, and accepts: "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun". \tn % Row Count 38 (+ 19) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.functions} \newline \newline The default date format these functions use is yyyy-MM-dd HH:mm:ss.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{5.181 cm} x{12.089 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Null/NaN - Column}} \tn % Row 0 \SetRowColor{LightBackground} isNull: Column & True if the current expression is null. \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} isNotNull: Column & True if the current expression is NOT null. \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} isNaN: Column & True if the current expression is NaN. \tn % Row Count 6 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.Column}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Null/NaN - functions}} \tn % Row 0 \SetRowColor{LightBackground} isnull(e: Column): Column & Return true iff the column is null. \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} isnan(e: Column): Column & Return true iff the column is NaN. \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} nanvl(col1: Column, col2: Column): Column & Returns col1 if it is not NaN, or col2 if col1 is NaN. Both inputs should be floating point columns (DoubleType or FloatType). \tn % Row Count 11 (+ 7) % Row 3 \SetRowColor{white} coalesce(e: Column*): Column & Returns the first column that is not null, or null if all inputs are null. For example, coalesce(a, b, c) will return a if a is not null, or b if a is null and b is not null, or c if both a and b are null but c is not null.\{\{nl\}\}`// create a movie with null title`\{\{nl\}\}`case class \seqsplit{Movie(actor\_name:String}, movie\_title:String, produced\_year:Long)`\{\{nl\}\}`val badMoviesDF = Seq( Movie(null, null, 2018L), Movie("John Doe", "Awesome Movie", 2018L)).toDF`\{\{nl\}\}`// use coalese to handle null value in title column`\{\{nl\}\}`badMoviesDF.select(coalesce('actor\_name, \seqsplit{lit("no\_name")).as("new\_title"))}.show`\{\{nl\}\}+-{}-{}-{}-{}-{}-{}-{}-{}-{}-+\{\{nl\}\}|~~~~~new\_title|\{\{nl\}\}+-{}-{}-{}-{}-{}-{}-{}-{}-{}-+\{\{nl\}\}|~~~~~no\_name|\{\{nl\}\}|~~~~~John Doe|\{\{nl\}\}+-{}-{}-{}-{}-{}-{}-{}-{}-{}-+ \tn % Row Count 51 (+ 40) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.functions}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Null/NaN - DataFrameNaFunctions}} \tn % Row 0 \SetRowColor{LightBackground} drop(): DataFrame & Returns a new DataFrame that drops rows containing {\bf{any}} null or NaN values. \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} drop(how: String): DataFrame & Returns a new DataFrame that drops rows containing null or NaN values.\{\{nl\}\}If how is {\bf{"any"}}, then drop rows containing any null or NaN values. If how is {\bf{"all"}}, then drop rows only if every column is null or NaN for that row. \tn % Row Count 16 (+ 12) % Row 2 \SetRowColor{LightBackground} drop(cols: Seq{[}String{]}): DataFrame & (Scala-specific) Returns a new DataFrame that drops rows containing any null or NaN values in the specified columns. \tn % Row Count 22 (+ 6) % Row 3 \SetRowColor{white} drop(how: String, cols: Seq{[}String{]}): DataFrame & (Scala-specific) Returns a new DataFrame that drops rows containing null or NaN values in the specified columns. If how is "any", then drop rows containing any null or NaN values in the specified columns. If how is "all", then drop rows only if every specified column is null or NaN for that row. \tn % Row Count 37 (+ 15) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Null/NaN - DataFrameNaFunctions (cont)}} \tn % Row 4 \SetRowColor{LightBackground} drop(minNonNulls: Int): DataFrame & Returns a new DataFrame that drops rows containing less than minNonNulls non-null and non-NaN values. \tn % Row Count 6 (+ 6) % Row 5 \SetRowColor{white} drop(minNonNulls: Int, cols: Seq{[}String{]}): DataFrame & (Scala-specific) Returns a new DataFrame that drops rows containing less than minNonNulls non-null and non-NaN values in the specified columns. \tn % Row Count 14 (+ 8) % Row 6 \SetRowColor{LightBackground} fill(value: \seqsplit{String/Boolean/Double/Long):} DataFrame & Returns a new DataFrame that replaces null values in string/boolean columns (or null or NaN values in numeric columns) with value. \tn % Row Count 21 (+ 7) % Row 7 \SetRowColor{white} fill(value: \seqsplit{String/Boolean/Double/Long}, cols: Seq{[}String{]}): DataFrame & (Scala-specific) Returns a new DataFrame that replaces null values in specified \seqsplit{string/boolean/double/long} columns. \tn % Row Count 27 (+ 6) % Row 8 \SetRowColor{LightBackground} fill(valueMap: Map{[}String, Any{]}): DataFrame & (Scala-specific) Returns a new DataFrame that replaces null values. The key of the map is the column name, and the value of the map is the replacement value. The value must be of the following type: Int, Long, Float, Double, String, Boolean. Replacement values are cast to the column data type.\{\{nl\}\}e.g. `df.na.fill(Map( "A" -\textgreater{} "unknown", "B" -\textgreater{} 1.0 ))` \tn % Row Count 45 (+ 18) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Null/NaN - DataFrameNaFunctions (cont)}} \tn % Row 9 \SetRowColor{LightBackground} replace{[}T{]}(col: String, replacement: Map{[}T, T{]}): DataFrame & (Scala-specific) Replaces values matching keys in replacement map.\{\{nl\}\}{\bf{col}} name of the column to apply the value replacement. If col is "*", replacement is applied on all string, numeric or boolean columns.\{\{nl\}\}{\bf{replacement}} value replacement map. Key and value of replacement map must have the same type, and can only be doubles, strings or booleans. The map value can have nulls.\{\{nl\}\}\{\{nl\}\}// Replaces all occurrences of 1.0 with 2.0 in column "height".\{\{nl\}\}df.na.replace("height", Map(1.0 -\textgreater{} 2.0));\{\{nl\}\}\{\{nl\}\}// Replaces all occurrences of "UNKNOWN" with "unnamed" in column "name".\{\{nl\}\}df.na.replace("name", Map("UNKNOWN" -\textgreater{} "unnamed"));\{\{nl\}\}\{\{nl\}\}// Replaces all occurrences of "UNKNOWN" with "unnamed" in all string columns.\{\{nl\}\}df.na.replace("*", Map("UNKNOWN" -\textgreater{} "unnamed")); \tn % Row Count 40 (+ 40) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Null/NaN - DataFrameNaFunctions (cont)}} \tn % Row 10 \SetRowColor{LightBackground} replace{[}T{]}(cols: Seq{[}String{]}, replacement: Map{[}T, T{]}): DataFrame & (Scala-specific) Replaces values matching keys in replacement map.\{\{nl\}\}\{\{nl\}\}// Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight".\{\{nl\}\}df.na.replace("height" :: "weight" :: Nil, Map(1.0 -\textgreater{} 2.0));\{\{nl\}\}\{\{nl\}\}// Replaces all occurrences of "UNKNOWN" with "unnamed" in column "firstname" and "lastname".\{\{nl\}\}df.na.replace("firstname" :: "lastname" :: Nil, Map("UNKNOWN" -\textgreater{} "unnamed")); \tn % Row Count 21 (+ 21) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.DataFrameNaFunctions} \newline use `df.na` to get DataFrameNaFunctions} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{6.5626 cm} x{10.7074 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Sorting - Column}} \tn % Row 0 \SetRowColor{LightBackground} asc: Column & Returns a sort expression based on ascending order of the column. \{\{nl\}\}\{\{nl\}\}// Scala: sort a DataFrame by age column in ascending order.\{\{nl\}\}`df.sort(df("age").asc)` \tn % Row Count 7 (+ 7) % Row 1 \SetRowColor{white} \seqsplit{asc\_nulls\_first:} Column & Returns a sort expression based on ascending order of the column, and null values return before non-null values. \tn % Row Count 12 (+ 5) % Row 2 \SetRowColor{LightBackground} \seqsplit{asc\_nulls\_last:} Column & Returns a sort expression based on ascending order of the column, and null values appear after non-null values. \tn % Row Count 17 (+ 5) % Row 3 \SetRowColor{white} desc: Column & Returns a sort expression based on the descending order of the column. \tn % Row Count 20 (+ 3) % Row 4 \SetRowColor{LightBackground} \seqsplit{desc\_nulls\_first:} Column & Returns a sort expression based on the descending order of the column, and null values appear before non-null values. \tn % Row Count 25 (+ 5) % Row 5 \SetRowColor{white} \seqsplit{desc\_nulls\_last:} Column & Returns a sort expression based on the descending order of the column, and null values appear after non-null values. \tn % Row Count 30 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.Column}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Sorting - functions}} \tn % Row 0 \SetRowColor{LightBackground} asc(columnName: String): Column & Returns a sort expression based on ascending order of the column.\{\{nl\}\}\{\{nl\}\}`df.sort(asc("dept"), desc("age"))` \tn % Row Count 6 (+ 6) % Row 1 \SetRowColor{white} \seqsplit{asc\_nulls\_first(columnName:} String): Column & Returns a sort expression based on ascending order of the column, and null values return before non-null values. \tn % Row Count 12 (+ 6) % Row 2 \SetRowColor{LightBackground} \seqsplit{asc\_nulls\_last(columnName:} String): Column & Returns a sort expression based on ascending order of the column, and null values appear after non-null values. \tn % Row Count 18 (+ 6) % Row 3 \SetRowColor{white} desc(columnName: String): Column & Returns a sort expression based on the descending order of the column. \tn % Row Count 22 (+ 4) % Row 4 \SetRowColor{LightBackground} \seqsplit{desc\_nulls\_first(columnName:} String): Column & Returns a sort expression based on the descending order of the column, and null values appear before non-null values. \tn % Row Count 28 (+ 6) % Row 5 \SetRowColor{white} \seqsplit{desc\_nulls\_last(columnName:} String): Column & Returns a sort expression based on the descending order of the column, and null values appear after non-null values. \tn % Row Count 34 (+ 6) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.functions}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Aggregate functions}} \tn % Row 0 \SetRowColor{LightBackground} count(columnName: String): TypedColumn{[}Any, Long{]} & Aggregate function: returns the number of items in a group.\{\{nl\}\}count("*"): count null values\{\{nl\}\}count(\textless{}column\_name\textgreater{}): not count null values \tn % Row Count 8 (+ 8) % Row 1 \SetRowColor{white} count(e: Column): Column & Aggregate function: returns the number of items in a group. \tn % Row Count 11 (+ 3) % Row 2 \SetRowColor{LightBackground} \seqsplit{countDistinct(columnName:} String, columnNames: String*): Column & Aggregate function: returns the number of distinct items in a group. \tn % Row Count 15 (+ 4) % Row 3 \SetRowColor{white} countDistinct(expr: Column, exprs: Column*): Column & Aggregate function: returns the number of distinct items in a group. \tn % Row Count 19 (+ 4) % Row 4 \SetRowColor{LightBackground} first(columnName: String): Column & Aggregate function: returns the first value of a column in a group.The function by default returns the first values it sees. It will return the first non-null value it sees when ignoreNulls is set to true. If all values are null, then null is returned. \tn % Row Count 32 (+ 13) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Aggregate functions (cont)}} \tn % Row 5 \SetRowColor{LightBackground} first(e: Column): Column & Aggregate function: returns the first value in a group. \tn % Row Count 3 (+ 3) % Row 6 \SetRowColor{white} first(columnName: String, ignoreNulls: Boolean): Column & Aggregate function: returns the first value of a column in a group. \tn % Row Count 7 (+ 4) % Row 7 \SetRowColor{LightBackground} first(e: Column, ignoreNulls: Boolean): Column & Aggregate function: returns the first value in a group. \tn % Row Count 10 (+ 3) % Row 8 \SetRowColor{white} last(columnName: String): Column & Aggregate function: returns the last value of the column in a group.The function by default returns the last values it sees. It will return the last non-null value it sees when ignoreNulls is set to true. If all values are null, then null is returned. \tn % Row Count 23 (+ 13) % Row 9 \SetRowColor{LightBackground} last(e: Column): Column & Aggregate function: returns the last value in a group. \tn % Row Count 26 (+ 3) % Row 10 \SetRowColor{white} last(columnName: String, ignoreNulls: Boolean): Column & Aggregate function: returns the last value of the column in a group. \tn % Row Count 30 (+ 4) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Aggregate functions (cont)}} \tn % Row 11 \SetRowColor{LightBackground} last(e: Column, ignoreNulls: Boolean): Column & Aggregate function: returns the last value in a group. \tn % Row Count 3 (+ 3) % Row 12 \SetRowColor{white} min(columnName: String): Column & Aggregate function: returns the minimum value of the column in a group. \tn % Row Count 7 (+ 4) % Row 13 \SetRowColor{LightBackground} min(e: Column): Column & Aggregate function: returns the minimum value of the expression in a group. \tn % Row Count 11 (+ 4) % Row 14 \SetRowColor{white} max(columnName: String): Column & Aggregate function: returns the maximum value of the column in a group. \tn % Row Count 15 (+ 4) % Row 15 \SetRowColor{LightBackground} max(e: Column): Column & Aggregate function: returns the maximum value of the expression in a group. \tn % Row Count 19 (+ 4) % Row 16 \SetRowColor{white} sum(columnName: String): Column & Aggregate function: returns the sum of all values in the given column. \tn % Row Count 23 (+ 4) % Row 17 \SetRowColor{LightBackground} sum(e: Column): Column & Aggregate function: returns the sum of all values in the expression. \tn % Row Count 27 (+ 4) % Row 18 \SetRowColor{white} \seqsplit{sumDistinct(columnName:} String): Column & Aggregate function: returns the sum of distinct values in the expression. \tn % Row Count 31 (+ 4) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Aggregate functions (cont)}} \tn % Row 19 \SetRowColor{LightBackground} sumDistinct(e: Column): Column & Aggregate function: returns the sum of distinct values in the expression. \tn % Row Count 4 (+ 4) % Row 20 \SetRowColor{white} avg(columnName: String): Column & Aggregate function: returns the average of the values in a group. \tn % Row Count 8 (+ 4) % Row 21 \SetRowColor{LightBackground} avg(e: Column): Column & Aggregate function: returns the average of the values in a group. \tn % Row Count 12 (+ 4) % Row 22 \SetRowColor{white} mean(columnName: String): Column & Aggregate function: returns the average of the values in a group. Alias for avg. \tn % Row Count 16 (+ 4) % Row 23 \SetRowColor{LightBackground} mean(e: Column): Column & Aggregate function: returns the average of the values in a group. Alias for avg. \tn % Row Count 20 (+ 4) % Row 24 \SetRowColor{white} \seqsplit{variance(columnName:} String): Column & Aggregate function: alias for var\_samp. \tn % Row Count 22 (+ 2) % Row 25 \SetRowColor{LightBackground} variance(e: Column): Column & Aggregate function: alias for var\_samp \tn % Row Count 24 (+ 2) % Row 26 \SetRowColor{white} \seqsplit{var\_samp(columnName:} String): Column & Aggregate function: returns the unbiased variance of the values in a group. \tn % Row Count 28 (+ 4) % Row 27 \SetRowColor{LightBackground} var\_samp(e: Column): Column & Aggregate function: returns the unbiased variance of the values in a group. \tn % Row Count 32 (+ 4) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Aggregate functions (cont)}} \tn % Row 28 \SetRowColor{LightBackground} \seqsplit{var\_pop(columnName:} String): Column & Aggregate function: returns the population variance of the values in a group. \tn % Row Count 4 (+ 4) % Row 29 \SetRowColor{white} var\_pop(e: Column): Column & Aggregate function: returns the population variance of the values in a group. \tn % Row Count 8 (+ 4) % Row 30 \SetRowColor{LightBackground} stddev(columnName: String): Column & Aggregate function: alias for stddev\_samp. \tn % Row Count 11 (+ 3) % Row 31 \SetRowColor{white} stddev(e: Column): Column & Aggregate function: alias for stddev\_samp. \tn % Row Count 14 (+ 3) % Row 32 \SetRowColor{LightBackground} \seqsplit{stddev\_samp(columnName:} String): Column & Aggregate function: returns the sample standard deviation of the expression in a group. \tn % Row Count 19 (+ 5) % Row 33 \SetRowColor{white} stddev\_samp(e: Column): Column & Aggregate function: returns the sample standard deviation of the expression in a group. \tn % Row Count 24 (+ 5) % Row 34 \SetRowColor{LightBackground} \seqsplit{stddev\_pop(columnName:} String): Column & Aggregate function: returns the population standard deviation of the expression in a group. \tn % Row Count 29 (+ 5) % Row 35 \SetRowColor{white} stddev\_pop(e: Column): Column & Aggregate function: returns the population standard deviation of the expression in a group. \tn % Row Count 34 (+ 5) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Aggregate functions (cont)}} \tn % Row 36 \SetRowColor{LightBackground} \seqsplit{skewness(columnName:} String): Column & Aggregate function: returns the skewness of the values in a group. \tn % Row Count 4 (+ 4) % Row 37 \SetRowColor{white} skewness(e: Column): Column & Aggregate function: returns the skewness of the values in a group. \tn % Row Count 8 (+ 4) % Row 38 \SetRowColor{LightBackground} \seqsplit{kurtosis(columnName:} String): Column & Aggregate function: returns the kurtosis of the values in a group. \tn % Row Count 12 (+ 4) % Row 39 \SetRowColor{white} kurtosis(e: Column): Column & Aggregate function: returns the kurtosis of the values in a group. \tn % Row Count 16 (+ 4) % Row 40 \SetRowColor{LightBackground} corr(columnName1: String, columnName2: String): Column & Aggregate function: returns the Pearson Correlation Coefficient for two columns. \tn % Row Count 20 (+ 4) % Row 41 \SetRowColor{white} corr(column1: Column, column2: Column): Column & Aggregate function: returns the Pearson Correlation Coefficient for two columns. \tn % Row Count 24 (+ 4) % Row 42 \SetRowColor{LightBackground} \seqsplit{covar\_samp(columnName1:} String, columnName2: String): Column & Aggregate function: returns the sample covariance for two columns. \tn % Row Count 28 (+ 4) % Row 43 \SetRowColor{white} \seqsplit{covar\_samp(column1:} Column, column2: Column): Column & Aggregate function: returns the sample covariance for two columns. \tn % Row Count 32 (+ 4) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Aggregate functions (cont)}} \tn % Row 44 \SetRowColor{LightBackground} \seqsplit{covar\_pop(columnName1:} String, columnName2: String): Column & Aggregate function: returns the population covariance for two columns. \tn % Row Count 4 (+ 4) % Row 45 \SetRowColor{white} covar\_pop(column1: Column, column2: Column): Column & Aggregate function: returns the population covariance for two columns. \tn % Row Count 8 (+ 4) % Row 46 \SetRowColor{LightBackground} \seqsplit{collect\_list(columnName:} String): Column & Aggregate function: returns a list of objects with duplicates. \tn % Row Count 12 (+ 4) % Row 47 \SetRowColor{white} collect\_list(e: Column): Column & Aggregate function: returns a list of objects with duplicates. \tn % Row Count 16 (+ 4) % Row 48 \SetRowColor{LightBackground} \seqsplit{collect\_set(columnName:} String): Column & Aggregate function: returns a set of objects with duplicate elements eliminated. \tn % Row Count 20 (+ 4) % Row 49 \SetRowColor{white} collect\_set(e: Column): Column & Aggregate function: returns a set of objects with duplicate elements eliminated. \tn % Row Count 24 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.functions}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Aggregate - RelationalGroupedDataset}} \tn % Row 0 \SetRowColor{LightBackground} agg(expr: Column, exprs: Column*): DataFrame & Compute aggregates by specifying a series of aggregate columns. Note that this function by default retains the grouping columns in its output. To not retain grouping columns, set \seqsplit{spark.sql.retainGroupColumns} to false.\{\{nl\}\}\{\{nl\}\}`import \seqsplit{org.apache.spark.sql.functions}.\_`\{\{nl\}\}`df.groupBy("department").agg(max("age"), sum("expense"))` \tn % Row Count 17 (+ 17) % Row 1 \SetRowColor{white} agg(exprs: Map{[}String, String{]}): DataFrame & (Scala-specific) Compute aggregates by specifying a map from column name to aggregate methods. The resulting DataFrame will also contain the grouping columns.\{\{nl\}\}\{\{nl\}\}`df.groupBy("department").agg(Map( "age" -\textgreater{} "max", "expense" -\textgreater{} "sum" ))` \tn % Row Count 30 (+ 13) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Aggregate - RelationalGroupedDataset (cont)}} \tn % Row 2 \SetRowColor{LightBackground} count(): DataFrame & Count the number of rows for each group. The resulting DataFrame will also contain the grouping columns. \tn % Row Count 6 (+ 6) % Row 3 \SetRowColor{white} max(colNames: String*): DataFrame & Compute the max value for each numeric columns for each group. The resulting DataFrame will also contain the grouping columns. When specified columns are given, only compute the max values for them. \tn % Row Count 16 (+ 10) % Row 4 \SetRowColor{LightBackground} min(colNames: String*): DataFrame & Compute the min value for each numeric column for each group. The resulting DataFrame will also contain the grouping columns. When specified columns are given, only compute the min values for them. \tn % Row Count 26 (+ 10) % Row 5 \SetRowColor{white} sum(colNames: String*): DataFrame & Compute the sum for each numeric columns for each group. The resulting DataFrame will also contain the grouping columns. When specified columns are given, only compute the sum for them. \tn % Row Count 36 (+ 10) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Aggregate - RelationalGroupedDataset (cont)}} \tn % Row 6 \SetRowColor{LightBackground} avg(colNames: String*): DataFrame & Compute the mean value for each numeric columns for each group. \tn % Row Count 4 (+ 4) % Row 7 \SetRowColor{white} mean(colNames: String*): DataFrame & Compute the average value for each numeric columns for each group. This is an alias for avg. The resulting DataFrame will also contain the grouping columns. When specified columns are given, only compute the average values for them. \tn % Row Count 16 (+ 12) % Row 8 \SetRowColor{LightBackground} pivot(pivotColumn: String): \seqsplit{RelationalGroupedDataset} & Pivots a column of the current DataFrame and performs the specified aggregation. There are two versions of pivot function: one that requires the caller to specify the list of distinct values to pivot on, and one that does not. The latter is more concise but less efficient, because Spark needs to first compute the list of distinct values internally. \tn % Row Count 34 (+ 18) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Aggregate - RelationalGroupedDataset (cont)}} \tn % Row 9 \SetRowColor{LightBackground} pivot(pivotColumn: String, values: Seq{[}Any{]}): \seqsplit{RelationalGroupedDataset} & Pivots a column of the current DataFrame and performs the specified aggregation. There are two versions of pivot function: one that requires the caller to specify the list of distinct values to pivot on, and one that does not. The latter is more concise but less efficient, because Spark needs to first compute the list of distinct values internally. \tn % Row Count 18 (+ 18) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.RelationalGroupedDataset} \newline Use `df.groupBy("xxx")` to get RelationalGroupedDataset} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{8.635 cm} x{8.635 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Collection - functions}} \tn % Row 0 \SetRowColor{LightBackground} size(e: Column): Column & Returns length of array or map. \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \seqsplit{array\_contains(column:} Column, value: Any): Column & Returns null if the array is null, true if the array contains value, and false otherwise. \tn % Row Count 7 (+ 5) % Row 2 \SetRowColor{LightBackground} sort\_array(e: Column): Column & Sorts the input array for the given column in ascending order, according to the natural ordering of the array elements. \tn % Row Count 13 (+ 6) % Row 3 \SetRowColor{white} sort\_array(e: Column, asc: Boolean): Column & Sorts the input array for the given column in ascending or descending order, according to the natural ordering of the array elements. \tn % Row Count 20 (+ 7) % Row 4 \SetRowColor{LightBackground} explode(e: Column): Column & Creates a new row for each element in the given array or map column. \tn % Row Count 24 (+ 4) % Row 5 \SetRowColor{white} explode\_outer(e: Column): Column & Creates a new row for each element in the given array or map column. Unlike explode, if the array/map is null or empty then null is produced. \tn % Row Count 32 (+ 8) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{\seqsplit{org.apache.spark.sql.functions}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{6.0445 cm} x{11.2255 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Window - functions}} \tn % Row 0 \SetRowColor{LightBackground} rank(): Column & Window function: returns the rank of rows within a window partition.\{\{nl\}\}The difference between rank and dense\_rank is that dense\_rank leaves no gaps in ranking sequence when there are ties. That is, if you were ranking a competition using dense\_rank and had three people tie for second place, you would say that all three were in second place and that the next person came in third. Rank would give me sequential numbers, making the person that came in third place (after the ties) would register as coming in fifth.\{\{nl\}\} e.g. 1, 2, 2, 2, 5 \tn % Row Count 21 (+ 21) % Row 1 \SetRowColor{white} \seqsplit{dense\_rank():} Column & Window function: returns the rank of rows within a window partition, without any gaps.\{\{nl\}\}The difference between rank and dense\_rank is that denseRank leaves no gaps in ranking sequence when there are ties. That is, if you were ranking a competition using dense\_rank and had three people tie for second place, you would say that all three were in second place and that the next person came in third. Rank would give me sequential numbers, making the person that came in third place (after the ties) would register as coming in fifth.\{\{nl\}\}e.g. 1, 2, 2, 2, 3 \tn % Row Count 43 (+ 22) \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{6.0445 cm} x{11.2255 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{Working with Window - functions (cont)}} \tn % Row 2 \SetRowColor{LightBackground} \seqsplit{percent\_rank():} Column & Window function: returns the relative rank (i.e. percentile) of rows within a window partition.\{\{nl\}\}This is computed by:\{\{nl\}\}(rank of row in its partition - 1) / (number of rows in the partition - 1) \tn % Row Count 8 (+ 8) % Row 3 \SetRowColor{white} \seqsplit{row\_number():} Column & Window function: returns a sequential number starting at 1 within a window partition. \tn % Row Count 12 (+ 4) % Row 4 \SetRowColor{LightBackground} cume\_dist(): Column & Window function: returns the cumulative distribution of values within a window partition, i.e.\{\{nl\}\}`N = total number of rows in the partition\{\{nl\}\} cumeDist(x) = number of values before (and including) x / N` \tn % Row Count 21 (+ 9) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{17.67cm}}{import \seqsplit{org.apache.spark.sql.expressions.Window} \newline import \seqsplit{org.apache.spark.sql.functions.colval} \newline windowSpec = \seqsplit{Window.partitionBy("CustomerId"}, \seqsplit{"date").orderBy(col("Quantity").desc).rowsBetween(Window.unboundedPreceding}, Window.currentRow) \newline \newline val purchaseRank = rank().over(windowSpec) \newline \newline dfWithDate.where("CustomerId IS NOT \seqsplit{NULL").orderBy("CustomerId")} \newline .select( \newline col("CustomerId"), \newline col("date"), \newline col("Quantity"), \newline \seqsplit{purchaseRank.alias("quantityRank")).show()}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{x{12.9525 cm} p{4.3175 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{17.67cm}}{\bf\textcolor{white}{\seqsplit{org.apache.spark.sql.expresseions.WindowSpec}}} \tn % Row 0 \SetRowColor{LightBackground} rowBetween & todo \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} rangeBetween & todo \tn % Row Count 2 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \end{document}