\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{Justin1209 (Justin1209)} \pdfinfo{ /Title (pandas.pdf) /Creator (Cheatography) /Author (Justin1209 (Justin1209)) /Subject (Pandas Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{5FD4CC} \definecolor{LightBackground}{HTML}{EBF9F8} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Pandas Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{Justin1209 (Justin1209)} via \textcolor{DarkBackground}{\uline{cheatography.com/101982/cs/21202/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}Justin1209 (Justin1209) \\ \uline{cheatography.com/justin1209} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 23rd November, 2019.\\ Updated 31st January, 2020.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Import the Pandas Module}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{import}} {\emph{pandas}} {\bf{as}} {\emph{pd}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Create a DataFrame}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{\# Method 1}} \newline df1 = pd.DataFrame(\{ \newline {\emph{'name'}}: {[}'John Smith', 'Jane Doe'{]}, \newline {\emph{'address'}}: {[}'13 Main St.', '46 Maple Ave.'{]}, \newline {\emph{'age'}}: {[}34, 28{]} \newline \}) \newline \newline {\bf{\# Method 2}} \newline df2 = pd.DataFrame({[} \newline {[}'John Smith', '123 Main St.', 34{]}, \newline {[}'Jane Doe', '456 Maple Ave.', 28{]}, \newline {[}'Joe Schmo', '9 Broadway', 51{]} \newline {]}, \newline {\bf{columns}}={[}{\emph{'name', 'address', 'age'}}{]})} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Loading and Saving CSVs}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{\# Load a CSV File in to a DataFrame}} \newline df = \seqsplit{pd.read\_csv('my-csv-file.csv')} \newline \newline {\bf{\# Saving DataFrame to a CSV File}} \newline df.to\_csv('new-csv-file.csv') \newline \newline {\bf{\# Load DataFrame in Chunks (For large Datasets)}} \newline \# Initialize reader object: urb\_pop\_reader \newline urb\_pop\_reader = \seqsplit{pd.read\_csv('ind\_pop\_data.csv'}, chunksize=1000) \newline \newline \# Get the first DataFrame chunk: df\_urb\_pop \newline df\_urb\_pop = next(urb\_pop\_reader)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{0.9954 cm} x{3.9816 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Inspect a DataFrame}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{df.head(5)} & First 5 rows \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \seqsplit{df.info()} & Statistics of columns {\emph{(row count, null values, datatype)}} \tn % Row Count 4 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Reshape (for Scikit)}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{nums = np.array(range(1, 11)) \newline -\textgreater{} {[} 1 2 3 4 5 6 7 8 9 10{]} \newline \newline nums = nums.reshape(-1, 1) \newline -\textgreater{} {[} {[}1{]}, \newline {[}2{]}, \newline {[}3{]}, \newline {[}4{]}, \newline {[}5{]}, \newline {[}6{]}, \newline {[}7{]}, \newline {[}8{]}, \newline {[}9{]}, \newline {[}10{]}{]}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{You can think of {\bf{reshape()}} as rotating this array. Rather than one big row of numbers, nums is now a big column of numbers - there's one number in each row.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Converting Datatypes}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\# {\emph{Convert argument to numeric type}} \newline % Row Count 1 (+ 1) `pandas.to\_numeric(arg, errors="raise")` \newline % Row Count 2 (+ 1) {\emph{errors:}} \newline % Row Count 3 (+ 1) "raise" -\textgreater{} raise an exception \newline % Row Count 4 (+ 1) "coerce" -\textgreater{} invalid parsing will be set as NaN% Row Count 5 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{DataFrame for Select Columns / Rows}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{df = pd.DataFrame({[} \newline {[}'January', 100, 100, 23, 100{]}, \newline {[}'February', 51, 45, 145, 45{]}, \newline {[}'March', 81, 96, 65, 96{]}, \newline {[}'April', 80, 80, 54, 180{]}, \newline {[}'May', 51, 54, 54, 154{]}, \newline {[}'June', 112, 109, 79, 129{]}{]}, \newline \newline columns={[}'month', 'east', 'north', 'south', 'west'{]} \newline )} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Select Columns}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{\# Select one Column}} \newline {\bf{clinic\_north = df.north}} \newline -{}-\textgreater{} Reshape values for Scikit learn: clinic\_north.{\bf{values.reshape(-1, 1)}} \newline \newline {\bf{\# Select multiple Columns}} \newline {\bf{clinic\_north\_south}} = df{[}{[}'north', 'south'{]}{]}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Make sure that you have a {\emph{double set of brackets}} {\bf{{[}{[} {]}{]}}}, or this command won't work!} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Select Rows}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{\# Select one Row}} \newline march = df.iloc{[}2{]} \newline \newline \newline {\bf{\# Select multiple Rows}} \newline jan\_feb\_march = df.iloc{[}:3{]} \newline feb\_march\_april = df.iloc{[}1:4{]} \newline may\_june = df.iloc{[}-2:{]} \newline \newline \newline {\bf{\# Select Rows with Logic}} \newline january = df{[}df.month {\bf{==}} 'January'{]} \newline {\bf{-\textgreater{} \textless{}, \textgreater{}, \textless{}=, \textgreater{}=, !=, ==}} \newline \newline march\_april = df{[}(df.month == 'March') | (df.month == 'April'){]} \newline {\bf{-\textgreater{} \&, |}} \newline \newline january\_february\_march = df{[}df{\bf{.month.isin}}({[}'January', 'February', 'March'{]}){]} \newline {\bf{-\textgreater{} column\_name.isin({[}" ", " "{]})}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Selecting a Subset of a Dataframe often results in {\bf{non-consecutive indices.}} \newline \newline Using {\bf{.reset\_index()}} will create a {\emph{new DataFrame}} move the old indices into a new colum called {\emph{index}}. \newline \newline Use {\bf{.reset\_index(drop=True)}} if you dont need the {\emph{index}} column. \newline Use {\bf{.reset\_index(inplace=True)}} to prevent a {\emph{new DataFrame}} from brein created.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Adding a Column}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{df = pd.DataFrame({[} \newline {[}1, '3 inch screw', 0.5, 0.75{]}, \newline {[}2, '2 inch nail', 0.10, 0.25{]}, \newline {[}3, 'hammer', 3.00, 5.50{]}, \newline {[}4, 'screwdriver', 2.50, 3.00{]} \newline {]}, \newline columns={[}'Product ID', 'Description', 'Cost to Manufacture', 'Price'{]} \newline ) \newline \newline {\bf{\# Add a Column with specified row-values}} \newline df{[}'Sold in Bulk?'{]} = {[}'Yes', 'Yes', 'No', 'No'{]} \newline \newline {\bf{\# Add a Column with same value in every row}} \newline df{[}'Is taxed?'{]} = 'Yes' \newline \newline {\bf{\# Add a Column with calculation}} \newline df{[}'Revenue'{]} = df{[}'Price'{]} - df{[}'Cost to Manufacture'{]}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Performing Column Operation}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{df = pd.DataFrame({[} \newline {[}'JOHN SMITH', 'john.smith@gmail.com'{]}, \newline {[}'Jane Doe', 'jdoe@yahoo.com'{]}, \newline {[}'joe schmo', 'joeschmo@hotmail.com'{]} \newline {]}, \newline columns={[}'Name', 'Email'{]}) \newline \newline {\bf{\# Changing a column with an Operation}} \newline df{[}'Name'{]} = df.Name.{\bf{apply(lower)}} \newline {\emph{-\textgreater{} lower, upper}} \newline \newline {\bf{\# Perform a lambda Operation on a Column}} \newline get\_last\_name = {\bf{lambda x: x.split(" "){[}-1{]}}} \newline df{[}'last\_name'{]} = df.Name.{\bf{apply(get\_last\_name)}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Performing a Operation on Multiple Columns}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{df = pd.DataFrame({[} \newline {[}"Apple", 1.00, "No"{]}, \newline {[}"Milk", 4.20, "No"{]}, \newline {[}"Paper Towels", 5.00, "Yes"{]}, \newline {[}"Light Bulbs", 3.75, "Yes"{]}, \newline {]}, \newline columns={[}"Item", "Price", "Is taxed?"{]}) \newline \newline \newline {\bf{\# Lambda Function}} \newline df{[}'Price with Tax'{]} = df.apply(lambda row: \newline row{[}'Price'{]} * 1.075 \newline if row{[}'Is taxed?'{]} == 'Yes' \newline else row{[}'Price'{]}, \newline {\bf{axis=1}} \newline )} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{We apply a {\bf{lambda to rows}}, as opposed to columns, when we want to perform functionality that needs to access more than one column at a time.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Rename Columns}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{\# Method 1}} \newline df{\bf{.columns}} = {[}'NewName\_1', 'NewName\_2, 'NewName\_3', '...'{]} \newline \newline {\bf{\# Method 2}} \newline df{\bf{.rename}}(columns=\{ \newline 'OldName\_1': 'NewName\_1', \newline 'OldName\_2': 'NewName\_2' \newline \}, {\bf{inplace=True}})} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Using {\bf{inplace=True}} lets us edit the original DataFrame.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Series vs. Dataframes}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{\# Dataframe and Series}} \newline % Row Count 1 (+ 1) \seqsplit{print(type(clinic\_north)):} \newline % Row Count 2 (+ 1) {\emph{\# \textless{}class 'pandas.core.series.Series'\textgreater{}}} \newline % Row Count 3 (+ 1) print(type(df)): \newline % Row Count 4 (+ 1) {\emph{\# \textless{}class 'pandas.core.frame.DataFrame'\textgreater{}}} \newline % Row Count 5 (+ 1) \seqsplit{print(type(clinic\_north\_south))} \newline % Row Count 6 (+ 1) {\emph{\# \textless{}class 'pandas.core.frame.DataFrame'\textgreater{}}}% Row Count 7 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{In Pandas \newline - a {\bf{series is a one-dimensional object}} that contains any type of data. \newline \newline - a {\bf{dataframe is a two-dimensional object}} that can hold multiple columns of different types of data. \newline \newline A {\emph{single column of a dataframe is a series}}, and a {\bf{dataframe is a container of two or more series objects}}.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.28942 cm} x{2.68758 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Column Statistics}} \tn % Row 0 \SetRowColor{LightBackground} Mean = Average & df.{\emph{column}}.{\bf{mean()}} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} Median & df.{\emph{column}}.{\bf{median()}} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} Minimal Value & df.{\emph{column}}.{\bf{min()}} \tn % Row Count 5 (+ 1) % Row 3 \SetRowColor{white} Maximum Value & df.{\emph{column}}.{\bf{max()}} \tn % Row Count 6 (+ 1) % Row 4 \SetRowColor{LightBackground} Number of Values & df.{\emph{column}}.{\bf{count()}} \tn % Row Count 8 (+ 2) % Row 5 \SetRowColor{white} Unique Values & df.{\emph{column}}.{\bf{nunique()}} \tn % Row Count 10 (+ 2) % Row 6 \SetRowColor{LightBackground} Standard Deviation & df.{\emph{column}}.{\bf{std()}} \tn % Row Count 11 (+ 1) % Row 7 \SetRowColor{white} List of Unique Values & df.{\emph{column}}.{\bf{unique()}} \tn % Row Count 13 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{{\bf{Dont't forget reset\_index()}} at the end of a {\bf{groupby}} operation} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Calculating Aggregate Functions}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{\# Group By}} \newline grouped = df.{\bf{groupby({[}'col1', 'col2'{]})}}.col3 \newline .{\emph{measurement()}}.{\bf{reset\_index()}} \newline {\emph{\# -\textgreater{} group by column1 and column2, calculate values of column3}} \newline \newline {\bf{\# Percentile}} \newline high\_earners = \seqsplit{df.groupby('category').wage} \newline .{\bf{apply(lambda}} x: np.percentile(x, 75){\bf{)}} \newline .{\bf{reset\_index()}} \newline {\emph{\# np.percentile can calculate any percentile over an array of values}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Don't forget reset.index()}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Pivot Tables}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{orders = \seqsplit{pd.read\_csv('orders.csv')} \newline \newline shoe\_counts = orders. \newline groupby({[}'shoe\_type', 'shoe\_color'{]}). \newline id.count().reset\_index() \newline \newline shoe\_counts\_pivot = shoe\_counts.pivot( \newline {\bf{index}} = 'shoe\_type', \newline {\bf{columns}} = 'shoe\_color', \newline {\bf{values}} = 'id').reset\_index()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{We have to build a temporary table where we group by the columns we want to include in the pivot table} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Merge (Same Column Name)}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{sales = \seqsplit{pd.read\_csv('sales.csv')} \newline targets = \seqsplit{pd.read\_csv('targets.csv')} \newline men\_women = \seqsplit{pd.read\_csv('men\_women\_sales.csv')} \newline \newline {\bf{\# Method 1}} \newline sales\_targets = pd{\bf{.merge}}(sales, targets, {\bf{how=}}" ") \newline {\emph{\# how: "inner"(default), "outer", "left", "right"}} \newline \newline {\bf{\#Method 2 (Method Chaining)}} \newline all\_data = sales{\bf{.merge}}(targets){\bf{.merge}}(men\_women)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Inner Merge (Different Column Name)}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{orders = \seqsplit{pd.read\_csv('orders.csv')} \newline products = \seqsplit{pd.read\_csv('products.csv')} \newline \newline {\bf{\# Method 1: Rename Columns}} \newline orders\_products = pd{\bf{.merge}}(orders, products{\bf{.rename}}(columns=\{'id':'product\_id'\}), {\bf{how=" "}}) \newline {\bf{.reset\_index()}} \newline {\emph{\# how: "inner"(default), "outer", "left", "right"}} \newline \newline {\bf{\# Method 2: }} \newline orders\_products = \newline pd{\bf{.merge}}(orders, products, \newline {\bf{left\_on}}="product\_id", \newline {\bf{right\_on}}="id", \newline {\bf{suffixes}}={[}"\_orders","\_products"{]})} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Method 2: \newline If we use this syntax, we'll end up with {\bf{two columns called id.}} \newline Pandas won't let you have two columns with the same name, so it will change them to id\_x and id\_y. \newline We can help make them more useful by using the keyword {\bf{suffixes.}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Concatenate}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{bakery = \seqsplit{pd.read\_csv('bakery.csv')} \newline ice\_cream = \seqsplit{pd.read\_csv('ice\_cream.csv')} \newline \newline menu = pd{\bf{.concat}}({[}bakery, ice\_cream{]})} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Melt}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{`pandas.melt(DataFrame, id\_vars, value\_vars, var\_name, value\_name='value')` \newline % Row Count 2 (+ 2) {\emph{id\_vars:}} Column(s) to use as identifier variables. \newline % Row Count 4 (+ 2) {\emph{value\_vars:}} Column(s) to unpivot. If not specified, uses all columns that are not set as id\_vars. \newline % Row Count 6 (+ 2) {\emph{var\_name:}} Name to use for the 'variable' column. \newline % Row Count 8 (+ 2) {\emph{value\_name:}} Name to use for the 'value' column.% Row Count 10 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Assert Statements}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\# Test if country is of type object \newline {\bf{assert gapminder.country.dtypes == np.object}} \newline \newline \# Test if year is of type int64 \newline {\bf{assert gapminder.year.dtypes == np.int64}} \newline \newline \# Test if life\_expectancy is of type float64 \newline {\bf{assert \seqsplit{gapminder.life\_expectancy.dtypes} == np.float64}} \newline \newline \# Assert that country does not contain any missing values \newline {\bf{assert \seqsplit{pd.notnull(gapminder.country)}.all()}} \newline \newline \# Assert that year does not contain any missing values \newline {\bf{assert \seqsplit{pd.notnull(gapminder.year)}.all()}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}