\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{Ianh} \pdfinfo{ /Title (python-pandas-top-25.pdf) /Creator (Cheatography) /Author (Ianh) /Subject (Python Pandas top 25 Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{3CA340} \definecolor{LightBackground}{HTML}{F2F9F3} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Python Pandas top 25 Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{Ianh} via \textcolor{DarkBackground}{\uline{cheatography.com/85121/cs/20026/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}Ianh \\ \uline{cheatography.com/ianh} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 12th July, 2019.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{2} \begin{tabularx}{8.4cm}{x{3.52 cm} x{4.48 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{1. Show installed versions}} \tn % Row 0 \SetRowColor{LightBackground} pd.\_\_version\_\_ & Show Python version \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \seqsplit{pd.show\_versions()} & Show dependency version \tn % Row Count 3 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{2. Create an example DataFrame}} \tn % Row 0 \SetRowColor{LightBackground} df = pd.DataFrame(\{'col one':{[}100, 200{]}, 'col two':{[}300, 400{]}\}) & Pass a dictionary to the DataFrame constructor, keys are the column names and the dictionary are the values \tn % Row Count 6 (+ 6) % Row 1 \SetRowColor{white} \seqsplit{pd.DataFrame(np.random.rand(4}, 8)) & Use the rand function to create a larger data frame \tn % Row Count 9 (+ 3) % Row 2 \SetRowColor{LightBackground} \seqsplit{pd.DataFrame(np.random.rand(4}, 8), \seqsplit{columns=list('abcdefgh'))} & If you want non-numeric column names \tn % Row Count 12 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{3. Rename columns}} \tn % Row 0 \SetRowColor{LightBackground} df = df.rename(\{'col one':'col\_one', 'col two':'col\_two'\}, axis='columns') & Pass a dictionary; keys are the old names and the values are the new names, and you also specify the axis \tn % Row Count 6 (+ 6) % Row 1 \SetRowColor{white} df.columns = {[}'col\_one', 'col\_two'{]} & if you're going to rename all of the columns at once, a simpler method is just to overwrite the columns attribute of the DataFrame \tn % Row Count 13 (+ 7) % Row 2 \SetRowColor{LightBackground} df.columns = \seqsplit{df.columns.str.replace('} ', '\_') & To replace spaces with underscores, use the str.replace() method \tn % Row Count 17 (+ 4) % Row 3 \SetRowColor{white} df.add\_prefix('X\_') & Add a prefix \tn % Row Count 18 (+ 1) % Row 4 \SetRowColor{LightBackground} df.add\_suffix('\_Y') & Add a suffix \tn % Row Count 19 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{3.92 cm} x{4.08 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{4. Reverse row order}} \tn % Row 0 \SetRowColor{LightBackground} drinks.loc{[}::-1{]} & The most straightforward method is to use the loc accessor \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} drinks.loc{[}::-1{]}.reset\_index(drop=True) & Reset the index; use reset\_index() to drop the old index entirely \tn % Row Count 7 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{2.56 cm} x{5.44 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{5. Reverse column order}} \tn % Row 0 \SetRowColor{LightBackground} drinks.loc{[}:, ::-1{]} & Use loc to reverse the left-to-right order of your columns \tn % Row Count 3 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{6. Select columns by data type}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{drinks.select\_dtypes(include='number')} & To select only the numeric columns \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} drinks.select\_dtypes(include={[}'number', 'object', 'category', 'datetime'{]}) & To include multiple data types by passing a list \tn % Row Count 6 (+ 4) % Row 2 \SetRowColor{LightBackground} \seqsplit{drinks.select\_dtypes(exclude='number')} & To exclude certain data types \tn % Row Count 8 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{7. Convert strings to numbers}} \tn % Row 0 \SetRowColor{LightBackground} df.astype(\{'col\_one':'float', 'col\_two':'float'\}).dtypes & o convert the data types to numeric. You can use the astype() method \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} \seqsplit{pd.to\_numeric(df.col\_three}, \seqsplit{errors='coerce').fillna(0)} & use the to\_numeric() function on. If you know that the NaN values actually represent zeros, you can fill them with zeros using the fillna() method the third column and tell it to convert any invalid input into NaN values \tn % Row Count 16 (+ 12) % Row 2 \SetRowColor{LightBackground} df = \seqsplit{df.apply(pd.to\_numeric}, \seqsplit{errors='coerce').fillna(0)} & To apply this function to the entire DataFrame all at once by using the apply() method \tn % Row Count 21 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{8. Reduce DataFrame size}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{drinks.info(memory\_usage='deep')} & Review the memory usage of a dataframe \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} cols = {[}'beer\_servings', 'continent'{]} small\_drinks = \seqsplit{pd.read\_csv('http://bit.ly/drinksbycountry'}, usecols=cols) & The first step is to only read in the columns that you actually need, which we specify with the "usecols" parameter \tn % Row Count 8 (+ 6) % Row 2 \SetRowColor{LightBackground} dtypes = \{'continent':'category'\} smaller\_drinks = \seqsplit{pd.read\_csv('http://bit.ly/drinksbycountry'}, usecols=cols, dtype=dtypes) & The second step is to convert any object columns containing categorical data to the category data type, which we specify with the "dtype" parameter \tn % Row Count 16 (+ 8) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{9. Build a DataFrame from multiple files (row)}} \tn % Row 0 \SetRowColor{LightBackground} from glob import glob & Use the glog module \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} stock\_files = \seqsplit{sorted(glob('data/stocks*}.csv')) & In this case, glob is looking in the "data" subdirectory for all CSV files that start with the word "stocks"; {[}'data/stocks1.csv', 'data/stocks2.csv', 'data/stocks3.csv'{]} \tn % Row Count 11 (+ 9) % Row 2 \SetRowColor{LightBackground} \seqsplit{pd.concat((pd.read\_csv(file)} for file in stock\_files)) & use a generator expression to read each of the files using read\_csv() and pass the results to the concat() function, which will concatenate the rows into a single DataFrame \tn % Row Count 20 (+ 9) % Row 3 \SetRowColor{white} \seqsplit{pd.concat((pd.read\_csv(file)} for file in stock\_files), ignore\_index=True) & There are now duplicate values in the index. To avoid that, we can tell the concat() function to ignore the index and instead use the default integer index \tn % Row Count 28 (+ 8) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{10. Build a DF from multiple files (columns)ΒΆ}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{drink\_files = \seqsplit{sorted(glob('data/drinks*}.csv'))} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \seqsplit{pd.concat((pd.read\_csv(file)} for file in drink\_files), axis='columns') & Tell the concat() function to concatenate along the columns axis \tn % Row Count 5 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{3.04 cm} x{4.96 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{11. Create a DataFrame from the clipboard}} \tn % Row 0 \SetRowColor{LightBackground} df = \seqsplit{pd.read\_clipboard()} & Just select the data and copy it to the clipboard. Then, you can use the read\_clipboard() function to read it into a DataFrame \tn % Row Count 6 (+ 6) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{22. Create a pivot table}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{titanic.pivot\_table(index='Sex'}, columns='Pclass', values='Survived', aggfunc='mean') & If you often create DataFrames, you might find it more convenient to use the pivot\_table() method instead. With a pivot table, you directly specify the index, the columns, the values, and the aggregation function. \tn % Row Count 11 (+ 11) % Row 1 \SetRowColor{white} \seqsplit{titanic.pivot\_table(index='Sex'}, columns='Pclass', values='Survived', aggfunc='mean', margins=True) & An added benefit of a pivot table is that you can easily add row and column totals by setting margins=True \tn % Row Count 17 (+ 6) % Row 2 \SetRowColor{LightBackground} \seqsplit{titanic.pivot\_table(index='Sex'}, columns='Pclass', values='Survived', aggfunc='count', margins=True) & Create a cross-tabulation just by changing the aggregation function from "mean" to "count" \tn % Row Count 22 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{23. Convert continuous data into categorical data}} \tn % Row 0 \SetRowColor{LightBackground} pd.cut(titanic.Age, bins={[}0, 18, 25, 99{]}, labels={[}'child', 'young adult', 'adult'{]}).head(10) & Label the age ranges, such as "child", "young adult", and "adult". The best way to do this is by using the cut() function. This assigned each value to a bin with a label. Ages 0 to 18 were assigned the label "child", ages 18 to 25 were assigned the label "young adult", and ages 25 to 99 were assigned the label "adult". \tn % Row Count 17 (+ 17) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{24. Change display options}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{pd.set\_option('display.float\_format'}, '\{:.2f\}'.format) & To standardise the display to use 2 decimal places \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \seqsplit{pd.reset\_option('display.float\_format')} & Reset any option back to its default \tn % Row Count 5 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{25. Style a DataFrame}} \tn % Row 0 \SetRowColor{LightBackground} format\_dict = \{'Date':'\{:\%m/\%d/\%y\}', 'Close':'\$\{:.2f\}', 'Volume':'\{:,\}'\} & Create a dictionary of format strings that specifies how each column should be formatted \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} \seqsplit{stocks.style.format(format\_dict)} & Pass it to the DataFrame's style.format() method \tn % Row Count 8 (+ 3) % Row 2 \SetRowColor{LightBackground} \seqsplit{(stocks.style.format(format\_dict).hide\_index().highlight\_min('Close'}, \seqsplit{color='red').highlight\_max('Close'}, color='lightgreen') ) & We've now hidden the index, highlighted the minimum Close value in red, and highlighted the maximum Close value in green \tn % Row Count 15 (+ 7) % Row 3 \SetRowColor{white} \seqsplit{(stocks.style.format(format\_dict).hide\_index().background\_gradient(subset='Volume'}, cmap='Blues') ) & Highlight the minimum Close value in red, and highlighted the maximum Close value in green \tn % Row Count 20 (+ 5) % Row 4 \SetRowColor{LightBackground} \seqsplit{(stocks.style.format(format\_dict)} .hide\_index() .bar('Volume', color='lightblue', align='zero') \seqsplit{.set\_caption('Stock} Prices from October 2016') ) & There's now a bar chart within the Volume column and a caption above the DataFrame. \tn % Row Count 28 (+ 8) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{21. Reshape a MultiIndexed Series}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{titanic.groupby('Sex').Survived.mean()} & If you wanted to calculate the survival rate by a single category such as "Sex", you would use a groupby() \tn % Row Count 6 (+ 6) % Row 1 \SetRowColor{white} titanic.groupby({[}'Sex', 'Pclass'{]}).Survived.mean() & If you wanted to calculate the survival rate across two different categories at once, you would groupby() both of those categories \tn % Row Count 13 (+ 7) % Row 2 \SetRowColor{LightBackground} titanic.groupby({[}'Sex', 'Pclass'{]}).Survived.mean().unstack() & It can be hard to read and interact with data in this format, so it's often more convenient to reshape a MultiIndexed Series into a DataFrame by using the unstack() method \tn % Row Count 22 (+ 9) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{20. Select a slice of rows and columns}} \tn % Row 0 \SetRowColor{LightBackground} titanic.describe() & If you wanted a numerical summary of the dataset, you would use the describe() method \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} titanic.describe().loc{[}'min':'max'{]} & If you wanted to filter it to only show the "five-number summary", you can use the loc accessor and pass it a slice of the "min" through the "max" row labels \tn % Row Count 13 (+ 8) % Row 2 \SetRowColor{LightBackground} titanic.describe().loc{[}'min':'max', 'Pclass':'Parch'{]} & And if you're not interested in all of the columns, you can also pass it a slice of column labels \tn % Row Count 18 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{18. Aggregate by multiple functions}} \tn % Row 0 \SetRowColor{LightBackground} orders{[}orders.order\_id == 1{]}.item\_price.sum() & Each order has an order\_id and consists of one or more rows. To figure out the total price of an order, you sum the item\_price for that order\_id. For example, here's the total price of order number 1 \tn % Row Count 10 (+ 10) % Row 1 \SetRowColor{white} \seqsplit{orders.groupby('order\_id').item\_price.sum()} & If you wanted to calculate the total price of every order, you would groupby() order\_id and then take the sum of item\_price for each group \tn % Row Count 17 (+ 7) % Row 2 \SetRowColor{LightBackground} \seqsplit{orders.groupby('order\_id').item\_price}.agg({[}'sum', 'count'{]}) & However, you're not actually limited to aggregating by a single function such as sum(). To aggregate by multiple functions, you use the agg() method and pass it a list of functions such as sum() and count() \tn % Row Count 28 (+ 11) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{19. Combine the output of an aggregation with a DF}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{orders.groupby('order\_id').item\_price.sum()} & What if we wanted to create a new column listing the total price of each order? Recall that we calculated the total price using the sum() method \tn % Row Count 8 (+ 8) % Row 1 \SetRowColor{white} \seqsplit{len(orders.groupby('order\_id').item\_price.sum())} & In other words, the output of the sum() function \tn % Row Count 11 (+ 3) % Row 2 \SetRowColor{LightBackground} \seqsplit{len(orders.item\_price)} & ...is smaller than the input to the function \tn % Row Count 14 (+ 3) % Row 3 \SetRowColor{white} total\_price = \seqsplit{orders.groupby('order\_id').item\_price.transform('sum')} len(total\_price) & The solution is to use the transform() method, which performs the same calculation but returns output data that is the same shape as the input data \tn % Row Count 22 (+ 8) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{This needs more work!} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{16. Split a string into multiple columns}} \tn % Row 0 \SetRowColor{LightBackground} df.name.str.split(' ', expand=True) & What if we wanted to split the "name" column into three separate columns, for first, middle, and last name? We would use the str.split() method and tell it to split on a space character and expand the results into a DataFrame \tn % Row Count 12 (+ 12) % Row 1 \SetRowColor{white} df{[}{[}'first', 'middle', 'last'{]}{]} = df.name.str.split(' ', expand=True) df & These three columns can actually be saved to the original DataFrame in a single assignment statement \tn % Row Count 17 (+ 5) % Row 2 \SetRowColor{LightBackground} \seqsplit{df.location.str.split('}, ', expand=True) & What if we wanted to split a string, but only keep one of the resulting columns? For example, let's split the location column on "comma space" \tn % Row Count 25 (+ 8) % Row 3 \SetRowColor{white} df{[}'city'{]} = \seqsplit{df.location.str.split('}, ', expand=True){[}0{]} & If we only cared about saving the city name in column 0, we can just select that column and save it to the DataFrame \tn % Row Count 31 (+ 6) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{13. Filter a DataFrame by multiple categories}} \tn % Row 0 \SetRowColor{LightBackground} movies{[}(movies.genre == 'Action') | (movies.genre == 'Drama') | (movies.genre == 'Western'){]} & If we wanted to filter the DataFrame to only show movies with the genre Action or Drama or Western, we could use multiple conditions separated by the "or" operator \tn % Row Count 9 (+ 9) % Row 1 \SetRowColor{white} movies{[}movies.genre.isin({[}'Action', 'Drama', 'Western'{]}){]} & However, you can actually rewrite this code more clearly by using the isin() method and passing it a list of genres \tn % Row Count 15 (+ 6) % Row 2 \SetRowColor{LightBackground} movies{[}\textasciitilde{}movies.genre.isin({[}'Action', 'Drama', 'Western'{]}){]} & And if you want to reverse this filter, so that you are excluding (rather than including) those three genres, you can put a tilde in front of the condition \tn % Row Count 23 (+ 8) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{14. Filter a DataFrame by largest categories}} \tn % Row 0 \SetRowColor{LightBackground} counts.nlargest(3) & The Series method nlargest() makes it easy to select the 3 largest values in this Series \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} \seqsplit{counts.nlargest(3).index} & And all we actually need from this Series is the index \tn % Row Count 8 (+ 3) % Row 2 \SetRowColor{LightBackground} movies{[}movies.genre.isin(counts.nlargest(3).index){]} & Finally, we can pass the index object to isin(), and it will be treated like a list of genres \tn % Row Count 13 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{15. Handle missing values}} \tn % Row 0 \SetRowColor{LightBackground} ufo.isna().sum() & To find out how many values are missing in each column, you can use the isna() method and then take the sum() \tn % Row Count 6 (+ 6) % Row 1 \SetRowColor{white} ufo.isna().mean() & Similarly, you can find out the percentage of values that are missing by taking the mean() of isna() \tn % Row Count 11 (+ 5) % Row 2 \SetRowColor{LightBackground} \seqsplit{ufo.dropna(axis='columns')} & If you want to drop the columns that have any missing values, you can use the dropna() method \tn % Row Count 16 (+ 5) % Row 3 \SetRowColor{white} \seqsplit{ufo.dropna(thresh=len(ufo)*0.9}, axis='columns') & Or if you want to drop columns in which more than 10\% of the values are missing, you can set a threshold for dropna() \tn % Row Count 22 (+ 6) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}