\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{Daryabi} \pdfinfo{ /Title (pandas-groupby.pdf) /Creator (Cheatography) /Author (Daryabi) /Subject (pandas groupby Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{A3A3A3} \definecolor{LightBackground}{HTML}{F3F3F3} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{pandas groupby Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{Daryabi} via \textcolor{DarkBackground}{\uline{cheatography.com/181709/cs/37835/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}Daryabi \\ \uline{cheatography.com/daryabi} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 21st March, 2023.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{2} \begin{tabularx}{8.4cm}{x{2 cm} x{6 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Pandas objects}} \tn % Row 0 \SetRowColor{LightBackground} Series & a one-dimensional labeled array that can hold any data type \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} DataFrame & a two-dimensional labeled data structure with columns of potentially different types \tn % Row Count 5 (+ 3) % Row 2 \SetRowColor{LightBackground} Index & a sequence of axis labels that can be used to identify rows or columns in a DataFrame \tn % Row Count 8 (+ 3) % Row 3 \SetRowColor{white} \seqsplit{MultiIndex} & a hierarchical index object that allows for more than one index level in a DataFrame. \tn % Row Count 11 (+ 3) % Row 4 \SetRowColor{LightBackground} Timestamp & a specific moment in time, represented in Pandas as a datetime object. \tn % Row Count 14 (+ 3) % Row 5 \SetRowColor{white} Period & a specific interval of time, represented in Pandas as a period object. \tn % Row Count 17 (+ 3) % Row 6 \SetRowColor{LightBackground} \seqsplit{DatetimeIndex} & an index of datetime objects, used to index Pandas objects like Series and DataFrame. \tn % Row Count 20 (+ 3) % Row 7 \SetRowColor{white} Timedelta & a duration of time, represented in Pandas as a timedelta object. \tn % Row Count 23 (+ 3) % Row 8 \SetRowColor{LightBackground} \seqsplit{Categorical} & a data type used to represent categorical variables, \tn % Row Count 25 (+ 2) % Row 9 \SetRowColor{white} Sparse & a data structure used to represent sparse data efficiently, \tn % Row Count 27 (+ 2) % Row 10 \SetRowColor{LightBackground} Interval & a data type used to represent intervals \tn % Row Count 29 (+ 2) % Row 11 \SetRowColor{white} \seqsplit{DatetimeTZ} & a datetime object with a timezone. \tn % Row Count 31 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{2.4 cm} x{5.6 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Pandas functions called on pd}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{pd.read\_csv()} & used to read data from a CSV file and create a DataFrame. \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \seqsplit{pd.DataFrame()} & a constructor function that is used to create a new DataFrame from data in memory. \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} \seqsplit{pd.Series():} & used to create a new Series object. \tn % Row Count 8 (+ 2) % Row 3 \SetRowColor{white} pd.concat() & used to concatenate two or more DataFrames or Series \tn % Row Count 10 (+ 2) % Row 4 \SetRowColor{LightBackground} pd.merge(): & used to merge two DataFrames based on a common column. \tn % Row Count 12 (+ 2) % Row 5 \SetRowColor{white} \seqsplit{pd.groupby()} & used to group data in a DataFrame based on one or more columns. \tn % Row Count 15 (+ 3) % Row 6 \SetRowColor{LightBackground} \seqsplit{pd.pivot\_table():} & used to create a pivot table from a DataFrame. \tn % Row Count 17 (+ 2) % Row 7 \SetRowColor{white} \seqsplit{pd.to\_datetime():} & used to convert a column of strings to datetime objects. \tn % Row Count 19 (+ 2) % Row 8 \SetRowColor{LightBackground} \seqsplit{pd.to\_numeric()} & used to convert a column of strings to numeric objects. \tn % Row Count 21 (+ 2) % Row 9 \SetRowColor{white} pd.cut() & This function is used to bin data into discrete intervals. \tn % Row Count 24 (+ 3) % Row 10 \SetRowColor{LightBackground} pd.qcut(): & This function is used to bin data into quantiles. \tn % Row Count 26 (+ 2) % Row 11 \SetRowColor{white} \seqsplit{pd.date\_range()} & used to create a range of dates or DatetimeIndex. \tn % Row Count 28 (+ 2) % Row 12 \SetRowColor{LightBackground} \seqsplit{pd.timedelta()} & used to create a timedelta object representing a duration of time. \tn % Row Count 31 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{3.04 cm} x{4.96 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{String vectorization}} \tn % Row 0 \SetRowColor{LightBackground} str.replace(): & used to replace a pattern in a string with another pattern \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} str.extract() & used to extract a pattern from a string and return the first match. \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} str.split() & used to split a string into a list of strings based on a delimiter. \tn % Row Count 9 (+ 3) % Row 3 \SetRowColor{white} str.join() & used to concatenate a list of strings with a separator string \tn % Row Count 12 (+ 3) % Row 4 \SetRowColor{LightBackground} \seqsplit{str.startswith()} & used to check if a string starts with a particular substring \tn % Row Count 15 (+ 3) % Row 5 \SetRowColor{white} str.endswith() & used to check if a string ends with a particular substring \tn % Row Count 18 (+ 3) % Row 6 \SetRowColor{LightBackground} str.lower(), str.upper(): & used to convert the case of a string to lowercase or uppercase, respectively \tn % Row Count 22 (+ 4) % Row 7 \SetRowColor{white} str.strip(): & used to remove leading and trailing whitespace from a string \tn % Row Count 25 (+ 3) % Row 8 \SetRowColor{LightBackground} str.contains() & used to check if a pattern exists in a string \tn % Row Count 27 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{3.84 cm} x{4.16 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Date Vectorization}} \tn % Row 0 \SetRowColor{LightBackground} pd.to\_datetime(): & used to convert a column of strings or Unix timestamps to a datetime object. \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} dt.date & used to extract the date component of a datetime object or DatetimeIndex. \tn % Row Count 8 (+ 4) % Row 2 \SetRowColor{LightBackground} dt.time & extract the time component of a datetime object or DatetimeIndex. \tn % Row Count 12 (+ 4) % Row 3 \SetRowColor{white} dt.hour, dt.minute, dt.second & used to extract the hour, minute, and second components of a datetime object or DatetimeIndex. \tn % Row Count 17 (+ 5) % Row 4 \SetRowColor{LightBackground} dt.dayofweek: & used to extract the day of the week as an integer, \tn % Row Count 20 (+ 3) % Row 5 \SetRowColor{white} dt.day\_name() & used to extract the name of the day of the week \tn % Row Count 23 (+ 3) % Row 6 \SetRowColor{LightBackground} dt.month\_name() & used to extract the name of the month \tn % Row Count 25 (+ 2) % Row 7 \SetRowColor{white} dt.tz\_localize() and dt.tz\_convert(): & used to set or convert the timezone of a datetime object or DatetimeIndex. \tn % Row Count 29 (+ 4) % Row 8 \SetRowColor{LightBackground} \seqsplit{dt.is\_month\_start} and dt.is\_month\_end & used to check if a datetime object or DatetimeIndex is at the start or end of a month, respectively. \tn % Row Count 34 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{pandas groupby}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Consider it as DataFrame; you can use all DataFrame attributes and functions on the group object.% Row Count 2 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{1.84 cm} x{6.16 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Pandas pivot\_table}} \tn % Row 0 \SetRowColor{LightBackground} data & The DataFrame or Series to be used for the pivot table. \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} values & The column to aggregate. If not specified, all numerical columns will be used. \tn % Row Count 5 (+ 3) % Row 2 \SetRowColor{LightBackground} index & The column(s) to use as row labels. \tn % Row Count 7 (+ 2) % Row 3 \SetRowColor{white} columns & The column(s) to use as column labels. \tn % Row Count 9 (+ 2) % Row 4 \SetRowColor{LightBackground} aggfunc & The function to use for aggregating the values. Defaults to 'mean'. \tn % Row Count 12 (+ 3) % Row 5 \SetRowColor{white} margins & Add all row/columns (e.g. 'All') label and compute grand total for that row/column. \tn % Row Count 15 (+ 3) % Row 6 \SetRowColor{LightBackground} \seqsplit{margins\_name} & The label for the row/column that contains the grand total. Defaults to 'All'. \tn % Row Count 18 (+ 3) % Row 7 \SetRowColor{white} dropna & Whether or not to exclude rows with missing values. Defaults to True. \tn % Row Count 21 (+ 3) % Row 8 \SetRowColor{LightBackground} sort & Whether or not to sort the rows by the values of the index. \tn % Row Count 23 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Pandas multiIndex}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\# Create a MultiIndex from two separate indexes \newline index1 = pd.Index({[}'A', 'B', 'C'{]}, name='Index1') \newline index2 = pd.Index({[}'X', 'Y', 'Z'{]}, name='Index2') \newline multi\_index = pd.MultiIndex.from\_product({[}index1, index2{]}, names={[}'Index1', 'Index2'{]}) \newline \newline \# Create a MultiIndex from a list of tuples \newline tuples = {[}('A', 'X'), ('A', 'Y'), ('B', 'X'), ('B', 'Y'){]} \newline multi\_index = \seqsplit{pd.MultiIndex.from\_tuples(tuples}, names={[}'Index1', 'Index2'{]}) \newline \newline \# Select data from a single level of the index \newline df.loc{[}'A'{]} \# selects all rows where Index1 = 'A' \newline df.loc{[}('A', 'X'){]} \# selects a single row where Index1 = 'A' and Index2 = 'X' \newline \newline \# Select data from multiple levels of the index \newline \# selects rows where Index1 is 'A' or 'B' and Index2 is 'X', and returns the 'Column1' values \newline df.loc{[}({[}'A', 'B'{]}, 'X'), 'Column1'{]} \newline \newline \# Stack a DataFrame to move a level of the MultiIndex to the columns \newline df.stack() \newline \newline \# Unstack a DataFrame to move a level of the columns to the index \newline df.unstack()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}