\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{email2automate} \pdfinfo{ /Title (pandas-for-data-analysis.pdf) /Creator (Cheatography) /Author (email2automate) /Subject (Pandas for Data Analysis Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{088395} \definecolor{LightBackground}{HTML}{EFF7F8} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Pandas for Data Analysis Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{email2automate} via \textcolor{DarkBackground}{\uline{cheatography.com/184557/cs/38509/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}email2automate \\ \uline{cheatography.com/email2automate} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 3rd May, 2023.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Shared Attributes}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{.head(n) \newline .tail(n) \newline .index \newline .values \newline .shape \newline .axes \newline .info()} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{Common attributes shared between pd.Series and pd.DataFrame} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Null Unique Values}} \tn \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{{\bf{Drop Null Values}} \newline % Row Count 1 (+ 1) `df.dropna(how= any/all, inplace = T/F)` \newline % Row Count 2 (+ 1) `df.dropna(subset = {[}'col\_name1', 'col\_name2'{]}, how = any/all, inplace = T/F)` - from specific columns \newline % Row Count 5 (+ 3) {\bf{Fill Null Values}} \newline % Row Count 6 (+ 1) `df.fillna(value, inplace = T/F)` - acts on entire data frame. \newline % Row Count 8 (+ 2) `df{[}'col\_name'{]}.fillna(value = 'fill\_value', inplace = T/F)` - act on selected columns \newline % Row Count 10 (+ 2) {\bf{Unique Values}} \newline % Row Count 11 (+ 1) `df{[}'col{]}.unique()` - Returns unique values as a list(including null values) \newline % Row Count 13 (+ 2) `df.nunique()` - No of unique Values.(by default doesn't count null) \newline % Row Count 15 (+ 2) `df.unique(dropna = False)` \newline % Row Count 16 (+ 1) `df{[}'col'{]}.nunique()` \newline % Row Count 17 (+ 1) `df{[}'col'{]}.unique(dropna = True/False)` \newline % Row Count 18 (+ 1) Panda Series \newline % Row Count 19 (+ 1) `pd.Series.hasnans` \newline % Row Count 20 (+ 1) `pds.nunique()` - No of unique Values. \newline % Row Count 21 (+ 1) `pdf.unique(dropna = True/False)` - Returns unique values in a list.% Row Count 23 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Dealing Columns}} \tn \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{`df.columns` -retuns column names as a list \newline % Row Count 1 (+ 1) `df{[}'new\_col'{]}=list/series` \newline % Row Count 2 (+ 1) `df.insert{[}loc = n, column= col\_name, value= New\_value/array{]}` -inserts column at desired position / location% Row Count 5 (+ 3) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Sort, Rank, Count}} \tn \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{{\bf{Sort}} \newline % Row Count 1 (+ 1) \seqsplit{`df.sort\_index(ascending} = True/False)` - Sort Index \newline % Row Count 3 (+ 2) \seqsplit{`df.sort\_values(col\_name}, na\_position = "First/Last", ascending = True/False)` - Sort based on values \newline % Row Count 6 (+ 3) `df.sort\_values({[}'col\_name, 'col2'{]}, ascending = {[}True, False{]})` - Sort by Multiple Columns \newline % Row Count 8 (+ 2) {\bf{Rank}} \newline % Row Count 9 (+ 1) `df{[}'col'{]}.rank(ascending = True/False)` - Ranks are assigned based on sorted values \newline % Row Count 11 (+ 2) {\bf{counts}} \newline % Row Count 12 (+ 1) `df.value\_counts()` - counts exact rows \newline % Row Count 13 (+ 1) \seqsplit{`df.value\_counts(normalise} = True)` \newline % Row Count 14 (+ 1) `pds.value\_counts()` - pd series% Row Count 15 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Data Type Conversions and Optimization}} \tn \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{`df{[}'col'{]} = pd.to\_datetime(df{[}'col'{]})` \newline % Row Count 1 (+ 1) or we can parse dates in import itself. \newline % Row Count 2 (+ 1) `pd.read\_csv(......, parse\_dates = {[}'col1','col2'{]})` \newline % Row Count 4 (+ 2) `df{[}'col'{]} = df{[}'col'{]}.astype(dtype)` \newline % Row Count 5 (+ 1) `dtype = bool, category, int`% Row Count 6 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Filtering Data}} \tn \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{{\bf{Multiple Conditional Filtering}} \newline % Row Count 1 (+ 1) `mask1 = df{[}col1'{]} == 'value'` - Returns True/False boolean series \newline % Row Count 3 (+ 2) `mask2 = df{[}col2'{]} \textless{}= 'value'` \newline % Row Count 4 (+ 1) `mask3 = df{[}col3'{]} \textgreater{}= 'value'` \newline % Row Count 5 (+ 1) `df{[}(mask1 \& mask2) | mask3{]}` \newline % Row Count 6 (+ 1) {\bf{Inclusion Check}} \newline % Row Count 7 (+ 1) `mask = df{[}'col'{]}.isin({[}'val1','val2','val3'{]})` - Check for inclusion using isin() method. \newline % Row Count 9 (+ 2) `mask1 = df{[}col1'{]} == 'val1'` - isin method is equal to three conditional checks. \newline % Row Count 11 (+ 2) `mask1 = df{[}col1'{]} == 'val2'` \newline % Row Count 12 (+ 1) `mask1 = df{[}col1'{]} == 'val3'` \newline % Row Count 13 (+ 1) {\bf{For NULL values}} \newline % Row Count 14 (+ 1) `mask = df{[}'col'{]}.isnull()` - Returns True/False boolean series \newline % Row Count 16 (+ 2) `mask = df{[}'col'{]}.notnull()` - Returns True/False boolean series \newline % Row Count 18 (+ 2) {\bf{Inclusion Check within a range}} \newline % Row Count 19 (+ 1) `mask = df{[}'col'{]}.between(val1, val2)` -Returns True/False boolean series. True for values within range. \newline % Row Count 22 (+ 3) {\bf{Duplicate Values}} \newline % Row Count 23 (+ 1) `mask = df{[}'col'{]}.duplicated(keep = "First/Last/False")` - Returns boolean series, True for Duplicates \newline % Row Count 26 (+ 3) `df.drop\_duplicates()` -Deletes duplicate from df. if applied on df complete row should be identical. \newline % Row Count 29 (+ 3) \seqsplit{`df.drop\_duplicates(subset} = {[}'col1','col2'{]})` - Drops if the combination of col1 and col2 are identical.% Row Count 32 (+ 3) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Data Extraction}} \tn \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{SET INDEX \newline % Row Count 1 (+ 1) RESET INDEX \newline % Row Count 2 (+ 1) LOC ACCESSOR \newline % Row Count 3 (+ 1) ILOC ACCESSOR \newline % Row Count 4 (+ 1) `CODE`% Row Count 5 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \end{document}