\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{wilecoyote} \pdfinfo{ /Title (data-analysis.pdf) /Creator (Cheatography) /Author (wilecoyote) /Subject (Data Analysis Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{02A307} \definecolor{LightBackground}{HTML}{F7FCF7} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Data Analysis Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{wilecoyote} via \textcolor{DarkBackground}{\uline{cheatography.com/178221/cs/37165/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}wilecoyote \\ \uline{cheatography.com/wilecoyote} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 25th February, 2023.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{2} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Importing data}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{df = \seqsplit{pd.read\_csv('path/filename.csv')}} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{df = \seqsplit{pd.read\_csv('https://example.com/page')}} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{df = \seqsplit{pd.read\_excel('path/filename.xlxs'},sheet='sheet1')} \tn % Row Count 4 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{CSV options: index\_col='A', header=2, parse\_dates={[}'D1', 'D2'{]}, thousands="," \newline Excel options:} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Cleaning data}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{df.dropna(inplace=True)} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{df.drop(inplace=True, columns={[}"A","B","C"{]}} \tn % Row Count 2 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{0.912 cm} x{3.344 cm} x{3.344 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{8.4cm}}{\bf\textcolor{white}{Fixing dates}} \tn % Row 0 \SetRowColor{LightBackground} Date \seqsplit{string} & Timestamp to string & string to Timestamp \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \seqsplit{'Feb-2023'} & \seqsplit{pd.Timestamp('2023-02-25').strftime('\%b-\%Y')} & \seqsplit{pd.to\_datetime('Feb-2023'}, \seqsplit{format='\%b-\%Y')} \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} \seqsplit{'02-2023'} & \seqsplit{pd.Timestamp('2023-02-25').strftime('\%m-\%Y')} & \seqsplit{pd.to\_datetime('Feb-2023'}, \seqsplit{format='\%m-\%Y')} \tn % Row Count 9 (+ 3) % Row 3 \SetRowColor{white} \mymulticolumn{3}{x{8.4cm}}{df{[}'ts'{]} = pd.to\_datetime(df{[}{[}'Year', 'Month'{]}{]}.apply(lambda x: '\{\} \{\} \{\}'.format(x{[}1{]}, 15, int(x{[}0{]})), axis=1)} \tn % Row Count 12 (+ 3) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{3}{x{8.4cm}}{df{[}'period'{]} = df{[}'ts'{]}.apply(lambda x: x.to\_period('M'))} \tn % Row Count 14 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{New columns with apply}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{s1 = s.apply(function) \newline df{[}'B'{]} = df.apply(function, args=()) \newline df{[}'B'{]} = df{[}'A'{]}.apply(function, axis=0|1, args=())} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{axis=0 is index, applies function to each column (e.g. sum down columns) \newline axis=1 is columns, applies function to each row (e.g. sum across rows) \newline each row or column in DataFrame or value in Series is passed to function \newline args are passed as additional positional parameters to function} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Selecting data}} \tn % Row 0 \SetRowColor{LightBackground} Return series & df{[}'A'{]} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} Return dataset & df{[}{[}'A'{]}{]} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} Return series of booleans & df{[}'A'{]} \textless{} 20 \tn % Row Count 4 (+ 2) % Row 3 \SetRowColor{white} Return filtered DataFrame & df{[}df{[}'A'{]}\textless{}20{]} \tn % Row Count 6 (+ 2) % Row 4 \SetRowColor{LightBackground} Return filtered Series & df{[}df{[}'A'{]}\textless{}20,'Column'{]} \tn % Row Count 8 (+ 2) % Row 5 \SetRowColor{white} Return filtered DataFrame & df{[}df{[}'A'{]}\textless{}20,{[}'Column'{]}{]} \tn % Row Count 10 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Combining data sets}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{ds1.merge(ds2, on='field', how='inner')} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{pd.concat({[}ds1,ds2,ds3{]}, join='inner')} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{pd.merge\_ordered(df1,df2)} \tn % Row Count 3 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{pd.merge\_asof(df1,df2,on='field', direction='nearest')} \tn % Row Count 5 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{how/join='inner','outer','left','right' \newline left\_on, right\_on; suffixes=('\_left','\_right') \newline fill\_method='ffill' \newline When forward filling data on multi-index, generally put date last to use the correct fill.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Semi-joins}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{ds1.merge(ds2, on='id', how='inner')} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}