\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{San (Bhartik)} \pdfinfo{ /Title (dataframe-python.pdf) /Creator (Cheatography) /Author (San (Bhartik)) /Subject (DataFrame Python Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{55A377} \definecolor{LightBackground}{HTML}{F4F9F6} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{DataFrame Python Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{San (Bhartik)} via \textcolor{DarkBackground}{\uline{cheatography.com/153407/cs/33011/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}San (Bhartik) \\ \uline{cheatography.com/bhartik} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 4th July, 2022.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{DATAFRAME}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Definition} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{Pandas module in python provides a two-dimensional data structure with labeled rows and columns similar to excel sheet or a table in relational database. This data structure in pandas is called DataFrame.} \tn % Row Count 6 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{ADVANTAGES}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{• DataFrame is the most flexible constructor and allows its creation from many sources as discussed above.} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{• DataFrame with its row/column names, index selection and slicing methods provide a very flexible way to access and manipulate data.} \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{• Presence of indexes allow for search, filter and merge which are very fast operations in a DataFrame.} \tn % Row Count 9 (+ 3) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{• Pandas is built on of numpy so it allows for very efficient matrix and vectorized operations on the data stored in DataFrame.} \tn % Row Count 12 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{DISADVANTAGES}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{• The flexibility provided for access comes at a cost for bit higher cost in terms of higher learning curve for its users. Different ways to access data can be bit overwhelming and intimidating for new users initially.} \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{• Pandas DataFrame can handle data that can fit in the memory. Additionally, DataFrame indexes and structure use additional memory to take care of bookkeeping needed to maintain such a flexible data structure.} \tn % Row Count 10 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{BEST PRACTICES TO USE DATAFRAME}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{• Many new programmers coming from other programming languages try to use iterator to loop over data inside DataFrame. Iterator ("iter") functions are expensive operations so should only be used if every other option has been exhausted. Built-in function provides by pandas/numpy have been highly optimized in many cases vectorized and thus are many times faster than a simple for-loop.} \tn % Row Count 8 (+ 8) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{• Many new programmers coming from other programming languages try to use iterator to loop over data inside DataFrame. Iterator ("iter") functions are expensive operations so should only be used if every other option has been exhausted. Built-in function provides by pandas/numpy have been highly optimized in many cases vectorized and thus are many times faster than a simple for-loop.} \tn % Row Count 16 (+ 8) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{• Create proper index: Choice of index could have significant impact on the performance while fetching and performing operation on data stored in DataFrame. As an example, its best to change data type to date or datetime for timeseries data. This choice of index allows for much flexible and fast slicing of data needed to perform timeseries analysis on the stored data.} \tn % Row Count 24 (+ 8) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{• Create relevant data type most suitable for data in each column of the DataFrame. As an example, after creation of DataFrame from a csv, some of the columns could be type "o" (pandas object). It is best practice to change the column type to float or integer depending upon data stored. This should allow more efficient operations to be performed on the data.} \tn % Row Count 32 (+ 8) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{BEST PRACTICES TO USE DATAFRAME (cont)}} \tn % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{• As it's known "code is read much more often than it is written" so care should be taken to write code which is descriptive with self-explanatory variable names and properly documented. Readability plays a key role in understanding of code by others as well as long term maintainability of the project as it grows in scope and codebase.} \tn % Row Count 7 (+ 7) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{• One of the main components of ease of readability and maintainability of code is following naming and formatting conventions. This truly applies to column/row headers and index names stored in the DataFrame to make access to the data meaningful.} \tn % Row Count 12 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{p{0.4577 cm} p{0.4577 cm} p{0.4577 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{5.377cm}}{\bf\textcolor{white}{Creation}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{3}{x{5.377cm}}{DataFrame can be created using following constructor with flexibility to provide data, index name, column names and column data type as follows:} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{3}{x{5.377cm}}{`pd.DataFrame( data=None, index: 'Axes | None' = None, columns: 'Axes | None' = None, dtype: 'Dtype | None' = None, copy: 'bool | None' = None, )`} \tn % Row Count 7 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Create DataFrame from list}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{One of the quickest ways to generate DataFrame in real-life would be to load data directly from SQL database, CSV or excel file. DataFrame can be created in different ways from various data sources.} \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{`import pandas as pd \{\{nl\}\}\# list of strings \{\{nl\}\}list1 = {[}'One', 'Two', 'Three', 'Four', 'Five', 'Six', 'Seven'{]} \{\{nl\}\}\#list of numbers list2 = {[}1,2,3,4,5,6,7{]} \{\{nl\}\} \# Calling DataFrame constructor on lists \{\{nl\}\}df = pd.DataFrame(\{'Words':list1, 'Number':list2\}) \{\{nl\}\}print(df)`} \tn % Row Count 10 (+ 6) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{p{0.4977 cm} p{0.4977 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Shallow copy}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{`\#Access shallow/reference copy which generates "SettingWithCopyWarning" warning \{\{nl\}\}df1 = df \{\{nl\}\}df1{[}'Words'{]}{[}0{]} = 'One only'`} \tn % Row Count 3 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Deep Copy}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Data contained in the DataFrame can be modfied by slicing and using various selection methods} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{`\#Access data and manipulate using "deep" copy instead of shallow view change \{\{nl\}\}df1 = df{[}{[}'Words'{]}{]}.copy() \{\{nl\}\}df1.iloc{[}0{]} = 'One only' \{\{nl\}\}df1`} \tn % Row Count 6 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{p{0.4977 cm} p{0.4977 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Select Column}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{`\#Subset: select one of the columns \{\{nl\}\}df{[}'Words'{]}`} \tn % Row Count 2 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{p{0.4977 cm} p{0.4977 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Select row with row index}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{`\#Select row with row index \{\{nl\}\}df.iloc{[}1{]}`} \tn % Row Count 1 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{p{0.4977 cm} p{0.4977 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Select row with .loc{[}{]}}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{`\#Select row with row index \{\{nl\}\}df.iloc{[}1{]}`} \tn % Row Count 1 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}