\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{KarimAchaibou (KarimAchaibou)} \pdfinfo{ /Title (pandas.pdf) /Creator (Cheatography) /Author (KarimAchaibou (KarimAchaibou)) /Subject (Pandas Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{499AA3} \definecolor{LightBackground}{HTML}{F3F8F9} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Pandas Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{KarimAchaibou (KarimAchaibou)} via \textcolor{DarkBackground}{\uline{cheatography.com/121484/cs/22373/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}KarimAchaibou (KarimAchaibou) \\ \uline{cheatography.com/karimachaibou} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 3rd May, 2020.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{2} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Lists {[} {]}}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{A list is an ordered an mutable (you can change it) Python container} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{creating a list: {[} {]}} \tn % Row Count 3 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`numbers = {[}1,2,3,4{]}` \{\{nl\}\} `cities = {[}"Bruges", "Rome"{]}` \{\{nl\}\} or mix of different types as wel as duplicated elements} \tn % Row Count 6 (+ 3) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{list() constructor:} \tn % Row Count 7 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{of a string: `list("Karim")` -{}-\textgreater{} {[}"K","a","r","i","m"{]} \{\{nl\}\} of tuple: `list(("Bruges", "Rome"))` -{}-\textgreater{} {[}"Bruges", "Rome"{]} \{\{nl\}\} of a dictionary `list(\{"hydrogen":1,"helium":2\})` -{}-\textgreater{} {[}hydrogen","helium"{]} \{\{nl\}\} of a set `list(\{"Bruges", "Rome"\})` -{}-\textgreater{} {[}"Bruges", "Rome"{]} \{\{nl\}\} of a numpy array `list(np.array({[}1,2,3{]}))` -{}-\textgreater{} {[}1,2,3{]}} \tn % Row Count 14 (+ 7) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{accessing:} \tn % Row Count 15 (+ 1) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{starting idex = 0; last element = -1 \{\{nl\}\} `cities{[}0{]}` -{}-\textgreater{} {[}"Bruges"{]} \{\{nl\}\} `cities{[}-2{]}`-{}-\textgreater{} {[}"Bruges"{]}} \tn % Row Count 18 (+ 3) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{accessing multiple elements} \tn % Row Count 19 (+ 1) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{[}start:stop:step{]} \{\{nl\}\}~{\bf{start index is inclusive}} \{\{nl\}\}~{\bf{end index is exclusive}}\{\{nl\}\} ~default value for step is 1; other values can be omitted = include all} \tn % Row Count 23 (+ 4) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{modifying items}}} \tn % Row Count 24 (+ 1) % Row 10 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{replace second item: `cities{[}1{]} ="Gent"` \{\{nl\}\} replace first two items: `cities{[}:2{]} = {[}"Paris","London"{]}`} \tn % Row Count 27 (+ 3) % Row 11 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Removing elements (del, pop, remove)} \tn % Row Count 28 (+ 1) % Row 12 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{del{[} {]} keyword -{}-\textgreater{} delete first element: `del cities{[}0{]}`} \tn % Row Count 30 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Lists {[} {]} (cont)}} \tn % Row 13 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{list.pop(x) methode: removes the item at the given index, and returns it -{}-\textgreater{} `removed\_cities = cities.pop(1)`} \tn % Row Count 3 (+ 3) % Row 14 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{list.remove(x) methode: deletes the first matching element from a the list, and returns None -{}-\textgreater{} \seqsplit{`cities.remove("Bruges")`}} \tn % Row Count 6 (+ 3) % Row 15 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Inserting elements}}} \tn % Row Count 7 (+ 1) % Row 16 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{list.insert(i,x) -{}-\textgreater{} insert an element x (numbers, booleans, lists) at index i and returns none \{\{nl\}\}} \tn % Row Count 10 (+ 3) % Row 17 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{list.append(x) -{}-\textgreater{} adds an item to the end of the list - equivalent to list.insert(len(list),x)} \tn % Row Count 12 (+ 2) % Row 18 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Sorting}}} \tn % Row Count 13 (+ 1) % Row 19 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{function: sorted(iterable{[}, key{]}{[}, reverse{]}) -{}-\textgreater{} returns a sorted list =\textgreater{} add to variable} \tn % Row Count 15 (+ 2) % Row 20 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{methode: list.sort(key=…, reverse=) -{}-\textgreater{} sorts the list in-place} \tn % Row Count 17 (+ 2) % Row 21 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{arguments: \{\{nl\}\}~ - reverse : default = False = ascending\{\{nl\}\}~ - key: sort a list based on the value returned by the function (def or lambda) provided in the key parameter} \tn % Row Count 21 (+ 4) % Row 22 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Reversing}}} \tn % Row Count 22 (+ 1) % Row 23 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{function: reversed(seq) -{}-\textgreater{} to get a list use the list() constructor ex.: products\_reversed = list(reversed(products))} \tn % Row Count 25 (+ 3) % Row 24 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{methode: list.reverse() -{}-\textgreater{} reverses the list in-place returning {\bf{None}}} \tn % Row Count 27 (+ 2) % Row 25 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Concatenate list}}} \tn % Row Count 28 (+ 1) % Row 26 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{+ operator} \tn % Row Count 29 (+ 1) % Row 27 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{list.extend(iterable) -{}-\textgreater{} extends the list by appending all the items from the iterable} \tn % Row Count 31 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Lists {[} {]} (cont)}} \tn % Row 28 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Check if an element exists in a list}}} \tn % Row Count 1 (+ 1) % Row 29 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{~ {\bf{in}} → Evaluates to True if the object on the left side is included in the object on the right side. \{\{nl\}\} ~ {\bf{not in}} → Evaluates to True if the object on the left side is not included in the object on the right side.} \tn % Row Count 6 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{basics}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{import the package} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`import pandas as pd`} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{check version} \tn % Row Count 3 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`pd.\_\_version\_\_`} \tn % Row Count 4 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{show all rows of dataframe (None = diplay all rows or fill in a number instead)} \tn % Row Count 6 (+ 2) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{\seqsplit{`pd.set\_option('display.max\_rows'}, None)`} \tn % Row Count 7 (+ 1) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\{\{ac\}\}\{\{link="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.set\_option.html"\}\}pandas.DataFrame.set\_option\{\{/popup\}\}} \tn % Row Count 10 (+ 3) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Copy data from clipboard} \tn % Row Count 11 (+ 1) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df = pd.read\_clipboard()`} \tn % Row Count 12 (+ 1) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{import data from csv-file ( .. = up one level)} \tn % Row Count 13 (+ 1) % Row 10 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df = \seqsplit{pd.read\_csv("path/file.csv")`}} \tn % Row Count 14 (+ 1) % Row 11 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Copy a data frame} \tn % Row Count 15 (+ 1) % Row 12 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df\_copy = df.copy()`} \tn % Row Count 16 (+ 1) % Row 13 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{head() - show first 5 rows (default) or X rows} \tn % Row Count 17 (+ 1) % Row 14 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df.head()` or `df.head(10)`} \tn % Row Count 18 (+ 1) % Row 15 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{tail() - show last 5 rows (default) or X rows} \tn % Row Count 19 (+ 1) % Row 16 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df.tail()` or `df.tail(10)`} \tn % Row Count 20 (+ 1) % Row 17 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{info() - This method prints information about a DataFrame including the index dtype and column dtypes, non-null values and memory usage} \tn % Row Count 23 (+ 3) % Row 18 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`pd.info()`} \tn % Row Count 24 (+ 1) % Row 19 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{describe() - Descriptive statistics include those that summarize the central tendency, dispersion and shape of a dataset's distribution, excluding NaN values.} \tn % Row Count 28 (+ 4) % Row 20 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`pd.describe()` chain `.round(2)` to clean up the table} \tn % Row Count 30 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{basics (cont)}} \tn % Row 21 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\{\{ac\}\}\{\{link="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html"\}\}pandas.DataFrame.describe\{\{/link\}\}} \tn % Row Count 3 (+ 3) % Row 22 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{column names} \tn % Row Count 4 (+ 1) % Row 23 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df.columns`} \tn % Row Count 5 (+ 1) % Row 24 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{size of the dataframe} \tn % Row Count 6 (+ 1) % Row 25 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df.shape`} \tn % Row Count 7 (+ 1) % Row 26 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Quantile() (like describe() but you can define your own values). Default axis = 0 =\textgreater{} row-wise} \tn % Row Count 9 (+ 2) % Row 27 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df.quantile({[}0.1,0.4,0.7, 0.8, 0.9{]})`} \tn % Row Count 10 (+ 1) % Row 28 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Mean, Standard Deviation, Variance, Count, Median, Min, and Max on column level} \tn % Row Count 12 (+ 2) % Row 29 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df{[}"column name"{]}.mean()` or other function, native or self-made} \tn % Row Count 14 (+ 2) % Row 30 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{renaming columns} \tn % Row Count 15 (+ 1) % Row 31 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df.rename(columns=\{'oldName1': 'newName1', 'oldName2': 'newName2'\}, inplace=True)`} \tn % Row Count 17 (+ 2) % Row 32 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Using the argument, {\bf{inplace}} = True =\textgreater{} save dataframe into itself. If we don't state {\bf{inplace}} = True you need to add result to a new or same dataframe with the "=" operator} \tn % Row Count 21 (+ 4) % Row 33 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{reorder columns - pass a list as a list and index} \tn % Row Count 22 (+ 1) % Row 34 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{order we want: \{\{nl\}\} `cols = {[}'col\_name\_4', 'col\_name\_2','col\_name\_3', 'col\_name\_1'{]}` \{\{nl\}\} overwrite the old dataframe with the same dataframe but new column order: \{\{nl\}\}`df= df{[}cols{]}`} \tn % Row Count 26 (+ 4) % Row 35 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{adding new columns} \tn % Row Count 27 (+ 1) % Row 36 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`df{[}"new\_column\_name"{]} = ...` \{\{nl\}\} ... = {[}list{]} or a function applied to an other column or ...} \tn % Row Count 29 (+ 2) % Row 37 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Count unique rows} \tn % Row Count 30 (+ 1) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{basics (cont)}} \tn % Row 38 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`len(df{[}'column\_name'{]}.unique()` or `df{[}'column\_name'{]}.nunique} \tn % Row Count 2 (+ 2) % Row 39 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Get count of (unique) values for a particular column} \tn % Row Count 4 (+ 2) % Row 40 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df.{\emph{column\_name}}.value\_counts()`} \tn % Row Count 5 (+ 1) % Row 41 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{transform dataframe to list} \tn % Row Count 6 (+ 1) % Row 42 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{chain with `.tolist()` -{}-\textgreater{} `df.columns.tolist()`} \tn % Row Count 7 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{making a dataframe}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{format: df = pd.DataFrame({\emph{data,index values,column names}})} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Creating df from list:} \tn % Row Count 3 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`lst = {[}'This', 'is', 'a', 'nice', 'cheat', 'sheet'{]}` \{\{nl\}\} `df1 = pd.DataFrame(lst)`} \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Creating df form dict:} \tn % Row Count 6 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`dict= \{"First Column Name": {[}"First value", "Second value",...{]}, "Second Column Name": {[}"First value", "Second value",...{]}, ... \} \{\{nl\}\} df2 = pd.DataFrame (dict)`} \tn % Row Count 10 (+ 4) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{another example:} \tn % Row Count 11 (+ 1) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df3 = \seqsplit{pd.DataFrame(np.random.randn(6}, 4), index = list('abcdef'), columns = list('ABCD'))`} \tn % Row Count 13 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Index}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\{\{link="https://towardsdatascience.com/pandas-index-explained-b131beaf6f7b"\}\}link: Pandas Index Explained\{\{/link\}\}} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{get index values (strings) - rows ("0", "1", "2" , ...)} \tn % Row Count 5 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df.index` \{\{nl\}\} \textgreater{}\textgreater{}\textgreater{} RangeIndex(start=0, stop=32561, step=1)} \tn % Row Count 7 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{get column index values} \tn % Row Count 8 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df.columns`} \tn % Row Count 9 (+ 1) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{naming index (rows)} \tn % Row Count 10 (+ 1) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df.index.name = "{\emph{name\_of\_choice}}"`} \tn % Row Count 11 (+ 1) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{reset index} \tn % Row Count 12 (+ 1) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df\_new = df.reset\_index()` (the df has already been sliced otherwise the old and new index will be the same)} \tn % Row Count 15 (+ 3) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Resetting the index will make it a column and recreate another default index. \{\{nl\}\} Parameters: \{\{nl\}\} ~ drop = True (default = False) paramater won't create that as column in the dataframe. \{\{nl\}\} ~ inplace = True (default = False)} \tn % Row Count 20 (+ 5) % Row 10 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{crosstabs has also index values} \tn % Row Count 21 (+ 1) % Row 11 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`cross = \seqsplit{pd.crosstab(df\_new.col\_name\_1},df\_new.col\_name\_2)` \{\{nl\}\} `cross.index` \{\{nl\}\} \textgreater{}\textgreater{}\textgreater{}`Index({[}'value\_1\_of\_col\_1', 'value\_2\_of\_col\_1', ...{]}, dtype='object', name='col\_name\_1')`\{\{nl\}\}individual items can be accessed like: \{\{nl\}\} `cross.loc{[}"value\_?\_of\_col\_1"{]}`} \tn % Row Count 27 (+ 6) % Row 12 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{using the old index (index befor reseting) to access initial dataframe} \tn % Row Count 29 (+ 2) % Row 13 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`df{[}"col\_name"{]}{[}new\_df.index\_old{]}` -{}-\textgreater{} index\_old (see before name\_of\_choice where we give our index a name)} \tn % Row Count 32 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Index (cont)}} \tn % Row 14 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Filtering a complementary set from the data} \tn % Row Count 1 (+ 1) % Row 15 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`df\_new =df{[}\textasciitilde{}df.index.isin(df\_sub.index){]}` -{}-\textgreater{} tilde sign \textasciitilde{} :negate data (True becomes False ...)} \tn % Row Count 3 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{df3}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/karimachaibou_1586612899_df3.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Selecting}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{slicing = getting and setting of subsets of the data set (3 ways)} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{.loc is primarily label based \{\{nl\}\} .iloc is primarily integer position based \{\{nl\}\}.loc, .iloc, and also {[} {]} indexing can accept a callable as indexer} \tn % Row Count 6 (+ 4) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df.loc{[}row\_indexer,column\_indexer{]}` -{}-\textgreater{} : is the null slice} \tn % Row Count 8 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{selecting column(s):} \tn % Row Count 9 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df{[}'colname'{]} or through a list of columns df{[}{[}'colname1', 'colname2'{]}{]}`\{\{nl\}\} or directly as an attribute \{\{nl\}\}`df.colname`} \tn % Row Count 12 (+ 3) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{swapping columns} \tn % Row Count 13 (+ 1) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df{[}{[}'B', 'A'{]}{]} = df{[}{[}'A', 'B'{]}{]}`} \tn % Row Count 14 (+ 1) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{swapping column values on a subset ({\bf{you have to swap the raw data !}})} \tn % Row Count 16 (+ 2) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{df.loc{[}:, {[}'B', 'A'{]}{]} = df{[}{[}'A', 'B'{]}{]}.to\_numpy()} \tn % Row Count 17 (+ 1) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{create new column A with value 0 -{}-\textgreater{} length of df} \tn % Row Count 18 (+ 1) % Row 10 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df{[}'A'{]} = \seqsplit{list(range(len(df.index)))`}} \tn % Row Count 19 (+ 1) % Row 11 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{slicing using the {\bf{{[} {]}}} operator -{}-\textgreater{} {[} {]} slices the rows} \tn % Row Count 21 (+ 2) % Row 12 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{[}start:end:step{]} -{}-\textgreater{}{[}2:5{]} -{}-\textgreater{} starts at row 3 (row 2 not included); stops at row 5 (included); default step = 1 \{\{nl\}\} If step is negatief = start from the last element} \tn % Row Count 25 (+ 4) % Row 13 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{.loc - Selection by label (labels can NOT be integer values)} \tn % Row Count 27 (+ 2) % Row 14 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df.loc{[}'index\_label\_x':'index\_label\_y''{]}` -{}-\textgreater{} index\_labels are row labels. \{\{nl\}\} When slicing with .loc {\bf{both the start bound AND the stop bound are included}}} \tn % Row Count 31 (+ 4) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Selecting (cont)}} \tn % Row 15 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Select all rows starting from row d, select all columns A to C} \tn % Row Count 2 (+ 2) % Row 16 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`df3.loc{[}'d':, 'A':'C'{]}` -{}-\textgreater{} red square} \tn % Row Count 3 (+ 1) % Row 17 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{getting values with a boolean array} \tn % Row Count 4 (+ 1) % Row 18 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`df3.loc{[}:, df3.loc{[}'a'{]} \textgreater{} 0{]}` -{}-\textgreater{} all rows and columns where row a \textgreater{}0 -{}-\textgreater{} green square} \tn % Row Count 6 (+ 2) % Row 19 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{select numeric columns (column names)} \tn % Row Count 7 (+ 1) % Row 20 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`df\_numeric = df.select\_dtypes({\bf{include}} = {[}np.number{]})` \{\{nl\}\} `numeric\_cols = \seqsplit{df\_numeric.columns.values`}} \tn % Row Count 10 (+ 3) % Row 21 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{select non numeric columns (column names)} \tn % Row Count 11 (+ 1) % Row 22 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`df\_non\_numeric = df.select\_dtypes({\bf{exclude}}={[}np.number{]})`\{\{nl\}\} `non\_numeric\_cols = \seqsplit{df\_non\_numeric.columns.values`}} \tn % Row Count 14 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{setup environment}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{import pandas as pd \newline import numpy as np \newline import seaborn as sns \newline \newline import matplotlib.pyplot as plt \newline import matplotlib.mlab as mlab \newline import matplotlib \newline plt.style.use("ggplot") \newline from matplotlib.pyplot import figure \newline \newline \%matplotlib inline \newline matplotlib.rcParams{[}"figure.figsize"{]} = (12,8) \newline \newline pd.options.mode.chained\_assignment = None} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{dropping and filling}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{drop columns} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{1) focus on columns to keep (add columns to a new dateframe)} \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df\_new = df{[}{[}'col\_1\_to\_keep', 'col\_2\_to\_keep', ... {]}{]}`} \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{2) focus on columns to drop} \tn % Row Count 6 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df\_new = df.drop({[}'col\_1\_to\_drop','col\_2\_to\_drop','col\_3\_to\_drop', ...{]}, axis=1)`} \tn % Row Count 8 (+ 2) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{} \tn % Row Count 8 (+ 0) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{} \tn % Row Count 8 (+ 0) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{fill NaN with some value x} \tn % Row Count 9 (+ 1) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`df.fillna(x)`} \tn % Row Count 10 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{datetime}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Import statement} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`from datetime import datetime` -{}-\textgreater{} python's default library for handling date and time} \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Creating datetime object} \tn % Row Count 4 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`datetime(year=2020, month=4, day=11)`\{\{nl\}\} \textgreater{}\textgreater{}\textgreater{} datetime.datetime(2020, 4, 11, 0, 0) \{\{nl\}\} {\bf{arguments}}: \seqsplit{year;month;day;hour;minute;second;millisecond}} \tn % Row Count 8 (+ 4) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Now()} \tn % Row Count 9 (+ 1) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`current\_time = datetime.now()`} \tn % Row Count 10 (+ 1) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Converting: string to datetime object} \tn % Row Count 11 (+ 1) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{\seqsplit{`datetime.strptime("11-04-2020}, 20:58:15", "\%d-\%m-\%Y, \%H:\%M:\%S")`\{\{nl\}\} \textgreater{}\textgreater{}\textgreater{}datetime.datetime(2020, 4, 11, 20, 58, 15)} \tn % Row Count 14 (+ 3) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\{\{link="http://strftime.org"\}\}formating arguments\{\{/link\}\}} \tn % Row Count 16 (+ 2) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Converting: datetime to string object} \tn % Row Count 17 (+ 1) % Row 10 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\seqsplit{`datetime.strftime(datetime(year=2020}, month=4, day=11), "\%d/\%m/\%Y")` \{\{nl\}\} \textgreater{}\textgreater{}\textgreater{} '11/04/2020'} \tn % Row Count 19 (+ 2) % Row 11 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Data range in Pandas} \tn % Row Count 20 (+ 1) % Row 12 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\seqsplit{`pd.date\_range(start=datetime(year=2020}, month=4, day=11),periods=3,freq='D')`\{\{nl\}\} \textgreater{}\textgreater{}\textgreater{}DatetimeIndex({[}'2020-04-11', '2020-04-12', '2020-04-13'{]}, dtype='datetime64{[}ns{]}', freq='D')} \tn % Row Count 24 (+ 4) % Row 13 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{\{\{ac\}\}\{\{link="https://pandas.pydata.org/pandas-docs/stable/user\_guide/timeseries.html\#timeseries-offset-aliases"\}\}frequency aliases\{\{/link\}\}} \tn % Row Count 27 (+ 3) % Row 14 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{start argument can also be like: '2020-04-11' or '2020/04/11' or '2020, may 11'} \tn % Row Count 29 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{apply function}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{under construction} \tn % Row Count 1 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{missing data}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{heatmap} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`cols = df.columns{[}:30{]}` -{}-\textgreater{} select first 30 columns (names) \{\{nl\}\} `colours = {[}"blue","yellow"{]}` -{}-\textgreater{} missing data will be displayed as yellow lines in the heatmap \{\{nl\}\} `sns.heatmap(df{[}cols{]}.isnull(),cmap=sns.color\_palette(colours))` -{}-\textgreater{} isnull() returns a df with true's and false's. If value is NA then isnull() returns true = 1} \tn % Row Count 8 (+ 7) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{data percentage list} \tn % Row Count 9 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{`missing = \{\}`\{\{nl\}\} `for col in df.columns:`\{\{nl\}\} ~ `pct\_missing = np.mean(df{[}col{]}.isnull())` \{\{nl\}\} ~ `missing{[}col{]} = round(pct\_missing*100)` \{\{nl\}\} `missing\_sorted = \{key: value for key, value in sorted(missing.items(), key=lambda item: item{[}1{]},reverse=True)\}`} \tn % Row Count 15 (+ 6) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}