\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{solider245} \pdfinfo{ /Title (pandas.pdf) /Creator (Cheatography) /Author (solider245) /Subject (pandas Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{1A10A3} \definecolor{LightBackground}{HTML}{F0F0F9} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{pandas Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{solider245} via \textcolor{DarkBackground}{\uline{cheatography.com/83835/cs/19845/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}solider245 \\ \uline{cheatography.com/solider245} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 14th June, 2019.\\ Updated 14th June, 2019.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{pandas安装}} \tn % Row 0 \SetRowColor{LightBackground} conda install pandas & 获得 pandas \seqsplit{的最佳方式是通过} conda \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} python3 -m pip install -{}-upgrade pandas & 通过 PyPI \tn % Row Count 5 (+ 2) % Row 2 \SetRowColor{LightBackground} pip install \seqsplit{-i https://pypi.tuna.tsinghua.edu.cn/simple gevent} & \seqsplit{使用清华镜像安装} \tn % Row Count 9 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{\seqsplit{国内推荐使用清华镜像安装}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{p{0.4977 cm} x{4.4793 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{关键缩写和包导入}} \tn % Row 0 \SetRowColor{LightBackground} df & 任意的Pandas DataFrame对象 \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} s & 任意的Pandas Series对象 \tn % Row Count 2 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{缩写} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.23965 cm} x{2.73735 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{引入方法}} \tn % Row 0 \SetRowColor{LightBackground} `import pandas as pd` & \seqsplit{导入pandas并缩写为pd} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{} \tn % Row Count 2 (+ 0) % Row 2 \SetRowColor{LightBackground} `import numpy as np` & \seqsplit{导入numpy并缩写为np} \tn % Row Count 4 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{常规导入} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{1.54287 cm} x{3.43413 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{数据选取}} \tn % Row 0 \SetRowColor{LightBackground} df{[}col{]} & \seqsplit{根据列名,并以Series的形式返回列} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} df{[}{[}col1, col2{]}{]} & \seqsplit{以DataFrame形式返回多列} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} s.iloc{[}0{]} & 按位置选取数据 \tn % Row Count 5 (+ 1) % Row 3 \SetRowColor{white} s.loc{[}'index\_one'{]} & 按索引选取数据 \tn % Row Count 7 (+ 2) % Row 4 \SetRowColor{LightBackground} df.iloc{[}0,:{]} & 返回第一行 \tn % Row Count 8 (+ 1) % Row 5 \SetRowColor{white} df.iloc{[}0,0{]} & \seqsplit{返回第一列的第一个元素} \tn % Row Count 10 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{数据清理}} \tn % Row 0 \SetRowColor{LightBackground} df.columns = {[}'a','b','c'{]} & 重命名列名 \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} pd.isnull() & \seqsplit{检查DataFrame对象中的空值,并返回一个Boolean数组} \tn % Row Count 6 (+ 4) % Row 2 \SetRowColor{LightBackground} pd.notnull() & \seqsplit{检查DataFrame对象中的非空值,并返回一个Boolean数组} \tn % Row Count 10 (+ 4) % Row 3 \SetRowColor{white} df.dropna() & \seqsplit{删除所有包含空值的行} \tn % Row Count 12 (+ 2) % Row 4 \SetRowColor{LightBackground} df.dropna(axis=1) & \seqsplit{删除所有包含空值的列} \tn % Row Count 14 (+ 2) % Row 5 \SetRowColor{white} df.dropna(axis=1,thresh=n) & \seqsplit{删除所有小于n个非空值的行} \tn % Row Count 16 (+ 2) % Row 6 \SetRowColor{LightBackground} df.fillna(x) & \seqsplit{用x替换DataFrame对象中所有的空值} \tn % Row Count 19 (+ 3) % Row 7 \SetRowColor{white} s.astype(float) & \seqsplit{将Series中的数据类型更改为float类型} \tn % Row Count 22 (+ 3) % Row 8 \SetRowColor{LightBackground} s.replace(1,'one') & \seqsplit{用'one'代替所有等于1的值} \tn % Row Count 24 (+ 2) % Row 9 \SetRowColor{white} s.replace({[}1,3{]},{[}'one','three'{]}) & \seqsplit{用'one'代替1,用'three'代替3} \tn % Row Count 26 (+ 2) % Row 10 \SetRowColor{LightBackground} \seqsplit{df.rename(columns=lambda} x: x + 1) & 批量更改列名 \tn % Row Count 28 (+ 2) % Row 11 \SetRowColor{white} df.rename(columns=\{'old\_name': 'new\_ name'\}) & \seqsplit{选择性更改列名} \tn % Row Count 31 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{数据清理 (cont)}} \tn % Row 12 \SetRowColor{LightBackground} \seqsplit{df.set\_index('column\_one')} & 更改索引列 \tn % Row Count 2 (+ 2) % Row 13 \SetRowColor{white} \seqsplit{df.rename(index=lambda} x: x + 1) & \seqsplit{批量重命名索引} \tn % Row Count 4 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.15119 cm} x{1.96811 cm} p{0.4577 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{5.377cm}}{\bf\textcolor{white}{导入数据}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{pd.read\_csv(filename)} & \seqsplit{从CSV文件导入数据} & 1 \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \seqsplit{pd.read\_table(filename)} & \seqsplit{从限定分隔符的文本文件导入数据} & 2 \tn % Row Count 5 (+ 3) % Row 2 \SetRowColor{LightBackground} \seqsplit{pd.read\_excel(filename)} & \seqsplit{从Excel文件导入数据} & 3 \tn % Row Count 7 (+ 2) % Row 3 \SetRowColor{white} \seqsplit{pd.read\_sql(query}, \seqsplit{connection\_object)} & \seqsplit{从SQL表/库导入数据} & 4 \tn % Row Count 10 (+ 3) % Row 4 \SetRowColor{LightBackground} \seqsplit{pd.read\_json(json\_string)} & \seqsplit{从JSON格式的字符串导入数据} & 5 \tn % Row Count 13 (+ 3) % Row 5 \SetRowColor{white} pd.read\_html(url) & \seqsplit{解析URL、字符串或者HTML文件,抽取其中的tables表格} & 6 \tn % Row Count 17 (+ 4) % Row 6 \SetRowColor{LightBackground} \seqsplit{pd.read\_clipboard()} & \seqsplit{从你的粘贴板获取内容,并传给read\_table()} & 7 \tn % Row Count 21 (+ 4) % Row 7 \SetRowColor{white} \seqsplit{pd.DataFrame(dict)} & \seqsplit{从字典对象导入数据,Key是列名,Value是数据} & 8 \tn % Row Count 25 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}---} \SetRowColor{LightBackground} \mymulticolumn{3}{x{5.377cm}}{\seqsplit{网上爬取表格一般使用方法6和方法2}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{p{0.4577 cm} x{1.87657 cm} x{2.24273 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{5.377cm}}{\bf\textcolor{white}{导出数据}} \tn % Row 0 \SetRowColor{LightBackground} 1 & \seqsplit{df.to\_csv(filename)} & \seqsplit{导出数据到CSV文件} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} 2 & \seqsplit{df.to\_excel(filename)} & \seqsplit{导出数据到Excel文件} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} 3 & \seqsplit{df.to\_sql(table\_name}, \seqsplit{connection\_object)} & \seqsplit{导出数据到SQL表} \tn % Row Count 7 (+ 3) % Row 3 \SetRowColor{white} 4 & \seqsplit{df.to\_json(filename)} & \seqsplit{以Json格式导出数据到文本文件} \tn % Row Count 10 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}---} \SetRowColor{LightBackground} \mymulticolumn{3}{x{5.377cm}}{\seqsplit{一般文件名需要加`''`不知道为什么这里没加}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{\seqsplit{数据处理:Filter、Sort和GroupBy}}} \tn % Row 0 \SetRowColor{LightBackground} df{[}df{[}col{]} \textgreater{} 0.5{]} & \seqsplit{选择col列的值大于0}.5的行 \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \seqsplit{df.sort\_values(col1)} & \seqsplit{按照列col1排序数据,默认升序排列} \tn % Row Count 5 (+ 3) % Row 2 \SetRowColor{LightBackground} \seqsplit{df.sort\_values(col2}, ascending=False) & \seqsplit{按照列col1降序排列数据} \tn % Row Count 7 (+ 2) % Row 3 \SetRowColor{white} df.sort\_values({[}col1,col2{]}, ascending={[}True,False{]}) & \seqsplit{先按列col1升序排列,后按col2降序排列数据} \tn % Row Count 10 (+ 3) % Row 4 \SetRowColor{LightBackground} df.groupby(col) & \seqsplit{返回一个按列col进行分组的Groupby对象} \tn % Row Count 13 (+ 3) % Row 5 \SetRowColor{white} df.groupby({[}col1,col2{]}) & \seqsplit{返回一个按多列进行分组的Groupby对象} \tn % Row Count 16 (+ 3) % Row 6 \SetRowColor{LightBackground} df.groupby(col1){[}col2{]} & \seqsplit{返回按列col1进行分组后,列col2的均值} \tn % Row Count 19 (+ 3) % Row 7 \SetRowColor{white} \seqsplit{df.pivot\_table(index=col1}, values={[}col2,col3{]}, aggfunc=max) & \seqsplit{创建一个按列col1进行分组,并计算col2和col3的最大值的数据透视表} \tn % Row Count 24 (+ 5) % Row 8 \SetRowColor{LightBackground} \seqsplit{df.groupby(col1).agg(np.mean)} & \seqsplit{返回按列col1分组的所有列的均值} \tn % Row Count 27 (+ 3) % Row 9 \SetRowColor{white} data.apply(np.mean) & \seqsplit{对DataFrame中的每一列应用函数np}.mean \tn % Row Count 30 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{\seqsplit{数据处理:Filter、Sort和GroupBy} (cont)}} \tn % Row 10 \SetRowColor{LightBackground} data.apply(np.max,axis=1) & \seqsplit{对DataFrame中的每一行应用函数np}.max \tn % Row Count 3 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{创建测试对象}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{pd.DataFrame(np.random.rand(20},5)) & \seqsplit{创建20行5列的随机数组成的DataFrame对象} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} pd.Series(my\_list) & \seqsplit{从可迭代对象my\_list创建一个Series对象} \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} df.index = \seqsplit{pd.date\_range('1900/1/30'}, periods=df.shape{[}0{]}) & \seqsplit{增加一个日期索引} \tn % Row Count 9 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.18988 cm} x{2.78712 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{查看、检查数据}} \tn % Row 0 \SetRowColor{LightBackground} df.head(n) & \seqsplit{查看DataFrame对象的前n行} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} df.tail(n) & \seqsplit{查看DataFrame对象的最后n行} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} df.shape() & 查看行数和列数 \tn % Row Count 5 (+ 1) % Row 3 \SetRowColor{white} http://df.info() & \seqsplit{查看索引、数据类型和内存信息} \tn % Row Count 7 (+ 2) % Row 4 \SetRowColor{LightBackground} df.describe() & \seqsplit{查看数值型列的汇总统计} \tn % Row Count 9 (+ 2) % Row 5 \SetRowColor{white} \seqsplit{s.value\_counts(dropna=False)} & \seqsplit{查看Series对象的唯一值和计数} \tn % Row Count 11 (+ 2) % Row 6 \SetRowColor{LightBackground} \seqsplit{df.apply(pd.Series.value\_counts)} & \seqsplit{查看DataFrame对象中每一列的唯一值和计数} \tn % Row Count 14 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.23965 cm} x{2.73735 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{数据合并}} \tn % Row 0 \SetRowColor{LightBackground} df1.append(df2) & \seqsplit{将df2中的行添加到df1的尾部} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} df.concat({[}df1, df2{]},axis=1) & \seqsplit{将df2中的列添加到df1的尾部} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} df1.join(df2,on=col1,how='inner') & \seqsplit{对df1的列和df2的列执行SQL形式的join} \tn % Row Count 7 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{1.24425 cm} x{3.73275 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{数据统计}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{df.describe()} & \seqsplit{查看数据值列的汇总统计} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} df.mean() & 返回所有列的均值 \tn % Row Count 3 (+ 1) % Row 2 \SetRowColor{LightBackground} df.corr() & \seqsplit{返回列与列之间的相关系数} \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} \seqsplit{df.count()} & \seqsplit{返回每一列中的非空值的个数} \tn % Row Count 7 (+ 2) % Row 4 \SetRowColor{LightBackground} df.max() & \seqsplit{返回每一列的最大值} \tn % Row Count 8 (+ 1) % Row 5 \SetRowColor{white} df.min() & \seqsplit{返回每一列的最小值} \tn % Row Count 9 (+ 1) % Row 6 \SetRowColor{LightBackground} \seqsplit{df.median()} & \seqsplit{返回每一列的中位数} \tn % Row Count 11 (+ 2) % Row 7 \SetRowColor{white} df.std() & \seqsplit{返回每一列的标准差} \tn % Row Count 12 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}