\documentclass[10pt,a4paper]{article}

% Packages
\usepackage{fancyhdr}           % For header and footer
\usepackage{multicol}           % Allows multicols in tables
\usepackage{tabularx}           % Intelligent column widths
\usepackage{tabulary}           % Used in header and footer
\usepackage{hhline}             % Border under tables
\usepackage{graphicx}           % For images
\usepackage{xcolor}             % For hex colours
%\usepackage[utf8x]{inputenc}    % For unicode character support
\usepackage[T1]{fontenc}        % Without this we get weird character replacements
\usepackage{colortbl}           % For coloured tables
\usepackage{setspace}           % For line height
\usepackage{lastpage}           % Needed for total page number
\usepackage{seqsplit}           % Splits long words.
%\usepackage{opensans}          % Can't make this work so far. Shame. Would be lovely.
\usepackage[normalem]{ulem}     % For underlining links
% Most of the following are not required for the majority
% of cheat sheets but are needed for some symbol support.
\usepackage{amsmath}            % Symbols
\usepackage{MnSymbol}           % Symbols
\usepackage{wasysym}            % Symbols
%\usepackage[english,german,french,spanish,italian]{babel}              % Languages

% Document Info
\author{RJ Murray (murenei)}
\pdfinfo{
  /Title (natural-language-processing-with-python-and-nltk.pdf)
  /Creator (Cheatography)
  /Author (RJ Murray (murenei))
  /Subject (Natural Language Processing with Python \& nltk Cheat Sheet)
}

% Lengths and widths
\addtolength{\textwidth}{6cm}
\addtolength{\textheight}{-1cm}
\addtolength{\hoffset}{-3cm}
\addtolength{\voffset}{-2cm}
\setlength{\tabcolsep}{0.2cm} % Space between columns
\setlength{\headsep}{-12pt} % Reduce space between header and content
\setlength{\headheight}{85pt} % If less, LaTeX automatically increases it
\renewcommand{\footrulewidth}{0pt} % Remove footer line
\renewcommand{\headrulewidth}{0pt} % Remove header line
\renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit
% This two commands together give roughly
% the right line height in the tables
\renewcommand{\arraystretch}{1.3}
\onehalfspacing

% Commands
\newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour
\newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols
\newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns
\newcommand{\tn}{\tabularnewline} % Required as custom column type in use

% Font and Colours
\definecolor{HeadBackground}{HTML}{333333}
\definecolor{FootBackground}{HTML}{666666}
\definecolor{TextColor}{HTML}{333333}
\definecolor{DarkBackground}{HTML}{257D22}
\definecolor{LightBackground}{HTML}{F8FAF8}
\renewcommand{\familydefault}{\sfdefault}
\color{TextColor}

% Header and Footer
\pagestyle{fancy}
\fancyhead{} % Set header to blank
\fancyfoot{} % Set footer to blank
\fancyhead[L]{
\noindent
\begin{multicols}{3}
\begin{tabulary}{5.8cm}{C}
    \SetRowColor{DarkBackground}
    \vspace{-7pt}
    {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent
        \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}}
    }
\end{tabulary}
\columnbreak
\begin{tabulary}{11cm}{L}
    \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Natural Language Processing with Python \& nltk Cheat Sheet}}}} \\
    \normalsize{by \textcolor{DarkBackground}{RJ Murray (murenei)} via \textcolor{DarkBackground}{\uline{cheatography.com/58736/cs/15485/}}}
\end{tabulary}
\end{multicols}}

\fancyfoot[L]{ \footnotesize
\noindent
\begin{multicols}{3}
\begin{tabulary}{5.8cm}{LL}
  \SetRowColor{FootBackground}
  \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}}  \\
  \vspace{-2pt}RJ Murray (murenei) \\
  \uline{cheatography.com/murenei} \\
        \uline{\seqsplit{tutify}.com.au}
  \end{tabulary}
\vfill
\columnbreak
\begin{tabulary}{5.8cm}{L}
  \SetRowColor{FootBackground}
  \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}}  \\
   \vspace{-2pt}Published 28th May, 2018.\\
   Updated 29th May, 2018.\\
   Page {\thepage} of \pageref{LastPage}.
\end{tabulary}
\vfill
\columnbreak
\begin{tabulary}{5.8cm}{L}
  \SetRowColor{FootBackground}
  \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}}  \\
  \SetRowColor{white}
  \vspace{-5pt}
  %\includegraphics[width=48px,height=48px]{dave.jpeg}
  Measure your website readability!\\
  www.readability-score.com
\end{tabulary}
\end{multicols}}


\begin{document}
\raggedright
\raggedcolumns

% Set font size to small. Switch to any value
% from this page to resize cheat sheet text:
% www.emerson.emory.edu/services/latex/latex_169.html
\footnotesize % Small font.

\begin{multicols*}{2}

\begin{tabularx}{8.4cm}{x{2.96 cm} x{5.04 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Handling Text}}  \tn
% Row 0
\SetRowColor{LightBackground}
`text='Some words'` & assign string \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
`list(text)` & Split text into character tokens \tn 
% Row Count 4 (+ 2)
% Row 2
\SetRowColor{LightBackground}
`set(text)` & Unique tokens \tn 
% Row Count 5 (+ 1)
% Row 3
\SetRowColor{white}
`len(text)` & Number of characters \tn 
% Row Count 6 (+ 1)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{3.52 cm} x{4.48 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Accessing corpora and lexical resources}}  \tn
% Row 0
\SetRowColor{LightBackground}
`from nltk.corpus import brown` & import CorpusReader object \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
\seqsplit{`brown.words(text\_id)`} & Returns pretokenised document as list of words \tn 
% Row Count 5 (+ 3)
% Row 2
\SetRowColor{LightBackground}
\seqsplit{`brown.fileids()`} & Lists docs in Brown corpus \tn 
% Row Count 7 (+ 2)
% Row 3
\SetRowColor{white}
\seqsplit{`brown.categories()`} & Lists categories in Brown corpus \tn 
% Row Count 9 (+ 2)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{3.68 cm} x{4.32 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Tokenization}}  \tn
% Row 0
\SetRowColor{LightBackground}
`text.split(" ")` & Split by space \tn 
% Row Count 1 (+ 1)
% Row 1
\SetRowColor{white}
\seqsplit{`nltk.word\_tokenizer(text)`} & nltk in-built word tokenizer \tn 
% Row Count 3 (+ 2)
% Row 2
\SetRowColor{LightBackground}
\seqsplit{`nltk.sent\_tokenize(doc)`} & nltk in-built sentence tokenizer \tn 
% Row Count 5 (+ 2)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{4.64 cm} x{3.36 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Lemmatization \& Stemming}}  \tn
% Row 0
\SetRowColor{LightBackground}
`input="List listed lists listing listings"` & Different suffixes \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
\seqsplit{`words=input.lower().split('} ')` & Normalize (lowercase) words \tn 
% Row Count 4 (+ 2)
% Row 2
\SetRowColor{LightBackground}
\seqsplit{`porter=nltk.PorterStemmer`} & Initialise Stemmer \tn 
% Row Count 6 (+ 2)
% Row 3
\SetRowColor{white}
`{[}porter.stem(t) for t in words{]}` & Create list of stems \tn 
% Row Count 8 (+ 2)
% Row 4
\SetRowColor{LightBackground}
\seqsplit{`WNL=nltk.WordNetLemmatizer()`} & Initialise WordNet lemmatizer \tn 
% Row Count 10 (+ 2)
% Row 5
\SetRowColor{white}
`{[}WNL.lemmatize(t) for t in words{]}` & Use the lemmatizer \tn 
% Row Count 12 (+ 2)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{3.44 cm} x{4.56 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Part of Speech (POS) Tagging}}  \tn
% Row 0
\SetRowColor{LightBackground}
\seqsplit{`nltk.help.upenn\_tagset('MD')`} & Lookup definition for a POS tag \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
\seqsplit{`nltk.pos\_tag(words)`} & nltk in-built POS tagger \tn 
% Row Count 4 (+ 2)
% Row 2
\SetRowColor{LightBackground}
 & \textless{}use an alternative tagger to illustrate ambiguity\textgreater{} \tn 
% Row Count 7 (+ 3)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{4.24 cm} x{3.76 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Sentence Parsing}}  \tn
% Row 0
\SetRowColor{LightBackground}
\seqsplit{`g=nltk.data.load('grammar.cfg')`} & Load a grammar from a file \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
\seqsplit{`g=nltk.CFG.fromstring("""...""")`} & Manually define grammar \tn 
% Row Count 4 (+ 2)
% Row 2
\SetRowColor{LightBackground}
\seqsplit{`parser=nltk.ChartParser(g)`} & Create a parser out of the grammar \tn 
% Row Count 6 (+ 2)
% Row 3
\SetRowColor{white}
\mymulticolumn{2}{x{8.4cm}}{\seqsplit{`trees=parser.parse\_all(text)`}} \tn 
% Row Count 7 (+ 1)
% Row 4
\SetRowColor{LightBackground}
\mymulticolumn{2}{x{8.4cm}}{`for tree in trees: ... print tree`} \tn 
% Row Count 8 (+ 1)
% Row 5
\SetRowColor{white}
\mymulticolumn{2}{x{8.4cm}}{`from nltk.corpus import treebank`} \tn 
% Row Count 9 (+ 1)
% Row 6
\SetRowColor{LightBackground}
\seqsplit{`treebank.parsed\_sents('wsj\_0001.mrg')`} & Treebank parsed sentences \tn 
% Row Count 11 (+ 2)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{4.4 cm} x{3.6 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Text Classification}}  \tn
% Row 0
\SetRowColor{LightBackground}
\mymulticolumn{2}{x{8.4cm}}{`from \seqsplit{sklearn.feature\_extraction.text} import CountVectorizer, TfidfVectorizer`} \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
\seqsplit{`vect=CountVectorizer()}.fit(X\_train)` & Fit bag of words model to data \tn 
% Row Count 4 (+ 2)
% Row 2
\SetRowColor{LightBackground}
\seqsplit{`vect.get\_feature\_names()`} & Get features \tn 
% Row Count 6 (+ 2)
% Row 3
\SetRowColor{white}
\seqsplit{`vect.transform(X\_train)`} & Convert to doc-term matrix \tn 
% Row Count 8 (+ 2)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{3.84 cm} x{4.16 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Entity Recognition (Chunking/Chinking)}}  \tn
% Row 0
\SetRowColor{LightBackground}
`g="NP: \{\textless{}DT\textgreater{}?\textless{}JJ\textgreater{}*\textless{}NN\textgreater{}\}"` & Regex chunk grammar \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
\seqsplit{`cp=nltk.RegexpParser(g)`} & Parse grammar \tn 
% Row Count 4 (+ 2)
% Row 2
\SetRowColor{LightBackground}
\seqsplit{`ch=cp.parse(pos\_sent)`} & Parse tagged sent. using grammar \tn 
% Row Count 6 (+ 2)
% Row 3
\SetRowColor{white}
`print(ch)` & Show chunks \tn 
% Row Count 7 (+ 1)
% Row 4
\SetRowColor{LightBackground}
`ch.draw()` & Show chunks in IOB tree \tn 
% Row Count 9 (+ 2)
% Row 5
\SetRowColor{white}
\seqsplit{`cp.evaluate(test\_sents)`} & Evaluate against test doc \tn 
% Row Count 11 (+ 2)
% Row 6
\SetRowColor{LightBackground}
\mymulticolumn{2}{x{8.4cm}}{\seqsplit{`sents=nltk.corpus.treebank.tagged\_sents()`}} \tn 
% Row Count 12 (+ 1)
% Row 7
\SetRowColor{white}
\seqsplit{`print(nltk.ne\_chunk(sent))} ` & Print chunk tree \tn 
% Row Count 14 (+ 2)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{RegEx with Pandas \& Named Groups}}  \tn
% Row 0
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{\seqsplit{`df=pd.DataFrame(time\_sents}, columns={[}'text'{]})`} \tn 
% Row Count 1 (+ 1)
% Row 1
\SetRowColor{white}
\mymulticolumn{1}{x{8.4cm}}{`df{[}'text'{]}.str.split().str.len()`} \tn 
% Row Count 2 (+ 1)
% Row 2
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{`df{[}'text'{]}.str.contains('word')`} \tn 
% Row Count 3 (+ 1)
% Row 3
\SetRowColor{white}
\mymulticolumn{1}{x{8.4cm}}{`df{[}'text'{]}.str.count(r'\textbackslash{}d')`} \tn 
% Row Count 4 (+ 1)
% Row 4
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{`df{[}'text'{]}.str.findall(r'\textbackslash{}d')`} \tn 
% Row Count 5 (+ 1)
% Row 5
\SetRowColor{white}
\mymulticolumn{1}{x{8.4cm}}{`df{[}'text'{]}.str.replace(r'\textbackslash{}w+day\textbackslash{}b', '???')`} \tn 
% Row Count 6 (+ 1)
% Row 6
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{`df{[}'text'{]}.str.replace(r'(\textbackslash{}w)', lambda x: x.groups(){[}0{]}{[}:3{]})`} \tn 
% Row Count 8 (+ 2)
% Row 7
\SetRowColor{white}
\mymulticolumn{1}{x{8.4cm}}{`df{[}'text'{]}.str.extract(r'(\textbackslash{}d?\textbackslash{}d):(\textbackslash{}d\textbackslash{}d)')`} \tn 
% Row Count 9 (+ 1)
% Row 8
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{`df{[}'text'{]}.str.extractall(r'((\textbackslash{}d?\textbackslash{}d):(\textbackslash{}d\textbackslash{}d) ?({[}ap{]}m))')`} \tn 
% Row Count 11 (+ 2)
% Row 9
\SetRowColor{white}
\mymulticolumn{1}{x{8.4cm}}{`df{[}'text'{]}.str.extractall(r'(?P\textless{}digits\textgreater{}\textbackslash{}d)')`} \tn 
% Row Count 12 (+ 1)
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}


% That's all folks
\end{multicols*}

\end{document}