\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{williamcollins} \pdfinfo{ /Title (ntlk-language-processing-python.pdf) /Creator (Cheatography) /Author (williamcollins) /Subject (NTLK Language Processing Python Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{A3A3A3} \definecolor{LightBackground}{HTML}{F3F3F3} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{NTLK Language Processing Python Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{williamcollins} via \textcolor{DarkBackground}{\uline{cheatography.com/61610/cs/15901/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}williamcollins \\ \uline{cheatography.com/williamcollins} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 26th May, 2018.\\ Updated 26th May, 2018.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{2} \begin{tabularx}{8.4cm}{x{2.16 cm} x{5.84 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Python Import}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{import nltk} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \seqsplit{nltk.download()} & This step will bring up a window in which you can download 'All Corpora' \tn % Row Count 4 (+ 3) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{from nltk.book import *} \tn % Row Count 5 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{p{0.8 cm} p{0.8 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{BASICS}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{} \tn % Row Count 0 (+ 0) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{3.28 cm} x{4.72 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Tokens}} \tn % Row 0 \SetRowColor{LightBackground} text1{[}0:100{]} & -first 101 tokens \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} text2{[}5{]} & - fifth token \tn % Row Count 2 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Concordance}} \tn % Row 0 \SetRowColor{LightBackground} \textgreater{} \seqsplit{text3.concordance('begat')} & - basic keyword-in-context \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \seqsplit{text1.concordance('sea'}, lines=100) & -{}- show other than default 25 lines \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} \textgreater{} \seqsplit{text1.concordance('sea'}, lines=100) & - show other than default 25 lines \tn % Row Count 7 (+ 3) % Row 3 \SetRowColor{white} \seqsplit{text1.concordance('sea'}, lines=all) & - show all results \tn % Row Count 9 (+ 2) % Row 4 \SetRowColor{LightBackground} \seqsplit{text1.concordance('sea'}, 10, lines=all) - & - change left and right context width to 10 characters and show all results \tn % Row Count 13 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{p{0.8 cm} p{0.8 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{common\_contexts}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{text1.common\_contexts({[}'sea','ocean'{]})} \tn % Row Count 1 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{COUNTING}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{} \tn % Row Count 0 (+ 0) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{3.92 cm} x{4.08 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{COUNTING}} \tn % Row 0 \SetRowColor{LightBackground} Count a String & len('this is a string of text') – number of characterslen('this is a string of text') – number of characters \tn % Row Count 6 (+ 6) % Row 1 \SetRowColor{white} Count a list of tokens & len(text1) –number of tokens \tn % Row Count 8 (+ 2) % Row 2 \SetRowColor{LightBackground} Make and Count a list of unique tokens & len(set(text1)) – notice that set return a list of unique tokens \tn % Row Count 12 (+ 4) % Row 3 \SetRowColor{white} Count Occurences & \seqsplit{text1.count('heaven')} – how many times does a word occur? \tn % Row Count 16 (+ 4) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{Frequency} \tn % Row Count 17 (+ 1) % Row 5 \SetRowColor{white} & fd = \seqsplit{nltk.FreqDist(text1)} – creates a new data object that contains information about word frequency \tn % Row Count 23 (+ 6) % Row 6 \SetRowColor{LightBackground} & fd{[}'the'{]} – how many occurences of the word 'the' \tn % Row Count 26 (+ 3) % Row 7 \SetRowColor{white} & fd.keys() – show the keys in the data object \tn % Row Count 29 (+ 3) % Row 8 \SetRowColor{LightBackground} & fd.values() – show the values in the data object \tn % Row Count 32 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{x{3.92 cm} x{4.08 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{COUNTING (cont)}} \tn % Row 9 \SetRowColor{LightBackground} & fd.items() – show everything \tn % Row Count 2 (+ 2) % Row 10 \SetRowColor{white} & fd.keys(){[}0:50{]} – just show a portion of the info \tn % Row Count 5 (+ 3) % Row 11 \SetRowColor{LightBackground} Frequency Plots & fd.plot(50,cumulative=False) – generate a chart of the 50 most frequent words \tn % Row Count 9 (+ 4) % Row 12 \SetRowColor{white} Other FreqDist functions & fd.hapaxes() \tn % Row Count 11 (+ 2) % Row 13 \SetRowColor{LightBackground} & fd.freq('the') \tn % Row Count 12 (+ 1) % Row 14 \SetRowColor{white} Get word lengths & lengths = {[}len(w) for w in text1{]} \tn % Row Count 14 (+ 2) % Row 15 \SetRowColor{LightBackground} And do FreqDist & fd = \seqsplit{nltk.FreqDist(lengths)} \tn % Row Count 16 (+ 2) % Row 16 \SetRowColor{white} FreqDist as Table & fd.tabulate() \tn % Row Count 17 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{PARTS OF SPEACH CODES}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{} \tn % Row Count 0 (+ 0) \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{CC Coordinating conjunction \newline CD Cardinal number \newline DT Determiner \newline EX Existential there \newline FW Foreign word \newline IN Preposition or subordinating \newline conjunction \newline JJ Adjective \newline JJR Adjective, comparative \newline JJS Adjective, superlative \newline LS List item marker \newline MD Modal \newline NN Noun, singular or mass} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{p{0.8 cm} p{0.8 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{PARTS OF SPEACH CODES}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{} \tn % Row Count 0 (+ 0) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{NNS Noun, plural \newline NNP Proper noun, singular \newline NNPS Proper noun, plural \newline PDT Predeterminer \newline POS Possessive ending \newline PRP Personal pronoun \newline PRP\$ Possessive pronoun \newline RB Adverb \newline RBR Adverb, comparative \newline RBS Adverb, superlative \newline RP Particle \newline SYM Symbol \newline TO to} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{p{0.8 cm} p{0.8 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{PARTS OF SPEACH CODES}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{} \tn % Row Count 0 (+ 0) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{UH Interjection \newline VB Verb, base form \newline VBD Verb, past tense \newline VBG Verb, gerund or present \newline participle \newline VBN Verb, past participle \newline VBP Verb, non-3rd person singular \newline present \newline VBZ Verb, 3rd person singular \newline present \newline WDT Wh-determiner \newline WP Wh-pronoun \newline WP\$ Possessive wh-pronoun \newline WRB Wh-adverb} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{2.56 cm} x{5.44 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{NORMALIZING}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{De-punctuate} & {[}w for w in text1 if w.isalpha() {]} – not so much getting rid of punctuation, but keeping alphabetic characters \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} \seqsplit{De-uppercaseify} (?) & \textgreater{}{[}w.lower() for w in text{]} – make each word in the tokenized list lowercase \tn % Row Count 8 (+ 3) % Row 2 \SetRowColor{LightBackground} & {[}w.lower() for w in text if w.isalpha(){]} – all in one go \tn % Row Count 11 (+ 3) % Row 3 \SetRowColor{white} Sort & sorted(text1) – careful with this! \tn % Row Count 13 (+ 2) % Row 4 \SetRowColor{LightBackground} Unique Words & set(text1) – set is oddly named, but very powerful. Leaves you with a list of only one of each word. \tn % Row Count 17 (+ 4) % Row 5 \SetRowColor{white} Exclude Stopwords & Make your own list of word to be excluded: \tn % Row Count 19 (+ 2) % Row 6 \SetRowColor{LightBackground} & stopwords = {[}'the','it','she','he'{]} \tn % Row Count 21 (+ 2) % Row 7 \SetRowColor{white} & mynewtext = {[}w for w in text1 if w not in stopwords{]} \tn % Row Count 23 (+ 2) % Row 8 \SetRowColor{LightBackground} & Or you can also use predefined stopword lists from NLTK: \tn % Row Count 26 (+ 3) % Row 9 \SetRowColor{white} & from nltk.corpus import stopwords \tn % Row Count 28 (+ 2) % Row 10 \SetRowColor{LightBackground} & stopwords = \seqsplit{stopwords.words('english')} \tn % Row Count 30 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{x{2.56 cm} x{5.44 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{NORMALIZING (cont)}} \tn % Row 11 \SetRowColor{LightBackground} & mynewtext = {[}w for w in text1 if w not in stopwords{]} \tn % Row Count 2 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{3.28 cm} x{4.72 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{SEARCHING}} \tn % Row 0 \SetRowColor{LightBackground} Dispersion Plot & text4.dispersion\_plot({[}'American','Liberty','Government'{]}) \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} Find Word that ends with... & {[}w for w in text4 if w.endswith('ness'){]} \tn % Row Count 6 (+ 2) % Row 2 \SetRowColor{LightBackground} Find Word that start with... & {[}w for w in text4 if w.startsswith('ness'){]} \tn % Row Count 9 (+ 3) % Row 3 \SetRowColor{white} Find Word that contain... & {[}w for w in text4 if 'ee' in w{]} \tn % Row Count 11 (+ 2) % Row 4 \SetRowColor{LightBackground} Combine them together & {[}w for w in text4 if 'ee' in w and w.endswith('ing'){]} Regular expressions 'Regular expressions' is a syntax for describing sequences \tn % Row Count 18 (+ 7) % Row 5 \SetRowColor{white} Regular Expressions & 'Regular expressions' is a syntax for describing sequences of characters usually used to construct search queries. The Python 're' module must first be imported: \tn % Row Count 26 (+ 8) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{Import} \tn % Row Count 27 (+ 1) % Row 7 \SetRowColor{white} & \textgreater{}\textgreater{}\textgreater{}import re \textgreater{}\textgreater{}\textgreater{}{[}w for w in text1 if re.search('\textasciicircum{}ab',w){]} – 'Regular expressions' is too big of a topic to cover here. Google it! \tn % Row Count 33 (+ 6) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{1.84 cm} x{6.16 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{CHUNKING}} \tn % Row 0 \SetRowColor{LightBackground} & \textgreater{}\textgreater{}\textgreater{}import re \textgreater{}\textgreater{}\textgreater{}{[}w for w in text1 if re.search('\textasciicircum{}ab',w){]} – 'Regular expressions' is too big of a topic to cover here. Google it! \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} \seqsplit{Collocations} & \textgreater{} text4.collocations() - multi-word expressions that commonly co-occur. Notice that is not necessarily related to the frequency of the words \tn % Row Count 10 (+ 5) % Row 2 \SetRowColor{LightBackground} & \textgreater{}text4.collocations(num=100) – alter the number of phrases returned Bigrams, Trigrams, and n-grams are useful for comparing texts, particularly for plagiarism detection and collation \tn % Row Count 17 (+ 7) % Row 3 \SetRowColor{white} Bi-grams & \textgreater{}nltk.bigrams(text4) – returns every string of two words \tn % Row Count 19 (+ 2) % Row 4 \SetRowColor{LightBackground} \seqsplit{Tri-grams} & nltk.trigrams(text4) – return every string of three word \tn % Row Count 21 (+ 2) % Row 5 \SetRowColor{white} n-grams & nltk.ngrams(text4, 5) \tn % Row Count 22 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{2.8 cm} x{5.2 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{TAGGING}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{part-of-speach} tagging & mytext = \seqsplit{nltk.word\_tokenize("This} is my sentence") \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} & nltk.pos\_tag(mytext) \tn % Row Count 4 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{3.2 cm} x{4.8 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Working with your own texts:}} \tn % Row 0 \SetRowColor{LightBackground} Open a file for reading & \textgreater{}file = open('myfile.txt') – make sure you are in the correct directory before starting Python \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} Read the file & t = file.read(); \tn % Row Count 6 (+ 1) % Row 2 \SetRowColor{LightBackground} Tokenize the file & tokens = nltk.word\_tokenize(t) \tn % Row Count 8 (+ 2) % Row 3 \SetRowColor{white} Convert to NLTK text object & text = nltk.Text(tokens) \tn % Row Count 10 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{p{3.2 cm} p{4.8 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{QUITTING PYTHON}} \tn % Row 0 \SetRowColor{LightBackground} Quit & quit() \tn % Row Count 1 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}