\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{datamansam} \pdfinfo{ /Title (reg-ex-cheatsheet.pdf) /Creator (Cheatography) /Author (datamansam) /Subject (Reg Ex CheatSheet Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{A3A3A3} \definecolor{LightBackground}{HTML}{F3F3F3} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Reg Ex CheatSheet Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{datamansam} via \textcolor{DarkBackground}{\uline{cheatography.com/139410/cs/29846/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}datamansam \\ \uline{cheatography.com/datamansam} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 30th November, 2021.\\ Updated 23rd November, 2021.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{x{1.55618 cm} x{1.51041 cm} x{1.51041 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{5.377cm}}{\bf\textcolor{white}{Regex}} \tn % Row 0 \SetRowColor{LightBackground} (*) indicates that the preceding character can occur 0 or more times. & meo*w & mew, meow, meooow, and \seqsplit{meoooooooooooow} \tn % Row Count 6 (+ 6) % Row 1 \SetRowColor{white} ? - character can appear either 0 or 1 time & humou?r & humour humor \tn % Row Count 10 (+ 4) % Row 2 \SetRowColor{LightBackground} . and it can match any single character (letter, number, symbol or whitespace) in a piece of text & ......... & any 9-character text \tn % Row Count 18 (+ 8) % Row 3 \SetRowColor{white} {[}{]} will match any of the characters included within the brackets & con{[}sc{]}en{[}sc{]}us & consensus, concensus, consencus, and concencus \tn % Row Count 23 (+ 5) % Row 4 \SetRowColor{LightBackground} \{\} contains the exact quantity & roa\{3\}r & roaaar \tn % Row Count 26 (+ 3) % Row 5 \SetRowColor{white} \{\}n. the quantity range of characters to be matched & roa\{3,6\}r & roaaar, roaaaar, roaaaaar, or roaaaaaar \tn % Row Count 30 (+ 4) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{1.55618 cm} x{1.51041 cm} x{1.51041 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{5.377cm}}{\bf\textcolor{white}{Regex (cont)}} \tn % Row 6 \SetRowColor{LightBackground} |, allows for the matching of either of two \seqsplit{subexpressions}. & \seqsplit{baboons|gorillas} & will match the text baboons as well as the text gorillas. \tn % Row Count 5 (+ 5) % Row 7 \SetRowColor{white} Anchors (hat \textasciicircum{} and dollar sign \$) are used in regular expressions to match text at the start and end of a string, \seqsplit{respectively.} & \textasciicircum{}Monkeys: my mortal enemy\$ & will completely match the text Monkeys: my mortal enemy but not match Spider Monkeys: my mortal enemy or Monkeys: my mortal enemy in the wild \tn % Row Count 16 (+ 11) % Row 8 \SetRowColor{LightBackground} {[}letter-letter{]} or {[}n-n{]} & a range of characters that can be matched & {[}A-Z{]}. : match any uppercase letter {[}a-z{]}. : match any lowercase letter {[}0-9{]}. : match any digit {[}A-Za-z{]} : match any uppercase or lowercase letter \tn % Row Count 28 (+ 12) % Row 9 \SetRowColor{white} Shorthand character classes simplify writing regular expressions & \textbackslash{}w represents the regex range {[}A-Za-z0-9\_{]}, \textbackslash{}d represents {[}0-9{]}, & \textbackslash{}W represents {[}\textasciicircum{}A-Za-z0-9\_{]} matching any character not included by \textbackslash{}w, \textbackslash{}D represents {[}\textasciicircum{}0-9{]} matching any character not included by \textbackslash{}d \tn % Row Count 39 (+ 11) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{1.55618 cm} x{1.51041 cm} x{1.51041 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{5.377cm}}{\bf\textcolor{white}{Regex (cont)}} \tn % Row 10 \SetRowColor{LightBackground} Negated character set & {[}\textasciicircum{}cdh{]}are & will match the m in mare. \tn % Row Count 2 (+ 2) % Row 11 \SetRowColor{white} + ndicates that the preceding character can occur 1 or more times & meo+w & will match meow, meooow, and \seqsplit{meoooooooooooow}, but not match mew \tn % Row Count 7 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{1.55618 cm} x{1.51041 cm} x{1.51041 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{5.377cm}}{\bf\textcolor{white}{Text Preprocessing}} \tn % Row 0 \SetRowColor{LightBackground} Noise removal & import re result = re.sub(r'{[}\textbackslash{}.\textbackslash{}?\textbackslash{}!\textbackslash{},\textbackslash{}:\textbackslash{};\textbackslash{}"{]}', '', text) & Removes Punctuation \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} Tokenization is the text \seqsplit{preprocessing} task of breaking up text into smaller components of text & from \seqsplit{nltk.tokenize} import \seqsplit{word\_tokenize} text = "This is a text to tokenize" tokenized = \seqsplit{word\_tokenize(text)} & \seqsplit{print(tokenized)} \# {[}"This", "is", "a", "text", "to", "tokenize"{]} \tn % Row Count 14 (+ 9) % Row 2 \SetRowColor{LightBackground} In natural language processing, \seqsplit{normalization} encompasses many text \seqsplit{preprocessing} tasks including & stemming, \seqsplit{lemmatization}, & upper or lowercasing, and stopwords removal. \tn % Row Count 22 (+ 8) % Row 3 \SetRowColor{white} Stemming In natural language processing, stemming is the text \seqsplit{preprocessing} \seqsplit{normalization} task concerned with bluntly removing word affixes (prefixes and suffixes). & from nltk.stem import \seqsplit{PorterStemmer} tokenized = {[}"So", "many", "squids", "are", "jumping"{]} stemmer = \seqsplit{PorterStemmer()} stemmed = {[}stemmer.stem(token) for token in tokenized{]} & \# {[}'So', 'mani', 'squid', 'are', 'jump'{]} \tn % Row Count 36 (+ 14) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{1.55618 cm} x{1.51041 cm} x{1.51041 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{5.377cm}}{\bf\textcolor{white}{Text Preprocessing (cont)}} \tn % Row 4 \SetRowColor{LightBackground} \seqsplit{Lemmatization} In natural language processing, \seqsplit{lemmatization} is the text \seqsplit{preprocessing} \seqsplit{normalization} task concerned with bringing words down to their root forms. & from nltk.stem import \seqsplit{WordNetLemmatizer} tokenized = {[}"So", "many", "squids", "are", "jumping"{]} lemmatizer = \seqsplit{WordNetLemmatizer()} lemmatized = {[}lemmatizer.lemmatize(token) for token in tokenized{]} & {[}'So', 'many', 'squid', 'be', 'jump'{]} \tn % Row Count 16 (+ 16) % Row 5 \SetRowColor{white} stopword removal is the process of removing words from a string that don't provide any information about the tone of a statement. & from nltk.corpus import stopwords \# define set of English stopwords stop\_words = \seqsplit{set(stopwords}.words('english')) & \# remove stopwords from tokens in dataset \seqsplit{statement\_no\_stop} = {[}word for word in word\_tokens if word not in stop\_words{]} \tn % Row Count 27 (+ 11) % Row 6 \SetRowColor{LightBackground} parser. \seqsplit{chunk.RegexpParser} & Uses a set of regular expression patterns to specify the behavior of the parser & \{\textless{}DT|JJ\textgreater{}\} \# chunk determiners and adjectives \tn % Row Count 34 (+ 7) \hhline{>{\arrayrulecolor{DarkBackground}}---} \SetRowColor{LightBackground} \mymulticolumn{3}{x{5.377cm}}{Token = Smaller Component of Text \newline Stem = Remove prefix and suffix \newline Lemmatization = Bring down to root \newline Stopword = Remove meaningless} \tn \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{1.69349 cm} x{1.19002 cm} x{1.69349 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{5.377cm}}{\bf\textcolor{white}{Lists and Strings}} \tn % Row 0 \SetRowColor{LightBackground} z = 'Natural Language Processing' & \seqsplit{z.replace('} ', '\textbackslash{}n') & 'Natural\textbackslash{}nLanguage\textbackslash{}nProcessing' \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} & list(z) & Split text into character tokens \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} & set(z) & Unique tokens \tn % Row Count 7 (+ 1) % Row 3 \SetRowColor{white} x = {[}'Natural', 'Language', 'Toolkit'{]} & \seqsplit{x.insert(0}, 'Python') & {[}'Language', 'Natural', 'Python', 'Toolkit'{]} \tn % Row Count 12 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}