\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{sree017} \pdfinfo{ /Title (nlp.pdf) /Creator (Cheatography) /Author (sree017) /Subject (NLP Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{3514A3} \definecolor{LightBackground}{HTML}{F2F0F9} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{NLP Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{sree017} via \textcolor{DarkBackground}{\uline{cheatography.com/126402/cs/24446/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}sree017 \\ \uline{cheatography.com/sree017} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 26th September, 2020.\\ Updated 26th September, 2020.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{4} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Tokenization}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{Tokenization breaks the raw text into words, sentences called tokens. These tokens help in understanding the context or developing the model for the NLP. ... If the text is split into words using some separation technique it is called word tokenization and same separation done for sentences is called sentence tokenization. \newline \newline \# NLTK \newline import nltk \newline nltk.download('punkt') \newline paragraph = "write paragaraph here to convert into tokens." \newline \newline sentences = \seqsplit{nltk.sent\_tokenize(paragraph)} \newline \newline words = \seqsplit{nltk.word\_tokenize(paragraph)} \newline \newline \# Spacy \newline from spacy.lang.en import English \newline nlp = English() \newline sbd = \seqsplit{nlp.create\_pipe('sentencizer')} \newline nlp.add\_pipe(sbd) \newline \newline doc = nlp(paragraph) \newline {[}sent for sent in doc.sents{]} \newline \newline nlp = English() \newline doc = nlp(paragraph) \newline {[}word for word in doc{]} \newline \newline \newline \# Keras \newline from keras.preprocessing.text import text\_to\_word\_sequence \newline text\_to\_word\_sequence(paragraph) \newline \newline \# genis \newline from \seqsplit{gensim.summarization.textcleaner} import split\_sentences \newline split\_sentences(paragraph) \newline \newline from gensim.utils import tokenize \newline list(tokenize(paragraph))} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Bag Of Words \& TF-IDF}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{Bag of Words model is used to preprocess the text by converting it into a bag of words, which keeps a count of the total occurrences of most frequently used words \newline \newline \# counters = List of stences after pre processing like tokenization, stemming/lemmatization, stopwords \newline \newline from \seqsplit{sklearn.feature\_extraction.text} import CountVectorizer \newline cv = \seqsplit{CountVectorizer(max\_features} = 1500) \newline X = \seqsplit{cv.fit\_transform(counters).toarray()} \newline \newline Term Frequency-Inverse Document Frequency (TF-IDF): \newline Term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. \newline \newline T.F = No of rep of words in setence/No of words in sentence \newline \newline IDF = No of sentences / No of sentences containing words \newline \newline from \seqsplit{sklearn.feature\_extraction.text} import TfidfVectorizer \newline cv = TfidfVectorizer() \newline X = \seqsplit{cv.fit\_transform(counters).toarray()} \newline \newline \newline N-gram Language Model: An N-gram is a sequence of N tokens (or words). \newline \newline A 1-gram (or unigram) is a one-word sequence.the unigrams would simply be: "I", "love", "reading", "blogs", "about", "data", "science", "on", "Analytics", "Vidhya". \newline \newline A 2-gram (or bigram) is a two-word sequence of words, like "I love", "love reading", or "Analytics Vidhya". \newline \newline And a 3-gram (or trigram) is a three-word sequence of words like "I love reading", "about data science" or "on Analytics Vidhya".} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Stemming \& Lemmatization}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{From Stemming we will process of getting the root form of a word. We would create the stem words by removing the prefix of suffix of a word. So, stemming a word may not result in actual words. \newline \newline \newline paragraph = "" \newline \# NLTK \newline from nltk.stem import PorterStemmer \newline from nltk import sent\_tokenize \newline from nltk import word\_tokenize \newline stem = PorterStemmer() \newline \newline sentence = sent\_tokenize(paragraph){[}1{]} \newline words = word\_tokenize(sentence) \newline {[}stem.stem(word) for word in words{]} \newline \newline \# Spacy \newline No Stemming in spacy \newline \newline \# Keras \newline No Stemming in Keras \newline \newline Lemmatization: \newline As stemming, lemmatization do the same but the only difference is that lemmatization ensures that root word belongs to the language \newline \newline \# NLTK \newline from nltk.stem import WordNetLemmatizer \newline lemma = WordNetLemmatizer() \newline \newline sentence = sent\_tokenize(paragraph){[}1{]} \newline words = word\_tokenize(sentence) \newline {[}lemma.lemmatize(word) for word in words{]} \newline \newline \# Spcay \newline import spacy as spac \newline sp = \seqsplit{spac.load('en\_core\_web\_sm')} \newline ch = sp(u'warning warned') \newline for x in ch: \newline print(ch.lemma\_) \newline \newline \# Keras \newline No lemmatization or stemming} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Word2Vec}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{In BOW and TF-IDF approach semantic information not stored. TF-IDF gives importance to uncommon words. There is definitely chance of overfitting. \newline \newline In W2v each word is basically represented as a vector of 32 or more dimension instead of a single number. Here the semantic information and relation between words is also preserved. \newline \newline Steps: \newline 1. Tokenization of the sentences \newline 2. Create Histograms \newline 3. Take most frequent words \newline 4. Create a matrix with all the unique words. It also represents the occurence relation between the words \newline \newline from gensim.models import Word2Vec \newline model = Word2Vec(sentences, min\_count=1) \newline words = model.wv.vocab \newline vector = model.wv{[}'freedom'{]} \newline similar = model.wv.most\_similar{[}'freedom'{]}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Stop Words}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{Stopwords are the most common words in any natural language. For the purpose of analyzing text data and building NLP models, these stopwords might not add much value to the meaning of the document. \newline \newline \# NLTK \newline from nltk.corpus import stopwords \newline from nltk.tokenize import word\_tokenize \newline \newline stopwords = \seqsplit{set(stopwords.words('english'))} \newline word\_tokens = \seqsplit{word\_tokenize(paragraph)} \newline {[}word for word in word\_tokens if word not in stopwords{]} \newline \newline \# Spacy \newline from spacy.lang.en import English \newline from \seqsplit{spacy.lang.en.stop\_words} import STOP\_WORDS \newline \newline nlp = English() \newline my\_doc = nlp(paragraph) \newline \newline \# Create list of word tokens \newline token\_list = {[}token.text for token in my\_doc{]} \newline \newline \# Create list of word tokens after removing stopwords \newline filtered\_sentence ={[}{]} \newline \newline for word in token\_list: \newline lexeme = nlp.vocab{[}word{]} \newline if lexeme.is\_stop == False: \newline \seqsplit{filtered\_sentence.append(word)} \newline \newline \# Gensim \newline from \seqsplit{gensim.parsing.preprocessing} import remove\_stopwords \newline remove\_stopwords(paragraph)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{p{0.44761 cm} p{0.55293 cm} p{0.55293 cm} x{1.07953 cm} } \SetRowColor{DarkBackground} \mymulticolumn{4}{x{3.833cm}}{\bf\textcolor{white}{Tokenization}} \tn % Row 0 \SetRowColor{LightBackground} NLTK & Spacy & Keras & Tensorlfow \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{4}{x{3.833cm}}{dfdfd} \tn % Row Count 2 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}----} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{3.833cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{3.833cm}}{\bf\textcolor{white}{Parts of Speech (POS) Tagging, Chunking \& NER}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{3.833cm}}{The pos(parts of speech) explain you how a word is used in a sentence. In the sentence, a word have different contexts and semantic meanings. The basic natural language processing(NLP) models like bag-of-words(bow) fails to identify these relation between the words. For that we use pos tagging to mark a word to its pos tag based on its context in the data. Pos is also used to extract rlationship between the words \newline \newline \# NLTK \newline from nltk.tokenize import word\_tokenize \newline from nltk import pos\_tag \newline nltk.download('averaged\_perceptron\_tagger') \newline \newline word\_tokens = word\_tokenize('Are you afraid of something?') \newline pos\_tag(word\_tokens) \newline \newline \# Spacy \newline nlp = \seqsplit{spacy.load("en\_core\_web\_sm")} \newline doc = nlp("Coronavirus: Delhi resident tests positive for coronavirus, total 31 people infected in India") \newline {[}token.pos\_ for token in doc{]} \newline \newline Chunking: \newline Chunking is the process of extracting phrases from the Unstructured text and give them more structure to it. We also called them shallow parsing.We can do it on top of pos tagging. It groups words into chunks mainly for noun phrases. chunking we do by using regular expression. \newline \newline \# NLTK \newline word\_tokens = word\_tokenize(text) \newline word\_pos = pos\_tag(word\_tokens) \newline chunkParser = \seqsplit{nltk.RegexpParser(grammar)} \newline tree = \seqsplit{chunkParser.parse(word\_pos)} \newline \newline \newline Named Entity Recognization: \newline It is used to extract information from unstructured text. It is used to classy the entities which is present in the text into categories like a person, organization, event, places, etc. This will give you a detail knowledge about the text and the relationship between the different entities. \newline \newline \newline \# Spacy \newline import spacy \newline nlp = \seqsplit{spacy.load("en\_core\_web\_sm")} \newline doc = nlp("Coronavirus: Delhi resident tests positive for coronavirus, total 31 people infected in India") \newline \newline for ent in doc.ents: \newline print(ent.text, ent.start\_char, ent.end\_char, ent.label\_)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}