\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{Remidy08} \pdfinfo{ /Title (machine-learning-with-python-cookbook.pdf) /Creator (Cheatography) /Author (Remidy08) /Subject (Machine Learning with Python Cookbook Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{A30B0B} \definecolor{LightBackground}{HTML}{FCF7F7} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Machine Learning with Python Cookbook Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{Remidy08} via \textcolor{DarkBackground}{\uline{cheatography.com/159206/cs/34199/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}Remidy08 \\ \uline{cheatography.com/remidy08} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 9th October, 2022.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Introduction}} \tn % Row 0 \SetRowColor{LightBackground} Creating a row Vector & np.array({[}1, 2, 3{]}) \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} Creating a column Vector & np.array({[}{[}1{]}, {[}2{]}, {[}3{]}{]}) \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} Creating a Matrix & np.array({[}{[}1, 2{]}, {[}1, 2{]}, {[}1, 2{]}{]}) \tn % Row Count 6 (+ 2) % Row 3 \SetRowColor{white} Creating a Sparse Matrix & from scipy import sparse \tn % Row Count 8 (+ 2) % Row 4 \SetRowColor{LightBackground} & \seqsplit{sparse.csr\_matrix(matrix)} \#shows the indixes of non zero elements \tn % Row Count 12 (+ 4) % Row 5 \SetRowColor{white} Select all elements of a vector & vector{[}:{]} \tn % Row Count 14 (+ 2) % Row 6 \SetRowColor{LightBackground} Select all rows and the second column & matrix{[}:,1:2{]} \tn % Row Count 16 (+ 2) % Row 7 \SetRowColor{white} View number of rows and columns & matrix.shape \tn % Row Count 18 (+ 2) % Row 8 \SetRowColor{LightBackground} View number of elements & matrix.size \tn % Row Count 20 (+ 2) % Row 9 \SetRowColor{white} View number of dimensions & matrix.ndim \tn % Row Count 22 (+ 2) % Row 10 \SetRowColor{LightBackground} Applying Operations to Elements & add\_100 = lambda i: i + 100 \tn % Row Count 24 (+ 2) % Row 11 \SetRowColor{white} & \seqsplit{vectorized\_add\_100} = \seqsplit{np.vectorize(add\_100)} \tn % Row Count 27 (+ 3) % Row 12 \SetRowColor{LightBackground} & \seqsplit{vectorized\_add\_100(matrix)} \tn % Row Count 29 (+ 2) % Row 13 \SetRowColor{white} maximum value in an array & np.max(matrix) \tn % Row Count 31 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Introduction (cont)}} \tn % Row 14 \SetRowColor{LightBackground} minimum value in an array & np.min(matrix) \tn % Row Count 2 (+ 2) % Row 15 \SetRowColor{white} Return mean & np.mean(matrix) \tn % Row Count 3 (+ 1) % Row 16 \SetRowColor{LightBackground} Return variance & np.var(matrix) \tn % Row Count 4 (+ 1) % Row 17 \SetRowColor{white} Return standard deviation & np.std(matrix) \tn % Row Count 6 (+ 2) % Row 18 \SetRowColor{LightBackground} Reshaping Arrays & matrix.reshape(2, 6) \tn % Row Count 7 (+ 1) % Row 19 \SetRowColor{white} Transposing a Vector or Matrix & matrix.T \tn % Row Count 9 (+ 2) % Row 20 \SetRowColor{LightBackground} You need to transform a matrix into a one-dimensional array & matrix.flatten() \tn % Row Count 12 (+ 3) % Row 21 \SetRowColor{white} Return matrix rank (This corresponds to the maximal number of linearly independent columns of the matrix) & \seqsplit{np.linalg.matrix\_rank(matrix)} \tn % Row Count 18 (+ 6) % Row 22 \SetRowColor{LightBackground} Calculating the Determinant & \seqsplit{np.linalg.det(matrix)} \tn % Row Count 20 (+ 2) % Row 23 \SetRowColor{white} Getting the Diagonal line of a Matrix & \seqsplit{matrix.diagonal(offset=1} (offsets the diagonal by the amount we put, can be negative)) \tn % Row Count 25 (+ 5) % Row 24 \SetRowColor{LightBackground} Return trace (sum of the diagonal elements) & matrix.trace() \tn % Row Count 28 (+ 3) % Row 25 \SetRowColor{white} Finding Eigenvalues and Eigenvectors & eigenvalues, eigenvectors = \seqsplit{np.linalg.eig(matrix)} \tn % Row Count 31 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Introduction (cont)}} \tn % Row 26 \SetRowColor{LightBackground} Calculating Dot Products (sum of the product of the elements of two vectores) & np.dot(vector\_a, vector\_b) \tn % Row Count 4 (+ 4) % Row 27 \SetRowColor{white} Add two matrices & np.add(matrix\_a, matrix\_b) \tn % Row Count 6 (+ 2) % Row 28 \SetRowColor{LightBackground} Subtract two matrices & \seqsplit{np.subtract(matrix\_a}, matrix\_b) \tn % Row Count 8 (+ 2) % Row 29 \SetRowColor{white} & Alternatively, we can simply use the + and - operators \tn % Row Count 11 (+ 3) % Row 30 \SetRowColor{LightBackground} Multiplying Matrices & np.dot(matrix\_a, matrix\_b) \tn % Row Count 13 (+ 2) % Row 31 \SetRowColor{white} & Alternatively, in Python 3.5+ we can use the @ operator \tn % Row Count 16 (+ 3) % Row 32 \SetRowColor{LightBackground} Multiply two matrices element-wise & matrix\_a * matrix\_b \tn % Row Count 18 (+ 2) % Row 33 \SetRowColor{white} Inverting a Matrix & \seqsplit{p.linalg.inv(matrix)} \tn % Row Count 19 (+ 1) % Row 34 \SetRowColor{LightBackground} Set seed for random value generation & np.random.seed(0) \tn % Row Count 21 (+ 2) % Row 35 \SetRowColor{white} Generate three random floats between 0.0 and 1.0 & np.random.random(3) \tn % Row Count 24 (+ 3) % Row 36 \SetRowColor{LightBackground} Generate three random integers between 1 and 10 & np.random.randint(0, 11, 3) \tn % Row Count 27 (+ 3) % Row 37 \SetRowColor{white} Draw three numbers from a normal distribution with mean 0.0 and standard deviation of 1.0 & \seqsplit{np.random.normal(0.0}, 1.0, 3) \tn % Row Count 32 (+ 5) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Introduction (cont)}} \tn % Row 38 \SetRowColor{LightBackground} Draw three numbers from a logistic distribution with mean 0.0 and scale of 1.0 & \seqsplit{np.random.logistic(0.0}, 1.0, 3) \tn % Row Count 4 (+ 4) % Row 39 \SetRowColor{white} Draw three numbers greater than or equal to 1.0 and less than 2.0 & \seqsplit{np.random.uniform(1.0}, 2.0, 3) \tn % Row Count 8 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{We select element from matrixes and vectores like we do in R. \newline \# Find maximum element in each column \newline np.max(matrix, axis=0) -\textgreater{} array({[}7, 8, 9{]}) \newline One useful argument in reshape is -1, which effectively means "as many as needed," so reshape(1, -1) means one row and as many columns as needed:} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Clustering}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Clustering Using K-Means} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} Load libraries & from sklearn.cluster import KMeans \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} Create k-mean object & cluster = \seqsplit{KMeans(n\_clusters=3}, random\_state=0, n\_jobs=-1) \tn % Row Count 6 (+ 3) % Row 3 \SetRowColor{white} Train model & model = \seqsplit{cluster.fit(features\_std)} \tn % Row Count 8 (+ 2) % Row 4 \SetRowColor{LightBackground} Predict observation's cluster & \seqsplit{model.predict(new\_observation)} \tn % Row Count 10 (+ 2) % Row 5 \SetRowColor{white} View predict class & model.labels\_ \tn % Row Count 11 (+ 1) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Speeding Up K-Means Clustering} \tn % Row Count 12 (+ 1) % Row 7 \SetRowColor{white} Load libraries & from sklearn.cluster import MiniBatchKMeans \tn % Row Count 15 (+ 3) % Row 8 \SetRowColor{LightBackground} Create k-mean object & cluster = \seqsplit{MiniBatchKMeans(n\_clusters=3}, random\_state=0, batch\_size=100) \tn % Row Count 19 (+ 4) % Row 9 \SetRowColor{white} Train model & model = \seqsplit{cluster.fit(features\_std)} \tn % Row Count 21 (+ 2) % Row 10 \SetRowColor{LightBackground} Clustering Using Meanshift & group observations without assuming the number of clusters or their shape \tn % Row Count 25 (+ 4) % Row 11 \SetRowColor{white} Load libraries & from sklearn.cluster import MeanShift \tn % Row Count 27 (+ 2) % Row 12 \SetRowColor{LightBackground} Create meanshift object & cluster = \seqsplit{MeanShift(n\_jobs=-1)} \tn % Row Count 29 (+ 2) % Row 13 \SetRowColor{white} Train model & model = \seqsplit{cluster.fit(features\_std)} \tn % Row Count 31 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Clustering (cont)}} \tn % Row 14 \SetRowColor{LightBackground} Note on meanshift & cluster\_all=False wherein orphan observations are given the label of -1 \tn % Row Count 4 (+ 4) % Row 15 \SetRowColor{white} Clustering Using DBSCAN & group observations into clusters of high density \tn % Row Count 7 (+ 3) % Row 16 \SetRowColor{LightBackground} Load libraries & from sklearn.cluster import DBSCAN \tn % Row Count 9 (+ 2) % Row 17 \SetRowColor{white} Create meanshift object & cluster = DBSCAN(n\_jobs=-1) \tn % Row Count 11 (+ 2) % Row 18 \SetRowColor{LightBackground} Train model & model = \seqsplit{cluster.fit(features\_std)} \tn % Row Count 13 (+ 2) % Row 19 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{DBSCAN has three main parameters to set:} \tn % Row Count 14 (+ 1) % Row 20 \SetRowColor{LightBackground} eps & The maximum distance from an observation for another observation to be considered its neighbor. \tn % Row Count 19 (+ 5) % Row 21 \SetRowColor{white} min\_samples & The minimum number of observations less than eps distance from an observation for it to be considered a core observation. \tn % Row Count 26 (+ 7) % Row 22 \SetRowColor{LightBackground} metric & The distance metric used by eps—for example, minkowski or euclidean \tn % Row Count 30 (+ 4) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Clustering (cont)}} \tn % Row 23 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Clustering Using Hierarchical Merging} \tn % Row Count 1 (+ 1) % Row 24 \SetRowColor{white} Load libraries & from sklearn.cluster import \seqsplit{AgglomerativeClustering} \tn % Row Count 4 (+ 3) % Row 25 \SetRowColor{LightBackground} Create meanshift object & cluster = \seqsplit{AgglomerativeClustering(n\_clusters=3)} \tn % Row Count 7 (+ 3) % Row 26 \SetRowColor{white} Train model & model = \seqsplit{cluster.fit(features\_std)} \tn % Row Count 9 (+ 2) % Row 27 \SetRowColor{LightBackground} \seqsplit{AgglomerativeClustering} uses the linkage parameter to determine the merging strategy to minimize the following: & Variance of merged clusters (ward) \tn % Row Count 15 (+ 6) % Row 28 \SetRowColor{white} & Average distance between observations from pairs of clusters (average) \tn % Row Count 19 (+ 4) % Row 29 \SetRowColor{LightBackground} & Maximum distance between observations from pairs of clusters (complete) \tn % Row Count 23 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{MiniBatchKMeans works similarly to KMeans, with one significant difference: the batch\_size parameter. batch\_size controls the number of randomly selected observations in each batch.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Categorical Data}} \tn % Row 0 \SetRowColor{LightBackground} Encoding Nominal Categorical Features & from \seqsplit{sklearn.preprocessing} import LabelBinarizer, MultiLabelBinarizer \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} Create one-hot encoder & one\_hot = LabelBinarizer() \tn % Row Count 6 (+ 2) % Row 2 \SetRowColor{LightBackground} One-hot encode feature & \seqsplit{one\_hot.fit\_transform(feature)} \tn % Row Count 8 (+ 2) % Row 3 \SetRowColor{white} View feature classes & one\_hot.classes\_ \tn % Row Count 9 (+ 1) % Row 4 \SetRowColor{LightBackground} reverse the one-hot encoding & \seqsplit{one\_hot.inverse\_transform(one\_hot.transform(feature))} \tn % Row Count 12 (+ 3) % Row 5 \SetRowColor{white} Create dummy variables from feature & pd.get\_dummies(feature{[}:,0{]}) \tn % Row Count 14 (+ 2) % Row 6 \SetRowColor{LightBackground} Create multiclass one-hot encoder & \seqsplit{one\_hot\_multiclass} = \seqsplit{MultiLabelBinarizer()} \tn % Row Count 17 (+ 3) % Row 7 \SetRowColor{white} One-hot encode multiclass feature & \seqsplit{one\_hot\_multiclass}.fit\_transform(multiclass\_feature) \tn % Row Count 20 (+ 3) % Row 8 \SetRowColor{LightBackground} see the classes with the classes\_ method & \seqsplit{ne\_hot\_multiclass.classes\_} \tn % Row Count 22 (+ 2) % Row 9 \SetRowColor{white} Encoding Ordinal Categorical Features & dataframe{[}"Score"{]}.replace(dic with categoricals as keys and numbers as values) \tn % Row Count 26 (+ 4) % Row 10 \SetRowColor{LightBackground} Encoding Dictionaries of Features & from \seqsplit{sklearn.feature\_extraction} import DictVectorizer \tn % Row Count 29 (+ 3) % Row 11 \SetRowColor{white} Create dictionary & data\_dict = {[}\{"Red": 2, "Blue": 4\}, \{"Red": 4, "Blue": 3\}, \{"Red": 1, "Yellow": 2\}, \{"Red": 2, "Yellow": 2\}{]} \tn % Row Count 35 (+ 6) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Categorical Data (cont)}} \tn % Row 12 \SetRowColor{LightBackground} Create dictionary vectorizer & dictvectorizer = \seqsplit{DictVectorizer(sparse=False)} \tn % Row Count 3 (+ 3) % Row 13 \SetRowColor{white} Convert dictionary to feature matrix & features = \seqsplit{dictvectorizer.fit\_transform(data\_dict)} \tn % Row Count 6 (+ 3) % Row 14 \SetRowColor{LightBackground} Get feature names & feature\_names = \seqsplit{dictvectorizer.get\_feature\_names()} \tn % Row Count 9 (+ 3) % Row 15 \SetRowColor{white} Imputing Missing Class Values & from sklearn.neighbors import \seqsplit{KNeighborsClassifier} \tn % Row Count 12 (+ 3) % Row 16 \SetRowColor{LightBackground} \# Train KNN learner & clf = \seqsplit{KNeighborsClassifier(3}, weights='distance') \tn % Row Count 15 (+ 3) % Row 17 \SetRowColor{white} & trained\_model = clf.fit(X{[}:,1:{]}, X{[}:,0{]}) \tn % Row Count 17 (+ 2) % Row 18 \SetRowColor{LightBackground} Predict missing values' class & imputed\_values = trained\_model.predict(X\_with\_nan{[}:,1:{]}) \tn % Row Count 20 (+ 3) % Row 19 \SetRowColor{white} Join column of predicted class with their other features & X\_with\_imputed = \seqsplit{np.hstack((imputed\_values.reshape(-1},1), X\_with\_nan{[}:,1:{]})) \tn % Row Count 24 (+ 4) % Row 20 \SetRowColor{LightBackground} Join two feature matrices & \seqsplit{np.vstack((X\_with\_imputed}, X)) \tn % Row Count 26 (+ 2) % Row 21 \SetRowColor{white} Use imputer to fill most frequen value & imputer = \seqsplit{Imputer(strategy='most\_frequent'}, axis=0) \tn % Row Count 29 (+ 3) % Row 22 \SetRowColor{LightBackground} Handling Imbalanced Classes & \seqsplit{RandomForestClassifier(class\_weight="balanced")} \tn % Row Count 32 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Categorical Data (cont)}} \tn % Row 23 \SetRowColor{LightBackground} downsample the majority class & i\_class0 = np.where(target == 0){[}0{]} \tn % Row Count 2 (+ 2) % Row 24 \SetRowColor{white} & i\_class1 = np.where(target == 1){[}0{]} \tn % Row Count 4 (+ 2) % Row 25 \SetRowColor{LightBackground} Number of observations in each class & n\_class0 = len(i\_class0) \tn % Row Count 6 (+ 2) % Row 26 \SetRowColor{white} & n\_class1 = len(i\_class1) \tn % Row Count 8 (+ 2) % Row 27 \SetRowColor{LightBackground} For every observation of class 0, randomly sample from class 1 without replacement & \seqsplit{i\_class1\_downsampled} = \seqsplit{np.random.choice(i\_class1}, size=n\_class0, replace=False) \tn % Row Count 13 (+ 5) % Row 28 \SetRowColor{white} Join together class 0's target vector with the downsampled class 1's target vector & np.hstack((target{[}i\_class0{]}, target{[}i\_class1\_downsampled{]})) \tn % Row Count 18 (+ 5) % Row 29 \SetRowColor{LightBackground} Join together class 0's feature matrix with the downsampled class 1's feature matrix & np.vstack((features{[}i\_class0,:{]}, features{[}i\_class1\_downsampled,:{]})){[}0:5{]} \tn % Row Count 23 (+ 5) % Row 30 \SetRowColor{white} upsample the minority class & \seqsplit{i\_class0\_upsampled} = \seqsplit{np.random.choice(i\_class0}, size=n\_class1, replace=True) \tn % Row Count 27 (+ 4) % Row 31 \SetRowColor{LightBackground} Join together class 0's upsampled target vector with class 1's target vector & np.concatenate((target{[}i\_class0\_upsampled{]}, target{[}i\_class1{]})) \tn % Row Count 31 (+ 4) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Categorical Data (cont)}} \tn % Row 32 \SetRowColor{LightBackground} Join together class 0's upsampled feature matrix with class 1's feature matrix & np.vstack((features{[}i\_class0\_upsampled,:{]}, features{[}i\_class1,:{]})){[}0:5{]} \tn % Row Count 4 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{A second strategy is to use a model evaluation metric better suited to imbalanced classes. Accuracy is often used as a metric for evaluating the performance of a model, but when imbalanced classes are present accuracy can be ill suited. Some better metrics we discuss in later chapters are confusion matrices, precision, recall, F1 scores, and ROC curves} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Dimensionality Reduction Using Feature Extraction}} \tn % Row 0 \SetRowColor{LightBackground} Reducing Features Using Principal Components & from \seqsplit{sklearn.decomposition} import PCA \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} & from \seqsplit{sklearn.preprocessing} import StandardScaler \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} Standardize the feature matrix & features = \seqsplit{StandardScaler().fit\_transform(digits.data)} \tn % Row Count 9 (+ 3) % Row 3 \SetRowColor{white} Create a PCA that will retain 99\% of variance & pca = \seqsplit{PCA(n\_components=0.99}, whiten=True) \tn % Row Count 12 (+ 3) % Row 4 \SetRowColor{LightBackground} Conduct PCA & features\_pca = \seqsplit{pca.fit\_transform(features)} \tn % Row Count 15 (+ 3) % Row 5 \SetRowColor{white} Reducing Features When Data Is Linearly Inseparable & Use an extension of principal component analysis that uses kernels to allow for non-linear dimensionality reduction \tn % Row Count 21 (+ 6) % Row 6 \SetRowColor{LightBackground} & from \seqsplit{sklearn.decomposition} import PCA, KernelPCA \tn % Row Count 24 (+ 3) % Row 7 \SetRowColor{white} Apply kernal PCA with radius basis function (RBF) kernel & kpca = \seqsplit{KernelPCA(kernel="rbf"}, gamma=15, n\_components=1) \tn % Row Count 27 (+ 3) % Row 8 \SetRowColor{LightBackground} & features\_kpca = \seqsplit{kpca.fit\_transform(features)} \tn % Row Count 30 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Dimensionality Reduction Using Feature Extraction (cont)}} \tn % Row 9 \SetRowColor{LightBackground} Reducing Features by Maximizing Class Separability & from \seqsplit{sklearn.discriminant\_analysis} import \seqsplit{LinearDiscriminantAnalysis} \tn % Row Count 4 (+ 4) % Row 10 \SetRowColor{white} Create and run an LDA, then use it to transform the features & \seqsplit{LinearDiscriminantAnalysis(n\_components=1)} \tn % Row Count 7 (+ 3) % Row 11 \SetRowColor{LightBackground} & features\_lda = lda.fit(features, \seqsplit{target).transform(features)} \tn % Row Count 10 (+ 3) % Row 12 \SetRowColor{white} amount of variance explained by each component & \seqsplit{lda.explained\_variance\_ratio\_} \tn % Row Count 13 (+ 3) % Row 13 \SetRowColor{LightBackground} non-negative matrix factorization (NMF) to reduce the dimensionality of the feature matrix & from \seqsplit{sklearn.decomposition} import NMF \tn % Row Count 18 (+ 5) % Row 14 \SetRowColor{white} Create, fit, and apply NMF & nmf = \seqsplit{NMF(n\_components=10}, random\_state=1) \tn % Row Count 21 (+ 3) % Row 15 \SetRowColor{LightBackground} & features\_nmf = \seqsplit{nmf.fit\_transform(features)} \tn % Row Count 24 (+ 3) % Row 16 \SetRowColor{white} Reducing Features on Sparse Data (Truncated Singular Value Decomposition (TSVD)) & from \seqsplit{sklearn.decomposition} import TruncatedSVD \tn % Row Count 28 (+ 4) % Row 17 \SetRowColor{LightBackground} & from scipy.sparse import csr\_matrix \tn % Row Count 30 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Dimensionality Reduction Using Feature Extraction (cont)}} \tn % Row 18 \SetRowColor{LightBackground} Standardize feature matrix & features = \seqsplit{StandardScaler().fit\_transform(digits.data)} \tn % Row Count 3 (+ 3) % Row 19 \SetRowColor{white} \# Make sparse matrix & features\_sparse = \seqsplit{csr\_matrix(features)} \tn % Row Count 5 (+ 2) % Row 20 \SetRowColor{LightBackground} Create a TSVD & tsvd = \seqsplit{TruncatedSVD(n\_components=10)} \tn % Row Count 7 (+ 2) % Row 21 \SetRowColor{white} Conduct TSVD on sparse matrix & \seqsplit{features\_sparse\_tsvd} = \seqsplit{tsvd.fit(features\_sparse).transform(features\_sparse)} \tn % Row Count 11 (+ 4) % Row 22 \SetRowColor{LightBackground} Sum of first three components' explained variance ratios & tsvd.explained\_variance\_ratio\_{[}0:3{]}.sum() \tn % Row Count 14 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{196 e 200 \newline One major requirement of NMA is that, as the name implies, the feature matrix cannot contain negative values.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Trees and Forests}} \tn % Row 0 \SetRowColor{LightBackground} Training a Decision Tree Classifier & from sklearn.tree import \seqsplit{DecisionTreeClassifier} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Create decision tree classifier object & decisiontree = \seqsplit{DecisionTreeClassifier(random\_state=0)} \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} Train model & model = \seqsplit{decisiontree.fit(features}, target) \tn % Row Count 9 (+ 3) % Row 3 \SetRowColor{white} Predict observation's class & \seqsplit{model.predict(observation)} \tn % Row Count 11 (+ 2) % Row 4 \SetRowColor{LightBackground} Training a Decision Tree Regressor & from sklearn.tree import \seqsplit{DecisionTreeRegressor} \tn % Row Count 14 (+ 3) % Row 5 \SetRowColor{white} Create decision tree classifier object & decisiontree = \seqsplit{DecisionTreeRegressor(random\_state=0)} \tn % Row Count 17 (+ 3) % Row 6 \SetRowColor{LightBackground} Train model & model = \seqsplit{decisiontree.fit(features}, target) \tn % Row Count 20 (+ 3) % Row 7 \SetRowColor{white} Create decision tree classifier object using entropy & decisiontree\_mae = \seqsplit{DecisionTreeRegressor(criterion="mae"}, random\_state=0) \tn % Row Count 24 (+ 4) % Row 8 \SetRowColor{LightBackground} Visualizing a Decision Tree Model & from IPython.display import Image \tn % Row Count 26 (+ 2) % Row 9 \SetRowColor{white} & import pydotplus \tn % Row Count 27 (+ 1) % Row 10 \SetRowColor{LightBackground} & from sklearn import tree \tn % Row Count 29 (+ 2) % Row 11 \SetRowColor{white} Create DOT data & dot\_data = \seqsplit{tree.export\_graphviz(decisiontree}, out\_file=None, \seqsplit{feature\_names=iris.feature\_names}, \seqsplit{class\_names=iris.target\_names)} \tn % Row Count 36 (+ 7) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Trees and Forests (cont)}} \tn % Row 12 \SetRowColor{LightBackground} Draw graph & graph = \seqsplit{pydotplus.graph\_from\_dot\_data(dot\_data)} \tn % Row Count 3 (+ 3) % Row 13 \SetRowColor{white} Show graph & \seqsplit{Image(graph.create\_png())} \tn % Row Count 5 (+ 2) % Row 14 \SetRowColor{LightBackground} Create PDF & \seqsplit{graph.write\_pdf("iris.pdf")} \tn % Row Count 7 (+ 2) % Row 15 \SetRowColor{white} Create PNG & \seqsplit{graph.write\_png("iris.png")} \tn % Row Count 9 (+ 2) % Row 16 \SetRowColor{LightBackground} Training a Random Forest Classifier & from sklearn.ensemble import \seqsplit{RandomForestClassifier} \tn % Row Count 12 (+ 3) % Row 17 \SetRowColor{white} Create random forest classifier object & randomforest = \seqsplit{RandomForestClassifier(random\_state=0}, n\_jobs=-1) \tn % Row Count 16 (+ 4) % Row 18 \SetRowColor{LightBackground} Create random forest classifier object using entropy & \seqsplit{randomforest\_entropy} = \seqsplit{RandomForestClassifier(} criterion="entropy", random\_state=0) \tn % Row Count 21 (+ 5) % Row 19 \SetRowColor{white} Training a Random Forest Regressor & from sklearn.ensemble import \seqsplit{RandomForestRegressor} \tn % Row Count 24 (+ 3) % Row 20 \SetRowColor{LightBackground} Create random forest classifier object & randomforest = \seqsplit{RandomForestRegressor(random\_state=0}, n\_jobs=-1) \tn % Row Count 28 (+ 4) % Row 21 \SetRowColor{white} Identifying Important Features in Random Forests & from sklearn.ensemble import \seqsplit{RandomForestClassifier} \tn % Row Count 31 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Trees and Forests (cont)}} \tn % Row 22 \SetRowColor{LightBackground} Create random forest classifier object & randomforest = \seqsplit{RandomForestClassifier(random\_state=0}, n\_jobs=-1) \tn % Row Count 4 (+ 4) % Row 23 \SetRowColor{white} Calculate feature importances & importances = \seqsplit{model.feature\_importances\_} \tn % Row Count 6 (+ 2) % Row 24 \SetRowColor{LightBackground} Sort feature importances in descending order & indices = np.argsort(importances){[}::-1{]} \tn % Row Count 9 (+ 3) % Row 25 \SetRowColor{white} Rearrange feature names so they match the sorted feature importances & names = {[}iris.feature\_names{[}i{]} for i in indices{]} \tn % Row Count 13 (+ 4) % Row 26 \SetRowColor{LightBackground} Create plot & plt.figure() \tn % Row Count 14 (+ 1) % Row 27 \SetRowColor{white} Create plot title & plt.title("Feature Importance") \tn % Row Count 16 (+ 2) % Row 28 \SetRowColor{LightBackground} Add bars & \seqsplit{plt.bar(range(features}.shape{[}1{]}), importances{[}indices{]}) \tn % Row Count 19 (+ 3) % Row 29 \SetRowColor{white} Add feature names as x-axis labels & \seqsplit{plt.xticks(range(features}.shape{[}1{]}), names, rotation=90) \tn % Row Count 22 (+ 3) % Row 30 \SetRowColor{LightBackground} Show plot & plt.show() \tn % Row Count 23 (+ 1) % Row 31 \SetRowColor{white} Selecting Important Features in Random Forests & from \seqsplit{sklearn.feature\_selection} import SelectFromModel \tn % Row Count 26 (+ 3) % Row 32 \SetRowColor{LightBackground} Create random forest classifier & randomforest = \seqsplit{RandomForestClassifier(random\_state=0}, n\_jobs=-1) \tn % Row Count 30 (+ 4) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Trees and Forests (cont)}} \tn % Row 33 \SetRowColor{LightBackground} Create object that selects features with importance greater than or equal to a threshold & selector = \seqsplit{SelectFromModel(randomforest}, threshold=0.3) \tn % Row Count 5 (+ 5) % Row 34 \SetRowColor{white} Feature new feature matrix using selector & features\_important = \seqsplit{selector.fit\_transform(features}, target) \tn % Row Count 9 (+ 4) % Row 35 \SetRowColor{LightBackground} Train random forest using most important featres & model = \seqsplit{randomforest.fit(features\_important}, target) \tn % Row Count 12 (+ 3) % Row 36 \SetRowColor{white} Handling Imbalanced Classes & Train a decision tree or random forest model with \seqsplit{class\_weight="balanced"} \tn % Row Count 16 (+ 4) % Row 37 \SetRowColor{LightBackground} Create random forest classifier object & randomforest = \seqsplit{RandomForestClassifier(} random\_state=0, n\_jobs=-1, \seqsplit{class\_weight="balanced")} \tn % Row Count 21 (+ 5) % Row 38 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{Controlling Tree Size} \tn % Row Count 22 (+ 1) % Row 39 \SetRowColor{LightBackground} Create decision tree classifier object & decisiontree = \seqsplit{DecisionTreeClassifier(random\_state=0}, max\_depth=None, \seqsplit{min\_samples\_split=2}, \seqsplit{min\_samples\_leaf=1}, \seqsplit{min\_weight\_fraction\_leaf=0}, \seqsplit{max\_leaf\_nodes=None}, \seqsplit{min\_impurity\_decrease=0)} \tn % Row Count 32 (+ 10) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Trees and Forests (cont)}} \tn % Row 40 \SetRowColor{LightBackground} Improving Performance Through Boosting & from sklearn.ensemble import AdaBoostClassifier \tn % Row Count 3 (+ 3) % Row 41 \SetRowColor{white} Create adaboost tree classifier object & adaboost = \seqsplit{AdaBoostClassifier(random\_state=0)} \tn % Row Count 6 (+ 3) % Row 42 \SetRowColor{LightBackground} Evaluating Random Forests with Out-of- Bag Errors & You need to evaluate a random forest model without using cross-validation \tn % Row Count 10 (+ 4) % Row 43 \SetRowColor{white} Create random tree classifier object & randomforest = \seqsplit{RandomForestClassifier(} random\_state=0, n\_estimators=1000, oob\_score=True, n\_jobs=-1) \tn % Row Count 15 (+ 5) % Row 44 \SetRowColor{LightBackground} OOB scores of a random forest & oob\_score\_ \tn % Row Count 17 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Trees and Forests}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Training a Decision Tree Classifier} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} Load libraries & from sklearn.tree import \seqsplit{DecisionTreeClassifier} \tn % Row Count 4 (+ 3) % Row 2 \SetRowColor{LightBackground} Create decision tree classifier object & decisiontree = \seqsplit{DecisionTreeClassifier(random\_state=0)} \tn % Row Count 7 (+ 3) % Row 3 \SetRowColor{white} Train model & model = \seqsplit{decisiontree.fit(features}, target) \tn % Row Count 10 (+ 3) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Training a Decision Tree Regressor} \tn % Row Count 11 (+ 1) % Row 5 \SetRowColor{white} Use scikit-learn's \seqsplit{DecisionTreeRegressor} & from sklearn.tree import \seqsplit{DecisionTreeRegressor} \tn % Row Count 14 (+ 3) % Row 6 \SetRowColor{LightBackground} Create decision tree classifier object & decisiontree = \seqsplit{DecisionTreeRegressor(random\_state=0)} \tn % Row Count 17 (+ 3) % Row 7 \SetRowColor{white} Train model & model = \seqsplit{decisiontree.fit(features}, target) \tn % Row Count 20 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Linear Regression}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Fitting a Line} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} Load libraries & from \seqsplit{sklearn.linear\_model} import LinearRegression \tn % Row Count 4 (+ 3) % Row 2 \SetRowColor{LightBackground} Create linear regression & regression = LinearRegression() \tn % Row Count 6 (+ 2) % Row 3 \SetRowColor{white} Fit the linear regression & model = \seqsplit{regression.fit(features}, target) \tn % Row Count 8 (+ 2) % Row 4 \SetRowColor{LightBackground} Handling Interactive Effects & You have a feature whose effect on the target variable depends on another feature. \tn % Row Count 13 (+ 5) % Row 5 \SetRowColor{white} Load libraries & from \seqsplit{sklearn.preprocessing} import PolynomialFeatures \tn % Row Count 16 (+ 3) % Row 6 \SetRowColor{LightBackground} Create interaction term & interaction = PolynomialFeatures( degree=3, include\_bias=False, \seqsplit{interaction\_only=True)} \tn % Row Count 21 (+ 5) % Row 7 \SetRowColor{white} & \seqsplit{features\_interaction} = \seqsplit{interaction.fit\_transform(features)} \tn % Row Count 24 (+ 3) % Row 8 \SetRowColor{LightBackground} Create linear regression & regression = LinearRegression() \tn % Row Count 26 (+ 2) % Row 9 \SetRowColor{white} Fit the linear regression & model = \seqsplit{regression.fit(features\_interaction}, target) \tn % Row Count 29 (+ 3) % Row 10 \SetRowColor{LightBackground} Fitting a Nonlinear Relationship & Create a polynomial regression by including polynomial features in a linear regression model \tn % Row Count 34 (+ 5) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Linear Regression (cont)}} \tn % Row 11 \SetRowColor{LightBackground} Load library & from \seqsplit{sklearn.preprocessing} import PolynomialFeatures \tn % Row Count 3 (+ 3) % Row 12 \SetRowColor{white} Create polynomial features x\textasciicircum{}2 and x\textasciicircum{}3 & polynomial = \seqsplit{PolynomialFeatures(degree=3}, \seqsplit{include\_bias=False)} \tn % Row Count 7 (+ 4) % Row 13 \SetRowColor{LightBackground} & \seqsplit{features\_polynomial} = \seqsplit{polynomial.fit\_transform(features)} \tn % Row Count 10 (+ 3) % Row 14 \SetRowColor{white} Create linear regression & regression = LinearRegression() \tn % Row Count 12 (+ 2) % Row 15 \SetRowColor{LightBackground} Fit the linear regression & model = \seqsplit{regression.fit(features\_polynomial}, target) \tn % Row Count 15 (+ 3) % Row 16 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{Reducing Variance with Regularization} \tn % Row Count 16 (+ 1) % Row 17 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Use a learning algorithm that includes a shrinkage penalty (also called regularization) like ridge regression and lasso regression:} \tn % Row Count 19 (+ 3) % Row 18 \SetRowColor{white} Load libraries & from \seqsplit{sklearn.linear\_model} import Ridge \tn % Row Count 21 (+ 2) % Row 19 \SetRowColor{LightBackground} Create ridge regression with an alpha value & regression = Ridge(alpha=0.5) \tn % Row Count 24 (+ 3) % Row 20 \SetRowColor{white} Fit the linear regression & model = \seqsplit{regression.fit(features\_standardized}, target) \tn % Row Count 27 (+ 3) % Row 21 \SetRowColor{LightBackground} Load library & from \seqsplit{sklearn.linear\_model} import RidgeCV \tn % Row Count 29 (+ 2) % Row 22 \SetRowColor{white} Create ridge regression with three alpha values & regr\_cv = RidgeCV(alphas={[}0.1, 1.0, 10.0{]}) \tn % Row Count 32 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Linear Regression (cont)}} \tn % Row 23 \SetRowColor{LightBackground} Fit the linear regression & model\_cv = \seqsplit{regr\_cv.fit(features\_standardized}, target) \tn % Row Count 3 (+ 3) % Row 24 \SetRowColor{white} View coefficients & model\_cv.coef\_ \tn % Row Count 4 (+ 1) % Row 25 \SetRowColor{LightBackground} View alpha & model\_cv.alpha\_ \tn % Row Count 5 (+ 1) % Row 26 \SetRowColor{white} Reducing Features with Lasso Regression & You want to simplify your linear regression model by reducing the number of features. \tn % Row Count 10 (+ 5) % Row 27 \SetRowColor{LightBackground} Load library & from \seqsplit{sklearn.linear\_model} import Lasso \tn % Row Count 12 (+ 2) % Row 28 \SetRowColor{white} Create lasso regression with alpha value & regression = Lasso(alpha=0.5) \tn % Row Count 14 (+ 2) % Row 29 \SetRowColor{LightBackground} Fit the linear regression & model = \seqsplit{regression.fit(features\_standardized}, target) \tn % Row Count 17 (+ 3) % Row 30 \SetRowColor{white} Create lasso regression with a high alpha & regression\_a10 = Lasso(alpha=10) \tn % Row Count 20 (+ 3) % Row 31 \SetRowColor{LightBackground} & model\_a10 = \seqsplit{regression\_a10.fit(features\_standardized}, target) \tn % Row Count 24 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{interaction\_only=True tells PolynomialFeatures to only return interaction terms \newline PolynomialFeatures will add a feature containing ones called a bias. We can prevent that with include\_bias=False \newline Polynomial regression is an extension of linear regression to allow us to model nonlinear relationships.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Loading Data}} \tn % Row 0 \SetRowColor{LightBackground} Loading a Sample Dataset & from sklearn import datasets \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} & digits = \seqsplit{datasets.load\_digits()} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} & features = digits.data \tn % Row Count 6 (+ 2) % Row 3 \SetRowColor{white} & target = digits.target \tn % Row Count 8 (+ 2) % Row 4 \SetRowColor{LightBackground} Creating a Simulated Dataset for regression & from sklearn.datasets import make\_regression \tn % Row Count 11 (+ 3) % Row 5 \SetRowColor{white} & features, target, coefficients = \seqsplit{make\_regression(n\_samples} = 100, n\_features = 3, n\_informative = 3, n\_targets = 1, noise = 0.0, coef = True, random\_state = 1) \tn % Row Count 19 (+ 8) % Row 6 \SetRowColor{LightBackground} Creating a Simulated Dataset for classification & from sklearn.datasets import \seqsplit{make\_classification} \tn % Row Count 22 (+ 3) % Row 7 \SetRowColor{white} & features, target = \seqsplit{make\_classification(n\_samples} = 100, n\_features = 3, n\_informative = 3, n\_redundant = 0, n\_classes = 2, weights = {[}.25, .75{]}, random\_state = 1) \tn % Row Count 31 (+ 9) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Loading Data (cont)}} \tn % Row 8 \SetRowColor{LightBackground} Creating a Simulated Dataset for clustering & from sklearn.datasets import make\_blobs \tn % Row Count 3 (+ 3) % Row 9 \SetRowColor{white} & features, target = \seqsplit{make\_blobs(n\_samples} = 100, n\_features = 2, centers = 3, cluster\_std = 0.5, shuffle = True, random\_state = 1) \tn % Row Count 10 (+ 7) % Row 10 \SetRowColor{LightBackground} Loading a CSV File & dataframe = pd.read\_csv(data,sep=',') \tn % Row Count 12 (+ 2) % Row 11 \SetRowColor{white} Loading an Excel File & pd.read\_excel(url, sheetname=0, header=1) \tn % Row Count 15 (+ 3) % Row 12 \SetRowColor{LightBackground} & If we need to load multiple sheets, include them as a list. \tn % Row Count 18 (+ 3) % Row 13 \SetRowColor{white} Loading a JSON File & pd.read\_json(url, orient='columns') \tn % Row Count 20 (+ 2) % Row 14 \SetRowColor{LightBackground} & The key difference is the orient parameter, which indicates to pandas how the JSON file is structured. However, it might take some experimenting to figure out which argument (split, records, index, columns, and values) is the right one. \tn % Row Count 32 (+ 12) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Loading Data (cont)}} \tn % Row 15 \SetRowColor{LightBackground} convert semistructured JSON data into a pandas DataFrame & json\_normalize \tn % Row Count 3 (+ 3) % Row 16 \SetRowColor{white} Querying a SQL Database & from sqlalchemy import create\_engine \tn % Row Count 5 (+ 2) % Row 17 \SetRowColor{LightBackground} & \seqsplit{database\_connection} = \seqsplit{create\_engine('sqlite:///sample}.db') \tn % Row Count 8 (+ 3) % Row 18 \SetRowColor{white} & \seqsplit{pd.read\_sql\_query('SELECT} * FROM data', \seqsplit{database\_connection)} \tn % Row Count 11 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{In addition, make\_classification contains a weights parameter that allows us to simulate datasets with imbalanced classes. For example, weights = {[}.25,.75{]} \newline For make\_blobs, the centers parameter determines the number of clusters \newline generated.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Naive Bayes}} \tn % Row 0 \SetRowColor{LightBackground} Training a Classifier for Continuous Features & Use a Gaussian naive Bayes classifier \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Load libraries & from \seqsplit{sklearn.naive\_bayes} import GaussianNB \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} Create Gaussian Naive Bayes object & classifer = GaussianNB() \tn % Row Count 8 (+ 2) % Row 3 \SetRowColor{white} Train model & model = \seqsplit{classifer.fit(features}, target) \tn % Row Count 10 (+ 2) % Row 4 \SetRowColor{LightBackground} Create Gaussian Naive Bayes object with prior probabilities of each class & clf = GaussianNB(priors={[}0.25, 0.25, 0.5{]}) \tn % Row Count 14 (+ 4) % Row 5 \SetRowColor{white} Training a Classifier for Discrete and Count Features & Given discrete or count data \tn % Row Count 17 (+ 3) % Row 6 \SetRowColor{LightBackground} Load libraries & from \seqsplit{sklearn.naive\_bayes} import MultinomialNB \tn % Row Count 20 (+ 3) % Row 7 \SetRowColor{white} & from \seqsplit{sklearn.feature\_extraction.text} import CountVectorizer \tn % Row Count 23 (+ 3) % Row 8 \SetRowColor{LightBackground} Create bag of words & count = CountVectorizer() \tn % Row Count 25 (+ 2) % Row 9 \SetRowColor{white} & bag\_of\_words = \seqsplit{count.fit\_transform(text\_data)} \tn % Row Count 28 (+ 3) % Row 10 \SetRowColor{LightBackground} Create feature matrix & features = \seqsplit{bag\_of\_words.toarray()} \tn % Row Count 30 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Naive Bayes (cont)}} \tn % Row 11 \SetRowColor{LightBackground} Create multinomial naive Bayes object with prior probabilities of each class & classifer = MultinomialNB(class\_prior={[}0.25, 0.5{]}) \tn % Row Count 4 (+ 4) % Row 12 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{Training a Naive Bayes Classifier for Binary Features} \tn % Row Count 6 (+ 2) % Row 13 \SetRowColor{LightBackground} Load libraries & from \seqsplit{sklearn.naive\_bayes} import BernoulliNB \tn % Row Count 9 (+ 3) % Row 14 \SetRowColor{white} Create Bernoulli Naive Bayes object with prior probabilities of each class & classifer = BernoulliNB(class\_prior={[}0.25, 0.5{]}) \tn % Row Count 13 (+ 4) % Row 15 \SetRowColor{LightBackground} Calibrating Predicted Probabilities & You want to calibrate the predicted probabilities from naive Bayes classifiers so they are interpretable. \tn % Row Count 19 (+ 6) % Row 16 \SetRowColor{white} Load libraries & from sklearn.calibration import \seqsplit{CalibratedClassifierCV} \tn % Row Count 22 (+ 3) % Row 17 \SetRowColor{LightBackground} Create calibrated cross-validation with sigmoid calibration & classifer\_sigmoid = \seqsplit{CalibratedClassifierCV(classifer}, cv=2, method='sigmoid') \tn % Row Count 26 (+ 4) % Row 18 \SetRowColor{white} Calibrate probabilities & \seqsplit{classifer\_sigmoid.fit(features}, target) \tn % Row Count 28 (+ 2) % Row 19 \SetRowColor{LightBackground} View calibrated probabilities & \seqsplit{classifer\_sigmoid.predict\_proba(new\_observation)} \tn % Row Count 31 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{If class\_prior is not specified, prior probabilities are learned using the data. However, if we want a uniform distribution to be used as the prior, we can set fit\_prior=False.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Logistic Regression}} \tn % Row 0 \SetRowColor{LightBackground} Training a Binary Classifier & from \seqsplit{sklearn.linear\_model} import LogisticRegression \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} & from \seqsplit{sklearn.preprocessing} import StandardScaler \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} Create logistic regression object & \seqsplit{logistic\_regression} = \seqsplit{LogisticRegression(random\_state=0)} \tn % Row Count 9 (+ 3) % Row 3 \SetRowColor{white} View predicted probabilities & \seqsplit{model.predict\_proba(new\_observation)} \tn % Row Count 11 (+ 2) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Training a Multiclass Classifier} \tn % Row Count 12 (+ 1) % Row 5 \SetRowColor{white} Create one-vs-rest logistic regression object & \seqsplit{logistic\_regression} = \seqsplit{LogisticRegression(random\_state=0}, multi\_class="ovr") \tn % Row Count 16 (+ 4) % Row 6 \SetRowColor{LightBackground} Reducing Variance Through Regularization & Tune the regularization strength hyperparameter, C \tn % Row Count 19 (+ 3) % Row 7 \SetRowColor{white} Create decision tree classifier object & \seqsplit{logistic\_regression} = \seqsplit{LogisticRegressionCV(} penalty='l2', Cs=10, random\_state=0, n\_jobs=-1) \tn % Row Count 24 (+ 5) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Training a Classifier on Very Large Data} \tn % Row Count 25 (+ 1) % Row 9 \SetRowColor{white} Create logistic regression object & \seqsplit{logistic\_regression} = \seqsplit{LogisticRegression(random\_state=0}, solver="sag") \tn % Row Count 29 (+ 4) % Row 10 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Handling Imbalanced Classes} \tn % Row Count 30 (+ 1) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Logistic Regression (cont)}} \tn % Row 11 \SetRowColor{LightBackground} Create target vector indicating if class 0, otherwise 1 & target = np.where((target == 0), 0, 1) \tn % Row Count 3 (+ 3) % Row 12 \SetRowColor{white} Create decision tree classifier object & \seqsplit{logistic\_regression} = \seqsplit{LogisticRegression(random\_state=0}, \seqsplit{class\_weight="balanced")} \tn % Row Count 8 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{K-Nearest Neighbors}} \tn % Row 0 \SetRowColor{LightBackground} Finding an Observation's Nearest Neighbors & from sklearn.neighbors import NearestNeighbors \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Create standardizer & standardizer = StandardScaler() \tn % Row Count 5 (+ 2) % Row 2 \SetRowColor{LightBackground} Standardize features & \seqsplit{features\_standardized} = \seqsplit{standardizer.fit\_transform(features)} \tn % Row Count 8 (+ 3) % Row 3 \SetRowColor{white} Two nearest neighbors & nearest\_neighbors = \seqsplit{NearestNeighbors(n\_neighbors=2)}.fit(features\_standardized) \tn % Row Count 12 (+ 4) % Row 4 \SetRowColor{LightBackground} Create an observation & new\_observation = {[} 1, 1, 1, 1{]} \tn % Row Count 14 (+ 2) % Row 5 \SetRowColor{white} Find distances and indices of the observation's nearest neighbors & distances, indices = nearest\_neighbors.kneighbors({[}new\_observation{]}) \tn % Row Count 18 (+ 4) % Row 6 \SetRowColor{LightBackground} View the nearest neighbors & eatures\_standardized{[}indices{]} \tn % Row Count 20 (+ 2) % Row 7 \SetRowColor{white} Find two nearest neighbors based on euclidean distance & \seqsplit{nearestneighbors\_euclidean} = NearestNeighbors( n\_neighbors=2, \seqsplit{metric='euclidean').fit(features\_standardized)} \tn % Row Count 26 (+ 6) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{create a matrix indicating each observation's nearest neighbors} \tn % Row Count 28 (+ 2) % Row 9 \SetRowColor{white} Find each observation's three nearest neighbors based on euclidean distance (including itself) & \seqsplit{nearestneighbors\_euclidean} = NearestNeighbors( n\_neighbors=3, \seqsplit{metric="euclidean").fit(features\_standardized)} \tn % Row Count 34 (+ 6) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{K-Nearest Neighbors (cont)}} \tn % Row 10 \SetRowColor{LightBackground} List of lists indicating each observation's 3 nearest neighbors & \seqsplit{nearest\_neighbors\_with\_self} = \seqsplit{nearestneighbors\_euclidean}.kneighbors\_graph( \seqsplit{features\_standardized)}.toarray() \tn % Row Count 6 (+ 6) % Row 11 \SetRowColor{white} Remove 1's marking an observation is a nearest neighbor to itself & for i, x in \seqsplit{enumerate(nearest\_neighbors\_with\_self):} \tn % Row Count 10 (+ 4) % Row 12 \SetRowColor{LightBackground} & x{[}i{]} = 0 \tn % Row Count 11 (+ 1) % Row 13 \SetRowColor{white} View first observation's two nearest neighbors & nearest\_neighbors\_with\_self{[}0{]} \tn % Row Count 14 (+ 3) % Row 14 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Creating a K-Nearest Neighbor Classifier} \tn % Row Count 15 (+ 1) % Row 15 \SetRowColor{white} Train a KNN classifier with 5 neighbors & knn = \seqsplit{KNeighborsClassifier(n\_neighbors=5}, \seqsplit{n\_jobs=-1).fit(X\_std}, y) \tn % Row Count 19 (+ 4) % Row 16 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Identifying the Best Neighborhood Size} \tn % Row Count 20 (+ 1) % Row 17 \SetRowColor{white} Load libraries & from sklearn.pipeline import Pipeline, FeatureUnion \tn % Row Count 23 (+ 3) % Row 18 \SetRowColor{LightBackground} & from \seqsplit{sklearn.model\_selection} import GridSearchCV \tn % Row Count 26 (+ 3) % Row 19 \SetRowColor{white} Create a pipeline & pipe = Pipeline({[}("standardizer", standardizer), ("knn", knn){]}) \tn % Row Count 30 (+ 4) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{K-Nearest Neighbors (cont)}} \tn % Row 20 \SetRowColor{LightBackground} Create space of candidate values & search\_space = {[}\{"knn\_\_n\_neighbors": {[}1, 2, 3, 4, 5, 6, 7, 8, 9, 10{]}\}{]} \tn % Row Count 4 (+ 4) % Row 21 \SetRowColor{white} Create grid search & classifier = GridSearchCV( pipe, search\_space, cv=5, \seqsplit{verbose=0).fit(features\_standardized}, target) \tn % Row Count 9 (+ 5) % Row 22 \SetRowColor{LightBackground} Best neighborhood size (k) & \seqsplit{classifier.best\_estimator\_}.get\_params(){[}"knn\_\_n\_neighbors"{]} \tn % Row Count 12 (+ 3) % Row 23 \SetRowColor{white} Creating a Radius-Based Nearest Neighbor Classifier & from sklearn.neighbors import \seqsplit{RadiusNeighborsClassifier} \tn % Row Count 15 (+ 3) % Row 24 \SetRowColor{LightBackground} Train a radius neighbors classifier & rnn = \seqsplit{RadiusNeighborsClassifier(} radius=.5, \seqsplit{n\_jobs=-1).fit(features\_standardized}, target) \tn % Row Count 20 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Selection}} \tn % Row 0 \SetRowColor{LightBackground} Selecting Best Models Using Exhaustive Search & from \seqsplit{sklearn.model\_selection} import GridSearchCV \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Create range of candidate penalty hyperparameter values & penalty = {[}'l1', 'l2'{]} \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} Create range of candidate regularization hyperparameter values & C = np.logspace(0, 4, 10) \tn % Row Count 10 (+ 4) % Row 3 \SetRowColor{white} & \seqsplit{numpy.logspace(start}, stop, num=50, endpoint=True, base=10.0, dtype=None, axis=0) \tn % Row Count 15 (+ 5) % Row 4 \SetRowColor{LightBackground} Create dictionary hyperparameter candidates & hyperparameters = dict(C=C, penalty=penalty) \tn % Row Count 18 (+ 3) % Row 5 \SetRowColor{white} Create grid search & gridsearch = \seqsplit{GridSearchCV(logistic}, hyperparameters, cv=5, verbose=0) \tn % Row Count 22 (+ 4) % Row 6 \SetRowColor{LightBackground} Fit grid search & best\_model = \seqsplit{gridsearch.fit(features}, target) \tn % Row Count 25 (+ 3) % Row 7 \SetRowColor{white} Predict target vector & \seqsplit{best\_model.predict(features)} \tn % Row Count 27 (+ 2) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Selecting Best Models Using Randomized Search} \tn % Row Count 28 (+ 1) % Row 9 \SetRowColor{white} Load libraries & from \seqsplit{sklearn.model\_selection} import RandomizedSearchCV \tn % Row Count 31 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Selection (cont)}} \tn % Row 10 \SetRowColor{LightBackground} Create range of candidate regularization penalty hyperparameter values & penalty = {[}'l1', 'l2'{]} \tn % Row Count 4 (+ 4) % Row 11 \SetRowColor{white} Create distribution of candidate regularization hyperparameter values & from scipy.stats import uniform \tn % Row Count 8 (+ 4) % Row 12 \SetRowColor{LightBackground} & C = uniform(loc=0, scale=4) \tn % Row Count 10 (+ 2) % Row 13 \SetRowColor{white} Create hyperparameter options & hyperparameters = dict(C=C, penalty=penalty) \tn % Row Count 13 (+ 3) % Row 14 \SetRowColor{LightBackground} Create randomized search & randomizedsearch = RandomizedSearchCV( logistic, hyperparameters, random\_state=1, n\_iter=100, cv=5, verbose=0, n\_jobs=-1) \tn % Row Count 20 (+ 7) % Row 15 \SetRowColor{white} Fit randomized search & best\_model = \seqsplit{randomizedsearch.fit(features}, target) \tn % Row Count 23 (+ 3) % Row 16 \SetRowColor{LightBackground} Predict target vector & \seqsplit{best\_model.predict(features)} \tn % Row Count 25 (+ 2) % Row 17 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{Selecting Best Models from Multiple} \tn % Row Count 26 (+ 1) % Row 18 \SetRowColor{LightBackground} Load libraries & from \seqsplit{sklearn.model\_selection} import GridSearchCV \tn % Row Count 29 (+ 3) % Row 19 \SetRowColor{white} & from sklearn.pipeline import Pipeline \tn % Row Count 31 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Selection (cont)}} \tn % Row 20 \SetRowColor{LightBackground} Create a pipeline & pipe = Pipeline({[}("classifier", RandomForestClassifier()){]}) \tn % Row Count 3 (+ 3) % Row 21 \SetRowColor{white} Create dictionary with candidate learning algorithms and their hyperparameters & search\_space = {[}\{"classifier": {[}LogisticRegression(){]}, \seqsplit{"classifier\_\_penalty":} {[}'l1', 'l2'{]}, "classifier\_\_C": np.logspace(0, 4, 10)\}, \{"classifier": {[}RandomForestClassifier(){]}, \seqsplit{"classifier\_\_n\_estimators":} {[}10, 100, 1000{]}, \seqsplit{"classifier\_\_max\_features":} {[}1, 2, 3{]}\}{]} \tn % Row Count 16 (+ 13) % Row 22 \SetRowColor{LightBackground} Create grid search & gridsearch = GridSearchCV(pipe, search\_space, cv=5, verbose=0) \tn % Row Count 20 (+ 4) % Row 23 \SetRowColor{white} Fit grid search & best\_model = \seqsplit{gridsearch.fit(features}, target) \tn % Row Count 23 (+ 3) % Row 24 \SetRowColor{LightBackground} View best model & \seqsplit{best\_model.best\_estimator\_}.get\_params(){[}"classifier"{]} \tn % Row Count 26 (+ 3) % Row 25 \SetRowColor{white} Predict target vector & \seqsplit{best\_model.predict(features)} \tn % Row Count 28 (+ 2) % Row 26 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Selecting Best Models When Preprocessing} \tn % Row Count 29 (+ 1) % Row 27 \SetRowColor{white} Load libraries & from sklearn.pipeline import Pipeline, FeatureUnion \tn % Row Count 32 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Selection (cont)}} \tn % Row 28 \SetRowColor{LightBackground} Create a preprocessing object that includes StandardScaler features and PCA & preprocess = FeatureUnion({[}("std", StandardScaler()), ("pca", PCA()){]}) \tn % Row Count 4 (+ 4) % Row 29 \SetRowColor{white} Create a pipeline & pipe = Pipeline({[}("preprocess", preprocess), ("classifier", LogisticRegression()){]}) \tn % Row Count 9 (+ 5) % Row 30 \SetRowColor{LightBackground} Create space of candidate values & search\_space = {[}\{"preprocess\_\_pca\_\_n\_components": {[}1, 2, 3{]}, \seqsplit{"classifier\_\_penalty":} {[}"l1", "l2"{]}, "classifier\_\_C": np.logspace(0, 4, 10)\}{]} \tn % Row Count 16 (+ 7) % Row 31 \SetRowColor{white} Create grid search & clf = GridSearchCV(pipe, search\_space, cv=5, verbose=0, n\_jobs=-1) \tn % Row Count 20 (+ 4) % Row 32 \SetRowColor{LightBackground} Fit grid search & best\_model = clf.fit(features, target) \tn % Row Count 22 (+ 2) % Row 33 \SetRowColor{white} Speeding Up Model Selection with Parallelization & Use all the cores in your machine by setting n\_jobs=-1 \tn % Row Count 25 (+ 3) % Row 34 \SetRowColor{LightBackground} & gridsearch = \seqsplit{GridSearchCV(logistic}, hyperparameters, cv=5, n\_jobs=-1, verbose=1) \tn % Row Count 29 (+ 4) % Row 35 \SetRowColor{white} peeding Up Model Selection Using Algorithm-Specific Methods & If you are using a select number of learning algorithms, use scikit-learn's modelspecific cross-validation hyperparameter tuning. \tn % Row Count 36 (+ 7) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Selection (cont)}} \tn % Row 36 \SetRowColor{LightBackground} Create cross-validated logistic regression & logit = \seqsplit{linear\_model.LogisticRegressionCV(Cs=100)} \tn % Row Count 3 (+ 3) % Row 37 \SetRowColor{white} Train model & logit.fit(features, target) \tn % Row Count 5 (+ 2) % Row 38 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Evaluating Performance After Model Selection} \tn % Row Count 6 (+ 1) % Row 39 \SetRowColor{white} Load libraries & from \seqsplit{sklearn.model\_selection} import GridSearchCV, cross\_val\_score \tn % Row Count 10 (+ 4) % Row 40 \SetRowColor{LightBackground} Conduct nested cross-validation and outut the average score & \seqsplit{cross\_val\_score(gridsearch}, features, target).mean() \tn % Row Count 13 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{In scikit-learn, many learning algorithms (e.g., ridge, \newline lasso, and elastic net regression) have an algorithm-specific cross-validation \newline method to take advantage of this.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Dates and Times}} \tn % Row 0 \SetRowColor{LightBackground} Create strings & date\_strings = np.array({[}'03-04-2005 11:35 PM', '23-05-2010 12:01 AM', '04-09-2009 09:09 PM'{]}) \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} Convert to datetimes & {[}pd.to\_datetime(date, format='\%d-\%m-\%Y \%I:\%M \%p', errors="coerce") for date in date\_strings{]} \tn % Row Count 10 (+ 5) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Handling Time Zones} \tn % Row Count 11 (+ 1) % Row 3 \SetRowColor{white} Create datetime & \seqsplit{pd.Timestamp('2017-05-01} 06:00:00', tz='Europe/London') \tn % Row Count 14 (+ 3) % Row 4 \SetRowColor{LightBackground} We can add a time zone to a previously created datetime & date\_in\_london = \seqsplit{date.tz\_localize('Europe/London')} \tn % Row Count 17 (+ 3) % Row 5 \SetRowColor{white} convert to a different time zone & \seqsplit{date\_in\_london.tz\_convert('Africa/Abidjan')} \tn % Row Count 20 (+ 3) % Row 6 \SetRowColor{LightBackground} tz\_localize and tz\_convert to every element & \seqsplit{dates.dt.tz\_localize('Africa/Abidjan')} \tn % Row Count 23 (+ 3) % Row 7 \SetRowColor{white} importing all\_timezones & from pytz import all\_timezones \tn % Row Count 25 (+ 2) % Row 8 \SetRowColor{LightBackground} Create datetimes range & dataframe{[}'date'{]} = \seqsplit{pd.date\_range('1/1/2001'}, periods=100000, freq='H') \tn % Row Count 29 (+ 4) % Row 9 \SetRowColor{white} Select observations between two datetimes & dataframe{[}(dataframe{[}'date'{]} \textgreater{} '2002-1-1 01:00:00') \& (dataframe{[}'date'{]} \textless{}= '2002-1-1 04:00:00'){]} \tn % Row Count 34 (+ 5) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Dates and Times (cont)}} \tn % Row 10 \SetRowColor{LightBackground} Breaking Up Date Data into Multiple Features & dataframe{[}'year'{]} = dataframe{[}'date'{]}.dt.year \tn % Row Count 3 (+ 3) % Row 11 \SetRowColor{white} & dataframe{[}'month'{]} = dataframe{[}'date'{]}.dt.month \tn % Row Count 6 (+ 3) % Row 12 \SetRowColor{LightBackground} & dataframe{[}'day'{]} = dataframe{[}'date'{]}.dt.day \tn % Row Count 9 (+ 3) % Row 13 \SetRowColor{white} & dataframe{[}'hour'{]} = dataframe{[}'date'{]}.dt.hour \tn % Row Count 12 (+ 3) % Row 14 \SetRowColor{LightBackground} & dataframe{[}'minute'{]} = dataframe{[}'date'{]}.dt.minute \tn % Row Count 15 (+ 3) % Row 15 \SetRowColor{white} Calculate duration between features & \seqsplit{pd.Series(delta.days} for delta in (dataframe{[}'Left'{]} - dataframe{[}'Arrived'{]})) \tn % Row Count 19 (+ 4) % Row 16 \SetRowColor{LightBackground} Show days of the week & \seqsplit{dates.dt.weekday\_name} \tn % Row Count 21 (+ 2) % Row 17 \SetRowColor{white} Show days of the week as numbers (Monday is 0) & dates.dt.weekday \tn % Row Count 24 (+ 3) % Row 18 \SetRowColor{LightBackground} Creating a Lagged Feature (Lagged values by one row) & dataframe{[}"previous\_days\_stock\_price"{]} = dataframe{[}"stock\_price"{]}.shift(1) \tn % Row Count 28 (+ 4) % Row 19 \SetRowColor{white} Calculate rolling mean or moving average & \seqsplit{dataframe.rolling(window=2).mean()} \tn % Row Count 30 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Dates and Times (cont)}} \tn % Row 20 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Handling Missing Data in Time Series} \tn % Row Count 1 (+ 1) % Row 21 \SetRowColor{white} Interpolate missing values & \seqsplit{dataframe.interpolate()} \tn % Row Count 3 (+ 2) % Row 22 \SetRowColor{LightBackground} replace missing values with the last known value (i.e., forward-filling) & dataframe.ffill() \tn % Row Count 7 (+ 4) % Row 23 \SetRowColor{white} eplace missing values with the latest known value (i.e., backfilling) & dataframe.bfill() \tn % Row Count 11 (+ 4) % Row 24 \SetRowColor{LightBackground} If we believe the line between the two known points is nonlinear & \seqsplit{dataframe.interpolate(method="quadratic")} \tn % Row Count 15 (+ 4) % Row 25 \SetRowColor{white} Interpolate missing values & \seqsplit{dataframe.interpolate(limit=1}, \seqsplit{limit\_direction="forward")} \tn % Row Count 18 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Numerical Data}} \tn % Row 0 \SetRowColor{LightBackground} Min Max scaler & from sklearn import preprocessing \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} Create scaler & minmax\_scale = \seqsplit{preprocessing.MinMaxScaler(feature\_range=(0}, 1)) \tn % Row Count 6 (+ 4) % Row 2 \SetRowColor{LightBackground} Scale feature & \seqsplit{minmax\_scale.fit\_transform(feature)} \tn % Row Count 8 (+ 2) % Row 3 \SetRowColor{white} Standardizing a Feature & from sklearn import preprocessing \tn % Row Count 10 (+ 2) % Row 4 \SetRowColor{LightBackground} Create scaler & scaler = \seqsplit{preprocessing.StandardScaler()} \tn % Row Count 12 (+ 2) % Row 5 \SetRowColor{white} Transform the feature & standardized = \seqsplit{scaler.fit\_transform(x)} \tn % Row Count 14 (+ 2) % Row 6 \SetRowColor{LightBackground} Normalizing Observations (unit norm -\textgreater{} all values have values lower than one) & from \seqsplit{sklearn.preprocessing} import Normalizer \tn % Row Count 18 (+ 4) % Row 7 \SetRowColor{white} Create normalizer & normalizer = \seqsplit{Normalizer(norm="l2")} \tn % Row Count 20 (+ 2) % Row 8 \SetRowColor{LightBackground} Transform feature matrix & \seqsplit{normalizer.transform(features)} \tn % Row Count 22 (+ 2) % Row 9 \SetRowColor{white} & This type of rescaling is often used when we have many equivalent features (e.g., text classification) \tn % Row Count 28 (+ 6) % Row 10 \SetRowColor{LightBackground} Generating Polynomial and Interaction Features & from \seqsplit{sklearn.preprocessing} import PolynomialFeatures \tn % Row Count 31 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Numerical Data (cont)}} \tn % Row 11 \SetRowColor{LightBackground} Create PolynomialFeatures object & \seqsplit{polynomial\_interaction} = \seqsplit{PolynomialFeatures(degree=2}, \seqsplit{interaction\_only=True},, \seqsplit{include\_bias=False)} \tn % Row Count 5 (+ 5) % Row 12 \SetRowColor{white} Create polynomial features & \seqsplit{polynomial\_interaction}.fit\_transform(features) \tn % Row Count 8 (+ 3) % Row 13 \SetRowColor{LightBackground} Transforming Features & from \seqsplit{sklearn.preprocessing} import FunctionTransformer \tn % Row Count 11 (+ 3) % Row 14 \SetRowColor{white} & does the same as apply \tn % Row Count 13 (+ 2) % Row 15 \SetRowColor{LightBackground} Detecting Outliers & from sklearn.covariance import EllipticEnvelope \tn % Row Count 16 (+ 3) % Row 16 \SetRowColor{white} Create detector & outlier\_detector = \seqsplit{EllipticEnvelope(contamination=}.1) \tn % Row Count 19 (+ 3) % Row 17 \SetRowColor{LightBackground} Fit detector & \seqsplit{outlier\_detector.fit(features)} \tn % Row Count 21 (+ 2) % Row 18 \SetRowColor{white} Predict outliers & \seqsplit{outlier\_detector.predict(features)} \tn % Row Count 23 (+ 2) % Row 19 \SetRowColor{LightBackground} IQR for outlier detection & def \seqsplit{indicies\_of\_outliers(x):} \tn % Row Count 25 (+ 2) % Row 20 \SetRowColor{white} & q1, q3 = np.percentile(x, {[}25, 75{]}) \tn % Row Count 27 (+ 2) % Row 21 \SetRowColor{LightBackground} & iqr = q3 - q1 \tn % Row Count 28 (+ 1) % Row 22 \SetRowColor{white} & lower\_bound = q1 - (iqr * 1.5) \tn % Row Count 30 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Numerical Data (cont)}} \tn % Row 23 \SetRowColor{LightBackground} & upper\_bound = q3 + (iqr * 1.5) \tn % Row Count 2 (+ 2) % Row 24 \SetRowColor{white} & return np.where((x \textgreater{} upper\_bound) | (x \textless{} lower\_bound)) \tn % Row Count 5 (+ 3) % Row 25 \SetRowColor{LightBackground} Handling Outliers & houses{[}houses{[}'Bathrooms'{]} \textless{} 20{]} \tn % Row Count 7 (+ 2) % Row 26 \SetRowColor{white} Create feature based on boolean condition to detect outliers & houses{[}"Outlier"{]} = np.where(houses{[}"Bathrooms"{]} \textless{} 20, 0, 1) \tn % Row Count 10 (+ 3) % Row 27 \SetRowColor{LightBackground} Transform the feature to dampen the effect of the outlier & houses{[}"Log\_Of\_Square\_Feet"{]} = {[}np.log(x) for x in houses{[}"Square\_Feet"{]}{]} \tn % Row Count 14 (+ 4) % Row 28 \SetRowColor{white} Standardization if we have outliers & RobustScaler \tn % Row Count 16 (+ 2) % Row 29 \SetRowColor{LightBackground} Discretizating Features (binning) & from \seqsplit{sklearn.preprocessing} import Binarizer \tn % Row Count 19 (+ 3) % Row 30 \SetRowColor{white} Create binarizer & binarizer = Binarizer(18) \tn % Row Count 21 (+ 2) % Row 31 \SetRowColor{LightBackground} Transform feature & \seqsplit{binarizer.fit\_transform(age)} array({[}{[}0{]}, {[}0{]}, \tn % Row Count 24 (+ 3) % Row 32 \SetRowColor{white} break up numerical features according to multiple thresholds & np.digitize(age, bins={[}20,30,64{]}, right=True (closes the right interval instead of the left)) \tn % Row Count 29 (+ 5) % Row 33 \SetRowColor{LightBackground} Grouping Observations Using Clustering & from sklearn.cluster import KMeans \tn % Row Count 31 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Numerical Data (cont)}} \tn % Row 34 \SetRowColor{LightBackground} Make k-means clusterer & clusterer = KMeans(3, random\_state=0) \tn % Row Count 2 (+ 2) % Row 35 \SetRowColor{white} Fit clusterer & \seqsplit{clusterer.fit(features)} \tn % Row Count 4 (+ 2) % Row 36 \SetRowColor{LightBackground} Predict values & dataframe{[}"group"{]} = \seqsplit{clusterer.predict(features)} \tn % Row Count 7 (+ 3) % Row 37 \SetRowColor{white} Keep only observations that are not (denoted by \textasciitilde{}) missing & features{[}\textasciitilde{}np.isnan(features).any(axis=1){]} \tn % Row Count 10 (+ 3) % Row 38 \SetRowColor{LightBackground} drop missing observations using pandas & dataframe.dropna() \tn % Row Count 12 (+ 2) % Row 39 \SetRowColor{white} Predict the missing values in the feature matrix & \seqsplit{features\_knn\_imputed} = KNN(k=5, \seqsplit{verbose=0).complete(standardized\_features)} \tn % Row Count 16 (+ 4) % Row 40 \SetRowColor{LightBackground} Imputer module to fill in missing values & from \seqsplit{sklearn.preprocessing} import Imputer \tn % Row Count 19 (+ 3) % Row 41 \SetRowColor{white} Create imputer & mean\_imputer = \seqsplit{Imputer(strategy="mean"}, axis=0) \tn % Row Count 22 (+ 3) % Row 42 \SetRowColor{LightBackground} Impute values & \seqsplit{features\_mean\_imputed} = \seqsplit{mean\_imputer.fit\_transform(features)} \tn % Row Count 25 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{One option is to use fit to calculate the minimum and maximum values of the feature, then \newline use transform to rescale the feature. The second option is to use fit\_transform to do both operations at once. There is no mathematical difference between the two options, but there is sometimes a practical benefit to keeping the operations separate because it allows us to apply the same \newline transformation to different sets of the data.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Deep learning}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Preprocessing Data for Neural Networks} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} Load libraries & from sklearn import preprocessing \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} Create scaler & scaler = \seqsplit{preprocessing.StandardScaler()} \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} Transform the feature & \seqsplit{features\_standardized} = \seqsplit{scaler.fit\_transform(features)} \# Show feature \seqsplit{features\_standardized} array({[}{[}-1.12541308, 1.96429418{]}, {[}-1.15329466, \tn % Row Count 12 (+ 7) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Designing a Neural Network} \tn % Row Count 13 (+ 1) % Row 5 \SetRowColor{white} Load libraries & from keras import models \tn % Row Count 15 (+ 2) % Row 6 \SetRowColor{LightBackground} & from keras import layers \tn % Row Count 17 (+ 2) % Row 7 \SetRowColor{white} Start neural network & network = models.Sequential() \tn % Row Count 19 (+ 2) % Row 8 \SetRowColor{LightBackground} Add fully connected layer with a ReLU activation function & \seqsplit{network.add(layers.Dense(units=16}, activation="relu", input\_shape=(10,))) \tn % Row Count 23 (+ 4) % Row 9 \SetRowColor{white} Add fully connected layer with a ReLU activation function & \seqsplit{network.add(layers.Dense(units=16}, activation="relu")) \tn % Row Count 26 (+ 3) % Row 10 \SetRowColor{LightBackground} Add fully connected layer with a sigmoid activation function & \seqsplit{network.add(layers.Dense(units=1}, \seqsplit{activation="sigmoid"))} \tn % Row Count 29 (+ 3) % Row 11 \SetRowColor{white} Compile neural network & \seqsplit{network.compile(loss="binary\_crossentropy"}, \# Cross-entropy optimizer="rmsprop", \# Root Mean Square Propagation metrics={[}"accuracy"{]}) \# Accuracy performance metric \tn % Row Count 38 (+ 9) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Deep learning (cont)}} \tn % Row 12 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Training a Binary Classifier} \tn % Row Count 1 (+ 1) % Row 13 \SetRowColor{white} Load libraries & from keras.datasets import imdb \tn % Row Count 3 (+ 2) % Row 14 \SetRowColor{LightBackground} & from \seqsplit{keras.preprocessing.text} import Tokenizer \tn % Row Count 6 (+ 3) % Row 15 \SetRowColor{white} & from keras import models \tn % Row Count 8 (+ 2) % Row 16 \SetRowColor{LightBackground} & from keras import layers \tn % Row Count 10 (+ 2) % Row 17 \SetRowColor{white} Set the number of features we want & \seqsplit{number\_of\_features} = 1000 \tn % Row Count 12 (+ 2) % Row 18 \SetRowColor{LightBackground} Start neural network & network = models.Sequential() \tn % Row Count 14 (+ 2) % Row 19 \SetRowColor{white} Add fully connected layer with a ReLU activation function & \seqsplit{network.add(layers.Dense(units=16}, activation="relu", input\_shape=( \seqsplit{number\_of\_features},))) \tn % Row Count 19 (+ 5) % Row 20 \SetRowColor{LightBackground} Add fully connected layer with a ReLU activation function & \seqsplit{network.add(layers.Dense(units=16}, activation="relu")) \tn % Row Count 22 (+ 3) % Row 21 \SetRowColor{white} Add fully connected layer with a sigmoid activation function & \seqsplit{network.add(layers.Dense(units=1}, \seqsplit{activation="sigmoid"))} \tn % Row Count 25 (+ 3) % Row 22 \SetRowColor{LightBackground} Compile neural network & \seqsplit{network.compile(loss="binary\_crossentropy"}, \# Cross-entropy optimizer="rmsprop", \# Root Mean Square Propagation metrics={[}"accuracy"{]}) \tn % Row Count 32 (+ 7) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Deep learning (cont)}} \tn % Row 23 \SetRowColor{LightBackground} Train neural network & history = \seqsplit{network.fit(features\_train}, \# Features target\_train, \# Target vectorepochs=3, \# Number of epochs verbose=1, \# Print description after each epoch batch\_size=100, \# Number of observations per batch \seqsplit{validation\_data=(features\_test}, target\_test)) \# Test data \tn % Row Count 14 (+ 14) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Evaluation}} \tn % Row 0 \SetRowColor{LightBackground} Cross-Validating Models & from \seqsplit{sklearn.model\_selection} import KFold, cross\_val\_score \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} & from sklearn.pipeline import make\_pipeline \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} Create a pipeline that standardizes, then runs logistic regression & pipeline = \seqsplit{make\_pipeline(standardizer}, logit) \tn % Row Count 10 (+ 4) % Row 3 \SetRowColor{white} Create k-Fold cross-validation & kf = KFold(n\_splits=10, shuffle=True, random\_state=1) \tn % Row Count 13 (+ 3) % Row 4 \SetRowColor{LightBackground} Conduct k-fold cross-validation & cv\_results = \seqsplit{cross\_val\_score(pipeline}, \# Pipeline features, \# Feature matrix target, \# Target vector cv=kf, \# Cross-validation technique scoring="accuracy", \# Loss function n\_jobs=-1) \# Use all CPU scores \tn % Row Count 24 (+ 11) % Row 5 \SetRowColor{white} Calculate mean & cv\_results.mean() \tn % Row Count 25 (+ 1) % Row 6 \SetRowColor{LightBackground} View score for all 10 folds & cv\_results \tn % Row Count 27 (+ 2) % Row 7 \SetRowColor{white} Fit standardizer to training set & \seqsplit{standardizer.fit(features\_train)} \tn % Row Count 29 (+ 2) % Row 8 \SetRowColor{LightBackground} Apply to both training and test sets & \seqsplit{features\_train\_std} = \seqsplit{standardizer.transform(features\_train)} \tn % Row Count 32 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Evaluation (cont)}} \tn % Row 9 \SetRowColor{LightBackground} & features\_test\_std = \seqsplit{standardizer.transform(features\_test)} \tn % Row Count 3 (+ 3) % Row 10 \SetRowColor{white} Creating a Baseline Regression Model & from sklearn.dummy import DummyRegressor \tn % Row Count 5 (+ 2) % Row 11 \SetRowColor{LightBackground} Create a dummy regressor & dummy = \seqsplit{DummyRegressor(strategy='mean')} \tn % Row Count 7 (+ 2) % Row 12 \SetRowColor{white} "Train" dummy regressor & \seqsplit{dummy.fit(features\_train}, target\_train) \tn % Row Count 9 (+ 2) % Row 13 \SetRowColor{LightBackground} Get R-squared score & \seqsplit{dummy.score(features\_test}, target\_test) \tn % Row Count 11 (+ 2) % Row 14 \SetRowColor{white} Regression & from \seqsplit{sklearn.linear\_model} import LinearRegression \tn % Row Count 14 (+ 3) % Row 15 \SetRowColor{LightBackground} Train simple linear regression model & ols = LinearRegression() \tn % Row Count 16 (+ 2) % Row 16 \SetRowColor{white} & \seqsplit{ols.fit(features\_train}, target\_train) \tn % Row Count 18 (+ 2) % Row 17 \SetRowColor{LightBackground} Get R-squared score & \seqsplit{ols.score(features\_test}, target\_test) \tn % Row Count 20 (+ 2) % Row 18 \SetRowColor{white} Create dummy regressor that predicts 20's for everything & clf = \seqsplit{DummyRegressor(strategy='constant'}, constant=20) \tn % Row Count 23 (+ 3) % Row 19 \SetRowColor{LightBackground} & \seqsplit{clf.fit(features\_train}, target\_train) \tn % Row Count 25 (+ 2) % Row 20 \SetRowColor{white} Creating a Baseline Classification Model & from sklearn.dummy import DummyClassifier \tn % Row Count 28 (+ 3) % Row 21 \SetRowColor{LightBackground} Create dummy classifier & dummy = \seqsplit{DummyClassifier(strategy='uniform'}, random\_state=1) \tn % Row Count 31 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Evaluation (cont)}} \tn % Row 22 \SetRowColor{LightBackground} "Train" model & \seqsplit{dummy.fit(features\_train}, target\_train) \tn % Row Count 2 (+ 2) % Row 23 \SetRowColor{white} Get accuracy score & \seqsplit{dummy.score(features\_test}, target\_test) \tn % Row Count 4 (+ 2) % Row 24 \SetRowColor{LightBackground} Evaluating Binary Classifier Predictions & from \seqsplit{sklearn.model\_selection} import cross\_val\_score \tn % Row Count 7 (+ 3) % Row 25 \SetRowColor{white} & from sklearn.datasets import \seqsplit{make\_classification} \tn % Row Count 10 (+ 3) % Row 26 \SetRowColor{LightBackground} Cross-validate model using accuracy & \seqsplit{cross\_val\_score(logit}, X, y, scoring="accuracy") \tn % Row Count 13 (+ 3) % Row 27 \SetRowColor{white} Cross-validate model using precision & \seqsplit{cross\_val\_score(logit}, X, y, \seqsplit{scoring="precision")} \tn % Row Count 16 (+ 3) % Row 28 \SetRowColor{LightBackground} Cross-validate model using recall & \seqsplit{cross\_val\_score(logit}, X, y, scoring="recall") \tn % Row Count 19 (+ 3) % Row 29 \SetRowColor{white} Cross-validate model using f1 & \seqsplit{cross\_val\_score(logit}, X, y, scoring="f1") \tn % Row Count 22 (+ 3) % Row 30 \SetRowColor{LightBackground} alculate metrics like accuracy and recall directly & from sklearn.metrics import accuracy\_score \tn % Row Count 25 (+ 3) % Row 31 \SetRowColor{white} Calculate accuracy & \seqsplit{accuracy\_score(y\_test}, y\_hat) \tn % Row Count 27 (+ 2) % Row 32 \SetRowColor{LightBackground} Evaluating Binary Classifier Thresholds & from sklearn.metrics import roc\_curve, roc\_auc\_score \tn % Row Count 30 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Evaluation (cont)}} \tn % Row 33 \SetRowColor{LightBackground} Get predicted probabilities & \seqsplit{target\_probabilities} = logit.predict\_proba(features\_test){[}:,1{]} \tn % Row Count 4 (+ 4) % Row 34 \SetRowColor{white} Create true and false positive rates & \seqsplit{false\_positive\_rate}, \seqsplit{true\_positive\_rate}, threshold = \seqsplit{roc\_curve(target\_test}, \seqsplit{target\_probabilities)} \tn % Row Count 9 (+ 5) % Row 35 \SetRowColor{LightBackground} Plot ROC curve & plt.title("Receiver Operating Characteristic") \tn % Row Count 12 (+ 3) % Row 36 \SetRowColor{white} & \seqsplit{plt.plot(false\_positive\_rate}, \seqsplit{true\_positive\_rate)} \tn % Row Count 15 (+ 3) % Row 37 \SetRowColor{LightBackground} & plt.plot({[}0, 1{]}, ls="-{}-") \tn % Row Count 17 (+ 2) % Row 38 \SetRowColor{white} & plt.plot({[}0, 0{]}, {[}1, 0{]} , c=".7"), plt.plot({[}1, 1{]} , c=".7") \tn % Row Count 20 (+ 3) % Row 39 \SetRowColor{LightBackground} & plt.ylabel("True Positive Rate") \tn % Row Count 22 (+ 2) % Row 40 \SetRowColor{white} & plt.xlabel("False Positive Rate") \tn % Row Count 24 (+ 2) % Row 41 \SetRowColor{LightBackground} & plt.show() \tn % Row Count 25 (+ 1) % Row 42 \SetRowColor{white} Evaluating Multiclass Classifier Predictions & \seqsplit{cross\_val\_score(logit}, features, target, \seqsplit{scoring='f1\_macro')} \tn % Row Count 28 (+ 3) % Row 43 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Visualizing a Classifier's Performance} \tn % Row Count 29 (+ 1) % Row 44 \SetRowColor{white} libraries & import matplotlib.pyplot as plt \tn % Row Count 31 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Evaluation (cont)}} \tn % Row 45 \SetRowColor{LightBackground} & import seaborn as sns \tn % Row Count 2 (+ 2) % Row 46 \SetRowColor{white} & from sklearn.metrics import confusion\_matrix \tn % Row Count 5 (+ 3) % Row 47 \SetRowColor{LightBackground} Create confusion matrix & matrix = \seqsplit{confusion\_matrix(target\_test}, target\_predicted) \tn % Row Count 8 (+ 3) % Row 48 \SetRowColor{white} Create pandas dataframe & dataframe = pd.DataFrame(matrix, index=class\_names, \seqsplit{columns=class\_names)} \tn % Row Count 12 (+ 4) % Row 49 \SetRowColor{LightBackground} Create heatmap & \seqsplit{sns.heatmap(dataframe}, annot=True, cbar=None, cmap="Blues") \tn % Row Count 15 (+ 3) % Row 50 \SetRowColor{white} & \seqsplit{plt.title("Confusion} Matrix"), plt.tight\_layout() \tn % Row Count 18 (+ 3) % Row 51 \SetRowColor{LightBackground} & plt.ylabel("True Class"), \seqsplit{plt.xlabel("Predicted} Class") \tn % Row Count 21 (+ 3) % Row 52 \SetRowColor{white} & plt.show() \tn % Row Count 22 (+ 1) % Row 53 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Evaluating Regression Models} \tn % Row Count 23 (+ 1) % Row 54 \SetRowColor{white} Cross-validate the linear regression using (negative) MSE \seqsplit{cross\_val\_score(ols}, features, target, \seqsplit{scoring='neg\_mean\_squared\_} & \seqsplit{cross\_val\_score(ols}, features, target, \seqsplit{scoring='neg\_mean\_squared\_error')} \tn % Row Count 30 (+ 7) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Evaluation (cont)}} \tn % Row 55 \SetRowColor{LightBackground} Cross-validate the linear regression using R-squared & \seqsplit{cross\_val\_score(ols}, features, target, scoring='r2') \tn % Row Count 3 (+ 3) % Row 56 \SetRowColor{white} Evaluating Clustering Models & from sklearn.metrics import silhouette\_score \tn % Row Count 6 (+ 3) % Row 57 \SetRowColor{LightBackground} & from sklearn.cluster import KMeans \tn % Row Count 8 (+ 2) % Row 58 \SetRowColor{white} Cluster data using k-means to predict classes & model = \seqsplit{KMeans(n\_clusters=2}, \seqsplit{random\_state=1).fit(features)} \tn % Row Count 11 (+ 3) % Row 59 \SetRowColor{LightBackground} Get predicted classes & target\_predicted = model.labels\_ \tn % Row Count 13 (+ 2) % Row 60 \SetRowColor{white} Evaluate model & \seqsplit{silhouette\_score(features}, target\_predicted) \tn % Row Count 16 (+ 3) % Row 61 \SetRowColor{LightBackground} Creating a Custom Evaluation Metric & from sklearn.metrics import make\_scorer, r2\_score \tn % Row Count 19 (+ 3) % Row 62 \SetRowColor{white} & from \seqsplit{sklearn.linear\_model} import Ridge \tn % Row Count 21 (+ 2) % Row 63 \SetRowColor{LightBackground} Create custom metric & def \seqsplit{custom\_metric(target\_test}, target\_predicted): \tn % Row Count 24 (+ 3) % Row 64 \SetRowColor{white} & r2 = \seqsplit{r2\_score(target\_test}, target\_predicted) \tn % Row Count 27 (+ 3) % Row 65 \SetRowColor{LightBackground} & return r2 \tn % Row Count 28 (+ 1) % Row 66 \SetRowColor{white} Make scorer and define that higher scores are better & score = \seqsplit{make\_scorer(custom\_metric}, \seqsplit{greater\_is\_better=True)} \tn % Row Count 31 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Evaluation (cont)}} \tn % Row 67 \SetRowColor{LightBackground} Create ridge regression object & classifier = Ridge() \tn % Row Count 2 (+ 2) % Row 68 \SetRowColor{white} Apply custom scorer & score(model, features\_test, target\_test) \tn % Row Count 4 (+ 2) % Row 69 \SetRowColor{LightBackground} Visualizing the Effect of Training Set Size & from \seqsplit{sklearn.model\_selection} import learning\_curve \tn % Row Count 7 (+ 3) % Row 70 \SetRowColor{white} Draw lines & \seqsplit{plt.plot(train\_sizes}, train\_mean, '-{}-', color="\#111111", label="Training score") \tn % Row Count 11 (+ 4) % Row 71 \SetRowColor{LightBackground} & \seqsplit{plt.plot(train\_sizes}, test\_mean, color="\#111111", \seqsplit{label="Cross-validation} score") \tn % Row Count 16 (+ 5) % Row 72 \SetRowColor{white} Draw bands & \seqsplit{plt.fill\_between(train\_sizes}, train\_mean - train\_std, train\_mean + train\_std, color="\#DDDDDD") \tn % Row Count 21 (+ 5) % Row 73 \SetRowColor{LightBackground} & \seqsplit{plt.fill\_between(train\_sizes}, test\_mean - test\_std, test\_mean + test\_std, color="\#DDDDDD") \tn % Row Count 26 (+ 5) % Row 74 \SetRowColor{white} Create plot & plt.title("Learning Curve") \tn % Row Count 28 (+ 2) % Row 75 \SetRowColor{LightBackground} & \seqsplit{plt.xlabel("Training} Set Size"), \seqsplit{plt.ylabel("Accuracy} Score"), \tn % Row Count 32 (+ 4) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Evaluation (cont)}} \tn % Row 76 \SetRowColor{LightBackground} & \seqsplit{plt.legend(loc="best")} \tn % Row Count 2 (+ 2) % Row 77 \SetRowColor{white} & plt.tight\_layout() \tn % Row Count 3 (+ 1) % Row 78 \SetRowColor{LightBackground} & plt.show() \tn % Row Count 4 (+ 1) % Row 79 \SetRowColor{white} Creating a Text Report of Evaluation Metrics & from sklearn.metrics import \seqsplit{classification\_report} \tn % Row Count 7 (+ 3) % Row 80 \SetRowColor{LightBackground} Create a classification report & \seqsplit{print(classification\_report(target\_test}, target\_predicted, \seqsplit{target\_names=class\_names))} \tn % Row Count 12 (+ 5) % Row 81 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{Visualizing the Effect of Hyperparameter Values} \tn % Row Count 13 (+ 1) % Row 82 \SetRowColor{LightBackground} Plot the validation curve & from \seqsplit{sklearn.model\_selection} import validation\_curve \tn % Row Count 16 (+ 3) % Row 83 \SetRowColor{white} Create range of values for parameter & param\_range = np.arange(1, 250, 2) \tn % Row Count 18 (+ 2) % Row 84 \SetRowColor{LightBackground} Hyperparameter to examine & \seqsplit{param\_name="n\_estimators"}, \tn % Row Count 20 (+ 2) % Row 85 \SetRowColor{white} Range of hyperparameter's values & param\_range = np.arange(1, 250, 2) \tn % Row Count 22 (+ 2) % Row 86 \SetRowColor{LightBackground} Calculate accuracy on training and test set using range of parameter values & train\_scores, test\_scores = validation\_curve( \# Classifier \seqsplit{RandomForestClassifier()}, \# Feature matrix features, \# Target vector target, \# Hyperparameter to examine \seqsplit{param\_name="n\_estimators"}, \# Range of hyperparameter's values \seqsplit{param\_range=param\_range}, \# Number of folds cv=3, \# Performance metric scoring="accuracy", \# Use all computer cores n\_jobs=-1) \tn % Row Count 40 (+ 18) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Evaluation (cont)}} \tn % Row 87 \SetRowColor{LightBackground} Plot mean accuracy scores for training and test sets & \seqsplit{plt.plot(param\_range}, train\_mean, label="Training score", color="black") \tn % Row Count 4 (+ 4) % Row 88 \SetRowColor{white} & \seqsplit{plt.plot(param\_range}, test\_mean, \seqsplit{label="Cross-validation} score", color="dimgrey") \tn % Row Count 9 (+ 5) % Row 89 \SetRowColor{LightBackground} Plot accurancy bands for training and test sets & \seqsplit{plt.fill\_between(param\_range}, train\_mean - train\_std, train\_mean + train\_std, color="gray") \tn % Row Count 14 (+ 5) % Row 90 \SetRowColor{white} & \seqsplit{plt.fill\_between(param\_range}, test\_mean - test\_std, test\_mean + test\_std, color="gainsboro") \tn % Row Count 19 (+ 5) % Row 91 \SetRowColor{LightBackground} Create plot & \seqsplit{plt.title("Validation} Curve With Random Forest") \tn % Row Count 22 (+ 3) % Row 92 \SetRowColor{white} & plt.xlabel("Number Of Trees") \tn % Row Count 24 (+ 2) % Row 93 \SetRowColor{LightBackground} & \seqsplit{plt.ylabel("Accuracy} Score") \tn % Row Count 26 (+ 2) % Row 94 \SetRowColor{white} & plt.tight\_layout() \tn % Row Count 27 (+ 1) % Row 95 \SetRowColor{LightBackground} & \seqsplit{plt.legend(loc="best")} \tn % Row Count 29 (+ 2) % Row 96 \SetRowColor{white} & plt.show() \tn % Row Count 30 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.43873 cm} x{2.53827 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Dimensionality Reduction Using Feature Selection}} \tn % Row 0 \SetRowColor{LightBackground} Thresholding Numerical Feature Variance & from \seqsplit{sklearn.feature\_selection} import VarianceThreshold \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Create thresholder & thresholder = \seqsplit{VarianceThreshold(threshold=}.5) \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} Create high variance feature matrix & \seqsplit{features\_high\_variance} = \seqsplit{thresholder.fit\_transform(features)} \tn % Row Count 9 (+ 3) % Row 3 \SetRowColor{white} View variances & \seqsplit{thresholder.fit(features).variances\_} \tn % Row Count 11 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{features with low variance are likely less interesting (and useful) than features with high variance. \newline the VT will not work when feature sets contain different units \newline If the features have been standardized (to mean zero and unit variance), then for obvious reasons variance thresholding will not work correctly} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Text}} \tn % Row 0 \SetRowColor{LightBackground} Strip whitespaces & strip\_whitespace = {[}string.strip() for string in text\_data{]} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Remove periods & remove\_periods = {[}string.replace(".", "") for string in strip\_whitespace{]} \tn % Row Count 7 (+ 4) % Row 2 \SetRowColor{LightBackground} Parsing and Cleaning HTML & from bs4 import BeautifulSoup \tn % Row Count 9 (+ 2) % Row 3 \SetRowColor{white} Parse html & soup = BeautifulSoup(html, "lxml") \tn % Row Count 11 (+ 2) % Row 4 \SetRowColor{LightBackground} Find the div with the class "full\_name", show text & soup.find("div", \{ "class" : "full\_name" \}).text \tn % Row Count 14 (+ 3) % Row 5 \SetRowColor{white} Removing Punctuation & import unicodedata \tn % Row Count 15 (+ 1) % Row 6 \SetRowColor{LightBackground} & import sys \tn % Row Count 16 (+ 1) % Row 7 \SetRowColor{white} Create a dictionary of punctuation characters & punctuation = dict.fromkeys(i for i in \seqsplit{range(sys.maxunicode)} if \seqsplit{unicodedata.category(chr(i)).startswith('P'))} \tn % Row Count 22 (+ 6) % Row 8 \SetRowColor{LightBackground} For each string, remove any punctuation characters & {[}string.translate(punctuation) for string in text\_data{]} \tn % Row Count 25 (+ 3) % Row 9 \SetRowColor{white} Tokenizing Text (You have text and want to break it up into individual words) & from nltk.tokenize import word\_tokenize \tn % Row Count 29 (+ 4) % Row 10 \SetRowColor{LightBackground} Tokenize words (string can't have full stops) & \seqsplit{word\_tokenize(string)} \tn % Row Count 32 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Text (cont)}} \tn % Row 11 \SetRowColor{LightBackground} Tokenize sentences (string has to have full stops) & \seqsplit{ent\_tokenize(string)} \tn % Row Count 3 (+ 3) % Row 12 \SetRowColor{white} Removing Stop Words & from nltk.corpus import stopwords \tn % Row Count 5 (+ 2) % Row 13 \SetRowColor{LightBackground} Load stop words & stop\_words = \seqsplit{stopwords.words('english')} \tn % Row Count 7 (+ 2) % Row 14 \SetRowColor{white} Remove stop words & {[}word for word in tokenized\_words if word not in stop\_words{]} \tn % Row Count 10 (+ 3) % Row 15 \SetRowColor{LightBackground} Stemming Words & from nltk.stem.porter import PorterStemmer \tn % Row Count 13 (+ 3) % Row 16 \SetRowColor{white} Create stemmer & porter = PorterStemmer() \tn % Row Count 15 (+ 2) % Row 17 \SetRowColor{LightBackground} Apply stemmer & {[}porter.stem(word) for word in tokenized\_words{]} \tn % Row Count 18 (+ 3) % Row 18 \SetRowColor{white} Tagging Parts of Speech & from nltk import pos\_tag \tn % Row Count 20 (+ 2) % Row 19 \SetRowColor{LightBackground} Filter words & {[}word for word, tag in text\_tagged if tag in {[}'NN','NNS','NNP','NNPS'{]} {]} \tn % Row Count 24 (+ 4) % Row 20 \SetRowColor{white} Tag each word and each tweet & for tweet in tweets: \tn % Row Count 26 (+ 2) % Row 21 \SetRowColor{LightBackground} & tweet\_tag = \seqsplit{nltk.pos\_tag(word\_tokenize(tweet))} \tn % Row Count 29 (+ 3) % Row 22 \SetRowColor{white} & tagged\_tweets.append({[}tag for word, tag in tweet\_tag{]}) \tn % Row Count 32 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Text (cont)}} \tn % Row 23 \SetRowColor{LightBackground} Use one-hot encoding to convert the tags into features & one\_hot\_multi = \seqsplit{MultiLabelBinarizer()} \tn % Row Count 3 (+ 3) % Row 24 \SetRowColor{white} & \seqsplit{one\_hot\_multi.fit\_transform(tagged\_tweets)} \tn % Row Count 6 (+ 3) % Row 25 \SetRowColor{LightBackground} To examine the accuracy of our tagger, we split our text data into two parts & from nltk.corpus import brown \tn % Row Count 10 (+ 4) % Row 26 \SetRowColor{white} takes into account the previous two words & from nltk.tag import UnigramTagger \tn % Row Count 13 (+ 3) % Row 27 \SetRowColor{LightBackground} takes into account the previous word & from nltk.tag import BigramTagger \tn % Row Count 15 (+ 2) % Row 28 \SetRowColor{white} looks at the word itself & from nltk.tag import TrigramTagger \tn % Row Count 17 (+ 2) % Row 29 \SetRowColor{LightBackground} Get some text from the Brown Corpus, broken into sentences & sentences = \seqsplit{brown.tagged\_sents(categories='news')} \tn % Row Count 20 (+ 3) % Row 30 \SetRowColor{white} Split into 4000 sentences for training and 623 for testing & train = sentences{[}:4000{]} \tn % Row Count 23 (+ 3) % Row 31 \SetRowColor{LightBackground} & test = sentences{[}4000:{]} \tn % Row Count 25 (+ 2) % Row 32 \SetRowColor{white} Create backoff tagger & unigram = \seqsplit{UnigramTagger(train)} \tn % Row Count 27 (+ 2) % Row 33 \SetRowColor{LightBackground} & bigram = BigramTagger(train, backoff=unigram) \tn % Row Count 30 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Text (cont)}} \tn % Row 34 \SetRowColor{LightBackground} & trigram = TrigramTagger(train, backoff=bigram) \tn % Row Count 3 (+ 3) % Row 35 \SetRowColor{white} Show accuracy & \seqsplit{trigram.evaluate(test)} \tn % Row Count 5 (+ 2) % Row 36 \SetRowColor{LightBackground} Encoding Text as a Bag of Words & from \seqsplit{sklearn.feature\_extraction.text} import CountVectorizer \tn % Row Count 8 (+ 3) % Row 37 \SetRowColor{white} Create the bag of words feature matrix & count = CountVectorizer() \tn % Row Count 10 (+ 2) % Row 38 \SetRowColor{LightBackground} Sparse matrix of bag of words & bag\_of\_words = \seqsplit{count.fit\_transform(text\_data)} \tn % Row Count 13 (+ 3) % Row 39 \SetRowColor{white} Trun sparse matrix into array & \seqsplit{bag\_of\_words.toarray()} \tn % Row Count 15 (+ 2) % Row 40 \SetRowColor{LightBackground} Show feature (column) names & \seqsplit{count.get\_feature\_names()} \tn % Row Count 17 (+ 2) % Row 41 \SetRowColor{white} Create feature matrix with arguments & \seqsplit{CountVectorizer(ngram\_range=(1},2), \seqsplit{stop\_words="english"}, vocabulary={[}'brazil'{]}) \tn % Row Count 21 (+ 4) % Row 42 \SetRowColor{LightBackground} & bag = \seqsplit{count\_2gram.fit\_transform(text\_data)} \tn % Row Count 24 (+ 3) % Row 43 \SetRowColor{white} View the 1-grams and 2-grams & \seqsplit{ount\_2gram.vocabulary\_} \tn % Row Count 26 (+ 2) % Row 44 \SetRowColor{LightBackground} Weighting Word Importance & from \seqsplit{sklearn.feature\_extraction.text} import TfidfVectorizer \tn % Row Count 29 (+ 3) % Row 45 \SetRowColor{white} Create the tf-idf (term frequency-document frequency) feature matrix & tfidf = TfidfVectorizer() \tn % Row Count 33 (+ 4) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Handling Text (cont)}} \tn % Row 46 \SetRowColor{LightBackground} & feature\_matrix = \seqsplit{tfidf.fit\_transform(text\_data)} \tn % Row Count 3 (+ 3) % Row 47 \SetRowColor{white} Show feature names & tfidf.vocabulary\_ \tn % Row Count 4 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{You will have to download the set of stop words the first time \newline import nltk \newline \seqsplit{nltk.download('stopwords')} \newline \newline Note that NLTK's stopwords assumes the tokenized words are all lowercased} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Support Vector Machines}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Training a Linear Classifier} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} Load libraries & from sklearn.svm import LinearSVC \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} Standardize features & scaler = StandardScaler() \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} & \seqsplit{features\_standardized} = \seqsplit{scaler.fit\_transform(features)} \tn % Row Count 8 (+ 3) % Row 4 \SetRowColor{LightBackground} Create support vector classifier & svc = LinearSVC(C=1.0) \tn % Row Count 10 (+ 2) % Row 5 \SetRowColor{white} Train model & model = \seqsplit{svc.fit(features\_standardized}, target) \tn % Row Count 13 (+ 3) % Row 6 \SetRowColor{LightBackground} Plot data points and color using their class & color = {[}"black" if c == 0 else "lightgrey" for c in target{]} \tn % Row Count 16 (+ 3) % Row 7 \SetRowColor{white} & plt.scatter(features\_standardized{[}:,0{]}, features\_standardized{[}:,1{]}, c=color) \tn % Row Count 20 (+ 4) % Row 8 \SetRowColor{LightBackground} Create the hyperplane & w = svc.coef\_{[}0{]} \tn % Row Count 22 (+ 2) % Row 9 \SetRowColor{white} & a = -w{[}0{]} / w{[}1{]} \tn % Row Count 23 (+ 1) % Row 10 \SetRowColor{LightBackground} Return evenly spaced numbers over a specified interval. & xx = np.linspace(-2.5, 2.5) \tn % Row Count 26 (+ 3) % Row 11 \SetRowColor{white} & yy = a * xx - (svc.intercept\_{[}0{]}) / w{[}1{]} \tn % Row Count 28 (+ 2) % Row 12 \SetRowColor{LightBackground} Plot the hyperplane & plt.plot(xx, yy) \tn % Row Count 29 (+ 1) % Row 13 \SetRowColor{white} & plt.axis("off"), plt.show() \tn % Row Count 31 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Support Vector Machines (cont)}} \tn % Row 14 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Handling Linearly Inseparable Classes Using Kernels} \tn % Row Count 2 (+ 2) % Row 15 \SetRowColor{white} Create a support vector machine with a radial basis function kernel & svc = SVC(kernel="rbf", random\_state=0, gamma=1, C=1) \tn % Row Count 6 (+ 4) % Row 16 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Creating Predicted Probabilities} \tn % Row Count 7 (+ 1) % Row 17 \SetRowColor{white} View predicted probabilities & \seqsplit{model.predict\_proba(new\_observation)} \tn % Row Count 9 (+ 2) % Row 18 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Identifying Support Vectors} \tn % Row Count 10 (+ 1) % Row 19 \SetRowColor{white} View support vectors & \seqsplit{model.support\_vectors\_} \tn % Row Count 12 (+ 2) % Row 20 \SetRowColor{LightBackground} Handling Imbalanced Classes & Increase the penalty for misclassifying the smaller class using class\_weight \tn % Row Count 16 (+ 4) % Row 21 \SetRowColor{white} Create support vector classifier & svc = SVC(kernel="linear", \seqsplit{class\_weight="balanced"}, C=1.0, random\_state=0) \tn % Row Count 20 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{visualization in page 321 \newline In scikit-learn, the predicted probabilities must be generated when the model is being trained. We can do this by setting SVC's probability to True. Then use the same method} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Data Wrangling}} \tn % Row 0 \SetRowColor{LightBackground} Creating a series & pd.Series({[}'Molly Mooney', 40, True{]}, index={[}'Name','Age','Driver'{]}) \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} Appending to a data frame & \seqsplit{dataframe.append(new\_person}, ignore\_index=True) \tn % Row Count 7 (+ 3) % Row 2 \SetRowColor{LightBackground} First lines of the data & dataframe.head(2) \tn % Row Count 9 (+ 2) % Row 3 \SetRowColor{white} descriptive statistics & \seqsplit{dataframe.describe()} \tn % Row Count 11 (+ 2) % Row 4 \SetRowColor{LightBackground} Return row by index & dataframe.iloc{[}0{]} \tn % Row Count 12 (+ 1) % Row 5 \SetRowColor{white} Return row by name & dataframe.loc{[}'Allen, Miss Elisabeth Walton'{]} \tn % Row Count 15 (+ 3) % Row 6 \SetRowColor{LightBackground} Set index & dataframe = dataframe.set\_index(dataframe{[}'Name'{]}) \tn % Row Count 18 (+ 3) % Row 7 \SetRowColor{white} Selecting Rows Based on Conditionals & dataframe{[}dataframe{[}'Sex'{]} == 'female'{]} \tn % Row Count 20 (+ 2) % Row 8 \SetRowColor{LightBackground} Replacing Values & dataframe{[}'Sex'{]}.replace("anterior", "posterior") \tn % Row Count 23 (+ 3) % Row 9 \SetRowColor{white} Replacing multiple values & dataframe{[}'Sex'{]}.replace({[}"female", "male"{]}, {[}"Woman", "Man"{]}) \tn % Row Count 27 (+ 4) % Row 10 \SetRowColor{LightBackground} Renaming Columns & dataframe.rename(columns=\{'PClass': 'Passenger Class'\}) \tn % Row Count 30 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Data Wrangling (cont)}} \tn % Row 11 \SetRowColor{LightBackground} Minimum, max, sum, count & dataframe{[}'Age'{]}.min() \tn % Row Count 2 (+ 2) % Row 12 \SetRowColor{white} Finding Unique Values & dataframe{[}'Sex'{]}.unique() \tn % Row Count 4 (+ 2) % Row 13 \SetRowColor{LightBackground} display all unique values with the number of times each value appears & dataframe{[}'Sex'{]}.value\_counts() \tn % Row Count 8 (+ 4) % Row 14 \SetRowColor{white} number of unique values & dataframe{[}'PClass'{]}.nunique() \tn % Row Count 10 (+ 2) % Row 15 \SetRowColor{LightBackground} return booleans indicating whether a value is missing & dataframe{[}dataframe{[}'Age'{]}.isnull(){]} \tn % Row Count 13 (+ 3) % Row 16 \SetRowColor{white} Replace missing values & dataframe{[}'Sex'{]} = dataframe{[}'Sex'{]}.replace('male', np.nan) \tn % Row Count 16 (+ 3) % Row 17 \SetRowColor{LightBackground} Load data, set missing values & dataframe = pd.read\_csv(url, na\_values={[}np.nan, 'NONE', -999{]}) \tn % Row Count 20 (+ 4) % Row 18 \SetRowColor{white} Filling missing values & \seqsplit{dataframe.fillna(value)} \tn % Row Count 22 (+ 2) % Row 19 \SetRowColor{LightBackground} Deleting a Column & dataframe.drop({[}'Age', 'Sex'{]}, axis=1).head(2) \tn % Row Count 25 (+ 3) % Row 20 \SetRowColor{white} Deleting a Row & dataframe{[}dataframe{[}'Sex'{]} != 'male'{]} \tn % Row Count 27 (+ 2) % Row 21 \SetRowColor{LightBackground} & or use drop \tn % Row Count 28 (+ 1) % Row 22 \SetRowColor{white} Dropping Duplicate Rows & \seqsplit{dataframe.drop\_duplicates()} \tn % Row Count 30 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Data Wrangling (cont)}} \tn % Row 23 \SetRowColor{LightBackground} Dropping Duplicate Rows, taking to account only a subset of rows & dataframe.drop\_duplicates(subset={[}'Sex'{]}keep='last' (optional argument to keep last observation instead of first)) \tn % Row Count 6 (+ 6) % Row 24 \SetRowColor{white} Grouping Rows by Values & \seqsplit{dataframe.groupby('Sex').mean()} \tn % Row Count 8 (+ 2) % Row 25 \SetRowColor{LightBackground} & dataframe.groupby({[}'Sex','Survived'{]}){[}'Age'{]}.mean() \tn % Row Count 11 (+ 3) % Row 26 \SetRowColor{white} creating a date range & \seqsplit{pd.date\_range('06/06/2017'}, periods=100000, freq='30S') \tn % Row Count 14 (+ 3) % Row 27 \SetRowColor{LightBackground} Group rows by week & \seqsplit{dataframe.resample('W').sum()} \tn % Row Count 16 (+ 2) % Row 28 \SetRowColor{white} Group by two weeks & \seqsplit{dataframe.resample('2W').mean()} \tn % Row Count 18 (+ 2) % Row 29 \SetRowColor{LightBackground} Group by month & \seqsplit{dataframe.resample('M'},label='left' (the label returned is the first observation in the group)).count() \tn % Row Count 24 (+ 6) % Row 30 \SetRowColor{white} Looping Over a Column & for name in dataframe{[}'Name'{]}{[}0:2{]}: \tn % Row Count 26 (+ 2) % Row 31 \SetRowColor{LightBackground} Applying a Function Over All Elements in a Column & dataframe{[}'Name'.apply(uppercase){]} \tn % Row Count 29 (+ 3) % Row 32 \SetRowColor{white} Applying a Function to Groups & \seqsplit{dataframe.groupby('Sex').apply(lambda} x: x.count()) \tn % Row Count 32 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Data Wrangling (cont)}} \tn % Row 33 \SetRowColor{LightBackground} Concatenating DataFrames by rows & pd.concat({[}dataframe\_a, dataframe\_b{]}, axis=0) \tn % Row Count 3 (+ 3) % Row 34 \SetRowColor{white} Concatenating DataFrames by columns & pd.concat({[}dataframe\_a, dataframe\_b{]}, axis=1) \tn % Row Count 6 (+ 3) % Row 35 \SetRowColor{LightBackground} Merging DataFrames & \seqsplit{pd.merge(dataframe\_employees}, dataframe\_sales, on='employee\_id, 'how='outer') \tn % Row Count 10 (+ 4) % Row 36 \SetRowColor{white} & left or right or inner \tn % Row Count 12 (+ 2) % Row 37 \SetRowColor{LightBackground} if the tables have columns with different names & \seqsplit{pd.merge(dataframe\_employees}, dataframe\_sales, \seqsplit{left\_on='employee\_id'}, \seqsplit{right\_on='employee\_id')} \tn % Row Count 17 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{replace can accepts regular expressions \newline To have full functionality with NaN we need to import the NumPy library first \newline groupby needs to be paired with some operation we want to apply to each group, such as calculating an aggregate statistic} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{1.89126 cm} x{3.08574 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Saving and Loading Trained Models}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Saving and Loading a scikit-learn Model} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} Load libraries & from sklearn.externals import joblib \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} Save model as pickle file & joblib.dump(model, "model.pkl") \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} Load model from file & classifer = \seqsplit{joblib.load("model.pkl")} \tn % Row Count 7 (+ 2) % Row 4 \SetRowColor{LightBackground} Get scikit-learn version & scikit\_version = joblib.\_\_version\_\_ \tn % Row Count 9 (+ 2) % Row 5 \SetRowColor{white} Save model as pickle file & joblib.dump(model, "model\_\{version\}.pkl".format(version=scikit\_version)) \tn % Row Count 12 (+ 3) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Saving and Loading a Keras Model} \tn % Row Count 13 (+ 1) % Row 7 \SetRowColor{white} Load libraries & from keras.models import load\_model \tn % Row Count 15 (+ 2) % Row 8 \SetRowColor{LightBackground} Save neural network & \seqsplit{network.save("model.h5")} \tn % Row Count 17 (+ 2) % Row 9 \SetRowColor{white} Load neural network & network = load\_model("model.h5") \tn % Row Count 19 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{When saving scikit-learn models, be aware that saved models might not be compatible between versions of scikit-learn; therefore, it can be helpful to include the version of scikit-learn used in the model in the filename} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}