\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{Remidy08} \pdfinfo{ /Title (scikit-learn.pdf) /Creator (Cheatography) /Author (Remidy08) /Subject (scikit-learn Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{A32C1F} \definecolor{LightBackground}{HTML}{F9F1F1} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{scikit-learn Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{Remidy08} via \textcolor{DarkBackground}{\uline{cheatography.com/159206/cs/33799/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}Remidy08 \\ \uline{cheatography.com/remidy08} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 6th September, 2022.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{x{2.43873 cm} x{2.53827 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Jupyter}} \tn % Row 0 \SetRowColor{LightBackground} pip install jupyter & installs jupyter \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} jupyter notebook & starts jupyter notebook \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} Creating a notebook & go to new on the upper right and click on python \tn % Row Count 6 (+ 3) % Row 3 \SetRowColor{white} Run & shift + enter \tn % Row Count 7 (+ 1) % Row 4 \SetRowColor{LightBackground} File menu & can create a new Notebook or open a preexisting one. This is also where you would go to rename a Notebook. I think the most interesting menu item is the Save and Checkpoint option. This allows you to create checkpoints that you can roll back to if you need to. \tn % Row Count 20 (+ 13) % Row 5 \SetRowColor{white} Edit menu & Here you can cut, copy, and paste cells. This is also where you would go if you wanted to delete, split, or merge a cell. You can reorder cells here too. \tn % Row Count 28 (+ 8) % Row 6 \SetRowColor{LightBackground} View menu & useful for toggling the visibility of the header and toolbar. You can also toggle Line Numbers within cells on or off. This is also where you would go if you want to mess about with the cell's toolbar. \tn % Row Count 39 (+ 11) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.43873 cm} x{2.53827 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Jupyter (cont)}} \tn % Row 7 \SetRowColor{LightBackground} Insert menu & just for inserting cells above or below the currently selected cell. \tn % Row Count 4 (+ 4) % Row 8 \SetRowColor{white} Cell menu & allows you to run one cell, a group of cells, or all the cells. You can also go here to change a cell's type, although the toolbar is more intuitive for that. The other handy feature in this menu is the ability to clear a cell's output. \tn % Row Count 16 (+ 12) % Row 9 \SetRowColor{LightBackground} Kernel cell & is for working with the kernel that is running in the background. Here you can restart the kernel, reconnect to it, shut it down, or even change which kernel your Notebook is using. \tn % Row Count 26 (+ 10) % Row 10 \SetRowColor{white} Widgets menu & is for saving and clearing widget state. Widgets are basically JavaScript widgets that you can add to your cells to make dynamic content using Python (or another Kernel). \tn % Row Count 35 (+ 9) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.43873 cm} x{2.53827 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Jupyter (cont)}} \tn % Row 11 \SetRowColor{LightBackground} Help menu & which is where you go to learn about the Notebook's keyboard shortcuts, a user interface tour, and lots of reference material. \tn % Row Count 7 (+ 7) % Row 12 \SetRowColor{white} Running tab & will tell you which Notebooks and Terminals you are currently running. \tn % Row Count 11 (+ 4) % Row 13 \SetRowColor{LightBackground} cell types: Code & cell where you write code \tn % Row Count 13 (+ 2) % Row 14 \SetRowColor{white} cell types: Raw NBConvert & is only intended for special use cases when using the nbconvert command line tool. Basically it allows you to control the formatting in a very specific way when converting from a Notebook to another format. \tn % Row Count 24 (+ 11) % Row 15 \SetRowColor{LightBackground} cell types: Heading & The Heading cell type is no longer supported and will display a dialog that says as much. Instead, you are supposed to use Markdown for your Headings. \tn % Row Count 32 (+ 8) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.43873 cm} x{2.53827 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Jupyter (cont)}} \tn % Row 16 \SetRowColor{LightBackground} cell types: Markdown & Jupyter Notebook supports Markdown, which is a markup language that is a superset of HTML. Next up some of the possible utilities of this type of cell will be shown. Once a markdown cell is written, its text cannot be changed. \tn % Row Count 12 (+ 12) % Row 17 \SetRowColor{white} & \_italic\_ or {\emph{italic}} \tn % Row Count 13 (+ 1) % Row 18 \SetRowColor{LightBackground} & \# Header 1 \tn % Row Count 14 (+ 1) % Row 19 \SetRowColor{white} & \#\# Header 2 \tn % Row Count 15 (+ 1) % Row 20 \SetRowColor{LightBackground} & \#\#\# Header 3 \tn % Row Count 16 (+ 1) % Row 21 \SetRowColor{white} & You can create a list (bullet points) by using dashes, plus signs, or asterisks. There needs to be a space between the marker and the letters. To make sub lists, press tab first \tn % Row Count 25 (+ 9) % Row 22 \SetRowColor{LightBackground} & For inline code highlighting, just surround the code with backticks. If you want to insert a block of code, you can use triple backticks and also specify the programming language: \tn % Row Count 34 (+ 9) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.43873 cm} x{2.53827 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Jupyter (cont)}} \tn % Row 23 \SetRowColor{LightBackground} & ```python ... ``` in multiple lines \tn % Row Count 2 (+ 2) % Row 24 \SetRowColor{white} Exporting notebooks & When you are working with Jupyter Notebooks, you will find that you need to share your results with non-technical people. When that happens, you can use the nbconvert tool which comes with Jupyter Notebook to convert or export your Notebook into one of the following formats: HTML, LaTex, PDF, RevealJS, Markdown, ReStructuted Text, Executable script \tn % Row Count 20 (+ 18) % Row 25 \SetRowColor{LightBackground} How to Use nbconvert & Open up a terminal and navigate to the folder that contains the Notebook you wish to convert. The basic conversion command looks like this: jupyter nbconvert \textless{}input notebook\textgreater{} -{}-to \textless{}output format\textgreater{}. Example: upyter nbconvert py\_examples.ipynb -{}-to pdf \tn % Row Count 33 (+ 13) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.43873 cm} x{2.53827 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Jupyter (cont)}} \tn % Row 26 \SetRowColor{LightBackground} & You can also export your currently running Notebook by going to the File menu and choosing the Download as option. This option allows you to download in all the formats that nbconvert supports. However I recommend doing so as you can use nbconvert to export multiple Notebooks at once, which is something that the menu does not support. \tn % Row Count 17 (+ 17) % Row 27 \SetRowColor{white} Extensions & A Notebook extension (nbextension) is a JavaScript module that you load in most of the views in the Notebook's frontend. \tn % Row Count 24 (+ 7) % Row 28 \SetRowColor{LightBackground} Where Do I Get Extensions? & You can use Google or search for Jupyter Notebook extensions. \tn % Row Count 28 (+ 4) % Row 29 \SetRowColor{white} How Do I Install Them? & jupyter nbextension install EXTENSION\_NAME \tn % Row Count 31 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.43873 cm} x{2.53827 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Jupyter (cont)}} \tn % Row 30 \SetRowColor{LightBackground} enable an extension after installing it & jupyter nbextension enable EXTENSION\_NAME \tn % Row Count 3 (+ 3) % Row 31 \SetRowColor{white} installing python packages & ! pip install package\_name -{}-user \tn % Row Count 5 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{If you see a greyed out menu item, try changing the cell's type and see if the item becomes available to use.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Evaluation Metrics and Scoring}} \tn % Row 0 \SetRowColor{LightBackground} Importing & from sklearn.metrics import confusion\_matrix \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} & confusion = \seqsplit{confusion\_matrix(y\_test}, \seqsplit{LogisticRegression(C=0}.1).fit(X\_train, \seqsplit{y\_train).predict(X\_test))} \tn % Row Count 9 (+ 6) % Row 2 \SetRowColor{LightBackground} Accuracy & \seqsplit{(TP+TN)/(TP+TN+FP+FN)} \tn % Row Count 11 (+ 2) % Row 3 \SetRowColor{white} Precision (positive predictive value) & TP/(TP+FP) \tn % Row Count 13 (+ 2) % Row 4 \SetRowColor{LightBackground} Recall & TP/(TP+FN) \tn % Row Count 14 (+ 1) % Row 5 \SetRowColor{white} f-score & \seqsplit{2*(precision-recall)/(precision+recall)} \tn % Row Count 16 (+ 2) % Row 6 \SetRowColor{LightBackground} Importing f-score & from sklearn.metrics import f1\_score \tn % Row Count 18 (+ 2) % Row 7 \SetRowColor{white} f1\_score & f1\_score(y\_test, \seqsplit{pred\_most\_frequent)))} \tn % Row Count 20 (+ 2) % Row 8 \SetRowColor{LightBackground} Importing classification report & from sklearn.metrics import \seqsplit{classification\_report} \tn % Row Count 23 (+ 3) % Row 9 \SetRowColor{white} & \seqsplit{classification\_report(y\_test}, model, target\_names={[}"not nine", "nine"{]})) \tn % Row Count 27 (+ 4) % Row 10 \SetRowColor{LightBackground} Prediction threshold & \seqsplit{y\_pred\_lower\_threshold} = \seqsplit{svc.decision\_function(X\_test)} \textgreater{} -.8 \tn % Row Count 30 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Evaluation Metrics and Scoring (cont)}} \tn % Row 11 \SetRowColor{LightBackground} Classification report & \seqsplit{classification\_report(y\_test}, \seqsplit{y\_pred\_lower\_threshold)} \tn % Row Count 3 (+ 3) % Row 12 \SetRowColor{white} Importing \seqsplit{precison\_recall\_curve} & from sklearn.metrics import \seqsplit{precision\_recall\_curve} \tn % Row Count 6 (+ 3) % Row 13 \SetRowColor{LightBackground} using the curve & precision, recall, thresholds = \seqsplit{precision\_recall\_curve(} y\_test, \seqsplit{svc.decision\_function(X\_test))} \tn % Row Count 11 (+ 5) % Row 14 \SetRowColor{white} find threshold closest to zero & close\_zero = \seqsplit{np.argmin(np.abs(thresholds))} \tn % Row Count 14 (+ 3) % Row 15 \SetRowColor{LightBackground} & plt.plot(precision{[}close\_zero{]}, recall{[}close\_zero{]}, 'o', markersize=10, label="threshold zero", fillstyle="none", c='k', mew=2) \tn % Row Count 21 (+ 7) % Row 16 \SetRowColor{white} for random forest & precision\_rf, recall\_rf, thresholds\_rf = \seqsplit{precision\_recall\_curve(} y\_test, rf.predict\_proba(X\_test){[}:, 1{]}) \tn % Row Count 27 (+ 6) % Row 17 \SetRowColor{LightBackground} & plt.plot(precision\_rf{[}close\_default\_rf{]}, recall\_rf{[}close\_default\_rf{]}, '\textasciicircum{}', c='k', markersize=10, label="threshold 0.5 rf", fillstyle="none", mew=2) \tn % Row Count 35 (+ 8) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Evaluation Metrics and Scoring (cont)}} \tn % Row 18 \SetRowColor{LightBackground} & \seqsplit{plt.xlabel("Precision")} plt.ylabel("Recall") \seqsplit{plt.legend(loc="best")} \tn % Row Count 4 (+ 4) % Row 19 \SetRowColor{white} \seqsplit{average\_precision\_score} (area under the curve) & from sklearn.metrics import \seqsplit{average\_precision\_score} \tn % Row Count 7 (+ 3) % Row 20 \SetRowColor{LightBackground} & ap\_rf = \seqsplit{average\_precision\_score(y\_test}, rf.predict\_proba(X\_test){[}:, 1{]}) \tn % Row Count 11 (+ 4) % Row 21 \SetRowColor{white} & ap\_svc = \seqsplit{average\_precision\_score(y\_test}, \seqsplit{svc.decision\_function(X\_test))} \tn % Row Count 15 (+ 4) % Row 22 \SetRowColor{LightBackground} ROC curve & from sklearn.metrics import roc\_curve \tn % Row Count 17 (+ 2) % Row 23 \SetRowColor{white} & fpr, tpr, thresholds = roc\_curve(y\_test, \seqsplit{svc.decision\_function(X\_test))} \tn % Row Count 21 (+ 4) % Row 24 \SetRowColor{LightBackground} & plt.plot(fpr, tpr, label="ROC Curve") \tn % Row Count 23 (+ 2) % Row 25 \SetRowColor{white} & close\_zero = \seqsplit{np.argmin(np.abs(thresholds))} \tn % Row Count 26 (+ 3) % Row 26 \SetRowColor{LightBackground} & plt.plot(fpr{[}close\_zero{]}, tpr{[}close\_zero{]}, 'o', markersize=10, label="threshold zero", fillstyle="none", c='k', mew=2) \tn % Row Count 32 (+ 6) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Evaluation Metrics and Scoring (cont)}} \tn % Row 27 \SetRowColor{LightBackground} ROC curve's AUC & from sklearn.metrics import roc\_auc\_score \tn % Row Count 3 (+ 3) % Row 28 \SetRowColor{white} & rf\_auc = \seqsplit{roc\_auc\_score(y\_test}, rf.predict\_proba(X\_test){[}:, 1{]}) \tn % Row Count 7 (+ 4) % Row 29 \SetRowColor{LightBackground} & svc\_auc = \seqsplit{roc\_auc\_score(y\_test}, \seqsplit{svc.decision\_function(X\_test))} \tn % Row Count 11 (+ 4) % Row 30 \SetRowColor{white} Micro average & computes the total number of false positives, false negatives, and true positives over all classes, and then computes precision, recall, and fscore using these counts. \tn % Row Count 20 (+ 9) % Row 31 \SetRowColor{LightBackground} & f1\_score(y\_test, pred, average="micro")) \tn % Row Count 22 (+ 2) % Row 32 \SetRowColor{white} Macro average & omputes the unweighted per-class f-scores. This gives equal weight to all classes, no matter what their size is. \tn % Row Count 28 (+ 6) % Row 33 \SetRowColor{LightBackground} & f1\_score(y\_test, pred, average="macro")) \tn % Row Count 30 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Evaluation Metrics and Scoring (cont)}} \tn % Row 34 \SetRowColor{LightBackground} To change how to evaluate function in CV and grid search add the following argument to functions, such as, ross\_val\_score & scoring="accuracy" \tn % Row Count 7 (+ 7) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{If you do set a threshold, you need to be careful not to do so using \newline the test set. As with any other parameter, setting a decision threshold \newline on the test set is likely to yield overly optimistic results. Use a \newline validation set or cross-validation instead.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Iris data set}} \tn % Row 0 \SetRowColor{LightBackground} importing data set & from sklearn.datasets import load\_iris \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} & iris\_dataset = load\_iris() \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} data set keys & \seqsplit{(iris\_dataset.keys()} \tn % Row Count 5 (+ 1) % Row 3 \SetRowColor{white} Split the data into training and testing & from \seqsplit{sklearn.model\_selection} import train\_test\_split \tn % Row Count 8 (+ 3) % Row 4 \SetRowColor{LightBackground} & X\_train, X\_test, y\_train, y\_test = train\_test\_split( iris\_dataset{[}'data'{]}, iris\_dataset{[}'target'{]}, train\_size=0.n, test\_size=0.n, random\_state=0, \seqsplit{shuffle=True(default}, shuffles the data),stratify=None(default)) \tn % Row Count 19 (+ 11) % Row 5 \SetRowColor{white} scatter matrix & \seqsplit{pd.plotting.scatter\_matrix(iris\_dataframe}, c=y\_train, figsize=(15, 15), marker='o', hist\_kwds=\{'bins': 20\}, s=60, alpha=.8 (transparenccy), cmap=mglearn.cm3) \tn % Row Count 27 (+ 8) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.33919 cm} x{2.63781 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Supervised Learning}} \tn % Row 0 \SetRowColor{LightBackground} classification & n, the goal is to predict a class label, which is a choice from a predefined list of possibilities \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} regression & the goal is to predict a continuous number, or a floating-point number in programming terms (or real number in mathematical terms) \tn % Row Count 12 (+ 7) % Row 2 \SetRowColor{LightBackground} graphic that shows nearest neighbor & \seqsplit{mglearn.plots.plot\_knn\_classification(n\_neighbors=1)} \tn % Row Count 15 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Preprocessing and Scaling}} \tn % Row 0 \SetRowColor{LightBackground} Importing & from \seqsplit{sklearn.preprocessing} import MinMaxScaler \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Shifts the data such that all features are exactly between 0 and 1 & scaler = \seqsplit{MinMaxScaler(copy=True}, feature\_range=(0, 1)) \tn % Row Count 7 (+ 4) % Row 2 \SetRowColor{LightBackground} & scaler.fit(X\_train) \tn % Row Count 8 (+ 1) % Row 3 \SetRowColor{white} To apply the transformation that we just learned—that is, to actually scale the training data—we use the transform method of the scaler & \seqsplit{scaler.transform(X\_train)} \tn % Row Count 15 (+ 7) % Row 4 \SetRowColor{LightBackground} To apply the SVM to the scaled data, we also need to transform the test set. & X\_test\_scaled = \seqsplit{scaler.transform(X\_test)} \tn % Row Count 19 (+ 4) % Row 5 \SetRowColor{white} learning an SVM on the scaled training data & svm = SVC(C=100) \tn % Row Count 22 (+ 3) % Row 6 \SetRowColor{LightBackground} & \seqsplit{svm.fit(X\_train\_scaled}, y\_train) \tn % Row Count 24 (+ 2) % Row 7 \SetRowColor{white} Importing & from \seqsplit{sklearn.preprocessing} import StandardScaler \tn % Row Count 27 (+ 3) % Row 8 \SetRowColor{LightBackground} preprocessing using zero mean and unit variance scaling & scaler = StandardScaler() \tn % Row Count 30 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Ridge regression}} \tn % Row 0 \SetRowColor{LightBackground} Ridge regression & is a model tuning method that is used to analyse any data that suffers from multicollinearity. This method performs L2 regularization. When the issue of multicollinearity occurs, least-squares are unbiased, and variances are large, this results in predicted values being far away from the actual values. \tn % Row Count 16 (+ 16) % Row 1 \SetRowColor{white} Importing & from \seqsplit{sklearn.linear\_model} import Ridge \tn % Row Count 18 (+ 2) % Row 2 \SetRowColor{LightBackground} Train & ridge = \seqsplit{Ridge().fit(X\_train}, y\_train) \tn % Row Count 20 (+ 2) % Row 3 \SetRowColor{white} R\textasciicircum{}2 & \seqsplit{ridge.score(X\_train}, y\_train) \tn % Row Count 22 (+ 2) % Row 4 \SetRowColor{LightBackground} \seqsplit{plt.hlines(y-indexes} where to plot the lines=0, xmin=0, \seqsplit{xmax=len(lr.coef\_))} & Plot horizontal lines at each y from xmin to xmax. \tn % Row Count 26 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{The Ridge model makes a trade-off between the simplicity of the model (near-zero \newline coefficients) and its performance on the training set. How much importance the \newline model places on simplicity versus training set performance can be specified by the \newline user, using the alpha parameter. Increasing alpha forces coefficients to move more toward zero, which decreases \newline training set performance but might help generalization.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.09034 cm} x{2.88666 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Linear models for classification}} \tn % Row 0 \SetRowColor{LightBackground} Importing logistic regression & from sklearn.linear\_model import LogisticRegression \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Train & \seqsplit{LogisticRegression(C=100)}.fit(X\_train, y\_train) \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} Score & logreg.score(X\_train, y\_train)) \tn % Row Count 8 (+ 2) % Row 3 \SetRowColor{white} Predict & y\_pred = \seqsplit{LogisticRegression().fit(X\_train}, \seqsplit{y\_train).predict(X\_test)} \tn % Row Count 11 (+ 3) % Row 4 \SetRowColor{LightBackground} Importing SVM & from sklearn.svm import LinearSVC \tn % Row Count 13 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Using low values of C \newline will cause the algorithms to try to adjust to the "majority" of data points, while using \newline a higher value of C stresses the importance that each individual data point be classified \newline correctly.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.43873 cm} x{2.53827 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Grid Search}} \tn % Row 0 \SetRowColor{LightBackground} validation set & X\_trainval, X\_test, y\_trainval, y\_test = train\_test\_split( iris.data, iris.target, random\_state=0) \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} & X\_train, X\_valid, y\_train, y\_valid = train\_test\_split( X\_trainval, y\_trainval, random\_state=1) \tn % Row Count 10 (+ 5) % Row 2 \SetRowColor{LightBackground} Grid Search with Cross-Validation & from \seqsplit{sklearn.model\_selection} import GridSearchCV \tn % Row Count 13 (+ 3) % Row 3 \SetRowColor{white} Trainning & grid\_search = GridSearchCV(SVC(), param\_grid, cv=5) \tn % Row Count 16 (+ 3) % Row 4 \SetRowColor{LightBackground} Find best parameters & \seqsplit{grid\_search.best\_params\_} \tn % Row Count 18 (+ 2) % Row 5 \SetRowColor{white} return best score & \seqsplit{grid\_search.best\_score\_} \tn % Row Count 20 (+ 2) % Row 6 \SetRowColor{LightBackground} best\_estimator\_ & access the model with the best parameters trained on the whole training set \tn % Row Count 24 (+ 4) % Row 7 \SetRowColor{white} esults of a grid search can be found in & \seqsplit{grid\_search.cv\_results\_} \tn % Row Count 27 (+ 3) % Row 8 \SetRowColor{LightBackground} CV grid search & GridSearchCV(SVC(), param\_grid, cv=5) \tn % Row Count 29 (+ 2) % Row 9 \SetRowColor{white} & param\_grid = {[}\{'kernel': {[}'rbf'{]}, 'C': {[}0.001, 0.01, 0.1, 1, 10, 100{]}, 'gamma': {[}0.001, 0.01, 0.1, 1, 10, 100{]}\}, \{'kernel': {[}'linear'{]}, 'C': {[}0.001, 0.01, 0.1, 1, 10, 100{]}\}{]} \tn % Row Count 38 (+ 9) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.43873 cm} x{2.53827 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Grid Search (cont)}} \tn % Row 10 \SetRowColor{LightBackground} nested cross-validation & scores = \seqsplit{cross\_val\_score(GridSearchCV(SVC()}, param\_grid, cv=5), iris.data, iris.target, cv=5) \tn % Row Count 5 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Grid search is a tuning technique that attempts to compute the optimum values of hyperparameters. It is an exhaustive search that is performed on a the specific parameter values of a model.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.23965 cm} x{2.73735 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Decision trees}} \tn % Row 0 \SetRowColor{LightBackground} Importing data & from sklearn.tree import \seqsplit{DecisionTreeClassifier} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Tree & tree = \seqsplit{DecisionTreeClassifier(random\_state=0)} \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} Train & tree.fit(X\_train, y\_train) \tn % Row Count 8 (+ 2) % Row 3 \SetRowColor{white} Score & tree.score(X\_train, y\_train) \tn % Row Count 10 (+ 2) % Row 4 \SetRowColor{LightBackground} Pre-prunning & Argument in \seqsplit{DecisionTreeClassifier:} max\_depth=4 \tn % Row Count 13 (+ 3) % Row 5 \SetRowColor{white} Other arguments & max\_leaf\_nodes, or min\_samples\_leaf \tn % Row Count 15 (+ 2) % Row 6 \SetRowColor{LightBackground} Import tree diagram & from sklearn.tree import export\_graphviz \tn % Row Count 17 (+ 2) % Row 7 \SetRowColor{white} Build tree diagram & export\_graphviz(tree, out\_file="tree.dot", class\_names={[}"malignant", "benign"{]}, \seqsplit{feature\_names=cancer.feature\_names}, impurity=False, filled=True) \tn % Row Count 24 (+ 7) % Row 8 \SetRowColor{LightBackground} Feature importance & \seqsplit{tree.feature\_importances\_} \tn % Row Count 26 (+ 2) % Row 9 \SetRowColor{white} Predict & tree.predict(X\_all) \tn % Row Count 27 (+ 1) % Row 10 \SetRowColor{LightBackground} Decision tree regressor importing & from sklearn.tree import DecisionTreeRegressor \tn % Row Count 30 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.23965 cm} x{2.73735 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Decision trees (cont)}} \tn % Row 11 \SetRowColor{LightBackground} Train & \seqsplit{DecisionTreeRegressor()}.fit(X\_train, y\_train) \tn % Row Count 3 (+ 3) % Row 12 \SetRowColor{white} log & y\_train = \seqsplit{np.log(data\_train.price)} \tn % Row Count 5 (+ 2) % Row 13 \SetRowColor{LightBackground} exponential & np.exp(pred\_tree) \tn % Row Count 6 (+ 1) % Row 14 \SetRowColor{white} Random Forest import & from sklearn.ensemble import \seqsplit{RandomForestClassifier} \tn % Row Count 9 (+ 3) % Row 15 \SetRowColor{LightBackground} Random Forest & forest = \seqsplit{RandomForestClassifier(n\_estimators=5}, random\_state=2) \tn % Row Count 12 (+ 3) % Row 16 \SetRowColor{white} Train & forest.fit(X\_train, y\_train) \tn % Row Count 14 (+ 2) % Row 17 \SetRowColor{LightBackground} gradient boosted trees import & from sklearn.ensemble import \seqsplit{GradientBoostingClassifier} \tn % Row Count 17 (+ 3) % Row 18 \SetRowColor{white} Gradient boost & gbrt = \seqsplit{GradientBoostingClassifier(random\_state=0)} \tn % Row Count 20 (+ 3) % Row 19 \SetRowColor{LightBackground} Train & gbrt.fit(X\_train, y\_train) \tn % Row Count 22 (+ 2) % Row 20 \SetRowColor{white} Score & gbrt.score(X\_test, y\_test) \tn % Row Count 24 (+ 2) % Row 21 \SetRowColor{LightBackground} Arguments & max\_depth, learning\_rate \tn % Row Count 26 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{often the default parameters of the random forest already work quite well. \newline You can set n\_jobs=-1 to use all the cores in \newline your computer in the random forest. \newline In general, it's a good rule of thumb to use \newline the default values: \seqsplit{max\_features=sqrt(n\_features)} for classification and max\_fea \newline tures=log2(n\_features) for regression. \newline Gradient boosted trees are frequently the winning entries in machine learning competitions, and are widely used in industry. \newline First use random than boost} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.63781 cm} x{2.33919 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Uncertainty Estimates from Classifiers}} \tn % Row 0 \SetRowColor{LightBackground} Evaluate the decision function for the samples in X. & model.decision\_function(X\_test){[}:6{]} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Return the probability of classifying as all classes & model.predict\_proba(X\_test{[}:6{]}) \tn % Row Count 6 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{A model is called calibrated if the \newline reported uncertainty actually matches how correct it is—in a calibrated model, a prediction \newline made with 70\% certainty would be correct 70\% of the time. \newline To summarize, predict\_proba and decision\_function always have shape (n\_sam \newline ples, n\_classes)—apart from decision\_function in the special binary case.In the \newline binary case, decision\_function only has one column, corresponding to the "positive" \newline class classes\_.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Feature selection}} \tn % Row 0 \SetRowColor{LightBackground} Importing variance threshold & from \seqsplit{sklearn.feature\_selection} import VarianceThreshold \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Removing columns with high variance & sel = \seqsplit{VarianceThreshold(threshold=(}.8 * (1 - .8))) \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} & \seqsplit{sel.fit\_transform(X)} \tn % Row Count 7 (+ 1) % Row 3 \SetRowColor{white} SelectKBest & removes all but the k highest scoring features \tn % Row Count 10 (+ 3) % Row 4 \SetRowColor{LightBackground} SelectPercentile & removes all but a user-specified highest scoring percentage of features using common univariate statistical tests for each feature: false positive rate SelectFpr, false discovery rate SelectFdr, or family wise error SelectFwe. \tn % Row Count 22 (+ 12) % Row 5 \SetRowColor{white} \seqsplit{GenericUnivariateSelect} & allows to perform univariate feature selection with a configurable strategy. \tn % Row Count 26 (+ 4) % Row 6 \SetRowColor{LightBackground} importing SelectKBest & from \seqsplit{sklearn.feature\_selection} import SelectKBest \tn % Row Count 29 (+ 3) % Row 7 \SetRowColor{white} importinhg chi2 & from \seqsplit{sklearn.feature\_selection} import chi2 \tn % Row Count 32 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Feature selection (cont)}} \tn % Row 8 \SetRowColor{LightBackground} & X\_new = SelectKBest(chi2, \seqsplit{k=2).fit\_transform(X}, y) \tn % Row Count 3 (+ 3) % Row 9 \SetRowColor{white} Recursive feature elimination & from \seqsplit{sklearn.feature\_selection} import RFE \tn % Row Count 6 (+ 3) % Row 10 \SetRowColor{LightBackground} & rfe = RFE(estimator=svc, \seqsplit{n\_features\_to\_select=1}, step=1) \tn % Row Count 9 (+ 3) % Row 11 \SetRowColor{white} & rfe.fit(X, y) \tn % Row Count 10 (+ 1) % Row 12 \SetRowColor{LightBackground} Recursive feature elimination with cross-validation & from \seqsplit{sklearn.feature\_selection} import RFECV \tn % Row Count 13 (+ 3) % Row 13 \SetRowColor{white} & rfecv = RFECV( estimator=svc, step=1, \seqsplit{cv=StratifiedKFold(2)}, scoring="accuracy", \seqsplit{min\_features\_to\_select=min\_features\_to\_select}, ) \tn % Row Count 21 (+ 8) % Row 14 \SetRowColor{LightBackground} import StratifiedKFold & from \seqsplit{sklearn.model\_selection} import StratifiedKFold \tn % Row Count 24 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Evaluation and Improvement}} \tn % Row 0 \SetRowColor{LightBackground} Importing cross validation & from \seqsplit{sklearn.model\_selection} import cross\_val\_score \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Cross-validation & scores = \seqsplit{cross\_val\_score(model} without fit, data, target, cv=5) \tn % Row Count 7 (+ 4) % Row 2 \SetRowColor{LightBackground} Summarizing cross-validation scores & scores.mean() \tn % Row Count 9 (+ 2) % Row 3 \SetRowColor{white} stratified k-fold cross-validation & In stratified cross-validation, we split the data such that the proportions between classes are the same in each fold as they are in the whole dataset \tn % Row Count 17 (+ 8) % Row 4 \SetRowColor{LightBackground} Provides train/test indices to split data in train/test sets. & KFold(n\_splits=5, *, shuffle=False, random\_state=None) \tn % Row Count 21 (+ 4) % Row 5 \SetRowColor{white} & \seqsplit{cross\_val\_score(logreg}, iris.data, iris.target, cv=kfold))) \tn % Row Count 24 (+ 3) % Row 6 \SetRowColor{LightBackground} Importing Leave-one-out cross-validation & from \seqsplit{sklearn.model\_selection} import LeaveOneOut \tn % Row Count 27 (+ 3) % Row 7 \SetRowColor{white} Leave-one-out cross-validation & loo = LeaveOneOut() \tn % Row Count 29 (+ 2) % Row 8 \SetRowColor{LightBackground} & scores = \seqsplit{cross\_val\_score(logreg}, iris.data, iris.target, cv=loo) \tn % Row Count 33 (+ 4) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Model Evaluation and Improvement (cont)}} \tn % Row 9 \SetRowColor{LightBackground} shuffle-split cross-validation & each split samples train\_size many points for the training set and test\_size many (disjoint) point for the test set \tn % Row Count 6 (+ 6) % Row 10 \SetRowColor{white} import shuffle-split & from \seqsplit{sklearn.model\_selection} import ShuffleSplit \tn % Row Count 9 (+ 3) % Row 11 \SetRowColor{LightBackground} & shuffle\_split = \seqsplit{ShuffleSplit(test\_size=}.5, train\_size=.5, n\_splits=10) \tn % Row Count 13 (+ 4) % Row 12 \SetRowColor{white} & scores = \seqsplit{cross\_val\_score(logreg}, iris.data, iris.target, cv=shuffle\_split) \tn % Row Count 17 (+ 4) % Row 13 \SetRowColor{LightBackground} takes an array of groups as argument that we can use & GroupKFold \tn % Row Count 20 (+ 3) % Row 14 \SetRowColor{white} Import GroupKFold & from \seqsplit{sklearn.model\_selection} import GroupKFold \tn % Row Count 23 (+ 3) % Row 15 \SetRowColor{LightBackground} & scores = \seqsplit{cross\_val\_score(logreg}, X, y, groups, \seqsplit{cv=GroupKFold(n\_splits=3))} \tn % Row Count 27 (+ 4) % Row 16 \SetRowColor{white} Predicting with cross-validation & \seqsplit{sklearn.model\_selection.cross\_val\_predict(estimator}, X, y=None, {\emph{, groups=None, cv=None, n\_jobs=None, verbose=0, fit\_params=None, pre\_dispatch='2}}n\_jobs', method='predict') \tn % Row Count 36 (+ 9) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{0.89586 cm} x{4.08114 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Multilayer perceptrons (MLPs) or neural networks}} \tn % Row 0 \SetRowColor{LightBackground} \seqsplit{Importing} & from sklearn.neural\_network import MLPClassifier \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} Train & mlp = \seqsplit{MLPClassifier(algorithm='l-bfgs'}, activation='tanh',random\_state=0, hidden\_layer\_sizes={[}10,10{]}).fit(X\_train, y\_train) \tn % Row Count 6 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{there can be more than one hidden layers, for this, use a list on the hidden\_layer\_sizes \newline If we want a smoother decision boundary, we could add more hidden units, add a second hidden layer, or use the tanh nonlinearity} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{1.4931 cm} x{3.4839 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Naive Bayes Classifiers}} \tn % Row 0 \SetRowColor{LightBackground} Importing & from sklearn.naive\_bayes import GaussianNB \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} Train and predict & y\_pred = gnb.fit(X\_train, \seqsplit{y\_train).predict(X\_test)} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} Function & class \seqsplit{sklearn.naive\_bayes.GaussianNB(*}, priors=None, var\_smoothing=1e-09) \tn % Row Count 7 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{There are three kinds of naive Bayes classifiers implemented in scikit-learn: GaussianNB, BernoulliNB, and MultinomialNB. GaussianNB can be applied to \newline any continuous data, while BernoulliNB assumes binary data and MultinomialNB \newline assumes count data (that is, that each feature represents an integer count of something} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Linear models for multiclass classification}} \tn % Row 0 \SetRowColor{LightBackground} Importing & from sklearn.svm import LinearSVC \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} Train linear SVC & linear\_svm = LinearSVC().fit(X, y) \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} Import SVC & from sklearn.svm import SVC \tn % Row Count 6 (+ 2) % Row 3 \SetRowColor{white} train & svm = SVC(kernel='rbf' (function to use with the kernel trick), C=10 (regularization parameter) , gamma=0.1 (controls the width of the Gaussian kernel)).fit(X, y) \tn % Row Count 15 (+ 9) % Row 4 \SetRowColor{LightBackground} plot support vectors & sv= \seqsplit{svm.support\_vectors\_} \tn % Row Count 17 (+ 2) % Row 5 \SetRowColor{white} class labels of support vectors are given by the sign of the dual coefficients & sv\_labels = \seqsplit{svm.dual\_coef\_.ravel()} \textgreater{} 0 \tn % Row Count 21 (+ 4) % Row 6 \SetRowColor{LightBackground} Rescaling method for kernel SVMs & min\_on\_training = \seqsplit{X\_train.min(axis=0)} \tn % Row Count 23 (+ 2) % Row 7 \SetRowColor{white} & range\_on\_training = (X\_train - \seqsplit{min\_on\_training).max(axis=0)} \tn % Row Count 26 (+ 3) % Row 8 \SetRowColor{LightBackground} & X\_train\_scaled = (X\_train - min\_on\_training) / range\_on\_training \tn % Row Count 30 (+ 4) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Linear models for multiclass classification (cont)}} \tn % Row 9 \SetRowColor{LightBackground} & X\_test\_scaled = (X\_test - min\_on\_training) / range\_on\_training \tn % Row Count 4 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{common \newline technique to extend a binary classification algorithm to a multiclass classification \newline algorithm is the one-vs.-rest approach. In the one-vs.-rest approach, a binary model is \newline learned for each class that tries to separate that class from all of the other classes, \newline resulting in as many binary models as there are classes.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{1.4931 cm} x{3.4839 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Lasso}} \tn % Row 0 \SetRowColor{LightBackground} Lasso & using the lasso also restricts coefficients to be close to zero, but in a slightly different way, called L1 regularization.8 The consequence of L1 regularization is that when using the lasso, some coefficients are exactly zero. This means some features are entirely ignored by the model. \tn % Row Count 11 (+ 11) % Row 1 \SetRowColor{white} Importing & from sklearn.linear\_model import Lasso \tn % Row Count 13 (+ 2) % Row 2 \SetRowColor{LightBackground} Train & lasso = Lasso(alpha=0.01, \seqsplit{max\_iter=100000).).fit(X\_train}, y\_train) \tn % Row Count 16 (+ 3) % Row 3 \SetRowColor{white} R\textasciicircum{}2 & lasso.score(X\_train, y\_train) \tn % Row Count 18 (+ 2) % Row 4 \SetRowColor{LightBackground} \seqsplit{Coefficients} used & np.sum(lasso.coef\_ != 0)) \tn % Row Count 20 (+ 2) % Row 5 \SetRowColor{white} Figure legend & plt.legend() \tn % Row Count 22 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{In practice, ridge regression is usually the first choice between these two models. \newline However, if you have a large amount of features and expect only a few of them to be \newline important, Lasso might be a better choice. \newline Note: There is a class called ElasticNet , which combines the penalties of Lasso and Ridge.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Linear models for regression}} \tn % Row 0 \SetRowColor{LightBackground} Importing & from \seqsplit{sklearn.linear\_model} import LinearRegression \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Split data set (from \seqsplit{sklearn.model\_selection} import train\_test\_split) & X\_train, X\_test, y\_train, y\_test = \seqsplit{train\_test\_split(X}, y, random\_state=42) \tn % Row Count 7 (+ 4) % Row 2 \SetRowColor{LightBackground} linear regression & lr = \seqsplit{LinearRegression().fit(X\_train}, y\_train) \tn % Row Count 10 (+ 3) % Row 3 \SetRowColor{white} slope & lr.coef\_ \tn % Row Count 11 (+ 1) % Row 4 \SetRowColor{LightBackground} interception & lr.intercept\_ \tn % Row Count 12 (+ 1) % Row 5 \SetRowColor{white} R\textasciicircum{}2 & lr.score(X\_train, y\_train) \tn % Row Count 14 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{scikit-learn always stores anything \newline that is derived from the training data in attributes that end with a \newline trailing underscore. That is to separate them from parameters that \newline are set by the user.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.33919 cm} x{2.63781 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{k-nearest neighbors}} \tn % Row 0 \SetRowColor{LightBackground} Importing & from sklearn.neighbors import KNeighborsClassifier \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} k-nearest neighbors & knn = \seqsplit{KNeighborsClassifier(n\_neighbors=1(number} of neighbors)) \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} Building a model on the training set & knn.fit(X\_train, y\_train) \tn % Row Count 8 (+ 2) % Row 3 \SetRowColor{white} & The fit method returns the knn object itself (and modifies it in place), so we get a string representation of our classifier. The representation shows us which parameters were used in creating the model. \tn % Row Count 18 (+ 10) % Row 4 \SetRowColor{LightBackground} Predictions & prediction = knn.predict(data) \tn % Row Count 20 (+ 2) % Row 5 \SetRowColor{white} Accuracy & np.mean(y\_pred == y\_test)) \tn % Row Count 22 (+ 2) % Row 6 \SetRowColor{LightBackground} & knn.score(X\_test, y\_test) \tn % Row Count 24 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{The k-nearest neighbors classification algorithm \newline is implemented in the KNeighborsClassifier class in the neighbors module.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}