\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{elhamsh} \pdfinfo{ /Title (supervised-learning-with-scikit-learn.pdf) /Creator (Cheatography) /Author (elhamsh) /Subject (Supervised Learning with scikit-learn Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{A3A3A3} \definecolor{LightBackground}{HTML}{F3F3F3} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Supervised Learning with scikit-learn Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{elhamsh} via \textcolor{DarkBackground}{\uline{cheatography.com/31327/cs/14694/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}elhamsh \\ \uline{cheatography.com/elhamsh} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 13th February, 2018.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{InitialDataProcessing}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{df.info()} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{df.shape} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{df.head()} \tn % Row Count 3 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{df.describe()} \tn % Row Count 4 (+ 1) % Row 4 \SetRowColor{LightBackground} plt.figure() \seqsplit{sns.countplot(x='education'}, hue='party', data=df, palette='RdBu') plt.xticks({[}0,1{]}, {[}'No', 'Yes'{]}) plt.show() & n sns.countplot(), we specify the x-axis data to be 'education', and hue to be 'party'. Recall that 'party' is also our target variable. So the resulting plot shows the difference in voting behavior between the two parties for the 'education' bill, with each party colored differently. We manually specified the color to be 'RdBu', as the Republican party has been traditionally associated with red, and the Democratic party with blue. \tn % Row Count 26 (+ 22) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{unsupervised}} \tn % Row 0 \SetRowColor{LightBackground} from sklearn.cluster import KMeans & \# Import KMeans \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} model = \seqsplit{KMeans(n\_clusters=3)} & \# Create a KMeans instance with 3 clusters: model \tn % Row Count 5 (+ 3) % Row 2 \SetRowColor{LightBackground} model.fit(points) & \# Fit model to points \tn % Row Count 7 (+ 2) % Row 3 \SetRowColor{white} labels = \seqsplit{model.predict(new\_points)} & \# Determine the cluster labels of new\_points: labels \tn % Row Count 10 (+ 3) % Row 4 \SetRowColor{LightBackground} centroids = \seqsplit{model.cluster\_centers\_} & Assign the cluster centers: centroids. note that model was \seqsplit{KMeans(n\_clulsters=k)} \tn % Row Count 15 (+ 5) % Row 5 \SetRowColor{white} df = pd.DataFrame(\{'NameOfArray1': array1, 'NameOfArray2': aray2\}) & Create a DataFrame with arrays as columns: df \tn % Row Count 19 (+ 4) % Row 6 \SetRowColor{LightBackground} pd.crosstab(df{[}'NameOfArray1'{]}, df{[}'NameOfArray2'{]}) & It is a table where it contains the counts the number of times each array2 coincides with each array1 label. \tn % Row Count 25 (+ 6) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Classification}} \tn % Row 0 \SetRowColor{LightBackground} X = \seqsplit{df.drop('targetvariable'}, axis=1).values & Note the use of .drop() to drop the target variable from the feature array X as well as the use of the .values attribute to ensure X are NumPy arrays \tn % Row Count 8 (+ 8) % Row 1 \SetRowColor{white} knn = \seqsplit{KNeighborsClassifier(n\_neighbors=6)} & nstantiate a \seqsplit{KNeighborsClassifier} called knn with 6 neighbors by specifying the n\_neighbors parameter. \tn % Row Count 14 (+ 6) % Row 2 \SetRowColor{LightBackground} knn.fit(X, y) & the classifier to the data using the .fit() method. X is the features, y is the target variable \tn % Row Count 19 (+ 5) % Row 3 \SetRowColor{white} from sklearn.neighbors import \seqsplit{KNeighborsClassifier} & Import \seqsplit{KNeighborsClassifier} from sklearn.neighbors \tn % Row Count 22 (+ 3) % Row 4 \SetRowColor{LightBackground} knn.predict(X\_new) & Predict for the new data point X\_new \tn % Row Count 24 (+ 2) % Row 5 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{from sklearn.model\_selection import train\_test\_split} \tn % Row Count 26 (+ 2) % Row 6 \SetRowColor{LightBackground} X\_train, X\_test, y\_train, y\_test = \seqsplit{train\_test\_split(X}, y, test\_size = .2, random\_state=42, stratify=y) & Create stratified training and test sets using 0.2 for the size of the test set. Use a random state of 42. Stratify the split according to the labels so that they are distributed in the training and test sets as they are in the original dataset. \tn % Row Count 39 (+ 13) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Classification (cont)}} \tn % Row 7 \SetRowColor{LightBackground} knn.score(X\_test, y\_test) & Compute and print the accuracy of the classifier's predictions using the .score() method. \tn % Row Count 5 (+ 5) % Row 8 \SetRowColor{white} np.arange(1, 9) & numpy array from 0 to 8=np.arange(1, 9) \tn % Row Count 7 (+ 2) % Row 9 \SetRowColor{LightBackground} for counter, value in \seqsplit{enumerate(some\_list):} print(counter, value) & Enumerate is a built-in function of Python. It's usefulness can not be summarized in a single line. Yet most of the newcomers and even some advanced programmers are unaware of it. It allows us to loop over something and have an automatic counter. \tn % Row Count 20 (+ 13) % Row 10 \SetRowColor{white} my\_list = {[}'apple', 'banana', 'grapes', 'pear'{]} for c, value in enumerate(my\_list, 1): print(c, value) & Output: \# 1 apple \# 2 banana \# 3 grapes \# 4 pear \tn % Row Count 26 (+ 6) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Regression}} \tn % Row 0 \SetRowColor{LightBackground} df{[}'ColName1'{]}.corr(df{[}'Colname2'{]}) & Caluclate the correlation between ColName1 and ColName2 in dataframe df \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} \seqsplit{numpy.linspace(start}, stop, num = 50, endpoint = True, retstep = False, dtype = None) & Returns number spaces evenly w.r.t interval. Similiar to arange but instead of step it uses sample number. Parameters : -\textgreater{} start : {[}optional{]} start of interval range. By default start = 0 -\textgreater{} stop : end of interval range -\textgreater{} restep : If True, return (samples, step). By deflut restep = False -\textgreater{} num : {[}int, optional{]} No. of samples to generate -\textgreater{} dtype : type of output array \tn % Row Count 24 (+ 20) % Row 2 \SetRowColor{LightBackground} from \seqsplit{sklearn.linear\_model} import LinearRegression & Import LinearRegression \tn % Row Count 27 (+ 3) % Row 3 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{from sklearn.metrics import mean\_squared\_error} \tn % Row Count 28 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{from sklearn.metrics import mean\_squared\_error} \tn % Row Count 29 (+ 1) % Row 5 \SetRowColor{white} \seqsplit{mean\_squared\_error(y\_true}, y\_pred, sample\_weight=None, \seqsplit{multioutput='uniform\_average')} & Mean squared error regression loss \tn % Row Count 34 (+ 5) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Regression (cont)}} \tn % Row 6 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{from sklearn.model\_selection import cross\_val\_score} \tn % Row Count 2 (+ 2) % Row 7 \SetRowColor{white} reg = LinearRegression() & Create a linear regression object: reg \tn % Row Count 4 (+ 2) % Row 8 \SetRowColor{LightBackground} cv\_scores = \seqsplit{cross\_val\_score(reg}, X, y, cv=5) & Compute 5-fold cross-validation scores: cv\_scores \tn % Row Count 7 (+ 3) % Row 9 \SetRowColor{white} from \seqsplit{sklearn.linear\_model} import Lasso & Import Lasso \tn % Row Count 9 (+ 2) % Row 10 \SetRowColor{LightBackground} lasso = Lasso(alpha=0.4, normalize=True) & \# Instantiate a lasso regressor: lasso \tn % Row Count 11 (+ 2) % Row 11 \SetRowColor{white} lasso.fit(X, y) & \# Fit the regressor to the data \tn % Row Count 13 (+ 2) % Row 12 \SetRowColor{LightBackground} lasso\_coef = lasso.coef\_ & \# Compute and print the coefficients \tn % Row Count 15 (+ 2) % Row 13 \SetRowColor{white} from \seqsplit{sklearn.linear\_model} import Ridge & \# Import necessary modules \tn % Row Count 17 (+ 2) % Row 14 \SetRowColor{LightBackground} def \seqsplit{display\_plot(cv\_scores}, cv\_scores\_std): fig = plt.figure() ax = fig.add\_subplot(1,1,1) \seqsplit{ax.plot(alpha\_space}, cv\_scores) std\_error = cv\_scores\_std / np.sqrt(10) \seqsplit{ax.fill\_between(alpha\_space}, cv\_scores + std\_error, cv\_scores - std\_error, alpha=0.2) ax.set\_ylabel('CV Score +/- Std Error') \seqsplit{ax.set\_xlabel('Alpha')} \seqsplit{ax.axhline(np.max(cv\_scores)}, linestyle='-{}-', color='.5') ax.set\_xlim({[}alpha\_space{[}0{]}, alpha\_space{[}-1{]}{]}) \seqsplit{ax.set\_xscale('log')} plt.show() & you will practice fitting ridge regression models over a range of different alphas, and plot cross-validated R2 scores for each, using this function that we have defined for you, which plots the R2 score as well as standard error for each alpha: \tn % Row Count 42 (+ 25) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{Regression (cont)}} \tn % Row 15 \SetRowColor{LightBackground} \seqsplit{cross\_val\_score(Ridge(normalize=True)}, X, y, cv=10) & erform 10-fold CV for Rdige Regressin. \tn % Row Count 3 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}