\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{Anoikis} \pdfinfo{ /Title (scikit-learn.pdf) /Creator (Cheatography) /Author (Anoikis) /Subject (scikit-learn Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{AAAAAA} \definecolor{LightBackground}{HTML}{F4F4F4} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{scikit-learn Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{Anoikis} via \textcolor{DarkBackground}{\uline{cheatography.com/74320/cs/18910/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}Anoikis \\ \uline{cheatography.com/anoikis} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 25th November, 2019.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{2} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Logistic regression}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{from sklearn.linear\_model import LogisticRegression, LogisticRegressionCV \newline from sklearn.pipeline import make\_pipeline \newline from sklearn.model\_selection import StratifiedKFold \newline from sklearn.preprocessing import PolynomialFeatures \newline from sklearn.model\_selection import GridSearchCV \newline \newline \# Create classifier \newline logit = \seqsplit{LogisticRegression(solver='lbfgs'}, n\_jobs=-1, random\_state=7) \newline \newline \# Fit and predict right away... \newline logit.fit(X\_train, y\_train) \newline logit.score(X\_test, y\_test) \newline \newline \#... or use cross validation for parameter tuning \newline \#\# solution 1 \newline skf = \seqsplit{StratifiedKFold(n\_splits=5}, shuffle=True, random\_state=17) \newline c\_values = np.logspace(-2, 3, 500) \newline grid\_logit = \seqsplit{LogisticRegressionCV(Cs=c\_values}, cv=skf, verbose=1, n\_jobs=-1) \newline grid\_logit.fit(X\_poly, y) \newline \#\# solution 2 \newline param\_grid\_logit = \{'logisticregression\_\_C': np.logspace(-5, 0, 6)\} \newline grid\_logit = GridSearchCV(logit, param\_grid\_logit, \seqsplit{return\_train\_score=True}, cv=3, n\_jobs=-1) \newline grid\_logit.fit(text\_train, y\_train) \newline \newline \# Check accuracy and model parameters \newline logit.score(X\_test, y\_test) \newline grid\_logit.best\_params\_, \seqsplit{grid\_logit.best\_score\_} \newline \newline \# Complexify the model for in case of non-linear boundaries \newline poly = PolynomialFeatures(degree={[}1-7{]}) \newline X\_poly = poly.fit\_transform(X) \newline \newline \# Special case of text processing \newline \#\# for CV \newline \#\# for some reason n\_jobs \textgreater{} 1 won't work with GridSearchCV's n\_jobs \textgreater{} 1 \newline text\_pipe\_logit = \seqsplit{make\_pipeline(CountVectorizer()}, \seqsplit{LogisticRegression(solver='lbfgs'}, n\_jobs=1, random\_state=7)) \newline \#\# then classical GridSearchCV with text\_pipe\_logit} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Decision Trees}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{from sklearn import \seqsplit{DecisionTreeClassifier/Regressor} \newline from sklearn.model\_selection import GridSearchCV, cross\_val\_score \newline from sklearn.metrics import accuracy\_score \newline from sklearn.tree import export\_graphviz \newline import pydotplus \#pip install pydotplus \newline \newline \# Create classifier \newline tree = \seqsplit{DecisionTreeClassifier(criterion='giny'}, splitter='best', max\_depth=None, min\_sample\_leaf=, random\_state = , ...) \newline \newline \# Fit and predict right away... \newline tree.fit(X\_train, y\_train) \newline pred\_holdout = tree.predict(X\_holdout) \newline \newline \# ... or use cross validation for parameter tuning \newline tree\_params = \{'max\_depth': range(1,11), \newline 'max\_features': range(4,19)\} \newline tree\_grid = GridSearchCV(tree, tree\_params, cv=5, n\_jobs=-1, verbose=True) \newline tree\_grid.fit(X\_train, y\_train) \newline pred\_holdout = \seqsplit{tree\_grid.predict(X\_holdout)} \newline \newline \# Check accuracy and model parameters \newline accuracy\_score(y\_holdout, pred\_holdout) \newline tree\_grid.best\_params\_, tree\_grid.best\_score\_ \newline \newline \# Export Decision Tree as png \newline tree\_str = export\_graphviz(tree, feature\_names={[}{\emph{feature\_names}}{]}, filled=True, out\_file=None) \newline graph = \seqsplit{pydotplus.graph\_from\_dot\_data(tree\_str)} \newline graph.write\_png({\emph{file}})} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Unsupervised Learning}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\# KMeans \newline from sklearn.cluster import KMeans \newline kmeans = KMeans(n\_clusters=k, random\_state=1) \newline kmeans.fit(X) \newline kmeans.labels\_ \newline \newline \#\# elbow method to choose the the number of clusters \newline inertia = {[}{]} \newline for k in range(1, 8): \newline kmeans = KMeans(n\_clusters=k, random\_state=1).fit(X) \newline \seqsplit{inertia.append(np.sqrt(kmeans.inertia\_))} \newline \newline \newline \newline \# Accuracy measures \newline \newline \#\# Needing observations' true labels, scaled \newline \#\#\# ARI \newline ari = \seqsplit{metrics.adjusted\_rand\_score(true\_labels}, predicted\_labels) \newline \#\#\# AMI \newline ami = \seqsplit{metrics.adjusted\_mutual\_info\_score(true\_labels}, predicted\_labels, \newline \seqsplit{average\_method='arithmetic')} \newline \newline \#\# Needing observations' true labels, not scaled \newline \#\#\# Homogenity \newline h = \seqsplit{metrics.homogeneity\_score(y}, algo.labels\_) \newline \#\#\# Completeness \newline c = \seqsplit{metrics.completeness\_score(y}, algo.labels\_) \newline \#\#\# V-measure \newline v = \seqsplit{metrics.v\_measure\_score(y}, algo.labels\_) \newline \newline \#\# \newline \#\#\# Silhouette \newline s = \seqsplit{metrics.silhouette\_score(X}, algo.labels\_)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{p{0.8 cm} p{0.8 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Subsampling, cross-validation, pipelines}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{} \tn % Row Count 0 (+ 0) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{k Nearest Neighboors}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{from sklearn.neighbors import KNeighborsClassifier \newline from sklearn.preprocessing import StandardScaler \newline from sklearn.pipeline import Pipeline \newline from sklearn.model\_selection import GridSearchCV, cross\_val\_score \newline from sklearn.metrics import accuracy\_score \newline \newline \# Create classifier \newline knn = \seqsplit{KNeighborsClassifier(n\_neighbors=10)} \newline \newline \# Scale the features \newline scaler = StandardScaler() \newline X\_train\_scaled = \seqsplit{scaler.fit\_transform(X\_train)} \newline X\_holdout\_scaled = \seqsplit{scaler.transform(X\_holdout)} \newline \newline \# Fit and predict right away... \newline knn.fit(X\_train\_scaled, y\_train) \newline knn\_pred = \seqsplit{knn.predict(X\_holdout\_scaled)} \newline \newline \# ... or use cross validation to tune parameter \newline knn\_pipe = Pipeline({[}('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n\_jobs=-1)){]}) \newline knn\_params = \{'knn\_\_n\_neighbors': range(1, 10)\} \newline knn\_grid = GridSearchCV(knn\_pipe, knn\_params, cv=5, n\_jobs=-1, verbose=True) \newline knn\_grid.fit(X\_train, y\_train) \newline knn\_pred = \seqsplit{knn\_grid.predict(X\_holdout\_scaled)} \newline \newline \# Check accuracy and model parameters \newline accuracy\_score(y\_holdout, knn\_pred) \newline knn\_grid.best\_params\_, knn\_grid.best\_score\_} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Random Forests}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{from sklearn.ensemble import \seqsplit{RandomForestRegressor/Classifier} \newline \# if big problems of overfit, import \seqsplit{ExtraTreesRegressor/Classifier} \newline from sklearn.model\_selection import cross\_val\_score, StratifiedKFold, GridSearchCV \newline from sklearn.metrics import accuracy\_score \newline \newline \# Create classifier \newline rf = \seqsplit{RandomForestRegressor(n\_estimators=100}, criterion='gini', max\_features=, min\_samples\_leaf=, max\_depth=, njob=-1, random\_state=42, oob\_score=True) \newline \#\# n\_estimators — the number of trees in the forest. \newline \#\# max\_features: m features chosen in p for each node. For classification sqrt(d), for regression d/3. \newline \#\# min\_samples\_leaf: minimal number of samples in a leaf. For min\_samples 1, for regression 5. \newline \newline \# Fit and predict right away... \newline rf.fit(X\_train, y\_train) \newline y\_test = rf.predict(X\_test) \newline \newline \# ... or use cross-validation for parameter tuning \newline skf = \seqsplit{StratifiedKFold(n\_splits=5}, shuffle=True, random\_state=42) \newline parameters = \{'max\_features': {[}4, 7, 10, 13{]}, 'min\_samples\_leaf': {[}1, 3, 5, 7{]}, 'max\_depth': {[}5,10,15,20{]}\} \newline rfc = \seqsplit{RandomForestClassifier(n\_estimators=100}, random\_state=42, n\_jobs=-1, oob\_score=True) \newline gcv = GridSearchCV(rfc, parameters, n\_jobs=-1, cv=skf, verbose=1) \newline gcv.fit(X, y) \newline \#\# for n\_estimator, also consider checking test score as a function of nb of trees {[}0, 1000{]} \newline \newline \# Check accuracy and model parameters \newline results = rfc.score(X\_test, y\_test) \newline gcv.best\_estimator\_, gcv.best\_score\_ \newline \newline \# Determine variable importance \newline importances = \seqsplit{forest.feature\_importances\_} \newline indices = np.argsort(importances){[}::-1{]} \newline num\_to\_plot = 10 \newline \#\# don't forget to link indices to variable's name \newline bars = \seqsplit{plt.bar(range(num\_to\_plot)}, importances{[}indices{[}:num\_to\_plot{]}{]}, align="center")} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{p{0.8 cm} p{0.8 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{A word about bootstrap and bagging}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{} \tn % Row Count 0 (+ 0) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}