\documentclass[10pt,a4paper]{article}

% Packages
\usepackage{fancyhdr}           % For header and footer
\usepackage{multicol}           % Allows multicols in tables
\usepackage{tabularx}           % Intelligent column widths
\usepackage{tabulary}           % Used in header and footer
\usepackage{hhline}             % Border under tables
\usepackage{graphicx}           % For images
\usepackage{xcolor}             % For hex colours
%\usepackage[utf8x]{inputenc}    % For unicode character support
\usepackage[T1]{fontenc}        % Without this we get weird character replacements
\usepackage{colortbl}           % For coloured tables
\usepackage{setspace}           % For line height
\usepackage{lastpage}           % Needed for total page number
\usepackage{seqsplit}           % Splits long words.
%\usepackage{opensans}          % Can't make this work so far. Shame. Would be lovely.
\usepackage[normalem]{ulem}     % For underlining links
% Most of the following are not required for the majority
% of cheat sheets but are needed for some symbol support.
\usepackage{amsmath}            % Symbols
\usepackage{MnSymbol}           % Symbols
\usepackage{wasysym}            % Symbols
%\usepackage[english,german,french,spanish,italian]{babel}              % Languages

% Document Info
\author{Anoikis}
\pdfinfo{
  /Title (scikit-learn.pdf)
  /Creator (Cheatography)
  /Author (Anoikis)
  /Subject (scikit-learn Cheat Sheet)
}

% Lengths and widths
\addtolength{\textwidth}{6cm}
\addtolength{\textheight}{-1cm}
\addtolength{\hoffset}{-3cm}
\addtolength{\voffset}{-2cm}
\setlength{\tabcolsep}{0.2cm} % Space between columns
\setlength{\headsep}{-12pt} % Reduce space between header and content
\setlength{\headheight}{85pt} % If less, LaTeX automatically increases it
\renewcommand{\footrulewidth}{0pt} % Remove footer line
\renewcommand{\headrulewidth}{0pt} % Remove header line
\renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit
% This two commands together give roughly
% the right line height in the tables
\renewcommand{\arraystretch}{1.3}
\onehalfspacing

% Commands
\newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour
\newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols
\newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns
\newcommand{\tn}{\tabularnewline} % Required as custom column type in use

% Font and Colours
\definecolor{HeadBackground}{HTML}{333333}
\definecolor{FootBackground}{HTML}{666666}
\definecolor{TextColor}{HTML}{333333}
\definecolor{DarkBackground}{HTML}{AAAAAA}
\definecolor{LightBackground}{HTML}{F4F4F4}
\renewcommand{\familydefault}{\sfdefault}
\color{TextColor}

% Header and Footer
\pagestyle{fancy}
\fancyhead{} % Set header to blank
\fancyfoot{} % Set footer to blank
\fancyhead[L]{
\noindent
\begin{multicols}{3}
\begin{tabulary}{5.8cm}{C}
    \SetRowColor{DarkBackground}
    \vspace{-7pt}
    {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent
        \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}}
    }
\end{tabulary}
\columnbreak
\begin{tabulary}{11cm}{L}
    \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{scikit-learn Cheat Sheet}}}} \\
    \normalsize{by \textcolor{DarkBackground}{Anoikis} via \textcolor{DarkBackground}{\uline{cheatography.com/74320/cs/18910/}}}
\end{tabulary}
\end{multicols}}

\fancyfoot[L]{ \footnotesize
\noindent
\begin{multicols}{3}
\begin{tabulary}{5.8cm}{LL}
  \SetRowColor{FootBackground}
  \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}}  \\
  \vspace{-2pt}Anoikis \\
  \uline{cheatography.com/anoikis} \\
  \end{tabulary}
\vfill
\columnbreak
\begin{tabulary}{5.8cm}{L}
  \SetRowColor{FootBackground}
  \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}}  \\
   \vspace{-2pt}Not Yet Published.\\
   Updated 25th November, 2019.\\
   Page {\thepage} of \pageref{LastPage}.
\end{tabulary}
\vfill
\columnbreak
\begin{tabulary}{5.8cm}{L}
  \SetRowColor{FootBackground}
  \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}}  \\
  \SetRowColor{white}
  \vspace{-5pt}
  %\includegraphics[width=48px,height=48px]{dave.jpeg}
  Measure your website readability!\\
  www.readability-score.com
\end{tabulary}
\end{multicols}}


\begin{document}
\raggedright
\raggedcolumns

% Set font size to small. Switch to any value
% from this page to resize cheat sheet text:
% www.emerson.emory.edu/services/latex/latex_169.html
\footnotesize % Small font.

\begin{multicols*}{2}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Logistic regression}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{from sklearn.linear\_model import LogisticRegression, LogisticRegressionCV \newline from sklearn.pipeline import make\_pipeline \newline from sklearn.model\_selection import StratifiedKFold \newline from sklearn.preprocessing import PolynomialFeatures \newline from sklearn.model\_selection import GridSearchCV \newline  \newline \# Create classifier \newline logit = \seqsplit{LogisticRegression(solver='lbfgs'}, n\_jobs=-1, random\_state=7) \newline  \newline \# Fit and predict right away... \newline logit.fit(X\_train, y\_train) \newline logit.score(X\_test, y\_test) \newline  \newline \#... or use cross validation for parameter tuning \newline \#\# solution 1 \newline skf = \seqsplit{StratifiedKFold(n\_splits=5}, shuffle=True, random\_state=17) \newline c\_values = np.logspace(-2, 3, 500) \newline grid\_logit = \seqsplit{LogisticRegressionCV(Cs=c\_values}, cv=skf, verbose=1, n\_jobs=-1) \newline grid\_logit.fit(X\_poly, y) \newline \#\# solution 2 \newline param\_grid\_logit = \{'logisticregression\_\_C': np.logspace(-5, 0, 6)\} \newline grid\_logit = GridSearchCV(logit, param\_grid\_logit, \seqsplit{return\_train\_score=True}, cv=3, n\_jobs=-1) \newline grid\_logit.fit(text\_train, y\_train) \newline  \newline \# Check accuracy and model parameters \newline logit.score(X\_test, y\_test) \newline grid\_logit.best\_params\_, \seqsplit{grid\_logit.best\_score\_} \newline  \newline \# Complexify the model for in case of non-linear boundaries \newline poly = PolynomialFeatures(degree={[}1-7{]}) \newline X\_poly = poly.fit\_transform(X) \newline  \newline \# Special case of text processing \newline \#\# for CV \newline \#\# for some reason n\_jobs \textgreater{} 1 won't work with GridSearchCV's n\_jobs \textgreater{} 1 \newline text\_pipe\_logit = \seqsplit{make\_pipeline(CountVectorizer()}, \seqsplit{LogisticRegression(solver='lbfgs'}, n\_jobs=1, random\_state=7)) \newline \#\# then classical GridSearchCV with text\_pipe\_logit} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Decision Trees}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{from sklearn import \seqsplit{DecisionTreeClassifier/Regressor} \newline from sklearn.model\_selection import GridSearchCV, cross\_val\_score \newline from sklearn.metrics import accuracy\_score \newline from sklearn.tree import export\_graphviz \newline import pydotplus \#pip install pydotplus \newline  \newline \# Create classifier \newline tree = \seqsplit{DecisionTreeClassifier(criterion='giny'}, splitter='best', max\_depth=None, min\_sample\_leaf=, random\_state = , ...) \newline  \newline \# Fit and predict right away... \newline tree.fit(X\_train, y\_train) \newline pred\_holdout = tree.predict(X\_holdout) \newline  \newline \# ... or use cross validation for parameter tuning \newline tree\_params = \{'max\_depth': range(1,11), \newline                'max\_features': range(4,19)\} \newline tree\_grid = GridSearchCV(tree, tree\_params, cv=5, n\_jobs=-1, verbose=True) \newline tree\_grid.fit(X\_train, y\_train) \newline pred\_holdout = \seqsplit{tree\_grid.predict(X\_holdout)} \newline  \newline \# Check accuracy and model parameters \newline accuracy\_score(y\_holdout, pred\_holdout) \newline tree\_grid.best\_params\_, tree\_grid.best\_score\_ \newline  \newline \# Export Decision Tree as png \newline tree\_str = export\_graphviz(tree, feature\_names={[}{\emph{feature\_names}}{]}, filled=True, out\_file=None) \newline graph = \seqsplit{pydotplus.graph\_from\_dot\_data(tree\_str)} \newline graph.write\_png({\emph{file}})} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Unsupervised Learning}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{\# KMeans \newline from sklearn.cluster import KMeans \newline kmeans = KMeans(n\_clusters=k, random\_state=1) \newline kmeans.fit(X) \newline kmeans.labels\_ \newline  \newline \#\# elbow method to choose the the number of clusters \newline inertia = {[}{]} \newline for k in range(1, 8): \newline     kmeans = KMeans(n\_clusters=k, random\_state=1).fit(X) \newline     \seqsplit{inertia.append(np.sqrt(kmeans.inertia\_))} \newline  \newline  \newline  \newline \# Accuracy measures \newline  \newline \#\# Needing observations' true labels, scaled \newline \#\#\# ARI \newline ari = \seqsplit{metrics.adjusted\_rand\_score(true\_labels}, predicted\_labels) \newline \#\#\# AMI \newline ami = \seqsplit{metrics.adjusted\_mutual\_info\_score(true\_labels}, predicted\_labels, \newline                                                  \seqsplit{average\_method='arithmetic')} \newline  \newline \#\# Needing observations' true labels, not scaled \newline \#\#\# Homogenity \newline h = \seqsplit{metrics.homogeneity\_score(y}, algo.labels\_) \newline \#\#\# Completeness \newline c = \seqsplit{metrics.completeness\_score(y}, algo.labels\_) \newline \#\#\# V-measure \newline v = \seqsplit{metrics.v\_measure\_score(y}, algo.labels\_) \newline  \newline \#\#  \newline \#\#\# Silhouette \newline s = \seqsplit{metrics.silhouette\_score(X}, algo.labels\_)} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{p{0.8 cm} p{0.8 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Subsampling, cross-validation, pipelines}}  \tn
% Row 0
\SetRowColor{LightBackground}
\mymulticolumn{2}{x{8.4cm}}{} \tn 
% Row Count 0 (+ 0)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{k Nearest Neighboors}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{from sklearn.neighbors import KNeighborsClassifier \newline from sklearn.preprocessing import StandardScaler \newline from sklearn.pipeline import Pipeline \newline from sklearn.model\_selection import GridSearchCV, cross\_val\_score \newline from sklearn.metrics import accuracy\_score   \newline  \newline \# Create classifier \newline knn = \seqsplit{KNeighborsClassifier(n\_neighbors=10)} \newline  \newline \# Scale the features \newline scaler = StandardScaler() \newline X\_train\_scaled = \seqsplit{scaler.fit\_transform(X\_train)} \newline X\_holdout\_scaled = \seqsplit{scaler.transform(X\_holdout)}   \newline  \newline \# Fit and predict right away... \newline knn.fit(X\_train\_scaled, y\_train) \newline knn\_pred = \seqsplit{knn.predict(X\_holdout\_scaled)}   \newline  \newline \# ... or use cross validation to tune parameter \newline knn\_pipe = Pipeline({[}('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n\_jobs=-1)){]}) \newline knn\_params = \{'knn\_\_n\_neighbors': range(1, 10)\} \newline knn\_grid = GridSearchCV(knn\_pipe, knn\_params, cv=5, n\_jobs=-1, verbose=True) \newline knn\_grid.fit(X\_train, y\_train) \newline knn\_pred = \seqsplit{knn\_grid.predict(X\_holdout\_scaled)}   \newline  \newline \# Check accuracy and model parameters \newline accuracy\_score(y\_holdout, knn\_pred) \newline knn\_grid.best\_params\_, knn\_grid.best\_score\_} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Random Forests}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{from sklearn.ensemble import \seqsplit{RandomForestRegressor/Classifier} \newline \# if big problems of overfit, import \seqsplit{ExtraTreesRegressor/Classifier}  \newline from sklearn.model\_selection import cross\_val\_score, StratifiedKFold, GridSearchCV \newline from sklearn.metrics import accuracy\_score \newline  \newline \# Create classifier \newline rf = \seqsplit{RandomForestRegressor(n\_estimators=100}, criterion='gini', max\_features=, min\_samples\_leaf=, max\_depth=, njob=-1, random\_state=42, oob\_score=True) \newline \#\# n\_estimators — the number of trees in the forest. \newline \#\# max\_features: m features chosen in p for each node. For classification sqrt(d), for regression d/3. \newline \#\# min\_samples\_leaf: minimal number of samples in a leaf. For min\_samples 1, for regression 5. \newline  \newline \# Fit and predict right away... \newline rf.fit(X\_train, y\_train) \newline y\_test = rf.predict(X\_test) \newline  \newline \# ... or use cross-validation for parameter tuning \newline skf = \seqsplit{StratifiedKFold(n\_splits=5}, shuffle=True, random\_state=42) \newline parameters = \{'max\_features': {[}4, 7, 10, 13{]}, 'min\_samples\_leaf': {[}1, 3, 5, 7{]}, 'max\_depth': {[}5,10,15,20{]}\} \newline rfc = \seqsplit{RandomForestClassifier(n\_estimators=100}, random\_state=42, n\_jobs=-1, oob\_score=True) \newline gcv = GridSearchCV(rfc, parameters, n\_jobs=-1, cv=skf, verbose=1) \newline gcv.fit(X, y) \newline \#\# for n\_estimator, also consider checking test score as a function of nb of trees {[}0, 1000{]} \newline  \newline \# Check accuracy and model parameters \newline results = rfc.score(X\_test, y\_test) \newline gcv.best\_estimator\_, gcv.best\_score\_ \newline  \newline \# Determine variable importance \newline importances = \seqsplit{forest.feature\_importances\_} \newline indices = np.argsort(importances){[}::-1{]} \newline num\_to\_plot = 10 \newline \#\# don't forget to link indices to variable's name \newline bars = \seqsplit{plt.bar(range(num\_to\_plot)}, importances{[}indices{[}:num\_to\_plot{]}{]}, align="center")} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{p{0.8 cm} p{0.8 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{A word about bootstrap and bagging}}  \tn
% Row 0
\SetRowColor{LightBackground}
\mymulticolumn{2}{x{8.4cm}}{} \tn 
% Row Count 0 (+ 0)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}


% That's all folks
\end{multicols*}

\end{document}