\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{lulu\_0012} \pdfinfo{ /Title (test-ml.pdf) /Creator (Cheatography) /Author (lulu\_0012) /Subject (test\_ml Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{0F45A3} \definecolor{LightBackground}{HTML}{F0F3F9} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{test\_ml Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{lulu\_0012} via \textcolor{DarkBackground}{\uline{cheatography.com/35807/cs/11266/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}lulu\_0012 \\ \uline{cheatography.com/lulu-0012} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 27th March, 2017.\\ Updated 30th April, 2017.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Three Common Types of Problem}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Regression}} To find the relationship between a dependent variable and many independent variables} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Classification}} To classify an observation to one of the several known catogories} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Clustering}} To group a set of objects into several unknown clusters} \tn % Row Count 6 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Regression}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{5.377cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/lulu-0012_1490640627_Linear_regression.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\{\{fa-rocket\}\} {\bf{Model evaluation methods}}: \newline R\textasciicircum{}2\textasciicircum{}, Adjusted R\textasciicircum{}2\textasciicircum{}, MAE {\emph{(mean absolute error)}}, MSE {\emph{(mean square error)}}, RMSE {\emph{(root mean square error)}}, AIC {\emph{(Akaike information criterion)}}, BIC {\emph{(Bayesian information criterion)}}, Residual analysis, Goodness-of-fit test, Cross validation} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Classification}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{5.377cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/lulu-0012_1490639284_ml_classification.jpg}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\{\{fa-rocket\}\} {\bf{Model evaluation methods}} \newline Accuracy, Confusion matrix, Sensitivity and specificity, ROC {\emph{(receiver operating characteristic)}}, AUC {\emph{(area under the curve)}}, Cross validation} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Clustering}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{5.377cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/lulu-0012_1490576647_220px-Cluster-2.svg.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\{\{fa-rocket\}\} {\bf{Model evaluation methods}} \newline Models can be externally evaluated using data that are not used for clustering but with known class labels} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{\{\{fa-key\}\} General steps to build a model}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{1. Collecting the data.} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{2. Preparing the data and fixing issues such as missing values and outliers.} \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{3. Use exploratory analysis to help study the content of your data and select a proper algorithm that suits your need.} \tn % Row Count 6 (+ 3) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{4. Training a model using the algorithm you just selected. Start with a simple model that only uses the most important variables/features.} \tn % Row Count 9 (+ 3) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{5. Check model performance using the evaluation methods.} \tn % Row Count 11 (+ 2) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{6. If the model is not satisfactory, choose another algorithm or introduce different variables into the exsiting model.} \tn % Row Count 14 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{\{\{fa-key\}\} Popular tools of implementation}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{R}} ML libraries including {\emph{stats}}, {\emph{glmnet}}, {\emph{caret}}} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Python}} popular packages for ML including {\emph{scikit-learn}}, {\emph{statsmodels}}} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Alteryx Designer}} 'drag-n-drop' and requires minimum coding} \tn % Row Count 6 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Microsoft Azure Machine Learning Studio}} 'drag-n-drop' and requires minimum coding} \tn % Row Count 8 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{\{\{fa-plug\}\} Linear Regression}} \tn % Row 0 \SetRowColor{LightBackground} {\bf{Learning style}} & Supervised \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} {\bf{Problem}} & Regression \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} {\bf{Use case}} & Revenue prediction \tn % Row Count 3 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{Widely used for predicting numeric values (or quantities). It trains and predicts fast, but can be prone to overfitting so proper feature selection is often needed.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.09034 cm} x{2.88666 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{\{\{fa-plug\}\} Logistic regression}} \tn % Row 0 \SetRowColor{LightBackground} {\bf{Learning style}} & Supervised \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} {\bf{Problem}} & Classification \tn % Row Count 3 (+ 1) % Row 2 \SetRowColor{LightBackground} {\bf{Use case}} & Customer churn prediction \tn % Row Count 5 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{A generalized linear model with dependent variable being binary (0-1). Mostly used to predict whether an event is going to occur based on the dependent variables.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.09034 cm} x{2.88666 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{\{\{fa-plug\}\} Decision Tree}} \tn % Row 0 \SetRowColor{LightBackground} {\bf{Learning style}} & Supervised \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} {\bf{Problem}} & \seqsplit{Classification/Regression} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} {\bf{Use case}} & Targeted advertising \tn % Row Count 5 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{It requires little data preparation and can handle both numeric and categorical data. Easy to interpret and visualize but susceptible to overfitting.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{1.9908 cm} x{2.9862 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{\{\{fa-plug\}\} Random Forest}} \tn % Row 0 \SetRowColor{LightBackground} {\bf{Learning style}} & Supervised \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} {\bf{Problem}} & \seqsplit{Classification/Regression} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} {\bf{Use case}} & Credit card fraud detection \tn % Row Count 6 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{An ensemble method that combines many decision trees together. It has all pros that a basic decision tree has, can handle many features and usually has high accuracy.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.23965 cm} x{2.73735 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{\{\{fa-plug\}\} K-means}} \tn % Row 0 \SetRowColor{LightBackground} {\bf{Learning style}} & Unsupervised \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} {\bf{Problem}} & Clustering \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} {\bf{Use case}} & Customer segementation \tn % Row Count 3 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{This method groups objects into {\emph{k}} clusters. The goal is to have the objects in one cluster more similar to each other than to any object in other clusters. When {\emph{k}} is not pre-determined, many methods can be used to find a good value of {\emph{k}}, such as the elbow method and silhouette method.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{pagebreak}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{} \tn % Row Count 0 (+ 0) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.33919 cm} x{2.63781 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{\{\{fa-plug\}\} Naïve Bayes}} \tn % Row 0 \SetRowColor{LightBackground} {\bf{Learning style}} & Supervised \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} {\bf{Problem}} & Classification \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} {\bf{Use case}} & Email spam filtering \tn % Row Count 3 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{A conditional probability model that assumes all features are conditionally independent on each other. Trains and predicts fast but the precision is low for small datasets and can suffer from 'zero-frequency' problem.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.09034 cm} x{2.88666 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{\{\{fa-plug\}\} K-nearest Neighbors (KNN)}} \tn % Row 0 \SetRowColor{LightBackground} {\bf{Learning style}} & Supervised \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} {\bf{Problem}} & Classification \tn % Row Count 3 (+ 1) % Row 2 \SetRowColor{LightBackground} {\bf{Use case}} & Bank credit risk analysis \tn % Row Count 5 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{A lazy learning algorithm that doesn't require much in training, but can be slow in prediction if you have a large data set.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.09034 cm} x{2.88666 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{\{\{fa-plug\}\} Support Vector Machine (SVM)}} \tn % Row 0 \SetRowColor{LightBackground} {\bf{Learning style}} & Supervised \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} {\bf{Problem}} & \seqsplit{Classification/Regression} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} {\bf{Use case}} & Text classification \tn % Row Count 5 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{It uses some kernel function to map data points to a higher dimensional space and find a hyperplane to divide these points in that space. Ideal for very large data set with high dimensions, or if you know the decision boundary is not linear.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{column break here}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{} \tn % Row Count 0 (+ 0) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{References}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{Basics of machine learning \newline % Row Count 1 (+ 1) \seqsplit{https://www.analyticsvidhya.com/blog/2015/06/machine-learning-basics/} \newline % Row Count 3 (+ 2) A practical guide to exploratory analysis \newline % Row Count 4 (+ 1) \seqsplit{https://www.analyticsvidhya.com/blog/2016/01/guide-data-exploration/} \newline % Row Count 6 (+ 2) A cheat sheet of the libraries/modules of each algorithm in Python/R \newline % Row Count 8 (+ 2) \seqsplit{http://www.dummies.com/programming/big-data/data-science/machine-learning-dummies-cheat-sheet/} \newline % Row Count 10 (+ 2) A cheat sheet for using Microsoft Azure Machine Learning Studio \newline % Row Count 12 (+ 2) \seqsplit{https://docs.microsoft.com/en-us/azure/machine-learning/machine-learning-algorithm-cheat-sheet} \newline % Row Count 14 (+ 2) Tool sheet of Alteryx Desinger \newline % Row Count 15 (+ 1) \seqsplit{http://www.alteryx.com/sites/default/files/alteryx-designer-tools-sheet\_0.pdf}% Row Count 17 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}