\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{DarioPittera (aggialavura)} \pdfinfo{ /Title (python-k-nearest-neighbors-knn-model.pdf) /Creator (Cheatography) /Author (DarioPittera (aggialavura)) /Subject (Python - K-Nearest\_Neighbors(KNN) model Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{6C6FA3} \definecolor{LightBackground}{HTML}{F5F6F9} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Python - K-Nearest\_Neighbors(KNN) model Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{DarioPittera (aggialavura)} via \textcolor{DarkBackground}{\uline{cheatography.com/83764/cs/19947/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}DarioPittera (aggialavura) \\ \uline{cheatography.com/aggialavura} \\ \uline{\seqsplit{www}.dariopittera.com} \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 27th June, 2019.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{2} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{TO START}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\# IMPORT DATA LIBRARIES \newline import numpy as np \newline import pandas as pd \newline \newline \# IMPORT VIS LIBRARIES \newline import seaborn as sns \newline import matplotlib.pyplot as plt \newline \%matplotlib inline \newline \newline \# IMPORT MODELLING LIBRARIES \newline from sklearn.preprocessing import StandardScaler \newline from sklearn.model\_selection import train\_test\_split \newline from sklearn.neighbors import KNeighborsClassifier \newline from sklearn.metrics import classification\_report,confusion\_matrix} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{6.08 cm} x{1.92 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{PRELIMINARY OPERATIONS}} \tn % Row 0 \SetRowColor{LightBackground} df = pd.read\_csv('data.csv') & read data \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{2}{x{8.4cm}}{{\bf{STANDARDISE THE VARIABLES}}} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{scaler = StandardScaler()} \tn % Row Count 3 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{2}{x{8.4cm}}{scaler.fit(df.drop('y',axis=1))} \tn % Row Count 4 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{scaled\_feat = \seqsplit{scaler.transform(df.drop('y'},axis=1))} \tn % Row Count 6 (+ 2) % Row 5 \SetRowColor{white} \mymulticolumn{2}{x{8.4cm}}{\seqsplit{df\_new=pd.DataFrame(scaled\_feat},columns=df.columns{[}:-1{]})*} \tn % Row Count 8 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{{\bf{df.columns{[}:-1{]}}}: means take all the columns but the last one.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4.64 cm} x{3.36 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{TRAIN MODEL}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{{\bf{\{\{fa-clone\}\} CREATE X and y}}} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} X = df{[}{[}'col1','col2',etc.{]}{]} & create df features \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} y = df{[}'col'{]} & create df var to predict \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{2}{x{8.4cm}}{{\bf{\{\{fa-columns\}\} SPLIT DATASET}}} \tn % Row Count 6 (+ 1) % Row 4 \SetRowColor{LightBackground} X\_train, X\_test, y\_train, y\_test = \{\{nl\}\} train\_test\_split(\{\{nl\}\}~~X,\{\{nl\}\}~~y, \{\{nl\}\}~~test\_size=0.3) & split df in train and test df \tn % Row Count 12 (+ 6) % Row 5 \SetRowColor{white} \mymulticolumn{2}{x{8.4cm}}{{\bf{\{\{fa-signal\}\} FIT THE MODEL}}} \tn % Row Count 13 (+ 1) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{knn = \seqsplit{KNeighborsClassifier(n\_neighbors=1)*}} \tn % Row Count 14 (+ 1) % Row 7 \SetRowColor{white} knn.fit(X\_train,y\_train) & train/fit the model \tn % Row Count 16 (+ 2) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{{\bf{\{\{fa-bullseye\}\} MAKE PREDICTIONS}}} \tn % Row Count 17 (+ 1) % Row 9 \SetRowColor{white} pred = knn.predict(X\_test) & make predictions \tn % Row Count 19 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{{\bf{n\_neighbors=1}}: we start specifying K = 1 and then we see how to better choose the K value (see evaluate block in this cheat sheet).} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{3.52 cm} x{4.48 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{EVALUATION of the MODEL}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{{\bf{\{\{fa-check\}\} EVAUATE MODEL}}} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{2}{x{8.4cm}}{\seqsplit{print(confusion\_matrix(y\_test},pred))} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{\seqsplit{print(classification\_report(y\_test},pred))} \tn % Row Count 3 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{2}{x{8.4cm}}{{\bf{\{\{fa-angle-double-up\}\} CHOOSING BETTER K}}} \tn % Row Count 4 (+ 1) % Row 4 \SetRowColor{LightBackground} error\_rate = {[}{]}* & create an empty list \tn % Row Count 5 (+ 1) % Row 5 \SetRowColor{white} \mymulticolumn{2}{x{8.4cm}}{for i in range(1,40):\{\{nl\}\}~knn = KNeighborsClassifier(n\_neighbors=i)\{\{nl\}\}~knn.fit(X\_train,y\_train)\{\{nl\}\}~pred\_i = knn.predict(X\_test)\{\{nl\}\}~error\_rate.append(np.mean(pred\_i != y\_test))} \tn % Row Count 10 (+ 5) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{{\bf{\{\{fa-image\}\} ELBOW PLOT}}} \tn % Row Count 11 (+ 1) % Row 7 \SetRowColor{white} \mymulticolumn{2}{x{8.4cm}}{plt.figure(figsize=(10,6))\{\{nl\}\}plt.plot(range(1,40),error\_rate)\{\{nl\}\}plt.title('Error Rate vs. K Value')\{\{nl\}\}plt.xlabel('K')\{\{nl\}\}plt.ylabel('Error Rate')} \tn % Row Count 15 (+ 4) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{Now we choose the K value where the error starts to reduce and flatten and we repeat the model fitting and evaluation! Theoretically, you should obtain better results.} \tn % Row Count 19 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{{\bf{Explanation}}: \newline 1. we create an empty list. \newline 2. we loop for a certain range of possible K values, here 1 to 40. \newline 3. we create and fit the KNN model with these different K values. \newline 4. we predict the using these models \newline 5. we calculate the mean of the error of all these models and store the errors in the empty list of point 1. We will then plot these errors to see what K values could be the best one.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}