\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{dganesh} \pdfinfo{ /Title (principal-component-analysis.pdf) /Creator (Cheatography) /Author (dganesh) /Subject (Principal Component Analysis Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{FF8317} \definecolor{LightBackground}{HTML}{FFF7F0} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Principal Component Analysis Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{dganesh} via \textcolor{DarkBackground}{\uline{cheatography.com/160314/cs/33726/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}dganesh \\ \uline{cheatography.com/dganesh} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 19th August, 2022.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Motivation}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Handle High Multicollinearity}} \newline % Row Count 1 (+ 1) {\emph{Existing Solutions:}} \newline % Row Count 2 (+ 1) 1. Variable Selection \seqsplit{(stepwise/forward/backward)} \newline % Row Count 3 (+ 1) {\emph{Cons:}} Each time dropping a variable, some information is lost \newline % Row Count 5 (+ 2) {\bf{Visualization of more features}} \newline % Row Count 6 (+ 1) {\emph{Existing Solutions:}} \newline % Row Count 7 (+ 1) 1. Pairwise scatter plots pC2 = (p*(p-1)/2), where p is number of variables \newline % Row Count 9 (+ 2) {\emph{Cons:}} if p=20, this would mean 190 plots!% Row Count 10 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{There must be a better way of doing this. Goal is to find an algorithm to reduce the number of variables without losing information. i.e. PCA} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Usecases}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{1. {\bf{Dimensionality Reduction}} without losing information. \newline % Row Count 2 (+ 2) 2. Easy {\bf{Data Visualization and Exploratory Data Analysis}} \newline % Row Count 4 (+ 2) 3. Create {\bf{uncorrelated features/variables}} that can be an input to a prediction model \newline % Row Count 6 (+ 2) 4. Uncovering latent \seqsplit{variables/themes/concepts} \newline % Row Count 7 (+ 1) 5. {\bf{Noise reduction}} in dataset% Row Count 8 (+ 1) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Prerequisite Knowledge}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{{\bf{Building Blocks:}} \newline % Row Count 1 (+ 1) 1. {\emph{The basis of a space:}} \newline % Row Count 2 (+ 1) Set of linearly independent vectors/directions that span the entire space i.e. Any point in space can be represented as a combination of these vectors. \newline % Row Count 6 (+ 4) Ex: Each row of a dataset is a point in the space. Each column is a basis vector (representation of any point in terms of columns). \newline % Row Count 9 (+ 3) 2. {\emph{Basis transformation:}} \newline % Row Count 10 (+ 1) The process of converting your information from one set of basis to another. (OR) Representing your data in new columns different from original. Often for convenience, efficiency or just for common sense. \newline % Row Count 15 (+ 5) Ex: Dropping or Adding a column to the dataset. \newline % Row Count 16 (+ 1) 3. {\emph{Variance as information:}} \newline % Row Count 17 (+ 1) Variance = Information \newline % Row Count 18 (+ 1) If two variables are highly correlated, they together don't add a lot of information than they do individually. So one of them can be dropped.% Row Count 21 (+ 3) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{In 2D geometry, X and Y axes are dimensions. i (1,0) is a unit vector in X direction, j (0,1) is a unit vector in the Y direction. For point a: ax, ay are the units to move in 'i' and 'j' directions to reach 'a' and also denoted as: ax i + ay j. Any point in 2D space can be represented in term of 'i' and 'j'. The 'i' and 'j' vectors are the 'basis of the space'. 'i' and 'j' are independent i.e. 'i' can't be expressed in terms of 'j' and vice versa} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{What does it do?}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{PCA is one of a family of techniques for taking high-dimensional data, and using the dependencies between the variables to represent it in a more tractable, lower-dimensional basis, without losing too much information. \newline % Row Count 5 (+ 5) Given p features/variables in a dataset, PCA finds the principal components as \newline % Row Count 7 (+ 2) 1. a {\bf{linear combination}} of the original features. \newline % Row Count 9 (+ 2) 2. the principal components capture {\bf{maximum variance}} in the dataset.% Row Count 11 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Mathematical Representation}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{5.377cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/dganesh_1660897698_pca1.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{The above equation represents the {\bf{First Principal Component}}. PCA finds φ values such that the variance on Z₁ is maximum. The {\bf{Second Principal Component}} is found as one that has maximal variance of all linear combinations that are uncorrelated with Z₁. And like this, each additional component is capturing incremental variance. The algorithm calculates 'k' principal components, where k\textless{}=p (p is number of variables in dataset).} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Workings of PCA}} \tn \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{1. Find {\bf{Principal Components}} \newline % Row Count 1 (+ 1) \{\{bl\}\} a) Using SVD (Singular Value Decomposition) \newline % Row Count 3 (+ 2) 2. Choose {\bf{optimal number}} of principal components (k).% Row Count 5 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Singular Value Decomposition (SVD)}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{5.377cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/dganesh_1660900818_pca2.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\emph{'Decomposition'}} because it breaks the original data matrix into 3 new matrices} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Code}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{from sklearn.decomposition import PCA \newline import numpy as np \newline \newline X = np.array({[}{[}-1, -1{]}, {[}-2, -1{]}, {[}-3, -2{]}, {[}1, 1{]}, {[}2, 1{]}, {[}3, 2{]}{]}) \newline pca = PCA(n\_components=2) \newline pca.fit(X)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}