\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{jjanana (djjang2)} \pdfinfo{ /Title (diving-into-data.pdf) /Creator (Cheatography) /Author (jjanana (djjang2)) /Subject (Diving into Data Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{0C4BAB} \definecolor{LightBackground}{HTML}{EFF3F9} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Diving into Data Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{jjanana (djjang2)} via \textcolor{DarkBackground}{\uline{cheatography.com/130831/cs/26177/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}jjanana (djjang2) \\ \uline{cheatography.com/djjang2} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 20th January, 2021.\\ Updated 20th January, 2021.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{2} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{BIG DATA}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Big Data}}: refers to the large, diverse sets of information that grow at ever-increasing rates. It encompasses the volume of information, the velocity or speed at which it is created and collected, and the variety or scope of the data points being covered (known as the "three v's" of big data). Big data often comes from data mining and arrives in multiple formats.} \tn % Row Count 8 (+ 8) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Big data is a great quantity of diverse information that arrives in increasing volumes and with ever-higher velocity.} \tn % Row Count 11 (+ 3) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Big data can be structured (often numeric, easily formatted and stored) or unstructured (more free-form, less quantifiable).} \tn % Row Count 14 (+ 3) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Nearly every dept in a company can utilize findings from big data analysis, but handling its clutter and noise can pose problems.} \tn % Row Count 17 (+ 3) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Big data can be collected from publicly shared comments on social networks and websites, voluntarily gathered from personal electronics and apps, through questionnaires, product purchases, and electronic check-ins.} \tn % Row Count 22 (+ 5) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Big data is most often stored in computer databases and is analyzed using software specifically designed to handle large, complex data sets.} \tn % Row Count 25 (+ 3) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Data analysts look at the relationship between different types of data, such as demographic data and purchase history, to determine whether a correlation exists. Such assessments may be done in-house or externally by a third-party that focuses on processing big data into digestible formats. Businesses often use the assessment of big data by such experts to turn it into actionable information.} \tn % Row Count 33 (+ 8) \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Volume}}: Quantity of data; Size determines the value \& potential insight, and if considered big data or not. \{\{nl\}\}{\bf{Variety}}: Type \& Nature of data. Change from structured to semi- or unstructured challenges the technologies. \{\{nl\}\}{\bf{Velocity}}: Speed the data.Big data is often avail. in real-time. \{\{nl\}\}{\bf{Veracity}}: Completeness \& Accuracy of data. Quality can vary, affecting accurate analysis. \{\{nl\}\}{\bf{Value}}: Derived from results of big data analysis.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{DATA}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Economic Data}}: Data regarding Interest rates, Asset prices, Exchange rates, and the Consumer Price Index; and other info about the global, national, or regional economy.} \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Structured Data}}: Data organized into databases with defined fields, including links between databases.} \tn % Row Count 7 (+ 3) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Unstructured Data}}: Data that is not organized into predetermined formats, such as databases, and often consists of text, images, or other nontraditional media.} \tn % Row Count 11 (+ 4) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Internal Data}}: Is owned, captured, and stored by an organization. Includes: Master data identifying customers, vendors, prospects; HR records; Employee/Customer correspondence; and Files specific to the type of business, such as Mfr's inventory records; banks' customer financial records; and insurer's premium records \& rating factors.} \tn % Row Count 18 (+ 7) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{External Data}}: Facts and figures available in locations outside a company. Refers to published data from outside the business.} \tn % Row Count 21 (+ 3) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Exploratory Data Analysis (EDA)}}: an approach to analyzing data sets to summarize their main characteristics, often with visual methods. \{\{nl\}\}{\emph{A statistical model can be used or not, but primarily EDA is for seeing what the data can tell us beyond the formal modeling or hypothesis testing task.}} \{\{nl\}\} Promoted to encourage statisticians to explore the data, and possibly formulate hypotheses that could lead to new data collection \& experiments.} \tn % Row Count 31 (+ 10) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{DATA (cont)}} \tn % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{EDA}} is different from initial data analysis (IDA), which focuses more narrowly on checking assumptions required for model fitting and hypothesis testing, and handling missing values and making transformations of variables as needed. EDA encompasses IDA. \{\{nl\}\}{\emph{EDA Techniques}} incl: Scatter Plot \& Bubble Plot} \tn % Row Count 7 (+ 7) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Initial Data Analysis (IDA)}}: most important distinction between the initial data analysis phase and the main analysis phase, is that during initial data analysis one refrains from any analysis that is aimed at answering the original research question. \{\{nl\}\} IDA phase is guided by the following (4) questions: Quality of Data, Quality of Measurements, Initial transformation, and did the implementation of the study fulfill the intentions of the research design.} \tn % Row Count 17 (+ 10) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Text Mining}}: Obtains info through language recognition; more difficult than w/ other models b/c there are no organized fields \& no numerical values.} \tn % Row Count 21 (+ 4) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Steps of the Text Mining Process}}: \{\{nl\}\}1. Rectrieve \& prepare the text. \{\{nl\}\}2. Convert unstructured data into structured data. \{\{nl\}\}3. Create a data mining model to help the Org. achieve its objectives. \{\{nl\}\}4. Evaluate the model's effectiveness in multiple areas.} \tn % Row Count 27 (+ 6) \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Examples of: \newline {\bf{External \& Unstructured Data}}: Social Media, News Reports, Internet Videos. \newline {\bf{Internal \& Structured Data}}: Policy Information, Claims History, Customer Data. \newline {\bf{External \& Structured Data}}: Telematics, Financial Data, Labor Statistics. \newline {\bf{nternal \& Unstructured Data}}: Adjustor's notes, Customer voice records, Surveillance videos.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{CyberSecurity}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Data loss occurs when valuable or sensitive information on a computer is compromised due to theft, human error, viruses, malware, or power failure. It may also occur due to physical damage or mechanical failure or equipment of an edifice.} \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Data loss can be caused by external factors, such as a power outage, theft, or a broad-based phishing attack. \{\{nl\} Companies can protect themselves by using data loss prevention procedures in software and by having protocols in place for employees that enable them to safely work with and share business documents.} \tn % Row Count 12 (+ 7) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{ESSENTIALS}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Data Mining}}: is a process used by companies to turn raw data into useful information. By using software to look for patterns in large batches of data, businesses can learn more about their customers to develop more effective marketing strategies, increase sales and decrease costs. Data mining depends on effective data collection, warehousing, and computer processing.\{\{nl\}\} {\emph{Data mining programs break down patterns and connections in data based on what information users request or provide.}}} \tn % Row Count 10 (+ 10) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Data Science}}: provides meaningful information based on large amounts of complex data or big data. Data science, or data-driven science, combines different fields of work in statistics and computation to interpret data for decision-making purposes. \{\{nl\}\}{\emph{Data science uses techniques such as machine learning and artificial intelligence to extract meaningful information and to predict future patterns and behaviors.}}} \tn % Row Count 19 (+ 9) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Disruptive Innovation}}: Disruptive Innovation refers to a technology whose application significantly affects the way a market or industry functions. An example of modern disruptive innovation is the Internet, which significantly altered the way companies did business and which negatively impacted companies that were unwilling to adapt to it. \{\{nl\}\} {\emph{Disruptive innovation refers to a new development that dramatically changes the way a structure or industry functions.}}} \tn % Row Count 29 (+ 10) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Sequential Pattern Mining}}: a topic of data mining concerned with finding statistically relevant patterns between data examples where the values are delivered in a sequence; presumed that the values are discrete, and thus time series mining is closely related, but usually considered a different activity. {\emph{Sequential pattern mining is a special case of structured data mining.}}} \tn % Row Count 37 (+ 8) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Leaders}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Electronic Commerce (e-commerce)}}: Electronic commerce or e-commerce (sometimes written as eCommerce) is a business model that lets firms and individuals buy and sell things over the internet. E-commerce operates in all four of the following major market segments: *Business to business; * Business to consumer; *Consumer to consumer; and *Consumer to business} \tn % Row Count 8 (+ 8) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Statistics}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Scatter Plot}}: A graphed cluster of dots, each of which represents the values of two variables. The slope of the points suggests the direction of the relationship between the two variables. The amount of scatter suggests the strength of the correlation. \{\{nl\}\}{\emph{two dimensional plot of point values}}} \tn % Row Count 7 (+ 7) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Bubble Plot}}: A Scatter Plot in which the size of the bubble represents a 3rd attribute, such as average accident severity. \{\{nl\}\} {\emph{Best option for conveying the numerical relationship between three or four sets of values.}}} \tn % Row Count 12 (+ 5) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Correlation Matrix}}: A table that summarizes a series of correlations among several variables. \{\{nl\}\}{\emph{rectangular display of all the correlations between all pairs of data sets with a key (such as color coding) that indicates the strength of the correlation}}} \tn % Row Count 18 (+ 6) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Regression Model}}: Estimates relationships between or among variables. \{\{nl\}\}{\emph{Model uses mathematical functions of statistical regression to predict the numerical value of a target variable based on the values of the explanatory variables}}} \tn % Row Count 23 (+ 5) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Regression Analysis}}: A set of statistical processes for estimating the relationships between a dependent variable (often called the 'outcome/target variable') and one or more independent variables (often called 'predictors', 'covariates', or 'features'). \{\{nl\}\}{\bf{Primarily used for (2) conceptually distinct purposes.}} \{\{nl\}\}First, widely used for prediction and forecasting, where its use has substantial overlap with the field of machine learning; Second, in some situations, can be used to infer causal relationships between the independent and dependent variables.} \tn % Row Count 35 (+ 12) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Statistics (cont)}} \tn % Row 5 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Linear Regression}}: Statistical method that predicts the numerical value of a target variable based on the value of one or more attributes or explanatory variables. \{\{nl\}\}A linear approach to modelling the relationship between a scalar response and 1 or more explanatory variables (also known as dependent \& independent variables).} \tn % Row Count 7 (+ 7) % Row 6 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Linear Regression}}: Falls into 1 of 2 categories: \{\{nl\}\} If the goal is prediction, forecasting, or error reduction, linear regression can be used to fit a predictive model to an observed data set of values of the response and explanatory variables. After developing such a model, if additional values of the explanatory variables are collected without an accompanying response value, the fitted model can be used to make a prediction of the response. \{\{nl\}\}If the goal is to explain variation in the response variable that can be attributed to variation in the explanatory variables, linear regression analysis can be applied to quantify the strength of the relationship between the response and the explanatory variables, and in particular to determine whether some explanatory variables may have no linear relationship with the response at all, or to identify which subsets of explanatory variables may contain redundant information about the response.} \tn % Row Count 27 (+ 20) % Row 7 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Generalized Linear Model (GLM)}}: Removes the normality and constant variance assumption in a linear model and it names a link funcation which defines the relationship between the expected response variable and linear combination of the predictor variables. \{\{nl\}\}A flexible generalization of ordinary linear regression that allows for response variables that have error distribution models other than a normal distribution.} \tn % Row Count 36 (+ 9) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Statistics (cont)}} \tn % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{GLM consists of (3) elements}}:\{\{nl\}\}1. An exponential family of probability distributions. \{\{nl\}\}2. A linear predictor - the quantity which incorporates the information about the independent variables into the model. \{\{nl\}\}3. A link function - provides the relationship between the linear predictor and the mean of the distribution function.} \tn % Row Count 7 (+ 7) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Data-Driven Decision Making}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Data-Driven Decision Making}}: gives reference to the collection and analysis of data to guide decisions that improve success. \{\{nl\}\} Data-Informed Decision Making (DIDM): (2) basic approaches:Descriptive \& Predictive approach.} \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Process for Data-driven Decision Making}}: \{\{nl\}\}1. {\bf{Define the Problem}} - provide a business context for using the data {\emph{this step is crucial because modeling and analyzing data is not effective without a business context}} \{\{nl\}\}2. {\bf{Prepare the Data}} - Identify the necessary data; Gather quality data; Verify its quality \{\{nl\}\}3. {\bf{Analyze \& Model}} - model the data using big data techniques. {\emph{use the appropriate descriptive or predictive approach}} \{\{nl\}\}4. {\bf{Develop Insights}} - identify trends, relationships, behaviors, and events \{\{nl\}\}5. {\bf{Make an Actionable Decision}} - develop and implement a solution to the problem} \tn % Row Count 18 (+ 13) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{FINANCIAL ANALYSIS}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Data Warehousing}}: the electronic storage of a large amount of information by a business or organization. Data warehousing is a vital component of business intelligence that employs analytical techniques on business data. \{\{nl\}\} A data warehouse is designed to run query and analysis on historical data derived from transactional sources for business intelligence and data mining purposes.} \tn % Row Count 8 (+ 8) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Data Analytics}}: Data analytics is the science of analyzing raw data in order to make conclusions about that information. Many of the techniques and processes of data analytics have been automated into mechanical processes and algorithms that work over raw data for human consumption. Data analytics techniques can reveal trends and metrics that would otherwise be lost in the mass of information. This information can then be used to optimize processes to increase the overall efficiency of a business or system.} \tn % Row Count 19 (+ 11) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Data Analytics Process}}: involves several different steps: \{\{nl\}\}1. The first step is to determine the data requirements or how the data is grouped. Data may be separated by age, demographic, income, or gender. Data values may be numerical or be divided by category. \{\{nl\}\} 2. The second step in data analytics is the process of collecting it. This can be done through a variety of sources such as computers, online sources, cameras, environmental sources, or through personnel. \{\{nl\}\} 3. Once the data is collected, it must be organized so it can be analyzed. Organization may take place on a spreadsheet or other form of software that can take statistical data. \{\{nl\}\} 4. The data is then cleaned up before analysis. This means it is scrubbed and checked to ensure there is no duplication or error, and that it is not incomplete. This step helps correct any errors before it goes on to a data analyst to be analyzed.} \tn % Row Count 38 (+ 19) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{FINANCIAL ANALYSIS (cont)}} \tn % Row 3 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Neural Network}}: A data analysis technique that operates similar to the human brain in its ability to infer rules from data patterns and construct logic to use for data analytics. \{\{nl\}\} A network or circuit of neurons, or in a modern sense, an artificial neural network, composed of artificial neurons or nodes. \{\{nl\}\}{\emph{Form of AI that enables a computer to learn as it accumulates more data (deep learning).}}} \tn % Row Count 9 (+ 9) % Row 4 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Neural Network: Disadvantages}}: The processes for developing the rules and logic may not be transparent. \{\{nl\}\}a neural network can be overtrained if it reviews data in such detail that it can not then operate in a larger framework with other types of data} \tn % Row Count 15 (+ 6) % Row 5 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{3 layers of Neural Network}}: \{\{nl\}\}1. {\bf{Input layer}} - provides data for the network to analyze \{\{nl\}\}2. {\bf{Hidden layer}} - uses mathematical functions to learn and recode input data \{\{nl\}\}3. {\bf{Output layer}} - provides results of the anaysis} \tn % Row Count 20 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{SOCIAL NETWORK}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Social Network Analysis (Network analysis)}}: Studies the connections and relationships among people in a social network. \{\{nl\}\} {\emph{Useful tool for making predictions based on trends}} \{\{nl\}\}{\bf{Social Network}} - group of individuals or entities who share relationships and the flow of communication} \tn % Row Count 6 (+ 6) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Node}}: Each individual or entity is know as this \{\{nl\}\}{\emph{a basic unit used to build data structures}}} \tn % Row Count 9 (+ 3) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Centrality Measures}}: In a social network context, the quantification of a node's relationship to other nodes in the same network. \{\{nl\}\} Determines the efficiency of the flow btwn Social Network commections. \{\{nl\}\}{\emph{indicators of centrality identify the most important vertices within a graph.}}} \tn % Row Count 15 (+ 6) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{(3) Centrality measures}}: \{\{nl\}\}1. {\bf{Degree}} - the number of connections each node has \{\{nl\}\}2. {\bf{Closeness}} - the average distance or path length btwn a given node and other nodes in the network \{\{nl\}\}3. {\bf{Betweenness}} - how many times a given node is part of the shortest path btwn 2 other nodes in a network} \tn % Row Count 22 (+ 7) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Financial Technology \& Automated Investing}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Artificial Intelligence (AI)}}: Artificial intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions. The term may also be applied to any machine that exhibits traits associated with a human mind such as learning and problem-solving.} \tn % Row Count 7 (+ 7) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Deep Learning}}: is an artificial intelligence (AI) function that imitates the workings of the human brain in processing data and creating patterns for use in decision making. Deep learning is a subset of machine learning in artificial intelligence that has networks capable of learning unsupervised from data that is unstructured or unlabeled. Also known as deep neural learning or deep neural network. \{\{nl\}\} *Deep learning AI is able to learn without human supervision, drawing from data that is both unstructured and unlabeled.; Also a form of machine learning, can be used to help detect fraud or money laundering, among other functions.} \tn % Row Count 20 (+ 13) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Machine Learning}}: Machine learning is the concept that a computer program can learn and adapt to new data without human intervention. Machine learning is a field of artificial intelligence (AI) that keeps a computer's built-in algorithms current regardless of changes in the worldwide economy. \{\{nl\}\} {\emph{Machine learning is useful in parsing the immense amount of information that is consistently and readily available in the world to assist in decision making.}}} \tn % Row Count 30 (+ 10) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Financial Technology \& Automated Investing (cont)}} \tn % Row 3 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Algorithm}}: An algorithm is set of instructions for solving a problem or accomplishing a task. One common example of an algorithm is a recipe, which consists of specific instructions for preparing a dish/meal. Every computerized device uses algorithms to perform its functions.} \tn % Row Count 6 (+ 6) % Row 4 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Disruptive Technology}}: Disruptive technology is an innovation that significantly alters the way that consumers, industries, or businesses operate. A disruptive technology sweeps away the systems or habits it replaces because it has attributes that are recognizably superior. Recent disruptive technology examples include e-commerce, online news sites, ride-sharing apps, and GPS systems. \{\{nl\}\}{\emph{A disruptive technology supersedes an older process, product, or habit.}}} \tn % Row Count 16 (+ 10) % Row 5 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Association Rule Learning}}: a rule-based machine learning method for discovering interesting relations between variables in large databases. It is intended to identify strong rules discovered in databases using some measures of interestingness. \{\{nl\}\}{\emph{association rules are employed today in many application areas including Web usage mining, intrusion detection, continuous production, and bioinformatics.}}} \tn % Row Count 25 (+ 9) % Row 6 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Process of {\bf{Association Rule Generation}} is usually split up into two separate steps: \{\{nl\}\} 1. A minimum support threshold is applied to find all frequent itemsets in a database. \{\{nl\}\} 2. A minimum confidence constraint is applied to these frequent itemsets in order to form rules.} \tn % Row Count 31 (+ 6) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Decision Tree Analysis: (5) Steps}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Decision Tree Analysis: (5) Steps}}:\{\{nl\}\}1. Define the problem with a statement of the decision being considered \{\{nl\}\}2. Create pathways (sequence of events) for each alternative, with each pathway leading to an outcome \{\{nl\}\}3. Assign a probability to each event on a pathway and estimate the value (cost or gain) of the outcome of each pathway \{\{nl\}\}4. Multiply the probability of each event by the value of its outcome to determine the expected value of each pathway \{\{nl\}\}5. Compare expected values to determine the pathway with the highest expected value} \tn % Row Count 12 (+ 12) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{DECISION TREE:Analysis, Use, Features, and Inputs}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Analysis}}: Analyzes the consequences, costs, and gains of decisions to compare alternative decisions.} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Use}}: Decision tree analysis helps risk managers choose the best strategy to meet a goal.} \tn % Row Count 5 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Features}}: The process can be used to analyze both negative and positive consequences.} \tn % Row Count 7 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Inputs}}: The risk manager inputs the project plan with decision points and information on possible outcomes.} \tn % Row Count 10 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{DECISION TREE: Outputs, Advantages, Disadvantages}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Outputs}}: Decision tree analysis produces an analysis of risk for each pathway with options and an expected value for each pathway.} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Advantages}}: Presents a visual portrayal, provides both quantitative and qualitative information, and offers a way to calculate the best pathway through a problem.} \tn % Row Count 7 (+ 4) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Disadvantages}}: Can be complicated and difficult to explain \{\{nl\}\}{\emph{also susceptible to oversimplification, which can result in less accurate decision making}}} \tn % Row Count 11 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{EVENT TREE Analysis: (6) Steps:}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{1. Identify the initiating event (first accidental event that could result in unwanted consequences) \{\{nl\}\}2. Determine consequences of events that could follow the accidental event \{\{nl\}\}3. Construct an event tree diagram that lists barriers in the sequence that would be activated if the designated event occurred \{\{nl\}\}4. Design each pathway to fork at each barrier depending on whether the barrier succeeds or fails \{\{nl\}\}5. Assign an estimated probability to the likelihood of success of failure of each barrier 6. Calculate the frequency of outcomes for each pathway} \tn % Row Count 12 (+ 12) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{EVENT TREE: Analysis, Use, and Features}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Event Tree Analysis}}: Analyzes the consequences of accidental events rather than decisions.} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Use}} - Risk managers use event tree analysis to evaluate risk treatment measures and identify, recommend, and justify improvements.} \tn % Row Count 5 (+ 3) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Features}} - Process typically analyzes only negative consequences.} \tn % Row Count 7 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{EVENT TREE: Outputs, Process, Procedures}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{- List of potential problems, with estimated values for outcomes and frequencies \{\{nl\}\}- Recommendations regarding the effectiveness of barriers} \tn % Row Count 3 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{EVENT TREE: Adv \& Disadvantages}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Advantages}}: \{\{nl\}\}- offers a visual portrayal of sequences of events following an accident \{\{nl\}\}- shows the effectiveness of control systems \{\{nl\}\}- provides both quantitative and qualitative information} \tn % Row Count 5 (+ 5) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Disadvantages}}: \{\{nl\}\}- effective only if all potential events are identified \{\{nl\}\}- analysis considers only two options (success or failure of barriers) \{\{nl\}\}- analysis may ignore dependencies that arise within a sequence} \tn % Row Count 10 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{CH.10 VOCAB}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Economic Data}}: Data regarding interest rates, asset prices, exchange rates, the Consumer Price Index, and other information about the global, the national, or a regional economy.} \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Classification Trees}}: A supervised learning technique that uses a structure similar to a tree to segment data according to known attributes to determine the value of a categorical target variable.} \tn % Row Count 8 (+ 4) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Cluster Analysis}}: A model that determines previously unknow groupings of data.} \tn % Row Count 10 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Data Mining}}: The process of extracting hidden patterns from data that is used in a wide variety of applications for research and fraud detection.} \tn % Row Count 13 (+ 3) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Centrality Measures}}: In a social network context, the quantification of a node's relationship to other nodes in the same network.} \tn % Row Count 16 (+ 3) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Big Data}}: Sets of data that are too large to be gathered and analyzed by traditional methods.} \tn % Row Count 18 (+ 2) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\bf{Data Science}}: An interdisciplinary field involving the design and the use of techniques to process very large amounts of data from a variety of courses to provide knowledge based on data.} \tn % Row Count 22 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}