\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{cgeeeeh} \pdfinfo{ /Title (r.pdf) /Creator (Cheatography) /Author (cgeeeeh) /Subject (R Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{A3A3A3} \definecolor{LightBackground}{HTML}{F3F3F3} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{R Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{cgeeeeh} via \textcolor{DarkBackground}{\uline{cheatography.com/191000/cs/39875/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}cgeeeeh \\ \uline{cheatography.com/cgeeeeh} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 27th September, 2023.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{R语言基础}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{NA: for missing or undefined \seqsplit{data,有这个数字,可是获得不了,比如我的头发(真的很难数嘛?)}} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{NULL: for empty object (e.g. null / empty \seqsplit{lists),没有的数字,压根找不到,比如海龟根本没有头发这个概念}} \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{NaN: for results that cannot be reasonably defined} \tn % Row Count 7 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{.libPaths() \# get library location; library() \# see all packages installed; search() \# see packages currently loaded} \tn % Row Count 10 (+ 3) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\# \seqsplit{获取帮助:help.start();\#} help about function \seqsplit{foo:help(foo)/?foo;exmaple();args()}} \tn % Row Count 12 (+ 2) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\# list all functions containing string foo:apropos("foo") ;\# show an example of function foo:example(foo)} \tn % Row Count 15 (+ 3) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{builtins()列出所有的内置函数}} \tn % Row Count 16 (+ 1) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\%\% in R = \% in Python;\%/\% inR = // in Python, 不然会返回float; TRUE/T in R = True in Python} \tn % Row Count 18 (+ 2) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{\%in\%是in的意思,但是要把list转化成vec。unlist()。再使用}} \tn % Row Count 20 (+ 2) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{data \seqsplit{type:integer(5L)/continuous/categorical} (norminal/ordinal)/ text} \tn % Row Count 22 (+ 2) % Row 10 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{matrix里面只能有一个datatype,dt和df没有这个限制}} \tn % Row Count 24 (+ 2) % Row 11 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{options('scipen',100)显示100条向量值} \tn % Row Count 25 (+ 1) % Row 12 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{定义函数:x=function} (x) \{ return (T)\}} \tn % Row Count 26 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{\seqsplit{探索性数据分析的心法}}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{1、商业问题是什么}} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{2、我需要知道数据的什么,来帮助理解并回答商业问题,找到商业机会}} \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{3、具体用R怎么实现}} \tn % Row Count 4 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{探索性数据分析的目的:}} \tn % Row Count 5 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{1、将数据与商业问题结合。数据充足吗?合适吗?比如没有预测性数值,类别变量,冗余}} \tn % Row Count 8 (+ 3) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{2、探测数据的问题。数据质量,异常值的监测}} \tn % Row Count 10 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{基本语法}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{c("a",1)会返回'a','1',因为char比numeric更高级,而vec会保证元素的类型全部一样} \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{c(1,2,3)*c(1,2,3)/c(1,2,3)\textasciicircum{}2=c(1,4,9)} \tn % Row Count 4 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{class(c(TRUE))返回的是logical}, \seqsplit{class(dt)返回datatable;} \seqsplit{length()返回变量数量,length("TTT")返回1}} \tn % Row Count 7 (+ 3) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{str() check classification of \seqsplit{viriables,检查数据框中有哪些数据,包括类型和数值}} \tn % Row Count 9 (+ 2) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{summary()可以用于检查数据错误:给出数值型变量的数值summary,min,max,median,mean;给出字符串向量的长度和class;factor向量会给出各个因子的count}} \tn % Row Count 13 (+ 4) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{向量,矩阵,df,dt初始化:c(),matrix(data=NA},nrow,ncol),data.frame("col":vec),data.table(vec)} \tn % Row Count 17 (+ 4) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{向量添加元素:vec \textless{}- append(vec,element);vec=c(a,a,a)会得到一个flat的向量vec} \tn % Row Count 19 (+ 2) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{重命名列:names(df1)} \textless{}-"col"/setnames(df\$col,old=c(xxx),new="xxx")/colnames(dt)=c('1','2','diff')} \tn % Row Count 22 (+ 3) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{删除列:col=NULL/df{[}-c(1,2){]}} \tn % Row Count 23 (+ 1) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{把两列或者两个别的什么东西粘在一起:col=paste(col1},col2, sep='-');paste(1,"a",T)} \tn % Row Count 26 (+ 3) % Row 10 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{独特的值:unique(), nrow()} \tn % Row Count 27 (+ 1) % Row 11 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{行列求和:colSums()/rowSums()}} \tn % Row Count 28 (+ 1) % Row 12 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{ifelse语句:ifelse(expression}, 0, 1), a=1; b=if(a==2) 'a' else 'b'} \tn % Row Count 30 (+ 2) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{基本语法 (cont)}} \tn % Row 13 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{a=1; if(a==2) b='a' else b='b'} \tn % Row Count 1 (+ 1) % Row 14 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{\#返回对应索引的值:switch(3}, "apple", "orange", "pear", "pineapple") / switch("beta", "alpha"="Big Rock", "beta"="Meteorite", "gamma"="Red Stones")} \tn % Row Count 6 (+ 5) % Row 15 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{切片索引:by address df{[}c(3:7),seq(1,ncol(df),2){]}} \tn % Row Count 8 (+ 2) % Row 16 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{by value:df{[}(df\textgreater{}=10) \& (df\textless{}=20) {]}} \tn % Row Count 9 (+ 1) % Row 17 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{赋值:df{[}2:4,1{]}=c(3,9,7), df{[}c(3,5), 2:4{]}=rbind(c(1,3,5), c(2,4,6), c(7,9,11)),df{[}1{]}=df{[},1{]}} \tn % Row Count 11 (+ 2) % Row 18 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{批量赋值:df{[}df\textless{}0{]}=-999。所有满足条件的值都被赋一个值} \tn % Row Count 13 (+ 2) % Row 19 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{R语言一直在原数据上更改,而python未必,这就是python会报错要求你去用iloc的原因}} \tn % Row Count 16 (+ 3) % Row 20 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{df{[},2{]}{[}df{[},2{]}\textless{}0{]}=-777;可以先选择列再选择行} \tn % Row Count 18 (+ 2) % Row 21 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{df{[}df{[},1{]}\textless{}0 \& df{[},2{]}\textgreater{}0, {]}=cbind(9,8)} \tn % Row Count 19 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{数据预处理}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{数据清洗的目的一是business} object,二是technical requirements} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{新建一列,清洗完之后和原列作比较}} \tn % Row Count 3 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{主要目的是防止清洗出错}} \tn % Row Count 4 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{\seqsplit{缺失值的处理(删除/填充)}}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{找到缺失的原因,这能决定如何填充缺失值}} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{分类值预测:整列的mode,subgroup的mode,cart,逻辑回归}} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{连续值预测:随机缺失的值可以用mean填充;系统性缺失的值可以用subgroup的mean,或者cart/逻辑回归;离散化连续值的列,当成分类值处理}} \tn % Row Count 8 (+ 4) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{删除一整行只是最后的选择}} \tn % Row Count 9 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{logistic/linear模型会自动忽略缺失值的行}} \tn % Row Count 11 (+ 2) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{is.na(dt)会显示所有的格子,which是不是空值}} \tn % Row Count 13 (+ 2) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{sum(is.na())只会得到一个值}} \tn % Row Count 14 (+ 1) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{which(is.na(data))能得到空值在第几行}, \seqsplit{which(h\$counts==max(h\$counts))}} \tn % Row Count 16 (+ 2) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{na.omit(ins.dt)} \tn % Row Count 17 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{错误值}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{将错误值替换成正确的值}} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{预测错误值,让他们变的更正确}} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{把它们变成na,交给cart去预测}} \tn % Row Count 3 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{删掉一整行} \tn % Row Count 4 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{\seqsplit{数据不一致/数据重复}}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{找到数据不一致的原因,尽量从源头解决问题}} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{数据重复的定义,往往取决于主键}} \tn % Row Count 3 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{data exploration R}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{\seqsplit{\#rm(list=ls()),ls()列出所有的变量}} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{\seqsplit{读数据:read.table("csv"}, skip=7, header=T, sep=",", row.names="id",nrows=4);} \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} read.csv("csv", stringsAsFactors = \seqsplit{TRUE)。stringsAsFactors把字符串转化为因子变量} & fread("csv",, na.strings = c("NA", "missing", "N/A", -99, "", "m", "M", "na", ".")) \tn % Row Count 8 (+ 5) % Row 3 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{写数据:write.csv(df,path, row.names=F); write.table(df,path)} \tn % Row Count 10 (+ 2) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{rbind(df,df2), cbind(df.df2), names(df){[}3:4{]}=c('a','b')} \tn % Row Count 12 (+ 2) % Row 5 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{\seqsplit{查看导入的数据集:head(df)},view(df), 图形交互} \tn % Row Count 14 (+ 2) % Row 6 \SetRowColor{LightBackground} \seqsplit{数据维度:dim(df)} & \seqsplit{数据列的细节,如有多少缺失值,各个类别有几个数:summary(df)} \tn % Row Count 19 (+ 5) % Row 7 \SetRowColor{white} \seqsplit{查看子集:subset(df}, is.na(df\$col, \seqsplit{select))。select选中了数据集中的某列} & train \textless{}- sample.split(Y = mtcars\$mpg, SplitRatio = 0.7) trainset \textless{}- subset(mtcars, train == T) testset \textless{}- subset(mtcars, train == F) \tn % Row Count 26 (+ 7) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{\seqsplit{比较两个向量或者df是否相同:identical(df1},df2)} \tn % Row Count 28 (+ 2) % Row 9 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{\seqsplit{为数据集增加噪音:jitter(df\$col),jitter只能给连续变量加噪音,logical和类别变量不行}} \tn % Row Count 31 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{data exploration R (cont)}} \tn % Row 10 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{\seqsplit{\#生成sequence,重复数字,反转列表:seq(10},20,3),rep(10,5),rev(v),sort(v)升序,unique(v),} \tn % Row Count 3 (+ 3) % Row 11 \SetRowColor{white} vector(c, n) – returns vector with all values c of size n & names(vec)=c('a','b','c')可以给vec的元素取名字 \tn % Row Count 6 (+ 3) % Row 12 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{matrix(c, nrow=5, ncol=3) – returns a 53 matrix with all values c} \tn % Row Count 8 (+ 2) % Row 13 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{data.frame(v1, v2, v3…) – returns a data frame made up of column vectors v1, v2, v3,} \tn % Row Count 10 (+ 2) % Row 14 \SetRowColor{LightBackground} \seqsplit{所以要转换类别:as}.numeric(df\$col) & \seqsplit{as.integer()=int()in} python \tn % Row Count 13 (+ 3) % Row 15 \SetRowColor{white} \seqsplit{对每一列都执行某样操作:sapply(df}, func, \seqsplit{na.rm=TRUE)。Possible} functions used in sapply include mean, sd, var, min, max, median, range , and quantile & sapply(v, function(x) if (is.na(x)) 999 else x) \tn % Row Count 22 (+ 9) % Row 16 \SetRowColor{LightBackground} lapply(my\_list, function(x) x == element)) & \seqsplit{lapply和sapply的区别是返回结构不同,lapply返回list,sapply返回向量和矩阵} \tn % Row Count 27 (+ 5) % Row 17 \SetRowColor{white} \seqsplit{\#见堆叠柱状图:table(deparse}.level=2)/prop.table(col1,col2,margin)能把两列数据组合成透视表 & \seqsplit{deparse.level=2会展示所有行和列的名字} \tn % Row Count 33 (+ 6) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{x{2.4885 cm} x{2.4885 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{5.377cm}}{\bf\textcolor{white}{data exploration R (cont)}} \tn % Row 18 \SetRowColor{LightBackground} \seqsplit{所有的factor:levels(factorcol}, ordered=T, levels=c(xxx), labels=c(xxx)) & \seqsplit{labels把向量里的数值映射到一个新的空间,它和levels的区别是levels只涉及input,labels涉及到output} \tn % Row Count 7 (+ 7) % Row 19 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{relevel(x, \seqsplit{ref)将ref因子放在x的第一个}} \tn % Row Count 8 (+ 1) % Row 20 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{\seqsplit{切分左开右闭区间:limits=c(1,2,3,4,5)}} \tn % Row Count 10 (+ 2) % Row 21 \SetRowColor{white} \mymulticolumn{2}{x{5.377cm}}{生成新的一列,值是区间:dt{[}, new\_col := cut(col, breaks=limits, include.lowest=T){]}} \tn % Row Count 12 (+ 2) % Row 22 \SetRowColor{LightBackground} \seqsplit{cor计算相关系数,the} degree of the consistensy of the trend of the relationship & \seqsplit{解读时了一说,一个上升时,另一个倾向于xxx,但因果关系我们并不清楚} \tn % Row Count 17 (+ 5) % Row 23 \SetRowColor{white} \seqsplit{cor(dataframe)可以计算相关性矩阵,"corrplot";corrplot(cor(mtcars)}, type = "upper") & \seqsplit{cov计算协方差,与cor(相关系数不同)} \tn % Row Count 22 (+ 5) % Row 24 \SetRowColor{LightBackground} \mymulticolumn{2}{x{5.377cm}}{ceiling(2.5)=3} \tn % Row Count 23 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{随机数}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{var(), sd()(标准差), skewness(),kurtosis()} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{quantile(data,c(0.025,0.975))} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{sample(1:80,80,replace=F);第一个是参数,第二个是size,第三个是能否重复} \tn % Row Count 4 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{dbinom(x,n,p),pbinom。x次成功from \seqsplit{n次,每次概率为p。x可以是1:10,成功1:10次}} \tn % Row Count 6 (+ 2) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{dpois(x,lambda), ppois(x,lambda)} \tn % Row Count 7 (+ 1) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{dnorm(x,mean,sd),pnorm(x,mean,sd)} \tn % Row Count 8 (+ 1) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{runif(size, low, high), rnorm(size, mean, sd而不是方差), rbinom(size, times, possibility)} \tn % Row Count 10 (+ 2) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{set.seed(seed) fixes the random result of ramdom function} \tn % Row Count 12 (+ 2) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{curve(dnorm(x,mean=0,sd=1),col="red",from=-3,to=4,xlim=c(-3,4),ylim=c(0,1)); curve(dnorm(x,mean=1,sd=1),col="blue",add=T)} \tn % Row Count 15 (+ 3) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{t.test(x, y = NULL, alternative = c("two.sided", "less", "greater"), mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95)} \tn % Row Count 18 (+ 3) % Row 10 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{如果x也有值,y也有值,那就是计算x-y的置信区间}} \tn % Row Count 20 (+ 2) % Row 11 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{prop.test(x, n, p = NULL, alternative = c("two.sided", "less", "greater"), conf.level = 0.95, correct = TRUE)} \tn % Row Count 23 (+ 3) % Row 12 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{这里x可以是向量,两次trial的成功次数;n是总次数,同理。算出来就是做差值}} \tn % Row Count 26 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{ggplot2}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{data层:ggplot(data,aes(x=colname,y=colname, fill=factorcolname))} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{图像层:geom\_point()} \tn % Row Count 3 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{布局层:facet\_grid(.\textasciitilde{}fl)。.\textasciitilde{}fl是指分组的colname} \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{将factor变量映射到颜色:scae\_fill\_manual(values=c("0"="dark} blue", "1"="orange")} \tn % Row Count 7 (+ 2) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{细节层:labs(title, xlab,ylab)} \tn % Row Count 8 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{visualization with R}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{par(mfrow=c(2,2)),两行两列的子图} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{plot(xaxt="n",yaxt="n");抹掉所有坐标轴} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{axis(1,at=seq(0.5,length(),1),labels=names(),tick=F,col="red")1是x轴,at决定位置,labels决定具体显示,包括内容和间距,tick控制有没有坐标线,col决定标线颜色} \tn % Row Count 6 (+ 4) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{label太多,可调整字体方向:par(las=0},1,2,3);0=parallel, 1=all horizontal, 2=all perpendicular to axis, 3=all vertical} \tn % Row Count 9 (+ 3) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{图片太大,可调整图与周边margin:par(mar=c(5},4,4,2)+0.1),margin的顺序是下,左,上,右;单位是line} \tn % Row Count 12 (+ 3) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{允许画图画到外面去:par(xpd=T)。}} \tn % Row Count 13 (+ 1) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{添加legend:legend("topright"}, inset=c(0,0), fill=c("red","grey"), legend=rownames(counts), border="grey", cex=0.6)} \tn % Row Count 16 (+ 3) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{第一个是大体位置,inset是具体位置,fill是对对应的颜色,legend是图例名字,border是边框颜色,}} \tn % Row Count 19 (+ 3) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{散点图和箱线图的区别是:散点图可以告诉你样本量的大小(比如50岁左右的人有保险的比没有的多),而箱线图不行}} \tn % Row Count 23 (+ 4) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{plot(density(df\$col), xlab,ylab,main)} \tn % Row Count 24 (+ 1) % Row 10 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{hist(df\$col, ylim=c(0,220), breaks=c(-10,0,10,20), labels=T, col="light blue") \seqsplit{labels会给每个柱子加上数字;默认每个区间是左开右闭的}} \tn % Row Count 28 (+ 4) % Row 11 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{boxplot(df\$col \textasciitilde{} df\$catecol);use \seqsplit{\$stats查看两个箱线图计算出的几个critical数据,从上到下依次递增的数据}} \tn % Row Count 31 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{visualization with R (cont)}} \tn % Row 12 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{箱线图的数据依次是:Q1-1}.5IQR,Q1,median,Q3,Q3+1.5QIR。inter-quatile \seqsplit{range简称IQR。在box-and-whisker} \seqsplit{method里,其余的都叫做异常值}} \tn % Row Count 4 (+ 4) % Row 13 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{柱状图,展示类别变量的分布:barplot(table(df\$col)},col=c("light blue","mistyrose","lightcyan","lavander"), horiz=T, cex.names=0.5)。} \tn % Row Count 7 (+ 3) % Row 14 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{堆叠柱状图:data=table(df\$ycol},df\$xcol)。row \seqsplit{index是ycol的各个数值,col} \seqsplit{index是xcol的各个值。barplot(data}, col=c("red","grey"))} \tn % Row Count 10 (+ 3) % Row 15 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{百分比柱状图(主要是数据预处理的不同):prop}.table(df\$ycol,df\$xcol, \seqsplit{margin=1/2)。margin=1是横着计算百分比,2是竖着计算百分比}} \tn % Row Count 14 (+ 4) % Row 16 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{散点图:plot(df\$xcol1},df\$ycol2); \seqsplit{在散点基础上加一根smooth} \seqsplit{curve。scatter.smooth(df\$col1},df\$col2, \seqsplit{col="grey");col决定散点的颜色}} \tn % Row Count 18 (+ 4) % Row 17 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{散点曲线矩阵图,查看各个变量间的关系,分辨哪个最先分析:pairs(\textasciitilde{} col1+age+sex+...., panel=panel.smooth, span=0.75, \seqsplit{data=df)。panel.smooth是加上平滑曲线,span越大,线性程度越高}} \tn % Row Count 23 (+ 5) % Row 18 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{png(),jpeg()+dev.off()能存储图片} \tn % Row Count 24 (+ 1) % Row 19 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{画完图,sys.sleep(0.05)}} \tn % Row Count 25 (+ 1) % Row 20 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{绘制空白图} \tn % Row Count 26 (+ 1) % Row 21 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{plot(x = c(22, 28), y = c(1, 1000), type = "n", xlab = "", ylab = "") \# set up a blank plot with specified ranges} \tn % Row Count 29 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{data.table}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{data.table中,j参数里,:=的结果within在dt中,=在out新建一个dt}} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{创建data.table:dt{[}, .(.N, colname=sum(col==2), \seqsplit{prop.uninsured=sum(col==2)/.N)}, keyby=colname{]}} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{keyby和by的区别是:keyby会sorting分组的结果。可以有多个分组标准}.(col1,col2)} \tn % Row Count 6 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{.N是指列名是number,值是count}} \tn % Row Count 7 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{线性回归}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{多重共线性不影响预测,只影响模型解读。同时解读模型时要假设其他变量不变。不能反写成x的等式,因为这不是代数,是统计模型}} \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{assumption1:linear assosiation between y and x} \tn % Row Count 5 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{assumption2:error has a normal distribution with mean 0} \tn % Row Count 7 (+ 2) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{assumption3:errors与x互相独立,并且有常数的standard} deviation} \tn % Row Count 9 (+ 2) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{lm(y\textasciitilde{}x1+x2或者., data=data);m4 \textless{}- \seqsplit{step(m.full);赤潮信息准则}} \tn % Row Count 11 (+ 2) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{coef(model); confint(model)} \tn % Row Count 12 (+ 1) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{abline(m1, col = "red");identify(x = mtcars\$wt, y = mtcars\$mpg)} \tn % Row Count 14 (+ 2) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{win.graph();identify(x = x\_data, y = y\_data)} \tn % Row Count 15 (+ 1) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{R \seqsplit{suqare代表了模型的解释力。about} xxx\% of the data canbe explained bythe \seqsplit{model。只要增加变量数,R方会一直上升,因此不能简单用R方来比较两个线性模型}} \tn % Row Count 19 (+ 4) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{adjusted R \seqsplit{square:惩罚每一个被添加的变量。在多变量的前提下,用这个比r方要好}} \tn % Row Count 22 (+ 3) % Row 10 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{plot(lm函数的结果)来检验假设1,2,3}} \tn % Row Count 24 (+ 2) % Row 11 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{左上角残差图:test1,2。理想情况是y=0的一条红线。residuals(m5)}} \tn % Row Count 26 (+ 2) % Row 12 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{右上角qq图:test2.理想情况是沿着虚线}} \tn % Row Count 27 (+ 1) % Row 13 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{左下角经过标准化后的残差图:test3。理想情况是上下均匀地分布在一个矩形里,而不是随着x的增大而改变}} \tn % Row Count 30 (+ 3) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{线性回归 (cont)}} \tn % Row 14 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{右下角:展示influential} \seqsplit{outliers。有木有influence主要指去掉这个点对拟合曲线的影响有多大}} \tn % Row Count 3 (+ 3) % Row 15 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{单变量离群点很好辨认,超过两个变量散点图就不能用了,所以得用cook统计量,点落到在虚线外就是influential}} \tn % Row Count 6 (+ 3) % Row 16 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{处理离散类别变量,要注意用r转换成factor。然后哪怕是有序变量,数字本身也没有意义,不能当成连续变量来处理}} \tn % Row Count 10 (+ 4) % Row 17 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{k个类,就有k-1个变量,其中一个会是baseline,baseline自动是字母顺序表里最早的那个,然后也可以用relevel()自定义}} \tn % Row Count 14 (+ 4) % Row 18 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{如何决定选择哪些变量:}} \tn % Row Count 15 (+ 1) % Row 19 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{1、专家,领域知识}} \tn % Row Count 16 (+ 1) % Row 20 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{2、统计知识:pvalue小于5\%},前向,后向选择,双向选择,降维方法,CRT等} \tn % Row Count 18 (+ 2) % Row 21 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{多重共线性:一个x能被其他x线性表出,意味着这个x的信息被其他的包含进去了。因此dummy} variable要减去1} \tn % Row Count 21 (+ 3) % Row 22 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{vif=1/(1-Ri\textasciicircum{}2)。Ri\textasciicircum{}2就是以这个x为因变量,其他x为自变量回归得出的R2。vif(lm的return)} \tn % Row Count 24 (+ 3) % Row 23 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{一版vif\textgreater{}5或者10,有dummy variable的模型一版gvif\textgreater{}2。使用vif模型from package car} \tn % Row Count 26 (+ 2) % Row 24 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{预测未来值:predict}.m5.test \textless{}- predict(m5, newdata = testset, type='response')response 返回y=1的概率值,} \tn % Row Count 29 (+ 3) % Row 25 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{提取p值:summary(model)\$coefficients{[},4{]}} \tn % Row Count 30 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{\seqsplit{逻辑回归(预测分类变量)}}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{glm(y\textasciitilde{}x, family=binomial, data=data)} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{1、建模预测分类变量}} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{2、如何辨别高风险因子}} \tn % Row Count 3 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{3、双变量分类模型}} \tn % Row Count 4 (+ 1) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{4、odds(胜利/失败概率,chances),在代数中等于e\textasciicircum{}z,是个function,odds ratio代表每个bk的作用,e\textasciicircum{}bk,是个常数} \tn % Row Count 7 (+ 3) % Row 5 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{连续变量xk增加一个单位,胜率会怎么增加,会乘以e\textasciicircum{}bk} \tn % Row Count 9 (+ 2) % Row 6 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{类别变量xk从baseline跳转到一个类,胜率也会乘以e\textasciicircum{}bk} \tn % Row Count 11 (+ 2) % Row 7 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{判断一个xk变化会怎么影响odds} \seqsplit{ratio,可以用置信区间,2.5\%-97.5\%的区间超过1且不包含1则大于1}} \tn % Row Count 14 (+ 3) % Row 8 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\seqsplit{5、multinomial(超过3个类别)}} \tn % Row Count 15 (+ 1) % Row 9 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{multinom() function from nnet Rpackage} \tn % Row Count 16 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{cluster}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{library(cluster)} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{km2=kmeans(pts,centers=center)\#初始的中心} \tn % Row Count 2 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{clus1=pts{[}km2\$cluster==1{]}} \tn % Row Count 3 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{5.377cm}}{agnes(rivers),plot(that)} \tn % Row Count 4 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}