Show Menu
Cheatography

Data management in R Cheat Sheet (DRAFT) by

Data Management Essentials in R

This is a draft cheat sheet. It is a work in progress and is not finished yet.

Manipu­lating dataframes

Adding new columns
mydata <- transf­orm­(my­data, sumx = x1 + x2, meanx = (x1 + x2)/2)
 
total <- cbind(A,B)
- each object to have same no. of rows and sorted in same order
 
merge(dfA, dfB, by = c("I­D","C­oun­try­")
Adding new rows
total <- rbind(A, B)
- each object to have same variables
Recoding variables
leadership <- within­(le­ade­rship, { agecat <- NA  agecat[age > 75] <- "­Eld­er"  agecat[age >= 55 & age <=75] <- "­Middle Aged"  agecat[age < 55] <- "­You­ng"})
 
Other recoding functions:
car
package -
recode()
,
doBy
package -
recode­var()
,
cut()
in R
Renaming variables
reshape
package -
rename()
 
rename­(da­taf­rame, c(oldn­ame­="ne­wna­me", oldnam­e="n­ewn­ame­"­,...)
 
names()
-
names(­lea­der­shi­p)[6:8] <- c("i­tem­1","i­tem­2","i­tem­3")
Missing values
is.na()
,
na.rm=TRUE
,
na.omit()
- deletes any row with missing data
Date values
as.Date(x, "­inp­ut_­for­mat­")
 
Default format: yyyy-mm-dd
 
Sys.Date()
,
date()
,
diffti­me(­date1, date2, units=­"­wee­ks")
 
Converting character to dates:
help(a­s.Date)
,
help(s­trf­time)
 
Formatting dates and time:
help(I­SOd­ate­time)
 
lubridate
and
fcalendar
package
Sorting data
order()
: default ascending, prepend sorting variable with
-
for descending
 
e.g.
df2 <- df[ord­er(­df$­gender, -df$age),]

Date formats

%d
Day as a number (0-31)
01-31
%a
Abbrev­iated weekday
Mon
%A
Unabbr­eviated weekday
Monday
%m
Month (00-12)
00-12
%b
Abbrev­iated month
Jan
%B
Unabbr­eviated month
January
%y
2-digit year
07
%Y
4-digit year
2007

Type conver­sions

is.num­eric()
as.num­eric()
is.cha­rac­ter()
as.cha­rac­ter()
is.vec­tor()
as.vec­tor()
is.mat­rix()
as.mat­rix()
is.dat­a.f­rame()
as.dat­a.f­rame()
is.fac­tor()
as.fac­tor()
is.log­ical()
as.log­ical()

Subsetting datasets

Selecting variables
new <- df[, c(6:10)]
 
new <- df[c("q­1","q­2","q­3")]
 
myvars <- paste(­"­q", 1:3, sep="")
 
new <- df[myvars]
Excluding variables
myvars <- names(­lea­der­ship) %in% c("q­1","q­2")
 
new <- df[!my­vars]
 
new <- df[c(-­1,-2)]
 
df$q1 <- NULL
Selecting observ­ations
new <- df[1:3,]
 
new <- df[whi­ch(­df$­q1=­="M" & df$q2 >30),]
Random Samples
mysample <- df[sam­ple­(1:­nro­w(df), 3, replac­e=F­ALSE),]
 
sampling
and
survey
package
subset()
function
e.g.
new <- subset(df, age >=35 | age < 24, select = c(q1, q2, q3))

new <- subset(df, gender == "­M" & age >25, select = gender:q3)
 

SQL in R

sqldf
package
librar­y(s­qldf)

new <- sqldf(­"­select * from mtcars where carb=1 order by mpg", row.na­mes­=TRUE)


sqldf(­"­select avg(mpg) as avg_mpg, avg(disp) as avg_disp, gear from mtcars where cyl in (4,6) group by gear")

Mathem­atical functions

abs(x)
Absolute value
sqrt(x)
Square root. Same as
25^(0.5)
.
ceiling(x)
Smallest integer not less than x
floor(x)
Largest integer not greater than x
trunc(x)
Integer formed by truncating values in
x
towards 0
round(x, digits=n)
Round
x
to the specified number of decimal places
signif(x, digits=n)
Round
x
to the specified number of signif­icant digits
cos(x), sin(x), tan(x)
Cosine, sine, and tangent
acos(x), asin(x), atan(x)
Arc-co­sine, arc-sine and arc-ta­ngent
cosh(x), sinh(x), tanh(x)
Hyperbolic cosine, sine, and tangent
acosh(x), asinh(x), atanh(x)
Hyperbolic arc-co­sine, arc-sine, and arc-ta­ngent
log(x, base=n)
Logarithm of
x
to the base
n
log(x)
Natural logarithm
log10(x)
Common logarithm
exp(x)
Expone­ntial function

Statis­tical functions

mean(x)
Mean
median(x)
Median
sd(x)
Standard deviation
var(x)
Variance
mad(x)
Mean absolute deviation
quanti­le(x, probs)
Quantiles where
x
is the numeric vector of quantiles and
probs
is a numeric vector with probab­ilities in [0,1]
 
y <- quanti­le(x, c(.3,.84))
range(x)
Range
 
diff(r­ang­e(x))
returns difference between extreme values
sum(x)
Sum
diff(x, lag=n)
Lagged differ­ences, with
lag
indicating which lag to use. Default lag is 1.
min(x)
Minimum
max(x)
Maximum
scale(x, center­=TRUE, scale=­TRUE)
Column center (
center­=TRUE
) or standa­rdize (
center­=TRUE, scale=TRUE
) data object
x
, i.e. to a mean of 0 and std of 1
Trimmed mean - dropping top and lowest 5% and missing values
y <- mean(x, trim=0.05, na.rm=­TRUE)

Probab­ility functions

beta
Beta
binom
Binomial
cauchy
Cauchy
chisq
Chi-sq­uared (nonce­ntral)
exp
Expone­ntial
f
F
gamma
Gamma
geom
Geometric
hyper
Hyperg­eom­etric
lnorm
Lognormal
logis
Logistic
multinom
Multin­omial
nbinom
Negative binomial
norm
Normal
pois
Poisson
signrank
Wilcoxon Signed Rank
t
T
unif
Uniform
weibull
Weibull
wilcox
Wilcoxon Rank Sum
General form of probab­ility function:
[dpqr]­dis­tri­but­ion­_ab­bre­via­tion()

d
= density
p
= distri­bution function
q
= quantile function
r
= random generation (random deviates)
 

Character functions

nchar(x)
Counts the no. of characters of
x
substr(x, start, stop)
Extract or replace substrings in a character vector
 
x <- "­abc­def­"
 
substr(x, 2, 4)
returns
"­bcd­"
 
substr(x, 2, 4) <- "­222­22"
produces
"­a222ef
grep(p­attern, x, ignore.ca­se=­FALSE, fixed=­FALSE)
Search for pattern in
x
.
fixed=­FALSE
-
pattern
is regex.
fixed=TRUE
-
pattern
is text string. Returns matching indices.
sub(pa­ttern, replac­ement, x, ignore.ca­se=­FALSE, fixed=­FALSE)
Find
pattern
in
x
and substitute with
replac­ement
text
strspl­it(x, split, fixed=­FALSE)
Split the elements of
x
at
split
 
y <- strspl­it(­"­abc­", "­")
returns 1-comp­onent, 3-element list containing
"­a" "­b" "­c"
.
 
unlist­(y)[2]
and
sapply(y, "­[", 2)
both return "­b".
paste(..., sep="")
Concat­enate strings after using
sep
string to separate them
 
paste(­"­x", 1:3, sep="M")
returns
c("x­M1",­"­xM2­"­,"xM­3")
toupper(x)
Uppercase
tolower(x)
Lowercase

Other useful functions

length(x)
Length of object x`
seq(from, to, by)
Generate a sequence
rep(x, n)
Repeat
x n
times
cut(x, n)
Divide continuous variable
x
into factor with
n
levels.
ordere­d_r­esult = TRUE
creates an ordered factor.
pretty(x, n)
Create pretty breakp­oints. Divide a continuous variable
x
into
n
intervals, by selecting
n+1
equally spaced rounded values. Often used in plotting.
cat(..., file="m­yfi­le", append­=FALSE)
Concat­enates the objects in ... and outputs them to the screen or to a file
apply(x, MARGIN, FUN, ...)
Apply function to data objects
Escape charac­ters:
\n
- new lines
\t
- tabs
\'
- single quote
\b
- backspace

Control flow

FOR
for (var in seq) statement
WHILE
while (cond) statement
IF-ELSE
if (cond) statement
 
if (cond) statement1 else statement2
IFELSE
ifelse­(cond, statem­ent1, statem­ent2)
SWITCH
switch­(expr, ...)
statement
- single R statement or compound statement enclosed in {} and separated by ;
cond
- expression that resolves to
TRUE
or
FALSE

expr
- statement that evaluates to number or character string
seq
- sequence of numbers or character strings

Example for switch

feelings <- c("sad","afraid")
for (i in feelings)
  print(
     switch(i,
         happy = "I am glad you are happy",
         afraid = "There is nothing to fear",
         sad = "Cheer up",
         angry = "Calm down now"
            )
        )

Aggreg­ation and restru­cturing

t()
Transpose
aggreg­ate(x, by, FUN)
Aggregate (
by
variables must be a list)
 
new <- aggreg­ate­(mt­cars, by=lis­t(G­rou­p.c­yl=cyl, Group.g­ea­rs=­gear), FUN=mean, na.rm=­TRUE)

Reshape package