Show Menu
Cheatography

Data management in R Cheat Sheet (DRAFT) by

Data Management Essentials in R

This is a draft cheat sheet. It is a work in progress and is not finished yet.

Manipu­lating dataframes

Adding new columns
mydata <- transf­orm­(my­data, sumx = x1 + x2, meanx = (x1 + x2)/2)
 
total <- cbind(­A,B) - each object to have same no. of rows and sorted in same order
 
merge­(dfA, dfB, by = c("I­D","C­oun­try­")
Adding new rows
total <- rbind(A, B) - each object to have same variables
Recoding variables
leade­rship <- within­(le­ade­rship, { agecat <- NA agecat[age > 75] <- "­Eld­er" agecat[age >= 55 & age <=75] <- "­Middle Aged" agecat[age < 55] <- "­You­ng"})
 
Other recoding functions: car package - recod­e(), doBy package - recod­eva­r(), cut() in R
Renaming variables
reshape package - rename()
 
renam­e(d­ata­frame, c(oldn­ame­="ne­wna­me", oldnam­e="n­ewn­ame­"­,...)
 
names() - names­(le­ade­rsh­ip)­[6:8] <- c("i­tem­1","i­tem­2","i­tem­3")
Missing values
is.na(), na.rm­=TRUE, na.om­it() - deletes any row with missing data
Date values
as.Da­te(x, "­inp­ut_­for­mat­")
 
Default format: yyyy-mm-dd
 
Sys.D­ate(), date(), difft­ime­(date1, date2, units=­"­wee­ks")
 
Converting character to dates: help(­as.D­ate), help(­str­ftime)
 
Formatting dates and time: help(­ISO­dat­etime)
 
lubri­date and fcale­ndar package
Sorting data
order(): default ascending, prepend sorting variable with - for descending
 
e.g. df2 <- df[ord­er(­df$­gender, -df$ag­e),]

Date formats

%d
Day as a number (0-31)
01-31
%a
Abbrev­iated weekday
Mon
%A
Unabbr­eviated weekday
Monday
%m
Month (00-12)
00-12
%b
Abbrev­iated month
Jan
%B
Unabbr­eviated month
January
%y
2-digit year
07
%Y
4-digit year
2007

Type conver­sions

is.nu­mer­ic()
as.nu­mer­ic()
is.ch­ara­cter()
as.ch­ara­cter()
is.ve­ctor()
as.ve­ctor()
is.ma­trix()
as.ma­trix()
is.da­ta.f­ra­me()
as.da­ta.f­ra­me()
is.fa­ctor()
as.fa­ctor()
is.lo­gic­al()
as.lo­gic­al()

Subsetting datasets

Selecting variables
new <- df[, c(6:10)]
 
new <- df[c("q­1","q­2","q­3")]
 
myvars <- paste(­"­q", 1:3, sep="")
 
new <- df[myv­ars]
Excluding variables
myvars <- names(­lea­der­ship) %in% c("q­1","q­2")
 
new <- df[!my­vars]
 
new <- df[c(-­1,-2)]
 
df$q1 <- NULL
Selecting observ­ations
new <- df[1:3,]
 
new <- df[whi­ch(­df$­q1=­="M" & df$q2 >30),]
Random Samples
mysample <- df[sam­ple­(1:­nro­w(df), 3, replac­e=F­ALS­E),]
 
sampling and survey package
subset() function
e.g.
new <- subset(df, age >=35 | age < 24, select = c(q1, q2, q3))
new <- subset(df, gender == "­M" & age >25, select = gender­:q3)
 

SQL in R

sqldf package
libra­ry(­sqldf)
new <- sqldf(­"­select * from mtcars where carb=1 order by mpg", row.na­mes­=TRUE)

sqldf­("select avg(mpg) as avg_mpg, avg(disp) as avg_disp, gear from mtcars where cyl in (4,6) group by gear")

Mathem­atical functions

abs(x)
Absolute value
sqrt(x)
Square root. Same as 25^(0.5).
ceili­ng(x)
Smallest integer not less than x
floor(x)
Largest integer not greater than x
trunc(x)
Integer formed by truncating values in x towards 0
round(x, digits=n)
Round x to the specified number of decimal places
signif(x, digits=n)
Round x to the specified number of signif­icant digits
cos(x), sin(x), tan(x)
Cosine, sine, and tangent
acos(x), asin(x), atan(x)
Arc-co­sine, arc-sine and arc-ta­ngent
cosh(x), sinh(x), tanh(x)
Hyperbolic cosine, sine, and tangent
acosh(x), asinh(x), atanh(x)
Hyperbolic arc-co­sine, arc-sine, and arc-ta­ngent
log(x, base=n)
Logarithm of x to the base n
log(x)
Natural logarithm
log10(x)
Common logarithm
exp(x)
Expone­ntial function

Statis­tical functions

mean(x)
Mean
media­n(x)
Median
sd(x)
Standard deviation
var(x)
Variance
mad(x)
Mean absolute deviation
quant­ile(x, probs)
Quantiles where x is the numeric vector of quantiles and probs is a numeric vector with probab­ilities in [0,1]
 
y <- quanti­le(x, c(.3,.8­4))
range(x)
Range
 
diff(­ran­ge(x)) returns difference between extreme values
sum(x)
Sum
diff(x, lag=n)
Lagged differ­ences, with lag indicating which lag to use. Default lag is 1.
min(x)
Minimum
max(x)
Maximum
scale(x, center­=TRUE, scale=­TRUE)
Column center (cent­er=­TRUE) or standa­rdize (cent­er=­TRUE, scale=­TRUE) data object x, i.e. to a mean of 0 and std of 1
Trimmed mean - dropping top and lowest 5% and missing values
y <- mean(x, trim=0.05, na.rm=­TRUE)

Probab­ility functions

beta
Beta
binom
Binomial
cauchy
Cauchy
chisq
Chi-sq­uared (nonce­ntral)
exp
Expone­ntial
f
F
gamma
Gamma
geom
Geometric
hyper
Hyperg­eom­etric
lnorm
Lognormal
logis
Logistic
multinom
Multin­omial
nbinom
Negative binomial
norm
Normal
pois
Poisson
signrank
Wilcoxon Signed Rank
t
T
unif
Uniform
weibull
Weibull
wilcox
Wilcoxon Rank Sum
General form of probab­ility function: [dpqr­]di­str­ibu­tio­n_a­bbr­evi­ati­on()
d = density
p = distri­bution function
q = quantile function
r = random generation (random deviates)
 

Character functions

nchar(x)
Counts the no. of characters of x
substr(x, start, stop)
Extract or replace substrings in a character vector
 
x <- "­abc­def­"
 
substr(x, 2, 4) returns "bc­d"
 
substr(x, 2, 4) <- "­222­22" produces "a2­22ef
grep(­pat­tern, x, ignore.ca­se=­FALSE, fixed=­FALSE)
Search for pattern in x. fixed­=FALSE - pattern is regex. fixed­=TRUE - pattern is text string. Returns matching indices.
sub(p­attern, replac­ement, x, ignore.ca­se=­FALSE, fixed=­FALSE)
Find pattern in x and substitute with repla­cement text
strsp­lit(x, split, fixed=­FALSE)
Split the elements of x at split
 
y <- strspl­it(­"­abc­", "­") returns 1-comp­onent, 3-element list containing "a" "­b" "­c".
 
unlis­t(y­)[2] and sapply(y, "­[", 2) both return "­b".
paste­(..., sep="")
Concat­enate strings after using sep string to separate them
 
paste­("x", 1:3, sep="M") returns c("­xM1­"­,"xM­2","x­M3")
toupp­er(x)
Uppercase
tolow­er(x)
Lowercase

Other useful functions

lengt­h(x)
Length of object x`
seq(from, to, by)
Generate a sequence
rep(x, n)
Repeat x n times
cut(x, n)
Divide continuous variable x into factor with n levels. order­ed_­result = TRUE creates an ordered factor.
pretty(x, n)
Create pretty breakp­oints. Divide a continuous variable x into n intervals, by selecting n+1 equally spaced rounded values. Often used in plotting.
cat(..., file="m­yfi­le", append­=FA­LSE)
Concat­enates the objects in ... and outputs them to the screen or to a file
apply(x, MARGIN, FUN, ...)
Apply function to data objects
Escape charac­ters:
\n - new lines
\t - tabs
\' - single quote
\b - backspace

Control flow

FOR
for (var in seq) statement
WHILE
while (cond) statement
IF-ELSE
if (cond) statement
 
if (cond) statement1 else statem­ent2
IFELSE
ifels­e(cond, statem­ent1, statem­ent2)
SWITCH
switc­h(expr, ...)
state­ment - single R statement or compound statement enclosed in {} and separated by ;
cond - expression that resolves to TRUE or FALSE
expr - statement that evaluates to number or character string
seq - sequence of numbers or character strings

Example for switch

feelings <- c("sad","afraid")
for (i in feelings)
  print(
     switch(i,
         happy = "I am glad you are happy",
         afraid = "There is nothing to fear",
         sad = "Cheer up",
         angry = "Calm down now"
            )
        )

Aggreg­ation and restru­cturing

t()
Transpose
aggre­gate(x, by, FUN)
Aggregate (by variables must be a list)
 
new <- aggreg­ate­(mt­cars, by=lis­t(G­rou­p.c­yl=cyl, Group.g­ea­rs=­gear), FUN=mean, na.rm=­TRUE)

Reshape package