Manipulating dataframes
Adding new columns mydata <- transform(mydata, sumx = x1 + x2, meanx = (x1 + x2)/2)
|
total <- cbind(A,B)
- each object to have same no. of rows and sorted in same order
|
merge(dfA, dfB, by = c("ID","Country")
|
Adding new rows total <- rbind(A, B)
- each object to have same variables
|
Recoding variables leadership <- within(leadership, { agecat <- NA agecat[age > 75] <- "Elder" agecat[age >= 55 & age <=75] <- "Middle Aged" agecat[age < 55] <- "Young"})
|
Other recoding functions: car
package - recode()
, doBy
package - recodevar()
, cut()
in R
|
Renaming variables reshape
package - rename()
|
rename(dataframe, c(oldname="newname", oldname="newname",...)
|
names()
- names(leadership)[6:8] <- c("item1","item2","item3")
|
Missing values is.na()
, na.rm=TRUE
, na.omit()
- deletes any row with missing data
|
Date values as.Date(x, "input_format")
|
Default format: yyyy-mm-dd
|
Sys.Date()
, date()
, difftime(date1, date2, units="weeks")
|
Converting character to dates: help(as.Date)
, help(strftime)
|
Formatting dates and time: help(ISOdatetime)
|
lubridate
and fcalendar
package
|
Sorting data order()
: default ascending, prepend sorting variable with -
for descending
|
e.g. df2 <- df[order(df$gender, -df$age),]
|
Date formats
|
Day as a number (0-31) |
01-31 |
|
Abbreviated weekday |
Mon |
|
Unabbreviated weekday |
Monday |
|
Month (00-12) |
00-12 |
|
Abbreviated month |
Jan |
|
Unabbreviated month |
January |
|
2-digit year |
07 |
|
4-digit year |
2007 |
Subsetting datasets
Selecting variables
|
new <- df[c("q1","q2","q3")]
|
myvars <- paste("q", 1:3, sep="")
|
|
Excluding variables myvars <- names(leadership) %in% c("q1","q2")
|
|
|
|
Selecting observations
|
new <- df[which(df$q1=="M" & df$q2 >30),]
|
Random Samples mysample <- df[sample(1:nrow(df), 3, replace=FALSE),]
|
sampling
and survey
package
|
subset()
function
e.g.
new <- subset(df, age >=35 | age < 24, select = c(q1, q2, q3))
new <- subset(df, gender == "M" & age >25, select = gender:q3)
|
|
SQL in R
sqldf
package
library(sqldf)
new <- sqldf("select * from mtcars where carb=1 order by mpg", row.names=TRUE)
sqldf("select avg(mpg) as avg_mpg, avg(disp) as avg_disp, gear from mtcars where cyl in (4,6) group by gear")
|
Mathematical functions
|
Absolute value |
|
Square root. Same as 25^(0.5)
. |
|
Smallest integer not less than x |
|
Largest integer not greater than x |
|
Integer formed by truncating values in x
towards 0 |
|
Round x
to the specified number of decimal places |
|
Round x
to the specified number of significant digits |
|
Cosine, sine, and tangent |
acos(x), asin(x), atan(x)
|
Arc-cosine, arc-sine and arc-tangent |
cosh(x), sinh(x), tanh(x)
|
Hyperbolic cosine, sine, and tangent |
acosh(x), asinh(x), atanh(x)
|
Hyperbolic arc-cosine, arc-sine, and arc-tangent |
|
Logarithm of x
to the base n
|
|
Natural logarithm |
|
Common logarithm |
|
Exponential function |
Statistical functions
|
Mean |
|
Median |
|
Standard deviation |
|
Variance |
|
Mean absolute deviation |
|
Quantiles where x
is the numeric vector of quantiles and probs
is a numeric vector with probabilities in [0,1] |
|
y <- quantile(x, c(.3,.84))
|
|
Range |
|
diff(range(x))
returns difference between extreme values |
|
Sum |
|
Lagged differences, with lag
indicating which lag to use. Default lag is 1. |
|
Minimum |
|
Maximum |
scale(x, center=TRUE, scale=TRUE)
|
Column center ( center=TRUE
) or standardize ( center=TRUE, scale=TRUE
) data object x
, i.e. to a mean of 0 and std of 1 |
Trimmed mean - dropping top and lowest 5% and missing values
y <- mean(x, trim=0.05, na.rm=TRUE)
Probability functions
|
Beta |
|
Binomial |
|
Cauchy |
|
Chi-squared (noncentral) |
|
Exponential |
|
F |
|
Gamma |
|
Geometric |
|
Hypergeometric |
|
Lognormal |
|
Logistic |
|
Multinomial |
|
Negative binomial |
|
Normal |
|
Poisson |
|
Wilcoxon Signed Rank |
|
T |
|
Uniform |
|
Weibull |
|
Wilcoxon Rank Sum |
General form of probability function: [dpqr]distribution_abbreviation()
d
= density
p
= distribution function
q
= quantile function
r
= random generation (random deviates)
|
|
Character functions
|
Counts the no. of characters of x
|
|
Extract or replace substrings in a character vector |
|
|
|
substr(x, 2, 4)
returns "bcd"
|
|
substr(x, 2, 4) <- "22222"
produces "a222ef
|
grep(pattern, x, ignore.case=FALSE, fixed=FALSE)
|
Search for pattern in x
. fixed=FALSE
- pattern
is regex. fixed=TRUE
- pattern
is text string. Returns matching indices. |
sub(pattern, replacement, x, ignore.case=FALSE, fixed=FALSE)
|
Find pattern
in x
and substitute with replacement
text |
strsplit(x, split, fixed=FALSE)
|
Split the elements of x
at split
|
|
y <- strsplit("abc", "")
returns 1-component, 3-element list containing "a" "b" "c"
. |
|
unlist(y)[2]
and sapply(y, "[", 2)
both return "b". |
|
Concatenate strings after using sep
string to separate them |
|
paste("x", 1:3, sep="M")
returns c("xM1","xM2","xM3")
|
|
Uppercase |
|
Lowercase |
Other useful functions
|
Length of object x` |
|
Generate a sequence |
|
|
|
Divide continuous variable x
into factor with n
levels. ordered_result = TRUE
creates an ordered factor. |
|
Create pretty breakpoints. Divide a continuous variable x
into n
intervals, by selecting n+1
equally spaced rounded values. Often used in plotting. |
cat(..., file="myfile", append=FALSE)
|
Concatenates the objects in ... and outputs them to the screen or to a file |
apply(x, MARGIN, FUN, ...)
|
Apply function to data objects |
Escape characters:
\n
- new lines
\t
- tabs
\'
- single quote
\b
- backspace
Control flow
FOR |
for (var in seq) statement
|
WHILE |
|
IF-ELSE |
|
|
if (cond) statement1 else statement2
|
IFELSE |
ifelse(cond, statement1, statement2)
|
SWITCH |
|
statement
- single R statement or compound statement enclosed in {} and separated by ;
cond
- expression that resolves to TRUE
or FALSE
expr
- statement that evaluates to number or character string
seq
- sequence of numbers or character strings
Example for switch
feelings <- c("sad","afraid")
for (i in feelings)
print(
switch(i,
happy = "I am glad you are happy",
afraid = "There is nothing to fear",
sad = "Cheer up",
angry = "Calm down now"
)
)
|
Aggregation and restructuring
|
Transpose |
|
Aggregate ( by
variables must be a list) |
|
new <- aggregate(mtcars, by=list(Group.cyl=cyl, Group.gears=gear), FUN=mean, na.rm=TRUE)
|
|