- R intro
- Data structures
- Control structures
- Functions
- Commonly used built in functions
- String manipulation
- Miscellaneous Tips and tricks
Ilan Man
Business Intelligence @ Squarespace
Rcpp
, ff
, snow
, parallel
?help
-> use this to get help. ?
is easily the most useful function in R.install.packages('ggplot2') ## do this once only
require('ggplot2') ## do this every time you load up an R session
library() ## shows you every package in your standard package location
R Coding convention is another resource
Use <-
NOT =
for assignment
Spaces between operators like +
, %*%
, <
, >
and after closing brackets )
, }
Don't write functions named rep()
, sample()
, plot()
or any other built-in R names
c
should not be used for any variable names
i
and j
should only be used in loops, conditionals, etc...
Use camel case for functions: myFirstFunction()
is better than my.first.function()
Use 'hello'
or "hello"
for strings, but be consistent.
mode
in R. Known as atomic.x <- c(1,2,3,4,5,6,7,8,9,10) ## vector from 1 to 10 - class numeric
x <- 1:10 ## alternative - class integer
x <- seq(from=1,to=10,by=1) ## alternative - class numeric
n <- 10
x <- numeric(n)
for (i in 1:n) x[i] <- i ## as n gets large, this is very slow (compared to the alternatives)
x <- numeric(0)
for (i in 1:n) x <- c(x,i) ## preferred vs. above
## to the extent possible, provide the size of your object when
## initializing it
logical
, character
, integer
, double
, complex
, raw
x <- seq(from = 1, to = 10, by = 1)
y <- 0
for (i in c(1:length(x))) y[i] <- x[i] * 5
print(y)
## [1] 5 10 15 20 25 30 35 40 45 50
## alternatively....
y <- x * 5
print(y)
## [1] 5 10 15 20 25 30 35 40 45 50
x <- 1:10
x[ c( 1:5 , 8:10 ) ]
[1] 1 2 3 4 5 8 9 10
x[ c(TRUE , FALSE) ] ## recycling - common R feature. R will not give you a warning!
[1] 1 3 5 7 9 ## very useful, but make sure you are comparing vectors of same length
x > 5 ## Boolean vector. mode = "logical"
[1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
any(x > 5)
[1] TRUE
all(x < 8)
[1] FALSE
f <- function(a, b) return(a^b)
f(x, 2)
## [1] 1 4 9 16 25 36 49 64 81 100
+
, -
, *
are functions"*"(x,5) ## returns 5 * x[1], 5 * x[2], ...
'['(x, x > 5 ) ## returns vector of values where x[1] > 5, x[2] > 5, ..., x[10] > 5 is TRUE
ifelse(x < 5, x^2, 0) ## if (condition) { do something } else { do something else }
[1] 1 4 9 16 0 0 0 0 0 0
logsum <- 0
x <- seq(100,1000000,by=10)
for (i in 1:length(x)){
logsum <- logsum + log(x[i])
}
logsum
[1] 1281524 ## this calculation takes about 0.17 seconds
# R translation
logsum <- sum(log(x)) ## this calculation takes about 0.002 seconds.
[1] 1281524
sum
, max
, min
, ... are exceptionsmean(1,3,2)
[1] 1 ## huh??
mean(c(1,3,2))
[1] 2 ## that's better
max(1,3,2)
[1] 3
rowSums(x)
instead of apply(x,1,sum)
...more on this later!NA
and NULL
NA
appears often in messy data, especially when a value doesn't existNA
, and therefore return NA
NULL
, it skips it. NULL
is non existant. Yet it exists as a NULL
. ?philosophy.x <- c(5, 10, NA, 20, 25)
mean(x)
[1] NA
is.na(x) ## commonly used when cleaning data sets
[1] FALSE FALSE TRUE FALSE FALSE
mean(x,na.rm=TRUE) ## 15
x <- c(5,10,NULL,20,25)
mean(x) ## 15
length(NA) ## NA is a logical constant of length 1
[1] 1
length(NULL) ## NULL does not take any value. By definition, it's undefined
[1] 0 ## ?philosphy
x <- 1:10
x[ x > 5 ] ## What's happening here?
x > 5
is a function call to ">"(a,b)
which returns TRUE
or FALSE
on every element of vector x
. x > 5
is logical
vector. And when used as an index on x
...x[c(FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE)]
## [1] 6 7 8 9 10
...returns elements of x
that are TRUE
Common filtering functions include:
subset(x, x > 5) ## [1] 6 7 8 9 10
which(x > 5) ## [1] 6 7 8 9 10
4%in%x ## [1] TRUE
atomic
seq()
, rep()
, sample()
, runif()
any()
, all()
, which()
, subset()
, %in%
Compute the following:
a) \(\large \sum_{i=1}^{500} \ln{(i^{2})} + \frac{2}{i}\)
b) \(\large \frac{1}{n}\sum_{i=1}^{n} (\bar{X} - X_{i})^{2}\), where X ~ Normal(5,100) and n = 1000
Hint:
?rnorm
c) \(\large \frac{1}{n}\sum_{i=1}^{n} (\bar{X} - X_{i})(\bar{Y} - Y_{i})\), where X ~ Poisson with lambda of 2, Y ~ Exponential with a rate of 1, and n = 1000
x <- matrix(seq(1, 6, by = 1), nrow = 3, ncol = 2) ## 3 by 2 matrix
print(x)
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
x <- matrix( seq(1,6,by=1), nrow=3) ## same as above
x <- matrix( seq(1,6,by=1), nrow=3, byrow=TRUE) ## row-major order
x <- matrix( seq(1,6,by=1), nrow=4) ## is this ok?
x <- matrix( seq(1,6,by=1), nrow=3, ncol=3) ## is this ok?
x <- matrix(seq(1,9),nrow=3,ncol=3)
x + 5
x * 2
t(x) ## transpose
x %*% x ## inner product
crossprod(x,x) ## cross product of x and x
x * x ## element-wise product
diag(x) ## diagonal components - identity matrix
det(x) ## determinant
eigen(x) ## list of eigenvalues and eigenvectors
x[2,1] ## second row, first column
x[,1] ## all rows, first column. Vector form, not matrix.
x[,] ## all rows, all columns. Same as print(x), or just x.
x[-1,] ## remove first row. Negative indexing.
x <- matrix( c(1:9), nrow=3, ncol=3)
class(x) ## matrix
y <- x[1,] ## 3 element vector
class(y) ## integer
attributes(y) ## returns NULL
y <- x[1,, drop=FALSE]
class(y) ## matrix
attributes(y) ## 1 by 3 matrix
colnames(x) <- c( 'first col' , 'second col' , 'third col' )
rownames(x) <- c( 'row 1' , 'row 2' , 'row 3' )
arrays
a)
x <- matrix(rep(c(1,3,-1,2),5),ncol=4)
(i) What is returned by the following? Do it by hand before typing it in.
mean(x[ x[1,] > 1, c(1:2) ])
(ii) Find the column in x which has the largest total.
b)
y <- matrix(c(c(1,2,4,8),c(2,3,-1,-7),c(0,5,12,-4),c(3,4,5,0)),ncol=4)
(i) Calculate the trace of y.
(ii) Replace each element of the 3rd column with the median of the elements of the first, second and
fourth columns for the same row.
modes
.data.frames
x <- list(title = "R presentation", date = format(as.POSIXlt(Sys.time(), "EDT"),
"%m %d %Y"), num_attendees = 10)
print(x)
## $title
## [1] "R presentation"
##
## $date
## [1] "12 06 2013"
##
## $num_attendees
## [1] 10
list
components## one bracket - [ - returns a list type
x[1]
## $title
## [1] "R presentation"
## two brackets - [[ - returns the actual element, in this case a character
x[[1]]
x$title
x[['title']]
[1] "R presentation"
list
components and valuesnames(x)
[1] "title" "date" "num_attendees"
unlist(x) ## flattens the list into a character vector
pres_1 <- format(as.POSIXlt(Sys.Date(),"EDT"),"%m %d %Y")
pres_2 <- format(as.POSIXlt(Sys.Date()+30,"EDT"),"%m %d %Y")
x <- list(title='1st R presentation', date=pres_1, num_attendees=10)
y <- list(title='2nd R presentation', date=pres_2, num_attendees=20)
z <- list(x,y) ## list of lists
## z[[1]][1] is equivalent to x
list
n <- 100
x <- rnorm(n, mean = 0, sd = 1) ## sample of 100 random standard normal variables
y <- 1 - 2 * x + rnorm(n)
f <- y ~ x ## y ~ x is a formula object
r <- lm(f) ## r is linear model object, i.e. linear regression
## the function str() - "structure"" - is VERY useful in exploratory data analysis
## structure of r is a bunch of lists
str(r)
r$coeff
r$residuals
data.frame
Every CSV or Text file you read in is a data.frame
, i.e. most real data comes in the form of a data.frame
Creating Data Frames
z <- data.frame() ## data frame with 0 columns and 0 rows
y <- data.frame(col1 = c(1, 2), col2 = c("a", "b"), row.names = c("row1", "row2"))
print(y)
## col1 col2
## row1 1 a
## row2 2 b
x <- data.frame(matrix( sample(c(50:100), size=12, replace=TRUE), nrow=6, ncol=2))
## return first column
x[,1] ## type is vector
x$X1 ## type is vector
x[1] ## type data.frame.
x['X1'] ## type data.frame
data.frame
functionsx <- x[-6,] ## remove rows or columns with a "-" sign. Like negative indexing.
y <- data.frame(names = c("dave","jenny","scott","mary","harry") )
z <- cbind(y, x) ## column bind. Can be used on matrices too.
## if you cbind two vectors you get a matrix, NOT data.frame
## alternatively you can create columns implicitly
x$names <- c("dave" ,"jenny" ,"scott" ,"mary" ,"harry")
w <- data.frame(names="megan", X1=82, X2=85)
z <- rbind(z, w) ## row bind
## make sure number of elements in row, column are consistent
## explicitly set columns names for z. Use rownames() for row names. Shocker.
names(z) <- c("names", "Exam 1","Exam 2")
## get dimensions
dim(z)
[1] 6 3
head(z) ## default to first 6 rows
tail(z) ## default to last 6 rows
data.frames
are more memory intensive than matrices
data.frame
, i.e. set size of data.frame
before using itmatrices
vector
with additional information - categories, or levels
data.frame
sx <- factor(c("finance", "tech", "tech", "auto", "finance", "energy", "tech"))
print(x)
## [1] finance tech tech auto finance energy tech
## Levels: auto energy finance tech
y <- factor(x, levels = c(levels(x), "tv")) ## include new level, even though no tv data exists
print(y)
## [1] finance tech tech auto finance energy tech
## Levels: auto energy finance tech tv
levels
to order your levels. Helpful when sorting factorswday <- c("mon", "tues", "mon", "wed", "fri", "wed")
wdayf <- factor(wday)
sort(wdayf) ## did this do what we expected?
## [1] fri mon mon tues wed wed
## Levels: fri mon tues wed
wdayf <- factor(wday, levels = c("mon", "tues", "wed", "thurs", "fri")) ## let's add Thursday as well
sort(wdayf)
## [1] mon mon tues wed wed fri
## Levels: mon tues wed thurs fri
factor
functionsz$names2 <- NULL ## NULL removes the object from the factor (or list)
z$gender <- c("m","f","m","f","m","f")
z$party <- c("D","D","R","R","D","D")
tbl <- table(z$gender,z$party) ## contingency table. class "table"
addmargins(tbl) ## marginal sums
## D R Sum
## f 2 1 3
## m 2 1 3
# # Sum 4 2 6
x <- seq(5,20,by=5)
f <- factor(x)
print(f)
[1] 5 10 15 20
Levels: 5 10 15 20
as.numeric(f) ## huh??
[1] 1 2 3 4
as.numeric(as.character(f))
[1] 5 10 15 20 ## much better
as.numeric(levels(f)) ## more efficient due to less conversions
Vectors
- lifeblood of RMatrices
- great for linear algebra and stats functionsLists
- store and access elements of complex objectsData.frames
- data analysis object of choiceFactors
- good for statistics and categorization of data into groupsfor()
while()
repeat()
try()
if()
for()
x <- seq(0, 20, by=1) ## default increment is 1
for (i in c(1:length(x))){
x[i] <- x[i] * 2
}
## can be written on one line - but careful to not make it too messy
for (i in c(1:length(x))) x[i] <- x[i] * 2
while()
i=1
while (i <= 21) {
x[i] <- x[i] * 2
i <- i + 1
}
repeat()
x <- seq(0,20,by=1)
i = 1
repeat {
x[i] <- x[i] * 2
i <- i + 1
if (i > 21) break
}
try()
try("hello" + 1, silent = FALSE)
try("hello" + 1, silent = TRUE)
tryCatch("hello" + 1, error = function(e) print("don't be ridiculous"))
## [1] "don't be ridiculous"
if()
if (a == b) {
# do something
} else { ## the else statement MUST be on the same line as the
# do something else ## closing bracket of the if()
}
if (a == b) {
# do something
}
else { ## WRONG. returns lots of headaches.
# do something else
}
if (a == b) do something ## one-liners
ifelse (a == b, x, y) ## use ifelse() on vectors
(a) Write a loop to scan through an integer vector and return the index of the
largest value. The loop should terminate as soon as the index is found. Ignore ties.
(b) Redo the above using built-in R functions such as rank(), sort() and order().
function()
is a built-in R function whose job is to create functions...#mindblownexponentiate <- function(x, y) {
return(x^y)
}
exponentiate(2, 4)
## [1] 16
exponentiate()
has two arguments: the parameters and the bodyformals(exponentiate) # $x $y These are the arguments to exponentiate()
body(exponentiate) # { return (x^y) }
exponentiate # prints out the entire function - good if you forget what's in it!
## function(x, y) {
## return(x^y)
## }
sum()
, mean()
)f <- function(x, y=3) { ... }
## Some functions have tons of parameters. You don't need to enter them all.
f <- function(x, ...) {
plot(x, ...)
}
sapply (x, function(x) x*2)
d <- 8
f <- function(y){
x <- 3 * y
h <- function (){
return(y*(x+d))
}
return(x+h())
}
f(2)
# d is global to f()
# x is local to f() and global to h()
# h cannot be called at the "top level" since it's environment is limited to f()'s
Write a function that finds the maximum value in corresponding indices for two vectors. For example:
x <- c(1,2,3,4)
y <- c(0,3,5,4)
## output should be
[1] 1 3 5 4
ls() # returns all the variables in the environment
# good to know when you've created a ton and are starting to lose track
rm(x) # rm(x) removes x...rm(list=ls()) is usually not a great idea!
"%powerUp%" <- function(a, b) return(a^b)
3 %powerUp% 2
## [1] 9
## in Python
>>> x = c(5,2,8)
>>> x.sort() ## this doesn't exist in R
>>> x
[2, 5 , 8]
## in R
x <- c(5,2,8)
sort(x)
[1] 2 5 8
x
[1] 5 2 8
x <- sort(x)
x
[1] 2 5 8
print()
, plot()
, summary()
. Concept of OOP.data(cars) ## load built in dataset
fit <- lm(dist ~ speed, data=cars)
summary(fit)
## same function call, on a different object type
summary(c(1,2,3))
## lists out all the methods for the summary function
methods(summary)
summary.lm()
function and a summary.default()
function
a) Get the Adjusted R-squared from the regression of distance on speed in the cars dataset
b) Get the t-value of the X variable (i.e. speed)
c) Predict the braking distance if going 200 miles per hour
apply()
lapply()
, tapply()
, sapply()
mapply()
by()
, cut()
, aggregate()
, split()
apply()
x <- matrix(sample(c(0:100),20,replace=TRUE),nrow=5,ncol=4)
apply(x,1,sum) ## sum rows
apply(x,1,function(x) x^2) ## apply function to every element
## What type of function is this?
lapply()
, tapply()
, sapply()
apply()
for other data structureslapply(list(z$'Exam 1',z$'Exam 2'),mean) ## mean of Exam scores; returns a list
## like apply() but can be used on data.frames
sapply(list(z$'Exam 1',z$'Exam 2'),mean) ## mean of Exam scores; returns a vector
sapply(list(z$'Exam 1',z$'Exam 2'), mean, simplify=FALSE) ## same as lappy()
## find mean exam 1 scores, split by party
tapply(z$'Exam 1', z$party, FUN = mean, simplify=TRUE) ## simplify determines output type
## D R
## 82.25 84.50
mapply()
a<-c(1:5)
b<-c(6:10)
d<-c(11:15)
mapply(sum,a,b,d)
sum(a[1],b[1],d[1])
sum(a[2],b[2],d[2])
mapply(mean,a,b,d) ## What's happening here?
by()
, cut()
, aggregate()
, split()
apply()
, but used on data.frames
plyr
is very popular and useful, but important to learn Base R firstaggregate(z[,c(2:3)], by=list(z$party), mean) ## mean Exam score by party
aggregate(z[,c(2:3)], by=list(z$part, z$gender), sum) ## sum by party and gender
## same as tapply() but for data.frames (instead of arrays)
## returns class "by"
by(z$'Exam 1',z$party,sum)
## convert numeric column of data.frame into factor
## great for binning data
z$age <- c(21,29,38,41,26,50)
cut(z$age,breaks=c(20,30,40,50))
split(z,f = z$gender) ## split a dataframe according to a factor
a) Create a column for the average grade for each student. Label it.
b)
system.time()
returns timings for R operations. Examine the help documentation for this function.for()
loopapply()
functionexample <- c("THIS IS AN EXAMPLE","and so is this","this is not","hello world","extra")
grep("an", example) ## return index of occurence "an"
[1] 2
grep("an", example, ignore.case=TRUE)
[1] 1 2
grep("an", example, ignore.case=TRUE, value=TRUE)
[1] "THIS IS AN EXAMPLE" "and so is this"
nchar(example)
[1] 18 14 11 11 5
paste(example[1],example[2])
[1] "THIS IS AN EXAMPLE and so is this"
files <- c("ex1","ex2")
for (i in files){
save(filename = paste("Title",i,".pdf"))
}
# formatting strings
sprintf("%f",exp(1))
[1] "2.718282"
sprintf("%0.2f",exp(1))
[1] "2.72"
sprintf("Today's date is %s",format(Sys.Date(),"%d %b %Y"))
[1] "Today's date is 31 Oct 2013"
example2 <- "Substring takes a subset of...the string!...It's nuts!"
paste(substr(example2, 19, 21), substr(example2, 22, 24), sep = "")
## [1] "subset"
sp <- strsplit(example2,split="of")
sp
[[1]]
[1] "Substring takes a sub-set " "...the string!...It's nuts!"
length(sp)
[1] 1
length(unlist(sp))
[1] 2
regexpr("!",example2) ## first occurence of "!" in example2
gregexpr("!",example2) ## all occurrences of "!" in example2
a) Convert the following character vector into a 3 column dataframe. Name each column.
b) Format the numbers to be percentages with 2 decimal places.
c) Find the total score for people with J-letter first names.
d) Find the most common weekday.
char_vec <- c("{'al' 'einst'} score:0.4503-[12302013]",
"{'isaac' 'knewt'} score:0.0007-[11202013]",
"{'ralph' 'emerson'} score:0.10321-[09122013]",
"{'james' 'dean'} score:0.84-[02032012]",
"{'jim' 'beam'} score:0.2-[10172013]",
"{'tommy' 'bahamas'} score:0.761-[05212013]",
"{'george' 'of the jungle'} score:0.9434-[01302013]",
"{'harry' 'henderson'} score:0.5456-[08112012]",
"{'johnny' 'walker'} score:0.309118-[08212011]")
e) Print out the following sentence with one word on each line:
y <- "This is a sentence."
x <- rnorm(1000,85,5)
y <- 2 * runif(1000,0,10)
mean(x) ## [1] 85.22434
median(x) ## [1] 85.24506
sd(x) ## [1] 4.869123
var(x) ## [1] 23.70836
cov(x,y) ## [1] 0.8053751
cor(x,y) ## [1] 0.02883522
1:n-1 ## wrong
1:(n-1) ## this is what you want
-2.4 ^ 2.5 ## nice
[1] -8.923354
x <- -2.4
x ^ 2.5 ## not so nice
[1] NaN
TRUE
x == 4 | 6 ## OR function - returns bogus result
x == 4 | TRUE ## weird
x == 4 | x == 6 ## better
x %in% c(4,6) ## best
read.csv()
returns a data.frame
. What if you want to do math? Matrices are better.x <- data.frame(num=c(1,2,3,4))
mean(x) ## Nope
x <- as.matrix(x)
mean(x) ## Yup
as.numeric()
as.character()
as.factor()
as.data.frame()
print()
vs cat()
print()
is a generic functioncat()
is a concatenate functionx <- 2
print("One plus one is",x)
[1] "One plus one is"
## alternatively...
print("One plus one is");print(x);
[1] "One plus one is"
[1] 2
## even better...
cat(paste("One plus one is",x))
One plus one is 2
class
vs mode
mode
determines how it's stored in memory class
determines its abstract type, a concept borrowed from OOP.x <- data.frame(scores = c(80, 90, 70))
y <- as.Date("2013-11-05")
cat(paste(mode(x), mode(y)))
## list numeric
cat(paste(class(x), class(y)))
## data.frame Date
do.call()
mode
do.call
to combine the elements into a data.frame
a <- list(1.3, 2.5, "jeff")
b <- list(4.5, 2.8, "jerry")
d <- list(6.5, 0.8, "joe")
z <- list(a, b, d)
df <- data.frame(do.call(rbind, z))
df
## X1 X2 X3
## 1 1.3 2.5 jeff
## 2 4.5 2.8 jerry
## 3 6.5 0.8 joe
a)
## Create a new column called num2 for which each value is double the corresponding value in num
## Make sure num2 is also a factor
x <- data.frame(num=factor(c(1.0,0.03,8.0, 0.4)))
b)
## Find the letters in z corresponding to the indices of even numbers in y
y <- c(1,2,NA,4,5,8,5,2,3)
z <- c("f","g","e","i","l","o","p","u")