R examples
Must Watch!
MustWatch
BASICS
# Setup some scalars
x <- 3
y <- 4
z <- sqrt(x*x + y*y)
use.dots.in.names <- z - 5
# Print out z
print(z)
# The default action is print()
z
# So you have a neat scientific calculator
sin(log(2.718281828)*pi)
SCALARS
# The basic scalar types
x <- 3
take.me <- FALSE
s <- "this is a string"
x
take.me
s
# Exploring objects using str() --
str(x)
str(take.me)
str(s)
DATES: A SPECIAL KIND OF SCALAR
s <- "2004-04-22"
d <- as.Date(s)
d
str(d)
s <- "22-Apr-2004"
d <- as.Date(s, format="%d-%b-%Y")
d
as.Date("22-04-2004", format="%d-%m-%Y")
as.Date("22-04-04", format="%d-%m-%y")
as.Date("22-Apr-04", format="%d-%b-%y")
VECTORS
x <- c(2,3,7, 9, 10, 11)
# the c() operator, for concatenation
x[1]
x[6]
length(x)
# Help about all R functions is online:
?length
# Arithmetic that gobbles a vector at a time
x
x+2
y <- log(x)
y
# Clever vector addressing
x[1]
x[6]
x[c(1,6)]
indexes <- c(1,3,5)
x[indexes]
# Another shorthand
1:4
x[1:4]
indexes <- 1:4
x[indexes]
# Another shorthand
switches <- c(TRUE,FALSE,FALSE,FALSE,FALSE,TRUE)
x[switches]
# Dropping some elements
x
x[-2]
x[c(-2,-3)]
# Bottom line: Very nice mechanisms for addressing.
# Let's stay in touch with how to explore objects --
str(x)
str(x[1])
FACTORS: A SPECIAL KIND OF SCALAR
names <- c("Payal", "Shraddha", "Aditi", "Kritika", "Diwakar")
attire <- c("Sari", "Salwar", "Sari", "Sari", "Kurta")
# attire is a "categorical variable"
attire <- factor(attire)
attire
table(attire)
table(names, attire)
# Internally, a factor is just stored as an integer
as.numeric(attire)
NOT AVAILABLE: A SPECIAL SCALAR
x <- c(2,3,NA,4,5,NA)
x
x + 2 # Automatic rules about NA arithmetic
MATRICES
X <- matrix(NA, nrow=7, ncol=3)
X
X[4,] <- c(2,3,4)
X
X[,3] <- 1:7
X
str(X)
nrow(X)
ncol(X)
# As with vectors, arithmetic that attacks a full matrix at a time!
X + 1
LISTS
# Ability to make a parcel of apparently unrelated materials.
results <- list(mu=0.2, sigma=0.9, x=c(1,2,3,9))
str(results)
# Accessing elements --
results$mu
results$sigma <- 9
results$new <- "Hello"
results
DAILY TIME-SERIES USING THE `ZOO' PACKAGE
# Zoo is a very powerful, high-level time-series system. To learn more:
library(help=zoo)
vignette("zoo-quickref")
# Let's do a daily time-series
dates <- as.Date(c("2004-04-01", "2004-04-02", "2004-04-03", "2004-04-05"))
values <- rnorm(4)
library(zoo)
z <- zoo(values, order.by=dates)
z
plot(z)
DATASET, TERMED `DATA FRAME' in R
# A dataset is a list of vectors, all of the same length.
names <- c("Payal", "Shraddha", "Aditi", "Kritika", "Diwakar")
age <- c(15, 17, 15, 19, 20)
iq <- c(90, 100, 110, 120, 160)
# By default, data.frame() forces the names to be factors...
D <- data.frame(names=names, age=age, iq=iq)
str(D)
D
nrow(D)
ncol(D)
summary(D)
# Accessing a column in a data frame
D$age
# Accessing an observation in a data frame - use matrix-like notation
D[3,]
# Accessing the 3rd value of age
D$age[3]
WRITING AND THEN READING FILES
D
write.csv(D, file="demo.csv")
system("cat demo.csv") # Say "type" if you're on DOS
E <- read.table("demo.csv", skip=1, sep=",",
col.names=c("obsnum","name","age","iq"))
E
E$obsnum <- NULL
E
# Compare against --
D
REGRESSION
m <- lm(iq ~ age, D)
m
summary(m)
# Remember that shoe size is strongly correlated with IQ
plot(D$age, D$iq, xlab="Age (years)", ylab="IQ")
abline(h=100, lty=3)
lines(D$age, fitted.values(m), col="blue", lwd=2)
SIMULATION
runif(10)
runif(10)
set.seed(133); runif(10)
set.seed(133); runif(10)
summary(runif(100))
summary(runif(1000))
summary(runif(10000))
summary(runif(100000))
cat("See how large datasets stabilise sample statistics\n")
summary(runif(100)); summary(runif(100))
summary(runif(10000)); summary(runif(10000))
plot(density(runif(1000)))
plot(density(rnorm(1000)))
x <- seq(-4,4,0.01)
lines(x, dnorm(x), col="blue", lwd=2)
SIMPLE SUMMARY STATISTICS
x <- rnorm(10000)
summary(x)
quantile(x, probs=c(0.01, 0.025, 0.25, 0.5, 0.75, 0.975, 0.99))
IQR(x)
fivenum(x)
sd(x)
range(x)
mean(x)
mean(x, trim=0.1) # Delete 10% of data at each end first
REGRESSION ON A SIMULATED DATASET
x <- 2 + 3*runif(100)
summary(x)
y <- -7 + 4*x + rnorm(100)
summary(lm(y ~ x)) # Convince yourself the intercept is ok
# Most statistical functions give back results of results.
res.1 <- lm(y ~ x)
str(res.1) # There's a lot of goodies here!
res.1$coefficients
# In this case, detailed calculations require summary() to get what
# I call "level 2 results".
res.2 <- summary(res.1)
str(res.1) # There's even more goodies here!!
res.2$coefficients
res.2$r.squared
res.2$sigma
# There are useful generic functions which work for a lot of models --
logLik(res.1)
AIC(res.1)
TIME SERIES ANALYSIS
# ACF of white noise
z <- rnorm(100)
acf(z)
z <- rnorm(1000)
acf(z)
# Simulate data from an AR(1) process
x <- arima.sim(model=list(ar=c(0.5)), n=1000)
arima(x, order=c(1,0,0))
acf(x)
# Let's try to estimate a wrong model --
arima(x, order=c(3,0,0))
A first look at R objects
- vectors, lists, matrices, data frames.
# To make vectors "x" "y" "year" and "names"
x <- c(2,3,7,9)
y <- c(9,7,3,2)
year <- 1990:1993
names <- c("payal", "shraddha", "kritika", "itida")
# Accessing the 1st and last elements of y --
y[1]
y[length(y)]
# To make a list "person" --
person <- list(name="payal", x=2, y=9, year=1990)
person
# Accessing things inside a list --
person$name
person$x
# To make a matrix, pasting together the columns "year" "x" and "y"
# The verb cbind() stands for "column bind"
cbind(year, x, y)
# To make a "data frame", which is a list of vectors of the same length --
D <- data.frame(names, year, x, y)
nrow(D)
# Accessing one of these vectors
D$names
# Accessing the last element of this vector
D$names[nrow(D)]
# Or equally,
D$names[length(D$names)]
sorting
#
# The approach here needs to be explained. If `i' is a vector of
# integers, then the data frame D[i,] picks up rows from D based
# on the values found in `i'.
#
# The order() function makes an integer vector which is a correct
# ordering for the purpose of sorting.
D <- data.frame(x=c(1,2,3,1), y=c(7,19,2,2))
D
# Sort on x
indexes <- order(D$x)
D[indexes,]
# Print out sorted dataset, sorted in reverse by y
D[rev(order(D$y)),]
Prices and returns
# I like to multiply returns by 100 so as to have "units in percent".
# In other words, I like it for 5% to be a value like 5 rather than 0.05.
# I. Simulate random-walk prices, switch between prices & returns.
# Simulate a time-series of PRICES drawn from a random walk
# where one-period returns are i.i.d. N(mu, sigma^2).
ranrw <- function(mu, sigma, p0=100, T=100) {
cumprod(c(p0, 1 + (rnorm(n=T, mean=mu, sd=sigma)/100)))
}
prices2returns <- function(x) {
100*diff(log(x))
}
returns2prices <- function(r, p0=100) {
c(p0, p0 * exp(cumsum(r/100)))
}
cat("Simulate 25 points from a random walk starting at 1500 --\n")
p <- ranrw(0.05, 1.4, p0=1500, T=25)
# gives you a 25-long series, starting with a price of 1500, where
# one-period returns are N(0.05,1.4^2) percent.
print(p)
cat("Convert to returns--\n")
r <- prices2returns(p)
print(r)
cat("Go back from returns to prices --\n")
goback <- returns2prices(r, 1500)
print(goback)
# II. Plenty of powerful things you can do with returns....
summary(r); sd(r) # summary statistics
plot(density(r)) # kernel density plot
acf(r) # Autocorrelation function
ar(r) # Estimate a AIC-minimising AR model
Box.test(r, lag=2, type="Ljung") # Box-Ljung test
library(tseries)
runs.test(factor(sign(r))) # Runs test
bds.test(r) # BDS test.
# III. Visualisation and the random walk
# I want to obtain intuition into what kinds of price series can happen,
# given a starting price, a mean return, and a given standard deviation.
# This function simulates out 10000 days of a price time-series at a time,
# and waits for you to click in the graph window, after which a second
# series is painted, and so on. Make the graph window very big and
# sit back and admire.
# The point is to eyeball many series and thus obtain some intuition
# into what the random walk does.
visualisation <- function(p0, s, mu, labelstring) {
N <- 10000
x <- (1:(N+1))/250 # Unit of years
while (1) {
plot(x, ranrw(mu, s, p0, N), ylab="Level", log="y",
type="l", col="red", xlab="Time (years)",
main=paste("40 years of a process much like", labelstring))
grid()
z=locator(1)
}
}
# Nifty -- assuming sigma of 1.4% a day and E(returns) of 13% a year
visualisation(2600, 1.4, 13/250, "Nifty")
# The numerical values here are used to think about what the INR/USD
# exchange rate would have looked like if it started from 31.37, had
# a mean depreciation of 5% per year, and had the daily vol of a floating
# exchange rate like EUR/USD.
visualisation(31.37, 0.7, 5/365, "INR/USD (NOT!) with daily sigma=0.7")
# This is of course not like the INR/USD series in the real world -
# which is neither a random walk nor does it have a vol of 0.7% a day.
# The numerical values here are used to think about what the USD/EUR
# exchange rate, starting with 1, having no drift, and having the observed
# daily vol of 0.7. (This is about right).
visualisation(1, 0.7, 0, "USD/EUR with no drift")
# IV. A monte carlo experiment about the runs test
# Measure the effectiveness of the runs test when faced with an
# AR(1) process of length 100 with a coeff of 0.1
set.seed(101)
one.ts <- function() {arima.sim(list(order = c(1,0,0), ar = 0.1), n=100)}
table(replicate(1000, runs.test(factor(sign(one.ts())))$p.value < 0.05))
# We find that the runs test throws up a prob value of below 0.05
# for 91 out of 1000 experiments.
# Wow! :-)
# To understand this, you need to look up the man pages of:
# set.seed, arima.sim, sign, factor, runs.test, replicate, table.
# e.g. say ?replicate
write functions
# To write functions that send back multiple objects.
# FIRST LEARN ABOUT LISTS --
X = list(height=5.4, weight=54)
print("Use default printing --")
print(X)
print("Accessing individual elements --")
cat("Your height is ", X$height, " and your weight is ", X$weight, "\n")
# FUNCTIONS --
square <- function(x) {
return(x*x)
}
cat("The square of 3 is ", square(3), "\n")
# default value of the arg is set to 5.
cube <- function(x=5) {
return(x*x*x);
}
cat("Calling cube with 2 : ", cube(2), "\n") # will give 2^3
cat("Calling cube : ", cube(), "\n") # will default to 5^3.
# LEARN ABOUT FUNCTIONS THAT RETURN MULTIPLE OBJECTS --
powers <- function(x) {
parcel = list(x2=x*x, x3=x*x*x, x4=x*x*x*x);
return(parcel);
}
X = powers(3);
print("Showing powers of 3 --"); print(X);
# WRITING THIS COMPACTLY (4 lines instead of 7)
powerful <- function(x) {
return(list(x2=x*x, x3=x*x*x, x4=x*x*x*x));
}
print("Showing powers of 3 --"); print(powerful(3));
# In R, the last expression in a function is, by default, what is
# returned. So you could equally just say:
powerful <- function(x) {list(x2=x*x, x3=x*x*x, x4=x*x*x*x)}
R vector notation
cat("EXAMPLE 1: sin(x) for a vector --\n")
# Suppose you have a vector x --
x = c(0.1,0.6,1.0,1.5)
# The bad way --
n = length(x)
r = numeric(n)
for (i in 1:n) {
r[i] = sin(x[i])
}
print(r)
# The good way -- don't use loops --
print(sin(x))
cat("\n\nEXAMPLE 2: Compute the mean of every row of a matrix --\n")
# Here's another example. It isn't really about R; it's about thinking in
# matrix notation. But still.
# Let me setup a matrix --
N=4; M=100;
r = matrix(runif(N*M), N, M)
# So I face a NxM matrix
# [r11 r12 ... r1N]
# [r21 r22 ... r2N]
# [r32 r32 ... r3N]
# My goal: each column needs to be reduced to a mean.
# Method 1 uses loops:
mean1 = numeric(M)
for (i in 1:M) {
mean1[i] = mean(r[,i])
}
# Alternatively, just say:
mean2 = rep(1/N, N) %*% r # Pretty!
# The two answers are the same --
all.equal(mean1,mean2[,])
#
# As an aside, I should say that you can do this directly by using
# the rowMeans() function. But the above is more about pedagogy rather
# than showing you how to get rowmeans.
cat("\n\nEXAMPLE 3: Nelson-Siegel yield curve\n")
# Write this asif you're dealing with scalars --
# Nelson Siegel function
nsz <- function(b0, b1, b2, tau, t) {
tmp = t/tau
tmp2 = exp(-tmp)
return(b0 + ((b1+b2)*(1-tmp2)/(tmp)) - (b2*tmp2))
}
timepoints <- c(0.01,1:5)
# The bad way:
z <- numeric(length(timepoints))
for (i in 1:length(timepoints)) {
z[i] <- nsz(14.084,-3.4107,0.0015,1.8832,timepoints[i])
}
print(z)
# The R way --
print(z <- nsz(14.084,-3.4107,0.0015,1.8832,timepoints))
cat("\n\nEXAMPLE 3: Making the NPV of a bond--\n")
# You know the bad way - sum over all cashflows, NPVing each.
# Now look at the R way.
C = rep(100, 6)
nsz(14.084,-3.4107,0.0015,1.8832,timepoints) # Print interest rates
C/((1.05)^timepoints) # Print cashflows discounted @ 5%
C/((1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints))^timepoints)) # Using NS instead of 5%
# NPV in two different ways --
C %*% (1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints)))^-timepoints
sum(C * (1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints)))^-timepoints)
# You can drop back to a flat yield curve at 5% easily --
sum(C * 1.05^-timepoints)
# Make a function for NPV --
npv <- function(C, timepoints, r) {
return(sum(C * (1 + (0.01*r))^-timepoints))
}
npv(C, timepoints, 5)
# Bottom line: Here's how you make the NPV of a bond with cashflows C
# at timepoints timepoints when the zero curve is a Nelson-Siegel curve --
npv(C, timepoints, nsz(14.084,-3.4107,0.0015,1.8832,timepoints))
# Wow!
# ---------------------------------------------------------------------------
# Elegant vector notation is amazingly fast (in addition to being beautiful)
N <- 1e5
x <- runif(N, -3,3)
y <- runif(N)
method1 <- function(x,y) {
tmp <- NULL
for (i in 1:N) {
if (x[i] < 0) tmp <- c(tmp, y[i])
}
tmp
}
method2 <- function(x,y) {
y[x < 0]
}
s1 <- system.time(ans1 <- method1(x,y))
s2 <- system.time(ans2 <- method2(x,y))
all.equal(ans1,ans2)
s1/s2 # On my machine it's 2000x faster
indexing notation, and the use of is.na()
x <- c(2,7,9,2,NA,5) # An example vector to play with.
# Give me elems 1 to 3 --
x[1:3]
# Give me all but elem 1 --
x[-1]
# Odd numbered elements --
indexes <- seq(1,6,2)
x[indexes]
# or, more compactly,
x[seq(1,6,2)]
# Access elements by specifying "on" / "off" through booleans --
require <- c(TRUE,TRUE,FALSE,FALSE,FALSE,FALSE)
x[require]
# Short vectors get reused! So, to get odd numbered elems --
x[c(TRUE,FALSE)]
# Locate missing data --
is.na(x)
# Replace missing data by 0 --
x[is.na(x)] <- 0
x
# Similar ideas work for matrices --
y <- matrix(c(2,7,9,2,NA,5), nrow=2)
y
# Make a matrix containing columns 1 and 3 --
y[,c(1,3)]
# Let us see what is.na(y) does --
is.na(y)
str(is.na(y))
# So is.na(y) gives back a matrix with the identical structure as that of y.
# Hence I can say
y[is.na(y)] <- -1
y
latex tabular out of an R matrix
# Setup a nice R object:
m <- matrix(rnorm(8), nrow=2)
rownames(m) <- c("Age", "Weight")
colnames(m) <- c("Person1", "Person2", "Person3", "Person4")
m
# Translate it into a latex tabular:
library(xtable)
xtable(m, digits=rep(3,5))
# Production latex code that goes into a paper or a book --
print(xtable(m,
caption="String",
label="t:"),
type="latex",
file="blah.gen",
table.placement="tp",
latex.environments=c("center", "footnotesize"))
# Now you do \input{blah.gen} in your latex file.
# You're lazy, and want to use R to generate latex tables for you?
data <- cbind(
c(7,9,11,2),
c(2,4,19,21)
)
colnames(data) <- c("a","b")
rownames(data) <- c("x","y","z","a")
xtable(data)
# or you could do
data <- rbind(
c(7,2),
c(9,4),
c(11,19),
c(2,21)
)
# and the rest goes through identically.
Associative arrays (as in awk) or hashes (as in perl)
# Or, more generally, adventures in R addressing.
# Here's a plain R vector:
x <- c(2,3,7,9)
# But now I tag every elem with labels:
names(x) <- c("kal","sho","sad","aja")
# Associative array operations:
x["kal"] <- 12
# Pretty printing the entire associative array:
x
# This works for matrices too:
m <- matrix(runif(10), nrow=5)
rownames(m) <- c("violet","indigo","blue","green","yellow")
colnames(m) <- c("Asia","Africa")
# The full matrix --
m
# Or even better --
library(xtable)
xtable(m)
# Now address symbolically --
m[,"Africa"]
m["indigo",]
m["indigo","Africa"]
# The "in" operator, as in awk --
for (colour in c("yellow", "orange", "red")) {
if (colour %in% rownames(m)) {
cat("For Africa and ", colour, " we have ", m[colour, "Africa"], "\n")
} else {
cat("Colour ", colour, " does not exist in the hash.\n")
}
}
# This works for data frames also --
D <- data.frame(m)
D
# Look closely at what happened --
str(D) # The colours are the rownames(D).
# Operations --
D$Africa
D[,"Africa"]
D["yellow",]
# or
subset(D, rownames(D)=="yellow")
colnames(D) <- c("Antarctica","America")
D
D$America
Utilise matrix notation
# We use the problems of portfolio analysis as an example.
# Prices of 4 firms to play with, at weekly frequency (for calendar 2004) --
p <- structure(c(300.403, 294.604, 291.038, 283.805, 270.773, 275.506, 292.271, 292.837, 284.872, 295.037, 280.939, 259.574, 250.608, 268.84, 266.507, 263.94, 273.173, 238.609, 230.677, 192.847, 219.078, 201.846, 210.279, 193.281, 186.748, 197.314, 202.813, 204.08, 226.044, 242.442, 261.274, 269.173, 256.05, 259.75, 243, 250.3, 263.45, 279.5, 289.55, 291.95, 302.1, 284.4, 283.5, 287.8, 298.3, 307.6, 307.65, 311.9, 327.7, 318.1, 333.6, 358.9, 385.1, 53.6, 51.95, 47.65, 44.8, 44.85, 44.3, 47.1, 44.2, 41.8, 41.9, 41, 35.3, 33.35, 35.6, 34.55, 35.55, 40.05, 35, 34.85, 28.95, 31, 29.25, 29.05, 28.95, 24.95, 26.15, 28.35, 29.4, 32.55, 37.2, 39.85, 40.8, 38.2, 40.35, 37.55, 39.4, 39.8, 43.25, 44.75, 47.25, 49.6, 47.6, 46.35, 49.4, 49.5, 50.05, 50.5, 51.85, 56.35, 54.15, 58, 60.7, 62.7, 293.687, 292.746, 283.222, 286.63, 259.774, 259.257, 270.898, 250.625, 242.401, 248.1, 244.942, 239.384, 237.926, 224.886, 243.959, 270.998, 265.557, 257.508, 258.266, 257.574, 251.917, 250.583, 250.783, 246.6, 252.475, 266.625, 263.85, 249.925, 262.9, 264.975, 273.425, 275.575, 267.2, 282.25, 284.25, 290.75, 295.625, 296.25, 291.375, 302.225, 318.95, 324.825, 320.55, 328.75, 344.05, 345.925, 356.5, 368.275, 374.825, 373.525, 378.325, 378.6, 374.4, 1416.7, 1455.15, 1380.97, 1365.31, 1303.2, 1389.64, 1344.05, 1266.29, 1265.61, 1312.17, 1259.25, 1297.3, 1327.38, 1250, 1328.03, 1347.46, 1326.79, 1286.54, 1304.84, 1272.44, 1227.53, 1264.44, 1304.34, 1277.65, 1316.12, 1370.97, 1423.35, 1382.5, 1477.75, 1455.15, 1553.5, 1526.8, 1479.85, 1546.8, 1565.3, 1606.6, 1654.05, 1689.7, 1613.95, 1703.25, 1708.05, 1786.75, 1779.75, 1906.35, 1976.6, 2027.2, 2057.85, 2029.6, 2051.35, 2033.4, 2089.1, 2065.2, 2091.7), .Dim = c(53, 4), .Dimnames = list(NULL, c("TISCO", "SAIL", "Wipro", "Infosys")))
# Shift from prices to returns --
r <- 100*diff(log(p))
# Historical expected returns --
colMeans(r)
# Historical correlation matrix --
cor(r)
# Historical covariance matrix --
S <- cov(r)
S
# Historical portfolio variance for a stated portfolio of 20%,20%,30%,30% --
w <- c(.2, .2, .3, .3)
t(w) %*% S %*% w
# The portfolio optimisation function in tseries --
library(tseries)
optimised <- portfolio.optim(r) # This uses the historical facts from r
optimised$pw # Weights
optimised$pm # Expected return using these weights
optimised$ps # Standard deviation of optimised port.
price time-series
# A stock is traded on 2 exchanges.
# Price data is missing at random on both exchanges owing to non-trading.
# We want to make a single price time-series utilising information
# from both exchanges. I.e., missing data for exchange 1 will
# be replaced by information for exchange 2 (if observed).
# Let's create some example data for the problem.
e1 <- runif(15) # Prices on exchange 1
e2 <- e1 + 0.05*rnorm(15) # Prices on exchange 2.
cbind(e1, e2)
# Blow away 5 points from each at random.
e1[sample(1:15, 5)] <- NA
e2[sample(1:15, 5)] <- NA
cbind(e1, e2)
# Now how do we reconstruct a time-series that tries to utilise both?
combined <- e1 # Do use the more liquid exchange here.
missing <- is.na(combined)
combined[missing] <- e2[missing] # if it's also missing, I don't care.
cbind(e1, e2, combined)
# There you are.
panel of pictures
par(mfrow=c(3,2)) # 3 rows, 2 columns.
# Now the next 6 pictures will be placed on these 6 regions. :-)
# Let me take some pains on the 1st
plot(density(runif(100)), lwd=2)
text(x=0, y=0.2, "100 uniforms") # Showing you how to place text at will
abline(h=0, v=0)
# All these statements effect the 1st plot.
x=seq(0.01,1,0.01)
par(col="blue") # default colour to blue.
# 2 --
plot(x, sin(x), type="l")
lines(x, cos(x), type="l", col="red")
# 3 --
plot(x, exp(x), type="l", col="green")
lines(x, log(x), type="l", col="orange")
# 4 --
plot(x, tan(x), type="l", lwd=3, col="yellow")
# 5 --
plot(x, exp(-x), lwd=2)
lines(x, exp(x), col="green", lwd=3)
# 6 --
plot(x, sin(x*x), type="l")
lines(x, sin(1/x), col="pink")
Make pictures in PDF files
xpts <- seq(-3,3,.05)
# Here is my suggested setup for a two-column picture --
pdf("demo2.pdf", width=5.6, height=2.8, bg="cadetblue1", pointsize=8)
par(mai=c(.6,.6,.2,.2))
plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4",
xlab="x", ylab="sin(x*x)")
grid(col="white", lty=1, lwd=.2)
abline(h=0, v=0)
# My suggested setup for a square one-column picture --
pdf("demo1.pdf", width=2.8, height=2.8, bg="cadetblue1", pointsize=8)
par(mai=c(.6,.6,.2,.2))
plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4",
xlab="x", ylab="sin(x*x)")
grid(col="white", lty=1, lwd=.2)
abline(h=0, v=0)
histogram with tails shown in red
# This happened on the R mailing list on 7 May 2004.
x <- rnorm(1000)
hx <- hist(x, breaks=100, plot=FALSE)
plot(hx, col=ifelse(abs(hx$breaks) < 1.669, 4, 2))
# What is cool is that "col" is supplied a vector.
Visualisation of 3-dimensional (x,y,z) data using contour
# plots and using colour to represent the 3rd dimension.
# The specific situation is: On a grid of (x,y) points, you have
# evaluated f(x,y). Now you want a graphical representation of
# the resulting list of (x,y,z) points that you have.
# Setup an interesting data matrix of (x,y,z) points:
points <- structure(
c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0.998, 0.124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0.998, 0.71, 0.068, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0.998, 0.898, 0.396, 0.058, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.998, 0.97, 0.726, 0.268, 0.056, 0.006, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.996, 0.88, 0.546, 0.208, 0.054, 0.012, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.998, 0.964, 0.776, 0.418, 0.18, 0.054, 0.014, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.998, 0.906, 0.664, 0.342, 0.166, 0.056, 0.018, 0.006, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.986, 0.862, 0.568, 0.29, 0.15, 0.056, 0.022, 0.008, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.954, 0.778, 0.494, 0.26, 0.148, 0.056, 0.024, 0.012, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.906, 0.712, 0.43, 0.242, 0.144, 0.058, 0.028, 0.012, 0.006, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.878, 0.642, 0.38, 0.222, 0.142, 0.066, 0.034, 0.014, 0.008, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0.846, 0.586, 0.348, 0.208, 0.136, 0.068, 0.034, 0.016, 0.012, 0.006, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0.8, 0.538, 0.318, 0.204, 0.136, 0.07, 0.046, 0.024, 0.012, 0.008, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0, 0, 0.762, 0.496, 0.294, 0.2, 0.138, 0.072, 0.05, 0.024, 0.014, 0.012, 0.006, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0, 0.704, 0.472, 0.286, 0.198, 0.138, 0.074, 0.054, 0.028, 0.016, 0.012, 0.008, 0.006, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0.668, 0.438, 0.276, 0.196, 0.138, 0.078, 0.054, 0.032, 0.024, 0.014, 0.012, 0.008, 0.004, 0.004, 0.002, 0.002, 0, 0, 0, 0.634, 0.412, 0.27, 0.194, 0.14, 0.086, 0.056, 0.032, 0.024, 0.016, 0.012, 0.01, 0.006, 0.004, 0.004, 0.002, 0.002, 0, 0, 0.604, 0.388, 0.26, 0.19, 0.144, 0.088, 0.058, 0.048, 0.026, 0.022, 0.014, 0.012, 0.008, 0.006, 0.004, 0.004, 0.002, 0.002, 0, 0.586, 0.376, 0.256, 0.19, 0.146, 0.094, 0.062, 0.052, 0.028, 0.024, 0.014, 0.012, 0.012, 0.008, 0.004, 0.004, 0.004, 0.002, 0.002, 0.566, 0.364, 0.254, 0.192, 0.148, 0.098, 0.064, 0.054, 0.032, 0.024, 0.022, 0.014, 0.012, 0.012, 0.008, 0.004, 0.004, 0.004, 0.002),
.Dim = c(399, 3),
.Dimnames = list(NULL, c("x", "y", "z"))
)
summary(points)
# x is a grid from 0 to 1
# y is a grid from 20 to 200
# z is the interesting object which will be the 3rd dimension.
# Solution using contourplot() from package 'lattice'
library(lattice)
d3 <- data.frame(points)
contourplot(z ~ x+y, data=d3)
## or nicer
contourplot(z ~ x+y, data=d3, cuts=20, region = TRUE)
## or using logit - transformed z values:
contourplot(qlogis(z) ~ x+y, data=d3, pretty=TRUE, region = TRUE)
# An interesting alternative is levelplot()
levelplot(z ~ x+y, pretty=TRUE, contour=TRUE, data=d3)
# There is a contour() function in R. Even though it sounds obvious
# for the purpose, it is a bit hard to use.
# contour() wants 3 inputs: vectors of x and y values, and a matrix of
# z values, where the x values correspond to the rows of z, and the y
# values to the columns. A collection of points like `points' above
# needs to be turned into such a grid. It might sound odd, but contour()
# image() and persp() have used this kind of input for the longest time.
#
# For irregular data, there's an interp function in the akima package
# that can convert from irregular data into the grid format.
#
# The `points' object that I have above - a list of (x,y,z) points -
# fits directly into the mentality of lattice::contourplot() but not
# into the requirements of contour()
Display of a macroeconomic time-series, with a filled colour
# bar showing a recession.
years <- 1950:2000
timeseries <- cumsum(c(100, runif(50)*5))
hilo <- range(timeseries)
plot(years, timeseries, type="l", lwd=3)
# A recession from 1960 to 1965 --
polygon(x=c(1960,1960, 1965,1965),
y=c(hilo, rev(hilo)),
density=NA, col="orange", border=NA)
lines(years, timeseries, type="l", lwd=3) # paint again so line comes on top
# alternative method -- though not as good looking --
# library(plotrix)
# gradient.rect(1960, hilo[1], 1965, hilo[2],
# reds=c(0,1), greens=c(0,0), blues=c(0,0),
# gradient="y")
Display two series on one plot, one with a left y axis
# and another with a right y axis.
y1 <- cumsum(rnorm(100))
y2 <- cumsum(rnorm(100, mean=0.2))
par(mai=c(.8, .8, .2, .8))
plot(1:100, y1, type="l", col="blue", xlab="X axis label", ylab="Left legend")
par(new=TRUE)
plot(1:100, y2, type="l", ann=FALSE, yaxt="n")
axis(4)
legend(x="topleft", bty="n", lty=c(1,1), col=c("blue","black"),
legend=c("String 1 (left scale)", "String 2 (right scale)"))
Simulate a dataset from the OLS model and obtain
# obtain OLS estimates for it.
x <- runif(100, 0, 10) # 100 draws from U(0,10)
y <- 2 + 3*x + rnorm(100) # beta = [2, 3] and sigma = 1
# You want to just look at OLS results?
summary(lm(y ~ x))
# Suppose x and y were packed together in a data frame --
D <- data.frame(x,y)
summary(lm(y ~ x, D))
# Full and elaborate steps --
d <- lm(y ~ x)
# Learn about this object by saying ?lm and str(d)
# Compact model results --
print(d)
# Pretty graphics for regression diagnostics --
par(mfrow=c(2,2))
plot(d)
d <- summary(d)
# Detailed model results --
print(d)
# Learn about this object by saying ?summary.lm and by saying str(d)
cat("OLS gave slope of ", d$coefficients[2,1],
"and a error sigma of ", d$sigma, "\n")
## I need to drop down to a smaller dataset now --
x <- runif(10)
y <- 2 + 3*x + rnorm(10)
m <- lm(y ~ x)
# Now R supplies a wide range of generic functions which extract
# useful things out of the result of estimation of many kinds of models.
residuals(m)
fitted(m)
AIC(m)
AIC(m, k=log(10)) # SBC
vcov(m)
logLik(m)
"Dummy variables" in regression
# Suppose you have this data:
people = data.frame(
age = c(21,62,54,49,52,38),
education = c("college", "school", "none", "school", "college", "none"),
education.code = c( 2, 1, 0, 1, 2, 0 )
)
# Here people$education is a string categorical variable and
# people$education.code is the same thing, with a numerical coding system.
people
# Note the structure of the dataset --
str(people)
# The strings supplied for `education' have been treated (correctly) as
# a factor, but education.code is being treated as an integer and not as
# a factor.
# We want to do a dummy variable regression. Normally you would have:
# 1 Chosen college as the omitted category
# 2 Made a dummy for "none" named educationnone
# 3 Made a dummy for "school" named educationschool
# 4 Ran a regression like lm(age ~ educationnone + educationschool, people)
# But this is R. Things are cool:
lm(age ~ education, people)
# ! :-)
# When you feed him an explanatory variable like education, he does all
# these steps automatically. (He chose college as the omitted category).
# If you use an integer coding, then the obvious thing goes wrong --
lm(age ~ education.code, people)
# because he's thinking that education.code is an integer explanatory
# variable. So you need to:
lm(age ~ factor(education.code), people)
# (he choose a different omitted category)
# Alternatively, fix up the dataset --
people$education.code <- factor(people$education.code)
lm(age ~ education.code, people)
#
# Bottom line:
# Once the dataset has categorical variables correctly represented as factors, i.e. as
str(people)
# doing OLS in R induces automatic generation of dummy variables while leaving one out:
lm(age ~ education, people)
lm(age ~ education.code, people)
# But what if you want the X matrix?
m <- lm(age ~ education, people)
model.matrix(m)
# This is the design matrix that went into the regression m.
latex table with results of an OLS regression
# Get an OLS --
x1 = runif(100)
x2 = runif(100, 0, 2)
y = 2 + 3*x1 + 4*x2 + rnorm(100)
m = lm(y ~ x1 + x2)
# and print it out prettily --
library(xtable)
# Bare --
xtable(m)
xtable(anova(m))
# Better --
print.xtable(xtable(m, caption="My regression",
label="t:mymodel",
digits=c(0,3,2,2,3)),
type="latex",
file="xtable_demo_ols.tex",
table.placement = "tp",
latex.environments=c("center", "footnotesize"))
print.xtable(xtable(anova(m),
caption="ANOVA of my regression",
label="t:anova_mymodel"),
type="latex",
file="xtable_demo_anova.tex",
table.placement = "tp",
latex.environments=c("center", "footnotesize"))
# Read the documentation of xtable. It actually knows how to generate
# pretty latex tables for a lot more R objects than just OLS results.
# It can be a workhorse for making tabular out of matrices, and
# can also generate HTML.
Simulate a dataset from a "fixed effects" model
# and obtain "least squares dummy variable" (LSDV) estimates.
#
# We do this in the context of a familiar "earnings function" -
# log earnings is quadratic in log experience, with parallel shifts by
# education category.
# Create an education factor with 4 levels --
education <- factor(sample(1:4,1000, replace=TRUE),
labels=c("none", "school", "college", "beyond"))
# Simulate an experience variable with a plausible range --
experience <- 30*runif(1000) # experience from 0 to 20 years
# Make the intercept vary by education category between 4 given values --
intercept <- c(0.5,1,1.5,2)[education]
# Simulate the log earnings --
log.earnings <- intercept +
2*experience - 0.05*experience*experience + rnorm(1000)
A <- data.frame(education, experience, e2=experience*experience, log.earnings)
summary(A)
# The OLS path to LSDV --
summary(lm(log.earnings ~ -1 + education + experience + e2, A))
Using data from Yahoo finance
# for weekly returns , estimate the beta of Sun Microsystems
# This is the `elaborate version' (36 lines), also see terse version (16 lines)
library(tseries)
# I know that the yahoo symbol for the common stock of Sun Microsystems
# is "SUNW" and for the S&P 500 index is "^GSPC".
prices <- cbind(get.hist.quote("SUNW", quote="Adj", start="2003-01-01", retclass="zoo"),
get.hist.quote("^GSPC", quote="Adj", start="2003-01-01", retclass="zoo"))
colnames(prices) <- c("SUNW", "SP500")
prices <- na.locf(prices) # Copy last traded price when NA
# To make weekly returns, you must have this incantation:
nextfri.Date <- function(x) 7 * ceiling(as.numeric(x - 1)/7) + as.Date(1)
# and then say
weekly.prices <- aggregate(prices, nextfri.Date,tail,1)
# Now we can make weekly returns --
r <- 100*diff(log(weekly.prices))
# Now shift out of zoo to become an ordinary matrix --
r <- coredata(r)
rj <- r[,1]
rM <- r[,2]
d <- lm(rj ~ rM) # Market model estimation.
print(summary(d))
# Make a pretty picture
big <- max(abs(c(rj, rM)))
range <- c(-big, big)
plot(rM, rj, xlim=range, ylim=range,
xlab="S&P 500 weekly returns (%)", ylab="SUNW weekly returns (%)")
grid()
abline(h=0, v=0)
lines(rM, d$fitted.values, col="blue")
Terse version of estimating the beta
# using weekly returns and data from Yahoo finance.
# By Gabor Grothendieck.
library(tseries)
getstock <- function(x)
c(get.hist.quote(x, quote = "Adj", start = "2003-01-01", compress = "w"))
r <- diff(log(cbind(sp500 = getstock("^gspc"), sunw = getstock("sunw"))))
mm <- lm(sunw ~ ., r)
print(summary(mm))
range <- range(r, -r)
plot(r[,1], r[,2], xlim = range, ylim = range,
xlab = "S&P 500 weekly returns (%)", ylab = "SUNW weekly returns (%)")
grid()
abline(mm, h = 0, v = 0, col = "blue")
nonlinear regression, in three ways
# By just supplying the function to be fit,
# By also supplying the analytical derivatives, and
# By having him analytically differentiate the function to be fit.
#
# John Fox has a book "An R and S+ companion to applied regression"
# (abbreviated CAR).
# An appendix associated with this book, titled
# "Nonlinear regression and NLS"
# is up on the web, and I strongly recommend that you go read it.
#
# This file is essentially from there (I have made slight changes).
# First take some data - from the CAR book --
library(car)
data(US.pop)
attach(US.pop)
plot(year, population, type="l", col="blue")
# So you see, we have a time-series of the US population. We want to
# fit a nonlinear model to it.
library(stats) # Contains nonlinear regression
time <- 0:20
pop.mod <- nls(population ~ beta1/(1 + exp(beta2 + beta3*time)),
start=list(beta1=350, beta2=4.5, beta3=-0.3), trace=TRUE)
# You just write in the formula that you want to fit, and supply
# starting values. "trace=TRUE" makes him show iterations go by.
summary(pop.mod)
# Add in predicted values into the plot
lines(year, fitted.values(pop.mod), lwd=3, col="red")
# Look at residuals
plot(year, residuals(pop.mod), type="b")
abline(h=0, lty=2)
# Using analytical derivatives:
model <- function(beta1, beta2, beta3, time) {
m <- beta1/(1+exp(beta2+beta3*time))
term <- exp(beta2 + beta3*time)
gradient <- cbind((1+term)^-1,
-beta1*(1+term)^-2 * term,
-beta1*(1+term)^-2 * term * time)
attr(m, 'gradient') <- gradient
return(m)
}
summary(nls(population ~ model(beta1, beta2, beta3, time),
start=list(beta1=350, beta2=4.5, beta3=-0.3)))
# Using analytical derivatives, using automatic differentiation (!!!):
model <- deriv(~ beta1/(1 + exp(beta2+beta3*time)), # rhs of model
c('beta1', 'beta2', 'beta3'), # parameter names
function(beta1, beta2, beta3, time){} # arguments for result
)
summary(nls(population ~ model(beta1, beta2, beta3, time),
start=list(beta1=350, beta2=4.5, beta3=-0.3)))
Some of the standard tests
# A classical setting --
x <- runif(100, 0, 10) # 100 draws from U(0,10)
y <- 2 + 3*x + rnorm(100) # beta = [2, 3] and sigma is 1
d <- lm(y ~ x)
# CLS results --
summary(d)
library(sandwich)
library(lmtest)
# Durbin-Watson test --
dwtest(d, alternative="two.sided")
# Breusch-Pagan test --
bptest(d)
# Heteroscedasticity and autocorrelation consistent (HAC) tests
coeftest(d, vcov=kernHAC)
# Tranplant the HAC values back in --
library(xtable)
sum.d <- summary(d)
xtable(sum.d)
sum.d$coefficients[1:2,1:4] <- coeftest(d, vcov=kernHAC)[1:2,1:4]
xtable(sum.d)
Experiment with fitting nonlinear functional forms
# in OLS, using orthogonal polynomials to avoid difficulties with
# near-singular design matrices that occur with ordinary polynomials.
# Shriya Anand, Gabor Grothendieck, Ajay Shah, March 2006.
# We will deal with noisy data from the d.g.p. y = sin(x) + e
x <- seq(0, 2*pi, length.out=50)
set.seed(101)
y <- sin(x) + 0.3*rnorm(50)
basicplot <- function(x, y, minx=0, maxx=3*pi, title="") {
plot(x, y, xlim=c(minx,maxx), ylim=c(-2,2), main=title)
lines(x, sin(x), col="blue", lty=2, lwd=2)
abline(h=0, v=0)
}
x.outsample <- seq(0, 3*pi, length.out=100)
# Severe multicollinearity with ordinary polynomials
x2 <- x*x
x3 <- x2*x
x4 <- x3*x
cor(cbind(x, x2, x3, x4))
# and a perfect design matrix using orthogonal polynomials
m <- poly(x, 4)
all.equal(cor(m), diag(4)) # Correlation matrix is I.
par(mfrow=c(2,2))
# Ordinary polynomial regression --
p <- lm(y ~ x + I(x^2) + I(x^3) + I(x^4))
summary(p)
basicplot(x, y, title="Polynomial, insample") # Data
lines(x, fitted(p), col="red", lwd=3) # In-sample
basicplot(x, y, title="Polynomial, out-of-sample")
predictions.p <- predict(p, list(x = x.outsample)) # Out-of-sample
lines(x.outsample, predictions.p, type="l", col="red", lwd=3)
lines(x.outsample, sin(x.outsample), type="l", col="blue", lwd=2, lty=2)
# As expected, polynomial fitting gives terrible results out of sample.
# These IDENTICAL things using orthogonal polynomials
d <- lm(y ~ poly(x, 4))
summary(d)
basicplot(x, y, title="Orth. poly., insample") # Data
lines(x, fitted(d), col="red", lwd=3) # In-sample
basicplot(x, y, title="Orth. poly., out-of-sample")
predictions.op <- predict(d, list(x = x.outsample)) # Out-of-sample
lines(x.outsample, predictions.op, type="l", col="red", lwd=3)
lines(x.outsample, sin(x.outsample), type="l", col="blue", lwd=2, lty=2)
# predict(d) is magical! See ?SafePrediction
# The story runs at two levels. First, when you do an OLS model,
# predict()ion requires applying coefficients to an appropriate
# X matrix. But one level deeper, the polynomial or orthogonal-polynomial
# needs to be utilised for computing the X matrix based on the
# supplied x.outsample data.
# If you say p <- poly(x, n)
# then you can say predict(p, new) where predict.poly() gets invoked.
# And when you say predict(lm()), the full steps are worked out for
# you automatically: predict.poly() is used to make an X matrix and
# then prediction based on the regression results is done.
all.equal(predictions.p, predictions.op) # Both paths are identical for this
# (tame) problem.
R syntax where model specification is an argument to a function
# Invent a dataset
x <- runif(100); y <- runif(100); z <- 2 + 3*x + 4*y + rnorm(100)
D <- data.frame(x=x, y=y, z=z)
amodel <- function(modelstring) {
summary(lm(modelstring, D))
}
amodel(z ~ x)
amodel(z ~ y)
Show the efficiency of the mean when compared with the median
# using a large simulation where both estimators are applied on
# a sample of U(0,1) uniformly distributed random numbers.
one.simulation <- function(N=100) { # N defaults to 100 if not supplied
x <- runif(N)
return(c(mean(x), median(x)))
}
# Simulation --
results <- replicate(100000, one.simulation(20)) # Gives back a 2x100000 matrix
# Two kernel densities --
k1 <- density(results[1,]) # results[1,] is the 1st row
k2 <- density(results[2,])
# A pretty picture --
xrange <- range(k1$x, k2$x)
plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="")
grid()
lines(k2$x, k2$y, col="red")
abline(v=.5)
legend(x="topleft", bty="n",
lty=c(1,1),
col=c("black", "red"),
legend=c("Mean", "Median"))
Simulation to study size and power in a simple problem
set.seed(101)
# The data generating process: a simple uniform distribution with stated mean
dgp <- function(N,mu) {runif(N)-0.5+mu}
# Simulate one FIXED hypothesis test for H0:mu=0, given a true mu for a sample size N
one.test <- function(N, truemu) {
x <- dgp(N,truemu)
muhat <- mean(x)
s <- sd(x)/sqrt(N)
# Under the null, the distribution of the mean has standard error s
threshold <- 1.96*s
(muhat < -threshold) || (muhat > threshold)
} # Return of TRUE means reject the null
# Do one experiment, where the fixed H0:mu=0 is run Nexperiments times with a sample size N.
# We return only one number: the fraction of the time that H0 is rejected.
experiment <- function(Nexperiments, N, truemu) {
sum(replicate(Nexperiments, one.test(N, truemu)))/Nexperiments
}
# Measure the size of a test, i.e. rejections when H0 is true
experiment(10000, 50, 0)
# Measurement with sample size of 50, and true mu of 0.
# Power study: I.e. Pr(rejection) when H0 is false
# (one special case in here is when the H0 is actually true)
muvalues <- seq(-.15,.15,.01)
# When true mu < -0.15 and when true mu > 0.15,
# the Pr(rejection) veers to 1 (full power) and it's not interesting.
# First do this with sample size of 50
results <- NULL
for (truth in muvalues) {
results <- c(results, experiment(10000, 50, truth))
}
par(mai=c(.8,.8,.2,.2))
plot(muvalues, results, type="l", lwd=2, ylim=c(0,1),
xlab="True mu", ylab="Pr(Rejection of H0:mu=0)")
abline(h=0.05, lty=2)
# Now repeat this with sample size of 100 (should yield a higher power)
results <- NULL
for (truth in muvalues) {
results <- c(results, experiment(10000, 100, truth))
}
lines(muvalues, results, lwd=2, col="blue")
legend(x=-0.15, y=.2, lwd=c(2,1,2), lty=c(1,2,1), cex=.8,
col=c("black","black","blue"), bty="n",
legend=c("N=50", "Size, 0.05", "N=100"))
Do bootstrap inference, as an example, for a sample median
library(boot)
samplemedian <- function(x, d) { # d is a vector of integer indexes
return(median(x[d])) # The genius is in the x[d] notation
}
data <- rnorm(50) # Generate a dataset with 50 obs
b <- boot(data, samplemedian, R=2000) # 2000 bootstrap replications
cat("Sample median has a sigma of ", sd(b$t[,1]), "\n")
plot(b)
# Make a 99% confidence interval
boot.ci(b, conf=0.99, type="basic")
OLS by MLE
# OLS likelihood function
# Note: I am going to write the LF using sigma2=sigma^2 and not sigma.
ols.lf1 <- function(theta, y, X) {
beta <- theta[-1]
sigma2 <- theta[1]
if (sigma2 <= 0) return(NA)
n <- nrow(X)
e <- y - X%*%beta # t() = matrix transpose
logl <- ((-n/2)*log(2*pi)) - ((n/2)*log(sigma2)) - ((t(e)%*%e)/(2*sigma2))
return(-logl) # since optim() does minimisation by default.
}
# Analytical derivatives
ols.gradient <- function(theta, y, X) {
beta <- theta[-1]
sigma2 <- theta[1]
e <- y - X%*%beta
n <- nrow(X)
g <- numeric(length(theta))
g[1] <- (-n/(2*sigma2)) + (t(e)%*%e)/(2*sigma2*sigma2) # d logl / d sigma
g[-1] <- (t(X) %*% e)/sigma2 # d logl / d beta
return(-g)
}
X <- cbind(1, runif(1000))
theta.true <- c(2,4,6) # error variance = 2, intercept = 4, slope = 6.
y <- X %*% theta.true[-1] + sqrt(theta.true[1]) * rnorm(1000)
# Estimation by OLS --
d <- summary(lm(y ~ X[,2]))
theta.ols <- c(sigma2 = d$sigma^2, d$coefficients[,1])
cat("OLS theta = ", theta.ols, "\n\n")
cat("\nGradient-free (constrained optimisation) --\n")
optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1,
lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), y=y, X=X)
cat("\nUsing the gradient (constrained optimisation) --\n")
optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, gr=ols.gradient,
lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), y=y, X=X)
cat("\n\nYou say you want a covariance matrix?\n")
p <- optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, gr=ols.gradient,
lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), hessian=TRUE,
y=y, X=X)
inverted <- solve(p$hessian)
results <- cbind(p$par, sqrt(diag(inverted)), p$par/sqrt(diag(inverted)))
colnames(results) <- c("Coefficient", "Std. Err.", "t")
rownames(results) <- c("Sigma", "Intercept", "X")
cat("MLE results --\n")
print(results)
cat("Compare with the OLS results --\n")
d$coefficients
# Picture of how the loglikelihood changes if you perturb the sigma
theta <- theta.ols
delta.values <- seq(-1.5, 1.5, .01)
logl.values <- as.numeric(lapply(delta.values,
function(x) {-ols.lf1(theta+c(x,0,0),y,X)}))
plot(sqrt(theta[1]+delta.values), logl.values, type="l", lwd=3, col="blue",
xlab="Sigma", ylab="Log likelihood")
grid()
To do `moving window volatility' of returns
library(zoo)
# Some data to play with (Nifty on all fridays for calendar 2004) --
p <- structure(c(1946.05, 1971.9, 1900.65, 1847.55, 1809.75, 1833.65, 1913.6, 1852.65, 1800.3, 1867.7, 1812.2, 1725.1, 1747.5, 1841.1, 1853.55, 1868.95, 1892.45, 1796.1, 1804.45, 1582.4, 1560.2, 1508.75, 1521.1, 1508.45, 1491.2, 1488.5, 1537.5, 1553.2, 1558.8, 1601.6, 1632.3, 1633.4, 1607.2, 1590.35, 1609, 1634.1, 1668.75, 1733.65, 1722.5, 1775.15, 1820.2, 1795, 1779.75, 1786.9, 1852.3, 1872.95, 1872.35, 1901.05, 1996.2, 1969, 2012.1, 2062.7, 2080.5), index = structure(c(12419, 12426, 12433, 12440, 12447, 12454, 12461, 12468, 12475, 12482, 12489, 12496, 12503, 12510, 12517, 12524, 12531, 12538, 12545, 12552, 12559, 12566, 12573, 12580, 12587, 12594, 12601, 12608, 12615, 12622, 12629, 12636, 12643, 12650, 12657, 12664, 12671, 12678, 12685, 12692, 12699, 12706, 12713, 12720, 12727, 12734, 12741, 12748, 12755, 12762, 12769, 12776, 12783), class = "Date"), frequency = 0.142857142857143, class = c("zooreg", "zoo"))
# Shift to returns --
r <- 100*diff(log(p))
head(r)
summary(r)
sd(r)
# Compute the moving window vol --
vol <- sqrt(250) * rollapply(r, 20, sd, align = "right")
# A pretty plot --
plot(vol, type="l", ylim=c(0,max(vol,na.rm=TRUE)),
lwd=2, col="purple", xlab="2004",
ylab=paste("Annualised sigma, 20-week window"))
grid()
legend(x="bottomleft", col=c("purple", "darkgreen"),
lwd=c(2,2), bty="n", cex=0.8,
legend=c("Annualised 20-week vol (left scale)", "Nifty (right scale)"))
par(new=TRUE)
plot(p, type="l", lwd=2, col="darkgreen",
xaxt="n", yaxt="n", xlab="", ylab="")
axis(4)
Joint distributions, marginal distributions, useful tables
# First let me invent some fake data
set.seed(102) # This yields a good illustration.
x <- sample(1:3, 15, replace=TRUE)
education <- factor(x, labels=c("None", "School", "College"))
x <- sample(1:2, 15, replace=TRUE)
gender <- factor(x, labels=c("Male", "Female"))
age <- runif(15, min=20,max=60)
D <- data.frame(age, gender, education)
rm(x,age,gender,education)
print(D)
# Table about education
table(D$education)
# Table about education and gender --
table(D$gender, D$education)
# Joint distribution of education and gender --
table(D$gender, D$education)/nrow(D)
# Add in the marginal distributions also
addmargins(table(D$gender, D$education))
addmargins(table(D$gender, D$education))/nrow(D)
# Generate a good LaTeX table out of it --
library(xtable)
xtable(addmargins(table(D$gender, D$education))/nrow(D),
digits=c(0,2,2,2,2)) # You have to do | and \hline manually.
# Study age by education category
by(D$age, D$gender, mean)
by(D$age, D$gender, sd)
by(D$age, D$gender, summary)
# Two-way table showing average age depending on education & gender
a <- matrix(by(D$age, list(D$gender, D$education), mean), nrow=2)
rownames(a) <- levels(D$gender)
colnames(a) <- levels(D$education)
print(a)
# or, of course,
print(xtable(a))
Get the data in place
load(file="demo.rda")
summary(firms)
# Look at it --
plot(density(log(firms$mktcap)))
plot(firms$mktcap, firms$spread, type="p", cex=.2, col="blue", log="xy",
xlab="Market cap (Mln USD)", ylab="Bid/offer spread (bps)")
m=lm(log(spread) ~ log(mktcap), firms)
summary(m)
# Making deciles --
library(gtools)
library(gdata)
# for deciles (default=quartiles)
size.category = quantcut(firms$mktcap, q=seq(0, 1, 0.1), labels=F)
table(size.category)
means = aggregate(firms, list(size.category), mean)
print(data.frame(means$mktcap,means$spread))
# Make a picture combining the sample mean of spread (in each decile)
# with the weighted average sample mean of the spread (in each decile),
# where weights are proportional to size.
wtd.means = by(firms, size.category,
function(piece) (sum(piece$mktcap*piece$spread)/sum(piece$mktcap)))
lines(means$mktcap, means$spread, type="b", lwd=2, col="green", pch=19)
lines(means$mktcap, wtd.means, type="b", lwd=2, col="red", pch=19)
legend(x=0.25, y=0.5, bty="n",
col=c("blue", "green", "red"),
lty=c(0, 1, 1), lwd=c(0,2,2),
pch=c(0,19,19),
legend=c("firm", "Mean spread in size deciles",
"Size weighted mean spread in size deciles"))
# Within group standard deviations --
aggregate(firms, list(size.category), sd)
# Now I do quartiles by BOTH mktcap and spread.
size.quartiles = quantcut(firms$mktcap, labels=F)
spread.quartiles = quantcut(firms$spread, labels=F)
table(size.quartiles, spread.quartiles)
# Re-express everything as joint probabilities
table(size.quartiles, spread.quartiles)/nrow(firms)
# Compute cell means at every point in the joint table:
aggregate(firms, list(size.quartiles, spread.quartiles), mean)
# Make pretty two-way tables
aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, nobs)
aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, mean)
aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, sd)
aggregate.table(firms$spread, size.quartiles, spread.quartiles, mean)
aggregate.table(firms$spread, size.quartiles, spread.quartiles, sd)
Scare the hell out of children with the Cauchy distribution
# A function which simulates N draws from one of two distributions,
# and returns the mean obtained thusly.
one.simulation <- function(N=100, distribution="normal") {
if (distribution == "normal") {
x <- rnorm(N)
} else {
x <- rcauchy(N)
}
mean(x)
}
k1 <- density(replicate(1000, one.simulation(20)))
k2 <- density(replicate(1000, one.simulation(20, distribution="cauchy")))
xrange <- range(k1$x, k2$x)
plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="")
grid()
lines(k2$x, k2$y, col="red")
abline(v=.5)
legend(x="topleft", bty="n",
lty=c(1,1),
col=c("black", "red"),
legend=c("Mean of Normal", "Mean of Cauchy"))
# The distribution of the mean of normals collapses into a point;
# that of the cauchy does not.
# Here's more scary stuff --
for (i in 1:10) {
cat("Sigma of distribution of 1000 draws from mean of normal - ",
sd(replicate(1000, one.simulation(20))), "\n")
}
for (i in 1:10) {
cat("Sigma of distribution of 1000 draws from mean of cauchy - ",
sd(replicate(1000, one.simulation(20, distribution="cauchy"))), "\n")
}
# Exercise for the reader: Compare the distribution of the median of
# the Normal against the distribution of the median of the Cauchy.
example of simulation-based inference
# This is in the context of testing for time-series dependence in
# stock market returns data.
# The code here does the idea of Kim, Nelson, Startz (1991).
# We want to use the distribution of realworld returns data, without
# needing assumptions about normality.
# The null is lack of dependence (i.e. an efficient market).
# So repeatedly, the data is permuted, and the sample ACF is computed.
# This gives us the distribution of the ACF under H0: independence, but
# while using the empirical distribution of the returns data.
# Weekly returns on Nifty, 1/1/2002 to 31/12/2003, 104 weeks of data.
r <- c(-0.70031182197603, 0.421690133064168, -1.20098072984689, 0.143402360644984, 3.81836537549516, 3.17055939373247, 0.305580301919228, 1.23853814691852, 0.81584795095706, -1.51865139747764, -2.71223626421522, -0.784836480094242, 1.09180041170998, 0.397649587762761, -4.11309534220923, -0.263912425099111, -0.0410144239805454, 1.75756212770972, -2.3335373897992, -2.19228764624217, -3.64578978183987, 1.92535789661354, 3.45782867883164, -2.15532607229374, -0.448039988298987, 1.50124793565896, -1.45871585874362, -2.13459863369767, -6.2128068251802, -1.94482987066289, 0.751294815735637, 1.78244982829590, 1.61567494389745, 1.53557708728931, -1.53557708728931, -0.322061470004265, -2.28394919698225, 0.70399304137414, -2.93580952607737, 2.38125098034425, 0.0617697039252185, -4.14482733720716, 2.04397528093754, 0.576400673606603, 3.43072725191913, 2.96465382864843, 2.89833358015583, 1.85387040058336, 1.52136515035952, -0.637268376944444, 1.75418926224609, -0.804391905851354, -0.861816058320475, 0.576902488444109, -2.84259880663331, -1.35375536139417, 1.49096529042234, -2.05404881010045, 2.86868849528146, -0.258270670200478, -4.4515881438687, -1.73055019137092, 3.04427015714648, -2.94928202352018, 1.62081315773994, -6.83117945164824, -0.962715713711582, -1.75875847071740, 1.50330330252721, -0.0479705789653728, 3.68968303215933, -0.535807567290103, 3.94034871061182, 3.85787174417738, 0.932185956989873, 4.08598654183674, 2.27343783689715, 1.13958830440017, 2.01737201171230, -1.88131458327554, 1.97596267156648, 2.79857144562001, 2.22470306481695, 2.03212951411427, 4.95626853448883, 3.40400972901396, 3.03840139165246, -1.89863129741417, -3.70832135042951, 4.78478922155396, 4.3973589590097, 4.9667050392987, 2.99775078737081, -4.12349101552438, 3.25638269809945, 2.29683376253966, -2.64772825878214, -0.630835277076258, 4.72528848505451, 1.87368447333380, 3.17543946162564, 4.58174427843208, 3.23625985632168, 2.29777651227296)
# The 1st autocorrelation from the sample:
acf(r, 1, plot=FALSE)$acf[2]
# Obtain 1000 draws from the distribution of the 1st autocorrelation
# under the null of independence:
set.seed <- 101
simulated <- replicate(1000, acf(r[sample(1:104, replace=FALSE)], 1, plot=FALSE)$acf[2])
# At 95% --
quantile(simulated, probs=c(.025,.975))
# At 99% --
quantile(simulated, probs=c(.005,.995))
# So we can reject the null at 95% but not at 99%.
# A pretty picture.
plot(density(simulated), col="blue")
abline(v=0)
abline(v=quantile(simulated, probs=c(.025,.975)), lwd=2, col="purple")
abline(v=acf(r, 1, plot=FALSE)$acf[2], lty=2, lwd=4, col="yellow")
Standard computations with well-studied distributions
# The normal distribution is named "norm". With this, we have:
# Normal density
dnorm(c(-1.96,0,1.96))
# Cumulative normal density
pnorm(c(-1.96,0,1.96))
# Inverse of this
qnorm(c(0.025,.5,.975))
pnorm(qnorm(c(0.025,.5,.975)))
# 1000 random numbers from the normal distribution
summary(rnorm(1000))
# Here's the same ideas, for the chi-squared distribution with 10 degrees
# of freedom.
dchisq(c(0,5,10), df=10)
# Cumulative normal density
pchisq(c(0,5,10), df=10)
# Inverse of this
qchisq(c(0.025,.5,.975), df=10)
# 1000 random numbers from the normal distribution
summary(rchisq(1000, df=10))
two-sample Kolmogorov-Smirnoff test
# Goal: Given two vectors of data,
# superpose their CDFs
# and show the results of the two-sample Kolmogorov-Smirnoff test
# The function consumes two vectors x1 and x2.
# You have to provide a pair of labels as `legendstrings'.
# If you supply an xlab, it's used
# If you specify log - e.g. log="x" - this is passed on to plot.
# The remaining args that you specify are sent on into ks.test()
two.cdfs.plot <- function(x1, x2, legendstrings, xlab="", log="", ...) {
stopifnot(length(x1)>0,
length(x2)>0,
length(legendstrings)==2)
hilo <- range(c(x1,x2))
par(mai=c(.8,.8,.2,.2))
plot(ecdf(x1), xlim=hilo, verticals=TRUE, cex=0,
xlab=xlab, log=log, ylab="Cum. distribution", main="")
grid()
plot(ecdf(x2), add=TRUE, verticals=TRUE, cex=0, lwd=3)
legend(x="bottomright", lwd=c(1,3), lty=1, bty="n",
legend=legendstrings)
k <- ks.test(x1,x2, ...)
text(x=hilo[1], y=c(.9,.85), pos=4, cex=.8,
labels=c(
paste("KS test statistic: ", sprintf("%.3g", k$statistic)),
paste("Prob value: ", sprintf("%.3g", k$p.value))
)
)
k
}
x1 <- rnorm(100, mean=7, sd=1)
x2 <- rnorm(100, mean=9, sd=1)
# Check error detection --
two.cdfs.plot(x1,x2)
# Typical use --
two.cdfs.plot(x1, x2, c("X1","X2"), xlab="Height (metres)", log="x")
# Send args into ks.test() --
two.cdfs.plot(x1, x2, c("X1","X2"), alternative="less")
To read in a simple data file, and look around it's contents
# Suppose you have a file "x.data" which looks like this:
# 1997,3.1,4
# 1998,7.2,19
# 1999,1.7,2
# 2000,1.1,13
# To read it in --
A <- read.table("x.data", sep=",",
col.names=c("year", "my1", "my2"))
nrow(A) # Count the rows in A
summary(A$year) # The column "year" in data frame A
# is accessed as A$year
A$newcol <- A$my1 + A$my2 # Makes a new column in A
newvar <- A$my1 - A$my2 # Makes a new R object "newvar"
A$my1 <- NULL # Removes the column "my1"
# You might find these useful, to "look around" a dataset --
str(A)
summary(A)
library(Hmisc) # This requires that you've installed the Hmisc package
contents(A)
describe(A)
To read in a simple data file where date data is present
# Suppose you have a file "x.data" which looks like this:
# 1997-07-04,3.1,4
# 1997-07-05,7.2,19
# 1997-07-07,1.7,2
# 1997-07-08,1.1,13
A <- read.table("x.data", sep=",",
col.names=c("date", "my1", "my2"))
A$date <- as.Date(A$date, format="%Y-%m-%d")
# Say ?strptime to learn how to use "%" to specify
# other date formats. Two examples --
# "15/12/2002" needs "%d/%m/%Y"
# "03 Jun 1997" needs "%d %b %Y"
# Actually, if you're using the ISO 8601 date format, i.e.
# "%Y-%m-%d", that's the default setting and you don't need to
# specify the format.
A$newcol <- A$my1 + A$my2 # Makes a new column in A
newvar <- A$my1 - A$my2 # Makes a new R object "newvar"
A$my1 <- NULL # Delete the `my1' column
summary(A) # Makes summary statistics
To read in files produced by CMIE's "Business Beacon"
# This assumes you have made a file of MONTHLY data using CMIE's
# Business Beacon program. This contains 2 columns: M3 and M0.
A <- read.table(
# Generic to all BB files --
sep="|", # CMIE's .txt file is pipe delimited
skip=3, # Skip the 1st 3 lines
na.strings=c("N.A.","Err"), # The ways they encode missing data
# Specific to your immediate situation --
file="bb_data.text",
col.names=c("junk", "date", "M3", "M0")
)
A$junk <- NULL # Blow away this column
# Parse the CMIE-style "Mmm yy" date string that's used on monthly data
A$date <- as.Date(paste("1", as.character(A$date)), format="%d %b %Y")
Reading and writing ascii files, reading and writing binary files
# And, to measure how much faster it is working with binary files.
# First manufacture a tall data frame:
# FYI -- runif(10) yields 10 U(0,1) random numbers.
B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000))
summary(B)
# Write out ascii file:
write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA)
# Read in this resulting ascii file:
C=read.table("/tmp/foo.csv", header = TRUE, sep = ",", row.names=1)
# Write a binary file out of dataset C:
save(C, file="/tmp/foo.binary")
# Delete the dataset C:
rm(C)
# Restore from foo.binary:
load("/tmp/foo.binary")
summary(C) # should yield the same results
# as summary(B) above.
# Now we time all these operations --
cat("Time creation of dataset:\n")
system.time({
B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000))
})
cat("Time writing an ascii file out of dataset B:\n")
system.time(
write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA)
)
cat("Time reading an ascii file into dataset C:\n")
system.time(
{C=read.table("/tmp/foo.csv", header = TRUE, sep=",", row.names=1)
})
cat("Time writing a binary file out of dataset C:\n")
system.time(save(C, file="/tmp/foo.binary"))
cat("Time reading a binary file + variablenames from /tmp/foo.binary:\n")
system.time(load("/tmp/foo.binary")) # and then read it in from binary file
Lots of timesR
# Goals: Lots of times, you need to give an R object to a friend,
# or embed data into an email.
# First I invent a little dataset --
set.seed(101) # To make sure you get the same random numbers as me
# FYI -- runif(10) yields 10 U(0,1) random numbers.
A = data.frame(x1=runif(10), x2=runif(10), x3=runif(10))
# Look at it --
print(A)
# Writing to a binary file that can be transported
save(A, file="/tmp/my_data_file.rda") # You can give this file to a friend
load("/tmp/my_data_file.rda")
# Plan B - you want pure ascii, which can be put into an email --
dput(A)
# This gives you a block of R code. Let me utilise that generated code
# to create a dataset named "B".
B <- structure(list(x1 = c(0.372198376338929, 0.0438248154241592,
0.709684018278494, 0.657690396532416, 0.249855723232031, 0.300054833060130,
0.584866625955328, 0.333467143354937, 0.622011963743716, 0.54582855431363
), x2 = c(0.879795730113983, 0.706874740775675, 0.731972594512627,
0.931634427979589, 0.455120594473556, 0.590319729177281, 0.820436094887555,
0.224118480458856, 0.411666829371825, 0.0386105608195066), x3 = c(0.700711545301601,
0.956837461562827, 0.213352001970634, 0.661061500199139, 0.923318882007152,
0.795719761401415, 0.0712125543504953, 0.389407767681405, 0.406451216200367,
0.659355078125373)), .Names = c("x1", "x2", "x3"), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame")
# Verify that A and B are near-identical --
A-B
# or,
all.equal(A,B)
Make a time-series object using the "zoo" package
A <- data.frame(date=c("1995-01-01", "1995-01-02", "1995-01-03", "1995-01-06"),
x=runif(4),
y=runif(4))
A$date <- as.Date(A$date) # yyyy-mm-dd is the default format
# So far there's nothing new - it's just a data frame. I have hand-
# constructed A but you could equally have obtained it using read.table().
# I want to make a zoo matrix out of the numerical columns of A
library(zoo)
B <- A
B$date <- NULL
z <- zoo(as.matrix(B), order.by=A$date)
rm(A, B)
# So now you are holding "z", a "zoo" object. You can do many cool
# things with it.
# See http://www.google.com/search?hl=en&q=zoo+quickref+achim&btnI=I%27m+Feeling+Lucky
# To drop down to a plain data matrix, say
C <- coredata(z)
rownames(C) <- as.character(time(z))
# Compare --
str(C)
str(z)
# The above is a tedious way of doing these things, designed to give you
# an insight into what is going on. If you just want to read a file
# into a zoo object, a very short path is something like:
# z <- read.zoo(filename, format="%d %b %Y")
All manner of import and export of datasets
# Invent a dataset --
A <- data.frame(
name=c("a","b","c"),
ownership=c("Case 1","Case 1","Case 2"),
listed.at=c("NSE",NA,"BSE"),
# Firm "b" is unlisted.
is.listed=c(TRUE,FALSE,TRUE),
# R convention - boolean variables are named "is.something"
x=c(2.2,3.3,4.4),
date=as.Date(c("2004-04-04","2005-05-05","2006-06-06"))
)
# To a spreadsheet through a CSV file --
write.table(A,file="demo.csv",sep = ",",col.names = NA,qmethod = "double")
B <- read.table("demo.csv", header = TRUE, sep = ",", row.names = 1)
# To R as a binary file --
save(A, file="demo.rda")
load("demo.rda")
# To the Open XML standard for transport for statistical data --
library(StatDataML)
writeSDML(A, "/tmp/demo.sdml")
B <- readSDML("/tmp/demo.sdml")
# To Stata --
library(foreign)
write.dta(A, "/tmp/demo.dta")
B <- read.dta("/tmp/demo.dta")
# foreign::write.foreign() also has a pathway to SAS and SPSS.
Special cases in reading files
# Reading in a .bz2 file --
read.table(bzfile("file.text.bz2")) # Requires you have ./file.text.bz2
# Reading in a .gz file --
read.table(gzfile("file.text.gz")) # Requires you have ./file.text.bz2
# Reading from a pipe --
mydata <- read.table(pipe("awk -f filter.awk input.txt"))
# Reading from a URL --
read.table(url("http://www.mayin.org/ajayshah/A/demo.text"))
# This also works --
read.table("http://www.mayin.org/ajayshah/A/demo.text")
# Hmm, I couldn't think of how to read a .bz2 file from a URL. How about:
read.table(pipe("links -source http://www.mayin.org/ajayshah/A/demo.text.bz2 | bunzip2"))
# Reading binary files from a URL --
load(url("http://www.mayin.org/ajayshah/A/nifty_weekly_returns.rda"))
Reading in a Microsoft .xls file directly
library(gdata)
a <- read.xls("file.xls", sheet=2) # This reads in the 2nd sheet
# Look at what the cat dragged in
str(a)
# If you have a date column, you'll want to fix it up like this:
a$date <- as.Date(as.character(a$X), format="%d-%b-%y")
a$X <- NULL
# Also see http://tolstoy.newcastle.edu.au/R/help/06/04/25674.html for
# another path.
ARMA modeling - estimation, diagnostics, forecasting
# 0. SETUP DATA
rawdata <- c(-0.21,-2.28,-2.71,2.26,-1.11,1.71,2.63,-0.45,-0.11,4.79,5.07,-2.24,6.46,3.82,4.29,-1.47,2.69,7.95,4.46,7.28,3.43,-3.19,-3.14,-1.25,-0.50,2.25,2.77,6.72,9.17,3.73,6.72,6.04,10.62,9.89,8.23,5.37,-0.10,1.40,1.60,3.40,3.80,3.60,4.90,9.60,18.20,20.60,15.20,27.00,15.42,13.31,11.22,12.77,12.43,15.83,11.44,12.32,12.10,12.02,14.41,13.54,11.36,12.97,10.00,7.20,8.74,3.92,8.73,2.19,3.85,1.48,2.28,2.98,4.21,3.85,6.52,8.16,5.36,8.58,7.00,10.57,7.12,7.95,7.05,3.84,4.93,4.30,5.44,3.77,4.71,3.18,0.00,5.25,4.27,5.14,3.53,4.54,4.70,7.40,4.80,6.20,7.29,7.30,8.38,3.83,8.07,4.88,8.17,8.25,6.46,5.96,5.88,5.03,4.99,5.87,6.78,7.43,3.61,4.29,2.97,2.35,2.49,1.56,2.65,2.49,2.85,1.89,3.05,2.27,2.91,3.94,2.34,3.14,4.11,4.12,4.53,7.11,6.17,6.25,7.03,4.13,6.15,6.73,6.99,5.86,4.19,6.38,6.68,6.58,5.75,7.51,6.22,8.22,7.45,8.00,8.29,8.05,8.91,6.83,7.33,8.52,8.62,9.80,10.63,7.70,8.91,7.50,5.88,9.82,8.44,10.92,11.67)
# Make a R timeseries out of the rawdata: specify frequency & startdate
gIIP <- ts(rawdata, frequency=12, start=c(1991,4))
print(gIIP)
plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2,
main="Full data")
grid()
# Based on this, I decide that 4/1995 is the start of the sensible period.
gIIP <- window(gIIP, start=c(1995,4))
print(gIIP)
plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2,
main="Estimation subset")
grid()
# Descriptive statistics about gIIP
mean(gIIP); sd(gIIP); summary(gIIP);
plot(density(gIIP), col="blue", main="(Unconditional) Density of IIP growth")
acf(gIIP)
# 1. ARMA ESTIMATION
m.ar2 <- arima(gIIP, order = c(2,0,0))
print(m.ar2) # Print it out
# 2. ARMA DIAGNOSTICS
tsdiag(m.ar2) # His pretty picture of diagnostics
## Time series structure in errors
print(Box.test(m.ar2$residuals, lag=12, type="Ljung-Box"));
## Sniff for ARCH
print(Box.test(m.ar2$residuals^2, lag=12, type="Ljung-Box"));
## Eyeball distribution of residuals
plot(density(m.ar2$residuals), col="blue", xlim=c(-8,8),
main=paste("Residuals of AR(2)"))
# 3. FORECASTING
## Make a picture of the residuals
plot.ts(m.ar2$residual, ylab="Innovations", col="blue", lwd=2)
s <- sqrt(m.ar2$sigma2)
abline(h=c(-s,s), lwd=2, col="lightGray")
p <- predict(m.ar2, n.ahead = 12) # Make 12 predictions.
print(p)
## Watch the forecastability decay away from fat values to 0.
## sd(x) is the naive sigma. p$se is the prediction se.
gain <- 100*(1-p$se/sd(gIIP))
plot.ts(gain, main="Gain in forecast s.d.", ylab="Per cent",
col="blue", lwd=2)
## Make a pretty picture that puts it all together
ts.plot(gIIP, p$pred, p$pred-1.96*p$se, p$pred+1.96*p$se,
gpars=list(lty=c(1,1,2,2), lwd=c(2,2,1,1),
ylab="IIP growth (%)", col=c("blue","red", "red", "red")))
grid()
abline(h=mean(gIIP), lty=2, lwd=2, col="lightGray")
legend(x="bottomleft", cex=0.8, bty="n",
lty=c(1,1,2,2), lwd=c(2,1,1,2),
col=c("blue", "red", "red", "lightGray"),
legend=c("IIP", "AR(2) forecasts", "95% C.I.", "Mean IIP growth"))