R examples

Must Watch!



MustWatch



BASICS

# Setup some scalars x <- 3 y <- 4 z <- sqrt(x*x + y*y) use.dots.in.names <- z - 5 # Print out z print(z) # The default action is print() z # So you have a neat scientific calculator sin(log(2.718281828)*pi)

SCALARS

# The basic scalar types x <- 3 take.me <- FALSE s <- "this is a string" x take.me s # Exploring objects using str() -- str(x) str(take.me) str(s)

DATES: A SPECIAL KIND OF SCALAR

s <- "2004-04-22" d <- as.Date(s) d str(d) s <- "22-Apr-2004" d <- as.Date(s, format="%d-%b-%Y") d as.Date("22-04-2004", format="%d-%m-%Y") as.Date("22-04-04", format="%d-%m-%y") as.Date("22-Apr-04", format="%d-%b-%y")

VECTORS

x <- c(2,3,7, 9, 10, 11) # the c() operator, for concatenation x[1] x[6] length(x) # Help about all R functions is online: ?length # Arithmetic that gobbles a vector at a time x x+2 y <- log(x) y # Clever vector addressing x[1] x[6] x[c(1,6)] indexes <- c(1,3,5) x[indexes] # Another shorthand 1:4 x[1:4] indexes <- 1:4 x[indexes] # Another shorthand switches <- c(TRUE,FALSE,FALSE,FALSE,FALSE,TRUE) x[switches] # Dropping some elements x x[-2] x[c(-2,-3)] # Bottom line: Very nice mechanisms for addressing. # Let's stay in touch with how to explore objects -- str(x) str(x[1])

FACTORS: A SPECIAL KIND OF SCALAR

names <- c("Payal", "Shraddha", "Aditi", "Kritika", "Diwakar") attire <- c("Sari", "Salwar", "Sari", "Sari", "Kurta") # attire is a "categorical variable" attire <- factor(attire) attire table(attire) table(names, attire) # Internally, a factor is just stored as an integer as.numeric(attire)

NOT AVAILABLE: A SPECIAL SCALAR

x <- c(2,3,NA,4,5,NA) x x + 2 # Automatic rules about NA arithmetic

MATRICES

X <- matrix(NA, nrow=7, ncol=3) X X[4,] <- c(2,3,4) X X[,3] <- 1:7 X str(X) nrow(X) ncol(X) # As with vectors, arithmetic that attacks a full matrix at a time! X + 1

LISTS

# Ability to make a parcel of apparently unrelated materials. results <- list(mu=0.2, sigma=0.9, x=c(1,2,3,9)) str(results) # Accessing elements -- results$mu results$sigma <- 9 results$new <- "Hello" results

DAILY TIME-SERIES USING THE `ZOO' PACKAGE

# Zoo is a very powerful, high-level time-series system. To learn more: library(help=zoo) vignette("zoo-quickref") # Let's do a daily time-series dates <- as.Date(c("2004-04-01", "2004-04-02", "2004-04-03", "2004-04-05")) values <- rnorm(4) library(zoo) z <- zoo(values, order.by=dates) z plot(z)

DATASET, TERMED `DATA FRAME' in R

# A dataset is a list of vectors, all of the same length. names <- c("Payal", "Shraddha", "Aditi", "Kritika", "Diwakar") age <- c(15, 17, 15, 19, 20) iq <- c(90, 100, 110, 120, 160) # By default, data.frame() forces the names to be factors... D <- data.frame(names=names, age=age, iq=iq) str(D) D nrow(D) ncol(D) summary(D) # Accessing a column in a data frame D$age # Accessing an observation in a data frame - use matrix-like notation D[3,] # Accessing the 3rd value of age D$age[3]

WRITING AND THEN READING FILES

D write.csv(D, file="demo.csv") system("cat demo.csv") # Say "type" if you're on DOS E <- read.table("demo.csv", skip=1, sep=",", col.names=c("obsnum","name","age","iq")) E E$obsnum <- NULL E # Compare against -- D

REGRESSION

m <- lm(iq ~ age, D) m summary(m) # Remember that shoe size is strongly correlated with IQ plot(D$age, D$iq, xlab="Age (years)", ylab="IQ") abline(h=100, lty=3) lines(D$age, fitted.values(m), col="blue", lwd=2)

SIMULATION

runif(10) runif(10) set.seed(133); runif(10) set.seed(133); runif(10) summary(runif(100)) summary(runif(1000)) summary(runif(10000)) summary(runif(100000)) cat("See how large datasets stabilise sample statistics\n") summary(runif(100)); summary(runif(100)) summary(runif(10000)); summary(runif(10000)) plot(density(runif(1000))) plot(density(rnorm(1000))) x <- seq(-4,4,0.01) lines(x, dnorm(x), col="blue", lwd=2)

SIMPLE SUMMARY STATISTICS

x <- rnorm(10000) summary(x) quantile(x, probs=c(0.01, 0.025, 0.25, 0.5, 0.75, 0.975, 0.99)) IQR(x) fivenum(x) sd(x) range(x) mean(x) mean(x, trim=0.1) # Delete 10% of data at each end first

REGRESSION ON A SIMULATED DATASET

x <- 2 + 3*runif(100) summary(x) y <- -7 + 4*x + rnorm(100) summary(lm(y ~ x)) # Convince yourself the intercept is ok # Most statistical functions give back results of results. res.1 <- lm(y ~ x) str(res.1) # There's a lot of goodies here! res.1$coefficients # In this case, detailed calculations require summary() to get what # I call "level 2 results". res.2 <- summary(res.1) str(res.1) # There's even more goodies here!! res.2$coefficients res.2$r.squared res.2$sigma # There are useful generic functions which work for a lot of models -- logLik(res.1) AIC(res.1)

TIME SERIES ANALYSIS

# ACF of white noise z <- rnorm(100) acf(z) z <- rnorm(1000) acf(z) # Simulate data from an AR(1) process x <- arima.sim(model=list(ar=c(0.5)), n=1000) arima(x, order=c(1,0,0)) acf(x) # Let's try to estimate a wrong model -- arima(x, order=c(3,0,0))

A first look at R objects

- vectors, lists, matrices, data frames. # To make vectors "x" "y" "year" and "names" x <- c(2,3,7,9) y <- c(9,7,3,2) year <- 1990:1993 names <- c("payal", "shraddha", "kritika", "itida") # Accessing the 1st and last elements of y -- y[1] y[length(y)] # To make a list "person" -- person <- list(name="payal", x=2, y=9, year=1990) person # Accessing things inside a list -- person$name person$x # To make a matrix, pasting together the columns "year" "x" and "y" # The verb cbind() stands for "column bind" cbind(year, x, y) # To make a "data frame", which is a list of vectors of the same length -- D <- data.frame(names, year, x, y) nrow(D) # Accessing one of these vectors D$names # Accessing the last element of this vector D$names[nrow(D)] # Or equally, D$names[length(D$names)]

sorting

# # The approach here needs to be explained. If `i' is a vector of # integers, then the data frame D[i,] picks up rows from D based # on the values found in `i'. # # The order() function makes an integer vector which is a correct # ordering for the purpose of sorting. D <- data.frame(x=c(1,2,3,1), y=c(7,19,2,2)) D # Sort on x indexes <- order(D$x) D[indexes,] # Print out sorted dataset, sorted in reverse by y D[rev(order(D$y)),]

Prices and returns

# I like to multiply returns by 100 so as to have "units in percent". # In other words, I like it for 5% to be a value like 5 rather than 0.05. # I. Simulate random-walk prices, switch between prices & returns. # Simulate a time-series of PRICES drawn from a random walk # where one-period returns are i.i.d. N(mu, sigma^2). ranrw <- function(mu, sigma, p0=100, T=100) { cumprod(c(p0, 1 + (rnorm(n=T, mean=mu, sd=sigma)/100))) } prices2returns <- function(x) { 100*diff(log(x)) } returns2prices <- function(r, p0=100) { c(p0, p0 * exp(cumsum(r/100))) } cat("Simulate 25 points from a random walk starting at 1500 --\n") p <- ranrw(0.05, 1.4, p0=1500, T=25) # gives you a 25-long series, starting with a price of 1500, where # one-period returns are N(0.05,1.4^2) percent. print(p) cat("Convert to returns--\n") r <- prices2returns(p) print(r) cat("Go back from returns to prices --\n") goback <- returns2prices(r, 1500) print(goback) # II. Plenty of powerful things you can do with returns.... summary(r); sd(r) # summary statistics plot(density(r)) # kernel density plot acf(r) # Autocorrelation function ar(r) # Estimate a AIC-minimising AR model Box.test(r, lag=2, type="Ljung") # Box-Ljung test library(tseries) runs.test(factor(sign(r))) # Runs test bds.test(r) # BDS test. # III. Visualisation and the random walk # I want to obtain intuition into what kinds of price series can happen, # given a starting price, a mean return, and a given standard deviation. # This function simulates out 10000 days of a price time-series at a time, # and waits for you to click in the graph window, after which a second # series is painted, and so on. Make the graph window very big and # sit back and admire. # The point is to eyeball many series and thus obtain some intuition # into what the random walk does. visualisation <- function(p0, s, mu, labelstring) { N <- 10000 x <- (1:(N+1))/250 # Unit of years while (1) { plot(x, ranrw(mu, s, p0, N), ylab="Level", log="y", type="l", col="red", xlab="Time (years)", main=paste("40 years of a process much like", labelstring)) grid() z=locator(1) } } # Nifty -- assuming sigma of 1.4% a day and E(returns) of 13% a year visualisation(2600, 1.4, 13/250, "Nifty") # The numerical values here are used to think about what the INR/USD # exchange rate would have looked like if it started from 31.37, had # a mean depreciation of 5% per year, and had the daily vol of a floating # exchange rate like EUR/USD. visualisation(31.37, 0.7, 5/365, "INR/USD (NOT!) with daily sigma=0.7") # This is of course not like the INR/USD series in the real world - # which is neither a random walk nor does it have a vol of 0.7% a day. # The numerical values here are used to think about what the USD/EUR # exchange rate, starting with 1, having no drift, and having the observed # daily vol of 0.7. (This is about right). visualisation(1, 0.7, 0, "USD/EUR with no drift") # IV. A monte carlo experiment about the runs test # Measure the effectiveness of the runs test when faced with an # AR(1) process of length 100 with a coeff of 0.1 set.seed(101) one.ts <- function() {arima.sim(list(order = c(1,0,0), ar = 0.1), n=100)} table(replicate(1000, runs.test(factor(sign(one.ts())))$p.value < 0.05)) # We find that the runs test throws up a prob value of below 0.05 # for 91 out of 1000 experiments. # Wow! :-) # To understand this, you need to look up the man pages of: # set.seed, arima.sim, sign, factor, runs.test, replicate, table. # e.g. say ?replicate

write functions

# To write functions that send back multiple objects. # FIRST LEARN ABOUT LISTS -- X = list(height=5.4, weight=54) print("Use default printing --") print(X) print("Accessing individual elements --") cat("Your height is ", X$height, " and your weight is ", X$weight, "\n") # FUNCTIONS -- square <- function(x) { return(x*x) } cat("The square of 3 is ", square(3), "\n") # default value of the arg is set to 5. cube <- function(x=5) { return(x*x*x); } cat("Calling cube with 2 : ", cube(2), "\n") # will give 2^3 cat("Calling cube : ", cube(), "\n") # will default to 5^3. # LEARN ABOUT FUNCTIONS THAT RETURN MULTIPLE OBJECTS -- powers <- function(x) { parcel = list(x2=x*x, x3=x*x*x, x4=x*x*x*x); return(parcel); } X = powers(3); print("Showing powers of 3 --"); print(X); # WRITING THIS COMPACTLY (4 lines instead of 7) powerful <- function(x) { return(list(x2=x*x, x3=x*x*x, x4=x*x*x*x)); } print("Showing powers of 3 --"); print(powerful(3)); # In R, the last expression in a function is, by default, what is # returned. So you could equally just say: powerful <- function(x) {list(x2=x*x, x3=x*x*x, x4=x*x*x*x)}

R vector notation

cat("EXAMPLE 1: sin(x) for a vector --\n") # Suppose you have a vector x -- x = c(0.1,0.6,1.0,1.5) # The bad way -- n = length(x) r = numeric(n) for (i in 1:n) { r[i] = sin(x[i]) } print(r) # The good way -- don't use loops -- print(sin(x)) cat("\n\nEXAMPLE 2: Compute the mean of every row of a matrix --\n") # Here's another example. It isn't really about R; it's about thinking in # matrix notation. But still. # Let me setup a matrix -- N=4; M=100; r = matrix(runif(N*M), N, M) # So I face a NxM matrix # [r11 r12 ... r1N] # [r21 r22 ... r2N] # [r32 r32 ... r3N] # My goal: each column needs to be reduced to a mean. # Method 1 uses loops: mean1 = numeric(M) for (i in 1:M) { mean1[i] = mean(r[,i]) } # Alternatively, just say: mean2 = rep(1/N, N) %*% r # Pretty! # The two answers are the same -- all.equal(mean1,mean2[,]) # # As an aside, I should say that you can do this directly by using # the rowMeans() function. But the above is more about pedagogy rather # than showing you how to get rowmeans. cat("\n\nEXAMPLE 3: Nelson-Siegel yield curve\n") # Write this asif you're dealing with scalars -- # Nelson Siegel function nsz <- function(b0, b1, b2, tau, t) { tmp = t/tau tmp2 = exp(-tmp) return(b0 + ((b1+b2)*(1-tmp2)/(tmp)) - (b2*tmp2)) } timepoints <- c(0.01,1:5) # The bad way: z <- numeric(length(timepoints)) for (i in 1:length(timepoints)) { z[i] <- nsz(14.084,-3.4107,0.0015,1.8832,timepoints[i]) } print(z) # The R way -- print(z <- nsz(14.084,-3.4107,0.0015,1.8832,timepoints)) cat("\n\nEXAMPLE 3: Making the NPV of a bond--\n") # You know the bad way - sum over all cashflows, NPVing each. # Now look at the R way. C = rep(100, 6) nsz(14.084,-3.4107,0.0015,1.8832,timepoints) # Print interest rates C/((1.05)^timepoints) # Print cashflows discounted @ 5% C/((1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints))^timepoints)) # Using NS instead of 5% # NPV in two different ways -- C %*% (1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints)))^-timepoints sum(C * (1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints)))^-timepoints) # You can drop back to a flat yield curve at 5% easily -- sum(C * 1.05^-timepoints) # Make a function for NPV -- npv <- function(C, timepoints, r) { return(sum(C * (1 + (0.01*r))^-timepoints)) } npv(C, timepoints, 5) # Bottom line: Here's how you make the NPV of a bond with cashflows C # at timepoints timepoints when the zero curve is a Nelson-Siegel curve -- npv(C, timepoints, nsz(14.084,-3.4107,0.0015,1.8832,timepoints)) # Wow! # --------------------------------------------------------------------------- # Elegant vector notation is amazingly fast (in addition to being beautiful) N <- 1e5 x <- runif(N, -3,3) y <- runif(N) method1 <- function(x,y) { tmp <- NULL for (i in 1:N) { if (x[i] < 0) tmp <- c(tmp, y[i]) } tmp } method2 <- function(x,y) { y[x < 0] } s1 <- system.time(ans1 <- method1(x,y)) s2 <- system.time(ans2 <- method2(x,y)) all.equal(ans1,ans2) s1/s2 # On my machine it's 2000x faster

indexing notation, and the use of is.na()

x <- c(2,7,9,2,NA,5) # An example vector to play with. # Give me elems 1 to 3 -- x[1:3] # Give me all but elem 1 -- x[-1] # Odd numbered elements -- indexes <- seq(1,6,2) x[indexes] # or, more compactly, x[seq(1,6,2)] # Access elements by specifying "on" / "off" through booleans -- require <- c(TRUE,TRUE,FALSE,FALSE,FALSE,FALSE) x[require] # Short vectors get reused! So, to get odd numbered elems -- x[c(TRUE,FALSE)] # Locate missing data -- is.na(x) # Replace missing data by 0 -- x[is.na(x)] <- 0 x # Similar ideas work for matrices -- y <- matrix(c(2,7,9,2,NA,5), nrow=2) y # Make a matrix containing columns 1 and 3 -- y[,c(1,3)] # Let us see what is.na(y) does -- is.na(y) str(is.na(y)) # So is.na(y) gives back a matrix with the identical structure as that of y. # Hence I can say y[is.na(y)] <- -1 y

latex tabular out of an R matrix

# Setup a nice R object: m <- matrix(rnorm(8), nrow=2) rownames(m) <- c("Age", "Weight") colnames(m) <- c("Person1", "Person2", "Person3", "Person4") m # Translate it into a latex tabular: library(xtable) xtable(m, digits=rep(3,5)) # Production latex code that goes into a paper or a book -- print(xtable(m, caption="String", label="t:"), type="latex", file="blah.gen", table.placement="tp", latex.environments=c("center", "footnotesize")) # Now you do \input{blah.gen} in your latex file. # You're lazy, and want to use R to generate latex tables for you? data <- cbind( c(7,9,11,2), c(2,4,19,21) ) colnames(data) <- c("a","b") rownames(data) <- c("x","y","z","a") xtable(data) # or you could do data <- rbind( c(7,2), c(9,4), c(11,19), c(2,21) ) # and the rest goes through identically.

Associative arrays (as in awk) or hashes (as in perl)

# Or, more generally, adventures in R addressing. # Here's a plain R vector: x <- c(2,3,7,9) # But now I tag every elem with labels: names(x) <- c("kal","sho","sad","aja") # Associative array operations: x["kal"] <- 12 # Pretty printing the entire associative array: x # This works for matrices too: m <- matrix(runif(10), nrow=5) rownames(m) <- c("violet","indigo","blue","green","yellow") colnames(m) <- c("Asia","Africa") # The full matrix -- m # Or even better -- library(xtable) xtable(m) # Now address symbolically -- m[,"Africa"] m["indigo",] m["indigo","Africa"] # The "in" operator, as in awk -- for (colour in c("yellow", "orange", "red")) { if (colour %in% rownames(m)) { cat("For Africa and ", colour, " we have ", m[colour, "Africa"], "\n") } else { cat("Colour ", colour, " does not exist in the hash.\n") } } # This works for data frames also -- D <- data.frame(m) D # Look closely at what happened -- str(D) # The colours are the rownames(D). # Operations -- D$Africa D[,"Africa"] D["yellow",] # or subset(D, rownames(D)=="yellow") colnames(D) <- c("Antarctica","America") D D$America

Utilise matrix notation

# We use the problems of portfolio analysis as an example. # Prices of 4 firms to play with, at weekly frequency (for calendar 2004) -- p <- structure(c(300.403, 294.604, 291.038, 283.805, 270.773, 275.506, 292.271, 292.837, 284.872, 295.037, 280.939, 259.574, 250.608, 268.84, 266.507, 263.94, 273.173, 238.609, 230.677, 192.847, 219.078, 201.846, 210.279, 193.281, 186.748, 197.314, 202.813, 204.08, 226.044, 242.442, 261.274, 269.173, 256.05, 259.75, 243, 250.3, 263.45, 279.5, 289.55, 291.95, 302.1, 284.4, 283.5, 287.8, 298.3, 307.6, 307.65, 311.9, 327.7, 318.1, 333.6, 358.9, 385.1, 53.6, 51.95, 47.65, 44.8, 44.85, 44.3, 47.1, 44.2, 41.8, 41.9, 41, 35.3, 33.35, 35.6, 34.55, 35.55, 40.05, 35, 34.85, 28.95, 31, 29.25, 29.05, 28.95, 24.95, 26.15, 28.35, 29.4, 32.55, 37.2, 39.85, 40.8, 38.2, 40.35, 37.55, 39.4, 39.8, 43.25, 44.75, 47.25, 49.6, 47.6, 46.35, 49.4, 49.5, 50.05, 50.5, 51.85, 56.35, 54.15, 58, 60.7, 62.7, 293.687, 292.746, 283.222, 286.63, 259.774, 259.257, 270.898, 250.625, 242.401, 248.1, 244.942, 239.384, 237.926, 224.886, 243.959, 270.998, 265.557, 257.508, 258.266, 257.574, 251.917, 250.583, 250.783, 246.6, 252.475, 266.625, 263.85, 249.925, 262.9, 264.975, 273.425, 275.575, 267.2, 282.25, 284.25, 290.75, 295.625, 296.25, 291.375, 302.225, 318.95, 324.825, 320.55, 328.75, 344.05, 345.925, 356.5, 368.275, 374.825, 373.525, 378.325, 378.6, 374.4, 1416.7, 1455.15, 1380.97, 1365.31, 1303.2, 1389.64, 1344.05, 1266.29, 1265.61, 1312.17, 1259.25, 1297.3, 1327.38, 1250, 1328.03, 1347.46, 1326.79, 1286.54, 1304.84, 1272.44, 1227.53, 1264.44, 1304.34, 1277.65, 1316.12, 1370.97, 1423.35, 1382.5, 1477.75, 1455.15, 1553.5, 1526.8, 1479.85, 1546.8, 1565.3, 1606.6, 1654.05, 1689.7, 1613.95, 1703.25, 1708.05, 1786.75, 1779.75, 1906.35, 1976.6, 2027.2, 2057.85, 2029.6, 2051.35, 2033.4, 2089.1, 2065.2, 2091.7), .Dim = c(53, 4), .Dimnames = list(NULL, c("TISCO", "SAIL", "Wipro", "Infosys"))) # Shift from prices to returns -- r <- 100*diff(log(p)) # Historical expected returns -- colMeans(r) # Historical correlation matrix -- cor(r) # Historical covariance matrix -- S <- cov(r) S # Historical portfolio variance for a stated portfolio of 20%,20%,30%,30% -- w <- c(.2, .2, .3, .3) t(w) %*% S %*% w # The portfolio optimisation function in tseries -- library(tseries) optimised <- portfolio.optim(r) # This uses the historical facts from r optimised$pw # Weights optimised$pm # Expected return using these weights optimised$ps # Standard deviation of optimised port.

price time-series

# A stock is traded on 2 exchanges. # Price data is missing at random on both exchanges owing to non-trading. # We want to make a single price time-series utilising information # from both exchanges. I.e., missing data for exchange 1 will # be replaced by information for exchange 2 (if observed). # Let's create some example data for the problem. e1 <- runif(15) # Prices on exchange 1 e2 <- e1 + 0.05*rnorm(15) # Prices on exchange 2. cbind(e1, e2) # Blow away 5 points from each at random. e1[sample(1:15, 5)] <- NA e2[sample(1:15, 5)] <- NA cbind(e1, e2) # Now how do we reconstruct a time-series that tries to utilise both? combined <- e1 # Do use the more liquid exchange here. missing <- is.na(combined) combined[missing] <- e2[missing] # if it's also missing, I don't care. cbind(e1, e2, combined) # There you are.

panel of pictures

par(mfrow=c(3,2)) # 3 rows, 2 columns. # Now the next 6 pictures will be placed on these 6 regions. :-) # Let me take some pains on the 1st plot(density(runif(100)), lwd=2) text(x=0, y=0.2, "100 uniforms") # Showing you how to place text at will abline(h=0, v=0) # All these statements effect the 1st plot. x=seq(0.01,1,0.01) par(col="blue") # default colour to blue. # 2 -- plot(x, sin(x), type="l") lines(x, cos(x), type="l", col="red") # 3 -- plot(x, exp(x), type="l", col="green") lines(x, log(x), type="l", col="orange") # 4 -- plot(x, tan(x), type="l", lwd=3, col="yellow") # 5 -- plot(x, exp(-x), lwd=2) lines(x, exp(x), col="green", lwd=3) # 6 -- plot(x, sin(x*x), type="l") lines(x, sin(1/x), col="pink")

Make pictures in PDF files

xpts <- seq(-3,3,.05) # Here is my suggested setup for a two-column picture -- pdf("demo2.pdf", width=5.6, height=2.8, bg="cadetblue1", pointsize=8) par(mai=c(.6,.6,.2,.2)) plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4", xlab="x", ylab="sin(x*x)") grid(col="white", lty=1, lwd=.2) abline(h=0, v=0) # My suggested setup for a square one-column picture -- pdf("demo1.pdf", width=2.8, height=2.8, bg="cadetblue1", pointsize=8) par(mai=c(.6,.6,.2,.2)) plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4", xlab="x", ylab="sin(x*x)") grid(col="white", lty=1, lwd=.2) abline(h=0, v=0)

histogram with tails shown in red

# This happened on the R mailing list on 7 May 2004. x <- rnorm(1000) hx <- hist(x, breaks=100, plot=FALSE) plot(hx, col=ifelse(abs(hx$breaks) < 1.669, 4, 2)) # What is cool is that "col" is supplied a vector.

Visualisation of 3-dimensional (x,y,z) data using contour

# plots and using colour to represent the 3rd dimension. # The specific situation is: On a grid of (x,y) points, you have # evaluated f(x,y). Now you want a graphical representation of # the resulting list of (x,y,z) points that you have. # Setup an interesting data matrix of (x,y,z) points: points <- structure( c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0.998, 0.124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0.998, 0.71, 0.068, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0.998, 0.898, 0.396, 0.058, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.998, 0.97, 0.726, 0.268, 0.056, 0.006, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.996, 0.88, 0.546, 0.208, 0.054, 0.012, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.998, 0.964, 0.776, 0.418, 0.18, 0.054, 0.014, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.998, 0.906, 0.664, 0.342, 0.166, 0.056, 0.018, 0.006, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.986, 0.862, 0.568, 0.29, 0.15, 0.056, 0.022, 0.008, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.954, 0.778, 0.494, 0.26, 0.148, 0.056, 0.024, 0.012, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.906, 0.712, 0.43, 0.242, 0.144, 0.058, 0.028, 0.012, 0.006, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.878, 0.642, 0.38, 0.222, 0.142, 0.066, 0.034, 0.014, 0.008, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0.846, 0.586, 0.348, 0.208, 0.136, 0.068, 0.034, 0.016, 0.012, 0.006, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0.8, 0.538, 0.318, 0.204, 0.136, 0.07, 0.046, 0.024, 0.012, 0.008, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0, 0, 0.762, 0.496, 0.294, 0.2, 0.138, 0.072, 0.05, 0.024, 0.014, 0.012, 0.006, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0, 0.704, 0.472, 0.286, 0.198, 0.138, 0.074, 0.054, 0.028, 0.016, 0.012, 0.008, 0.006, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0.668, 0.438, 0.276, 0.196, 0.138, 0.078, 0.054, 0.032, 0.024, 0.014, 0.012, 0.008, 0.004, 0.004, 0.002, 0.002, 0, 0, 0, 0.634, 0.412, 0.27, 0.194, 0.14, 0.086, 0.056, 0.032, 0.024, 0.016, 0.012, 0.01, 0.006, 0.004, 0.004, 0.002, 0.002, 0, 0, 0.604, 0.388, 0.26, 0.19, 0.144, 0.088, 0.058, 0.048, 0.026, 0.022, 0.014, 0.012, 0.008, 0.006, 0.004, 0.004, 0.002, 0.002, 0, 0.586, 0.376, 0.256, 0.19, 0.146, 0.094, 0.062, 0.052, 0.028, 0.024, 0.014, 0.012, 0.012, 0.008, 0.004, 0.004, 0.004, 0.002, 0.002, 0.566, 0.364, 0.254, 0.192, 0.148, 0.098, 0.064, 0.054, 0.032, 0.024, 0.022, 0.014, 0.012, 0.012, 0.008, 0.004, 0.004, 0.004, 0.002), .Dim = c(399, 3), .Dimnames = list(NULL, c("x", "y", "z")) ) summary(points) # x is a grid from 0 to 1 # y is a grid from 20 to 200 # z is the interesting object which will be the 3rd dimension. # Solution using contourplot() from package 'lattice' library(lattice) d3 <- data.frame(points) contourplot(z ~ x+y, data=d3) ## or nicer contourplot(z ~ x+y, data=d3, cuts=20, region = TRUE) ## or using logit - transformed z values: contourplot(qlogis(z) ~ x+y, data=d3, pretty=TRUE, region = TRUE) # An interesting alternative is levelplot() levelplot(z ~ x+y, pretty=TRUE, contour=TRUE, data=d3) # There is a contour() function in R. Even though it sounds obvious # for the purpose, it is a bit hard to use. # contour() wants 3 inputs: vectors of x and y values, and a matrix of # z values, where the x values correspond to the rows of z, and the y # values to the columns. A collection of points like `points' above # needs to be turned into such a grid. It might sound odd, but contour() # image() and persp() have used this kind of input for the longest time. # # For irregular data, there's an interp function in the akima package # that can convert from irregular data into the grid format. # # The `points' object that I have above - a list of (x,y,z) points - # fits directly into the mentality of lattice::contourplot() but not # into the requirements of contour()

Display of a macroeconomic time-series, with a filled colour

# bar showing a recession. years <- 1950:2000 timeseries <- cumsum(c(100, runif(50)*5)) hilo <- range(timeseries) plot(years, timeseries, type="l", lwd=3) # A recession from 1960 to 1965 -- polygon(x=c(1960,1960, 1965,1965), y=c(hilo, rev(hilo)), density=NA, col="orange", border=NA) lines(years, timeseries, type="l", lwd=3) # paint again so line comes on top # alternative method -- though not as good looking -- # library(plotrix) # gradient.rect(1960, hilo[1], 1965, hilo[2], # reds=c(0,1), greens=c(0,0), blues=c(0,0), # gradient="y")

Display two series on one plot, one with a left y axis

# and another with a right y axis. y1 <- cumsum(rnorm(100)) y2 <- cumsum(rnorm(100, mean=0.2)) par(mai=c(.8, .8, .2, .8)) plot(1:100, y1, type="l", col="blue", xlab="X axis label", ylab="Left legend") par(new=TRUE) plot(1:100, y2, type="l", ann=FALSE, yaxt="n") axis(4) legend(x="topleft", bty="n", lty=c(1,1), col=c("blue","black"), legend=c("String 1 (left scale)", "String 2 (right scale)"))

Simulate a dataset from the OLS model and obtain

# obtain OLS estimates for it. x <- runif(100, 0, 10) # 100 draws from U(0,10) y <- 2 + 3*x + rnorm(100) # beta = [2, 3] and sigma = 1 # You want to just look at OLS results? summary(lm(y ~ x)) # Suppose x and y were packed together in a data frame -- D <- data.frame(x,y) summary(lm(y ~ x, D)) # Full and elaborate steps -- d <- lm(y ~ x) # Learn about this object by saying ?lm and str(d) # Compact model results -- print(d) # Pretty graphics for regression diagnostics -- par(mfrow=c(2,2)) plot(d) d <- summary(d) # Detailed model results -- print(d) # Learn about this object by saying ?summary.lm and by saying str(d) cat("OLS gave slope of ", d$coefficients[2,1], "and a error sigma of ", d$sigma, "\n") ## I need to drop down to a smaller dataset now -- x <- runif(10) y <- 2 + 3*x + rnorm(10) m <- lm(y ~ x) # Now R supplies a wide range of generic functions which extract # useful things out of the result of estimation of many kinds of models. residuals(m) fitted(m) AIC(m) AIC(m, k=log(10)) # SBC vcov(m) logLik(m)

"Dummy variables" in regression

# Suppose you have this data: people = data.frame( age = c(21,62,54,49,52,38), education = c("college", "school", "none", "school", "college", "none"), education.code = c( 2, 1, 0, 1, 2, 0 ) ) # Here people$education is a string categorical variable and # people$education.code is the same thing, with a numerical coding system. people # Note the structure of the dataset -- str(people) # The strings supplied for `education' have been treated (correctly) as # a factor, but education.code is being treated as an integer and not as # a factor. # We want to do a dummy variable regression. Normally you would have: # 1 Chosen college as the omitted category # 2 Made a dummy for "none" named educationnone # 3 Made a dummy for "school" named educationschool # 4 Ran a regression like lm(age ~ educationnone + educationschool, people) # But this is R. Things are cool: lm(age ~ education, people) # ! :-) # When you feed him an explanatory variable like education, he does all # these steps automatically. (He chose college as the omitted category). # If you use an integer coding, then the obvious thing goes wrong -- lm(age ~ education.code, people) # because he's thinking that education.code is an integer explanatory # variable. So you need to: lm(age ~ factor(education.code), people) # (he choose a different omitted category) # Alternatively, fix up the dataset -- people$education.code <- factor(people$education.code) lm(age ~ education.code, people) # # Bottom line: # Once the dataset has categorical variables correctly represented as factors, i.e. as str(people) # doing OLS in R induces automatic generation of dummy variables while leaving one out: lm(age ~ education, people) lm(age ~ education.code, people) # But what if you want the X matrix? m <- lm(age ~ education, people) model.matrix(m) # This is the design matrix that went into the regression m.

latex table with results of an OLS regression

# Get an OLS -- x1 = runif(100) x2 = runif(100, 0, 2) y = 2 + 3*x1 + 4*x2 + rnorm(100) m = lm(y ~ x1 + x2) # and print it out prettily -- library(xtable) # Bare -- xtable(m) xtable(anova(m)) # Better -- print.xtable(xtable(m, caption="My regression", label="t:mymodel", digits=c(0,3,2,2,3)), type="latex", file="xtable_demo_ols.tex", table.placement = "tp", latex.environments=c("center", "footnotesize")) print.xtable(xtable(anova(m), caption="ANOVA of my regression", label="t:anova_mymodel"), type="latex", file="xtable_demo_anova.tex", table.placement = "tp", latex.environments=c("center", "footnotesize")) # Read the documentation of xtable. It actually knows how to generate # pretty latex tables for a lot more R objects than just OLS results. # It can be a workhorse for making tabular out of matrices, and # can also generate HTML.

Simulate a dataset from a "fixed effects" model

# and obtain "least squares dummy variable" (LSDV) estimates. # # We do this in the context of a familiar "earnings function" - # log earnings is quadratic in log experience, with parallel shifts by # education category. # Create an education factor with 4 levels -- education <- factor(sample(1:4,1000, replace=TRUE), labels=c("none", "school", "college", "beyond")) # Simulate an experience variable with a plausible range -- experience <- 30*runif(1000) # experience from 0 to 20 years # Make the intercept vary by education category between 4 given values -- intercept <- c(0.5,1,1.5,2)[education] # Simulate the log earnings -- log.earnings <- intercept + 2*experience - 0.05*experience*experience + rnorm(1000) A <- data.frame(education, experience, e2=experience*experience, log.earnings) summary(A) # The OLS path to LSDV -- summary(lm(log.earnings ~ -1 + education + experience + e2, A))

Using data from Yahoo finance

# for weekly returns , estimate the beta of Sun Microsystems # This is the `elaborate version' (36 lines), also see terse version (16 lines) library(tseries) # I know that the yahoo symbol for the common stock of Sun Microsystems # is "SUNW" and for the S&P 500 index is "^GSPC". prices <- cbind(get.hist.quote("SUNW", quote="Adj", start="2003-01-01", retclass="zoo"), get.hist.quote("^GSPC", quote="Adj", start="2003-01-01", retclass="zoo")) colnames(prices) <- c("SUNW", "SP500") prices <- na.locf(prices) # Copy last traded price when NA # To make weekly returns, you must have this incantation: nextfri.Date <- function(x) 7 * ceiling(as.numeric(x - 1)/7) + as.Date(1) # and then say weekly.prices <- aggregate(prices, nextfri.Date,tail,1) # Now we can make weekly returns -- r <- 100*diff(log(weekly.prices)) # Now shift out of zoo to become an ordinary matrix -- r <- coredata(r) rj <- r[,1] rM <- r[,2] d <- lm(rj ~ rM) # Market model estimation. print(summary(d)) # Make a pretty picture big <- max(abs(c(rj, rM))) range <- c(-big, big) plot(rM, rj, xlim=range, ylim=range, xlab="S&P 500 weekly returns (%)", ylab="SUNW weekly returns (%)") grid() abline(h=0, v=0) lines(rM, d$fitted.values, col="blue")

Terse version of estimating the beta

# using weekly returns and data from Yahoo finance. # By Gabor Grothendieck. library(tseries) getstock <- function(x) c(get.hist.quote(x, quote = "Adj", start = "2003-01-01", compress = "w")) r <- diff(log(cbind(sp500 = getstock("^gspc"), sunw = getstock("sunw")))) mm <- lm(sunw ~ ., r) print(summary(mm)) range <- range(r, -r) plot(r[,1], r[,2], xlim = range, ylim = range, xlab = "S&P 500 weekly returns (%)", ylab = "SUNW weekly returns (%)") grid() abline(mm, h = 0, v = 0, col = "blue")

nonlinear regression, in three ways

# By just supplying the function to be fit, # By also supplying the analytical derivatives, and # By having him analytically differentiate the function to be fit. # # John Fox has a book "An R and S+ companion to applied regression" # (abbreviated CAR). # An appendix associated with this book, titled # "Nonlinear regression and NLS" # is up on the web, and I strongly recommend that you go read it. # # This file is essentially from there (I have made slight changes). # First take some data - from the CAR book -- library(car) data(US.pop) attach(US.pop) plot(year, population, type="l", col="blue") # So you see, we have a time-series of the US population. We want to # fit a nonlinear model to it. library(stats) # Contains nonlinear regression time <- 0:20 pop.mod <- nls(population ~ beta1/(1 + exp(beta2 + beta3*time)), start=list(beta1=350, beta2=4.5, beta3=-0.3), trace=TRUE) # You just write in the formula that you want to fit, and supply # starting values. "trace=TRUE" makes him show iterations go by. summary(pop.mod) # Add in predicted values into the plot lines(year, fitted.values(pop.mod), lwd=3, col="red") # Look at residuals plot(year, residuals(pop.mod), type="b") abline(h=0, lty=2) # Using analytical derivatives: model <- function(beta1, beta2, beta3, time) { m <- beta1/(1+exp(beta2+beta3*time)) term <- exp(beta2 + beta3*time) gradient <- cbind((1+term)^-1, -beta1*(1+term)^-2 * term, -beta1*(1+term)^-2 * term * time) attr(m, 'gradient') <- gradient return(m) } summary(nls(population ~ model(beta1, beta2, beta3, time), start=list(beta1=350, beta2=4.5, beta3=-0.3))) # Using analytical derivatives, using automatic differentiation (!!!): model <- deriv(~ beta1/(1 + exp(beta2+beta3*time)), # rhs of model c('beta1', 'beta2', 'beta3'), # parameter names function(beta1, beta2, beta3, time){} # arguments for result ) summary(nls(population ~ model(beta1, beta2, beta3, time), start=list(beta1=350, beta2=4.5, beta3=-0.3)))

Some of the standard tests

# A classical setting -- x <- runif(100, 0, 10) # 100 draws from U(0,10) y <- 2 + 3*x + rnorm(100) # beta = [2, 3] and sigma is 1 d <- lm(y ~ x) # CLS results -- summary(d) library(sandwich) library(lmtest) # Durbin-Watson test -- dwtest(d, alternative="two.sided") # Breusch-Pagan test -- bptest(d) # Heteroscedasticity and autocorrelation consistent (HAC) tests coeftest(d, vcov=kernHAC) # Tranplant the HAC values back in -- library(xtable) sum.d <- summary(d) xtable(sum.d) sum.d$coefficients[1:2,1:4] <- coeftest(d, vcov=kernHAC)[1:2,1:4] xtable(sum.d)

Experiment with fitting nonlinear functional forms

# in OLS, using orthogonal polynomials to avoid difficulties with # near-singular design matrices that occur with ordinary polynomials. # Shriya Anand, Gabor Grothendieck, Ajay Shah, March 2006. # We will deal with noisy data from the d.g.p. y = sin(x) + e x <- seq(0, 2*pi, length.out=50) set.seed(101) y <- sin(x) + 0.3*rnorm(50) basicplot <- function(x, y, minx=0, maxx=3*pi, title="") { plot(x, y, xlim=c(minx,maxx), ylim=c(-2,2), main=title) lines(x, sin(x), col="blue", lty=2, lwd=2) abline(h=0, v=0) } x.outsample <- seq(0, 3*pi, length.out=100) # Severe multicollinearity with ordinary polynomials x2 <- x*x x3 <- x2*x x4 <- x3*x cor(cbind(x, x2, x3, x4)) # and a perfect design matrix using orthogonal polynomials m <- poly(x, 4) all.equal(cor(m), diag(4)) # Correlation matrix is I. par(mfrow=c(2,2)) # Ordinary polynomial regression -- p <- lm(y ~ x + I(x^2) + I(x^3) + I(x^4)) summary(p) basicplot(x, y, title="Polynomial, insample") # Data lines(x, fitted(p), col="red", lwd=3) # In-sample basicplot(x, y, title="Polynomial, out-of-sample") predictions.p <- predict(p, list(x = x.outsample)) # Out-of-sample lines(x.outsample, predictions.p, type="l", col="red", lwd=3) lines(x.outsample, sin(x.outsample), type="l", col="blue", lwd=2, lty=2) # As expected, polynomial fitting gives terrible results out of sample. # These IDENTICAL things using orthogonal polynomials d <- lm(y ~ poly(x, 4)) summary(d) basicplot(x, y, title="Orth. poly., insample") # Data lines(x, fitted(d), col="red", lwd=3) # In-sample basicplot(x, y, title="Orth. poly., out-of-sample") predictions.op <- predict(d, list(x = x.outsample)) # Out-of-sample lines(x.outsample, predictions.op, type="l", col="red", lwd=3) lines(x.outsample, sin(x.outsample), type="l", col="blue", lwd=2, lty=2) # predict(d) is magical! See ?SafePrediction # The story runs at two levels. First, when you do an OLS model, # predict()ion requires applying coefficients to an appropriate # X matrix. But one level deeper, the polynomial or orthogonal-polynomial # needs to be utilised for computing the X matrix based on the # supplied x.outsample data. # If you say p <- poly(x, n) # then you can say predict(p, new) where predict.poly() gets invoked. # And when you say predict(lm()), the full steps are worked out for # you automatically: predict.poly() is used to make an X matrix and # then prediction based on the regression results is done. all.equal(predictions.p, predictions.op) # Both paths are identical for this # (tame) problem.

R syntax where model specification is an argument to a function

# Invent a dataset x <- runif(100); y <- runif(100); z <- 2 + 3*x + 4*y + rnorm(100) D <- data.frame(x=x, y=y, z=z) amodel <- function(modelstring) { summary(lm(modelstring, D)) } amodel(z ~ x) amodel(z ~ y)

Show the efficiency of the mean when compared with the median

# using a large simulation where both estimators are applied on # a sample of U(0,1) uniformly distributed random numbers. one.simulation <- function(N=100) { # N defaults to 100 if not supplied x <- runif(N) return(c(mean(x), median(x))) } # Simulation -- results <- replicate(100000, one.simulation(20)) # Gives back a 2x100000 matrix # Two kernel densities -- k1 <- density(results[1,]) # results[1,] is the 1st row k2 <- density(results[2,]) # A pretty picture -- xrange <- range(k1$x, k2$x) plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="") grid() lines(k2$x, k2$y, col="red") abline(v=.5) legend(x="topleft", bty="n", lty=c(1,1), col=c("black", "red"), legend=c("Mean", "Median"))

Simulation to study size and power in a simple problem

set.seed(101) # The data generating process: a simple uniform distribution with stated mean dgp <- function(N,mu) {runif(N)-0.5+mu} # Simulate one FIXED hypothesis test for H0:mu=0, given a true mu for a sample size N one.test <- function(N, truemu) { x <- dgp(N,truemu) muhat <- mean(x) s <- sd(x)/sqrt(N) # Under the null, the distribution of the mean has standard error s threshold <- 1.96*s (muhat < -threshold) || (muhat > threshold) } # Return of TRUE means reject the null # Do one experiment, where the fixed H0:mu=0 is run Nexperiments times with a sample size N. # We return only one number: the fraction of the time that H0 is rejected. experiment <- function(Nexperiments, N, truemu) { sum(replicate(Nexperiments, one.test(N, truemu)))/Nexperiments } # Measure the size of a test, i.e. rejections when H0 is true experiment(10000, 50, 0) # Measurement with sample size of 50, and true mu of 0. # Power study: I.e. Pr(rejection) when H0 is false # (one special case in here is when the H0 is actually true) muvalues <- seq(-.15,.15,.01) # When true mu < -0.15 and when true mu > 0.15, # the Pr(rejection) veers to 1 (full power) and it's not interesting. # First do this with sample size of 50 results <- NULL for (truth in muvalues) { results <- c(results, experiment(10000, 50, truth)) } par(mai=c(.8,.8,.2,.2)) plot(muvalues, results, type="l", lwd=2, ylim=c(0,1), xlab="True mu", ylab="Pr(Rejection of H0:mu=0)") abline(h=0.05, lty=2) # Now repeat this with sample size of 100 (should yield a higher power) results <- NULL for (truth in muvalues) { results <- c(results, experiment(10000, 100, truth)) } lines(muvalues, results, lwd=2, col="blue") legend(x=-0.15, y=.2, lwd=c(2,1,2), lty=c(1,2,1), cex=.8, col=c("black","black","blue"), bty="n", legend=c("N=50", "Size, 0.05", "N=100"))

Do bootstrap inference, as an example, for a sample median

library(boot) samplemedian <- function(x, d) { # d is a vector of integer indexes return(median(x[d])) # The genius is in the x[d] notation } data <- rnorm(50) # Generate a dataset with 50 obs b <- boot(data, samplemedian, R=2000) # 2000 bootstrap replications cat("Sample median has a sigma of ", sd(b$t[,1]), "\n") plot(b) # Make a 99% confidence interval boot.ci(b, conf=0.99, type="basic")

OLS by MLE

# OLS likelihood function # Note: I am going to write the LF using sigma2=sigma^2 and not sigma. ols.lf1 <- function(theta, y, X) { beta <- theta[-1] sigma2 <- theta[1] if (sigma2 <= 0) return(NA) n <- nrow(X) e <- y - X%*%beta # t() = matrix transpose logl <- ((-n/2)*log(2*pi)) - ((n/2)*log(sigma2)) - ((t(e)%*%e)/(2*sigma2)) return(-logl) # since optim() does minimisation by default. } # Analytical derivatives ols.gradient <- function(theta, y, X) { beta <- theta[-1] sigma2 <- theta[1] e <- y - X%*%beta n <- nrow(X) g <- numeric(length(theta)) g[1] <- (-n/(2*sigma2)) + (t(e)%*%e)/(2*sigma2*sigma2) # d logl / d sigma g[-1] <- (t(X) %*% e)/sigma2 # d logl / d beta return(-g) } X <- cbind(1, runif(1000)) theta.true <- c(2,4,6) # error variance = 2, intercept = 4, slope = 6. y <- X %*% theta.true[-1] + sqrt(theta.true[1]) * rnorm(1000) # Estimation by OLS -- d <- summary(lm(y ~ X[,2])) theta.ols <- c(sigma2 = d$sigma^2, d$coefficients[,1]) cat("OLS theta = ", theta.ols, "\n\n") cat("\nGradient-free (constrained optimisation) --\n") optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), y=y, X=X) cat("\nUsing the gradient (constrained optimisation) --\n") optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, gr=ols.gradient, lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), y=y, X=X) cat("\n\nYou say you want a covariance matrix?\n") p <- optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, gr=ols.gradient, lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), hessian=TRUE, y=y, X=X) inverted <- solve(p$hessian) results <- cbind(p$par, sqrt(diag(inverted)), p$par/sqrt(diag(inverted))) colnames(results) <- c("Coefficient", "Std. Err.", "t") rownames(results) <- c("Sigma", "Intercept", "X") cat("MLE results --\n") print(results) cat("Compare with the OLS results --\n") d$coefficients # Picture of how the loglikelihood changes if you perturb the sigma theta <- theta.ols delta.values <- seq(-1.5, 1.5, .01) logl.values <- as.numeric(lapply(delta.values, function(x) {-ols.lf1(theta+c(x,0,0),y,X)})) plot(sqrt(theta[1]+delta.values), logl.values, type="l", lwd=3, col="blue", xlab="Sigma", ylab="Log likelihood") grid()

To do `moving window volatility' of returns

library(zoo) # Some data to play with (Nifty on all fridays for calendar 2004) -- p <- structure(c(1946.05, 1971.9, 1900.65, 1847.55, 1809.75, 1833.65, 1913.6, 1852.65, 1800.3, 1867.7, 1812.2, 1725.1, 1747.5, 1841.1, 1853.55, 1868.95, 1892.45, 1796.1, 1804.45, 1582.4, 1560.2, 1508.75, 1521.1, 1508.45, 1491.2, 1488.5, 1537.5, 1553.2, 1558.8, 1601.6, 1632.3, 1633.4, 1607.2, 1590.35, 1609, 1634.1, 1668.75, 1733.65, 1722.5, 1775.15, 1820.2, 1795, 1779.75, 1786.9, 1852.3, 1872.95, 1872.35, 1901.05, 1996.2, 1969, 2012.1, 2062.7, 2080.5), index = structure(c(12419, 12426, 12433, 12440, 12447, 12454, 12461, 12468, 12475, 12482, 12489, 12496, 12503, 12510, 12517, 12524, 12531, 12538, 12545, 12552, 12559, 12566, 12573, 12580, 12587, 12594, 12601, 12608, 12615, 12622, 12629, 12636, 12643, 12650, 12657, 12664, 12671, 12678, 12685, 12692, 12699, 12706, 12713, 12720, 12727, 12734, 12741, 12748, 12755, 12762, 12769, 12776, 12783), class = "Date"), frequency = 0.142857142857143, class = c("zooreg", "zoo")) # Shift to returns -- r <- 100*diff(log(p)) head(r) summary(r) sd(r) # Compute the moving window vol -- vol <- sqrt(250) * rollapply(r, 20, sd, align = "right") # A pretty plot -- plot(vol, type="l", ylim=c(0,max(vol,na.rm=TRUE)), lwd=2, col="purple", xlab="2004", ylab=paste("Annualised sigma, 20-week window")) grid() legend(x="bottomleft", col=c("purple", "darkgreen"), lwd=c(2,2), bty="n", cex=0.8, legend=c("Annualised 20-week vol (left scale)", "Nifty (right scale)")) par(new=TRUE) plot(p, type="l", lwd=2, col="darkgreen", xaxt="n", yaxt="n", xlab="", ylab="") axis(4)

Joint distributions, marginal distributions, useful tables

# First let me invent some fake data set.seed(102) # This yields a good illustration. x <- sample(1:3, 15, replace=TRUE) education <- factor(x, labels=c("None", "School", "College")) x <- sample(1:2, 15, replace=TRUE) gender <- factor(x, labels=c("Male", "Female")) age <- runif(15, min=20,max=60) D <- data.frame(age, gender, education) rm(x,age,gender,education) print(D) # Table about education table(D$education) # Table about education and gender -- table(D$gender, D$education) # Joint distribution of education and gender -- table(D$gender, D$education)/nrow(D) # Add in the marginal distributions also addmargins(table(D$gender, D$education)) addmargins(table(D$gender, D$education))/nrow(D) # Generate a good LaTeX table out of it -- library(xtable) xtable(addmargins(table(D$gender, D$education))/nrow(D), digits=c(0,2,2,2,2)) # You have to do | and \hline manually. # Study age by education category by(D$age, D$gender, mean) by(D$age, D$gender, sd) by(D$age, D$gender, summary) # Two-way table showing average age depending on education & gender a <- matrix(by(D$age, list(D$gender, D$education), mean), nrow=2) rownames(a) <- levels(D$gender) colnames(a) <- levels(D$education) print(a) # or, of course, print(xtable(a))

Get the data in place

load(file="demo.rda") summary(firms) # Look at it -- plot(density(log(firms$mktcap))) plot(firms$mktcap, firms$spread, type="p", cex=.2, col="blue", log="xy", xlab="Market cap (Mln USD)", ylab="Bid/offer spread (bps)") m=lm(log(spread) ~ log(mktcap), firms) summary(m) # Making deciles -- library(gtools) library(gdata) # for deciles (default=quartiles) size.category = quantcut(firms$mktcap, q=seq(0, 1, 0.1), labels=F) table(size.category) means = aggregate(firms, list(size.category), mean) print(data.frame(means$mktcap,means$spread)) # Make a picture combining the sample mean of spread (in each decile) # with the weighted average sample mean of the spread (in each decile), # where weights are proportional to size. wtd.means = by(firms, size.category, function(piece) (sum(piece$mktcap*piece$spread)/sum(piece$mktcap))) lines(means$mktcap, means$spread, type="b", lwd=2, col="green", pch=19) lines(means$mktcap, wtd.means, type="b", lwd=2, col="red", pch=19) legend(x=0.25, y=0.5, bty="n", col=c("blue", "green", "red"), lty=c(0, 1, 1), lwd=c(0,2,2), pch=c(0,19,19), legend=c("firm", "Mean spread in size deciles", "Size weighted mean spread in size deciles")) # Within group standard deviations -- aggregate(firms, list(size.category), sd) # Now I do quartiles by BOTH mktcap and spread. size.quartiles = quantcut(firms$mktcap, labels=F) spread.quartiles = quantcut(firms$spread, labels=F) table(size.quartiles, spread.quartiles) # Re-express everything as joint probabilities table(size.quartiles, spread.quartiles)/nrow(firms) # Compute cell means at every point in the joint table: aggregate(firms, list(size.quartiles, spread.quartiles), mean) # Make pretty two-way tables aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, nobs) aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, mean) aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, sd) aggregate.table(firms$spread, size.quartiles, spread.quartiles, mean) aggregate.table(firms$spread, size.quartiles, spread.quartiles, sd)

Scare the hell out of children with the Cauchy distribution

# A function which simulates N draws from one of two distributions, # and returns the mean obtained thusly. one.simulation <- function(N=100, distribution="normal") { if (distribution == "normal") { x <- rnorm(N) } else { x <- rcauchy(N) } mean(x) } k1 <- density(replicate(1000, one.simulation(20))) k2 <- density(replicate(1000, one.simulation(20, distribution="cauchy"))) xrange <- range(k1$x, k2$x) plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="") grid() lines(k2$x, k2$y, col="red") abline(v=.5) legend(x="topleft", bty="n", lty=c(1,1), col=c("black", "red"), legend=c("Mean of Normal", "Mean of Cauchy")) # The distribution of the mean of normals collapses into a point; # that of the cauchy does not. # Here's more scary stuff -- for (i in 1:10) { cat("Sigma of distribution of 1000 draws from mean of normal - ", sd(replicate(1000, one.simulation(20))), "\n") } for (i in 1:10) { cat("Sigma of distribution of 1000 draws from mean of cauchy - ", sd(replicate(1000, one.simulation(20, distribution="cauchy"))), "\n") } # Exercise for the reader: Compare the distribution of the median of # the Normal against the distribution of the median of the Cauchy.

example of simulation-based inference

# This is in the context of testing for time-series dependence in # stock market returns data. # The code here does the idea of Kim, Nelson, Startz (1991). # We want to use the distribution of realworld returns data, without # needing assumptions about normality. # The null is lack of dependence (i.e. an efficient market). # So repeatedly, the data is permuted, and the sample ACF is computed. # This gives us the distribution of the ACF under H0: independence, but # while using the empirical distribution of the returns data. # Weekly returns on Nifty, 1/1/2002 to 31/12/2003, 104 weeks of data. r <- c(-0.70031182197603, 0.421690133064168, -1.20098072984689, 0.143402360644984, 3.81836537549516, 3.17055939373247, 0.305580301919228, 1.23853814691852, 0.81584795095706, -1.51865139747764, -2.71223626421522, -0.784836480094242, 1.09180041170998, 0.397649587762761, -4.11309534220923, -0.263912425099111, -0.0410144239805454, 1.75756212770972, -2.3335373897992, -2.19228764624217, -3.64578978183987, 1.92535789661354, 3.45782867883164, -2.15532607229374, -0.448039988298987, 1.50124793565896, -1.45871585874362, -2.13459863369767, -6.2128068251802, -1.94482987066289, 0.751294815735637, 1.78244982829590, 1.61567494389745, 1.53557708728931, -1.53557708728931, -0.322061470004265, -2.28394919698225, 0.70399304137414, -2.93580952607737, 2.38125098034425, 0.0617697039252185, -4.14482733720716, 2.04397528093754, 0.576400673606603, 3.43072725191913, 2.96465382864843, 2.89833358015583, 1.85387040058336, 1.52136515035952, -0.637268376944444, 1.75418926224609, -0.804391905851354, -0.861816058320475, 0.576902488444109, -2.84259880663331, -1.35375536139417, 1.49096529042234, -2.05404881010045, 2.86868849528146, -0.258270670200478, -4.4515881438687, -1.73055019137092, 3.04427015714648, -2.94928202352018, 1.62081315773994, -6.83117945164824, -0.962715713711582, -1.75875847071740, 1.50330330252721, -0.0479705789653728, 3.68968303215933, -0.535807567290103, 3.94034871061182, 3.85787174417738, 0.932185956989873, 4.08598654183674, 2.27343783689715, 1.13958830440017, 2.01737201171230, -1.88131458327554, 1.97596267156648, 2.79857144562001, 2.22470306481695, 2.03212951411427, 4.95626853448883, 3.40400972901396, 3.03840139165246, -1.89863129741417, -3.70832135042951, 4.78478922155396, 4.3973589590097, 4.9667050392987, 2.99775078737081, -4.12349101552438, 3.25638269809945, 2.29683376253966, -2.64772825878214, -0.630835277076258, 4.72528848505451, 1.87368447333380, 3.17543946162564, 4.58174427843208, 3.23625985632168, 2.29777651227296) # The 1st autocorrelation from the sample: acf(r, 1, plot=FALSE)$acf[2] # Obtain 1000 draws from the distribution of the 1st autocorrelation # under the null of independence: set.seed <- 101 simulated <- replicate(1000, acf(r[sample(1:104, replace=FALSE)], 1, plot=FALSE)$acf[2]) # At 95% -- quantile(simulated, probs=c(.025,.975)) # At 99% -- quantile(simulated, probs=c(.005,.995)) # So we can reject the null at 95% but not at 99%. # A pretty picture. plot(density(simulated), col="blue") abline(v=0) abline(v=quantile(simulated, probs=c(.025,.975)), lwd=2, col="purple") abline(v=acf(r, 1, plot=FALSE)$acf[2], lty=2, lwd=4, col="yellow")

Standard computations with well-studied distributions

# The normal distribution is named "norm". With this, we have: # Normal density dnorm(c(-1.96,0,1.96)) # Cumulative normal density pnorm(c(-1.96,0,1.96)) # Inverse of this qnorm(c(0.025,.5,.975)) pnorm(qnorm(c(0.025,.5,.975))) # 1000 random numbers from the normal distribution summary(rnorm(1000)) # Here's the same ideas, for the chi-squared distribution with 10 degrees # of freedom. dchisq(c(0,5,10), df=10) # Cumulative normal density pchisq(c(0,5,10), df=10) # Inverse of this qchisq(c(0.025,.5,.975), df=10) # 1000 random numbers from the normal distribution summary(rchisq(1000, df=10))

two-sample Kolmogorov-Smirnoff test

# Goal: Given two vectors of data, # superpose their CDFs # and show the results of the two-sample Kolmogorov-Smirnoff test # The function consumes two vectors x1 and x2. # You have to provide a pair of labels as `legendstrings'. # If you supply an xlab, it's used # If you specify log - e.g. log="x" - this is passed on to plot. # The remaining args that you specify are sent on into ks.test() two.cdfs.plot <- function(x1, x2, legendstrings, xlab="", log="", ...) { stopifnot(length(x1)>0, length(x2)>0, length(legendstrings)==2) hilo <- range(c(x1,x2)) par(mai=c(.8,.8,.2,.2)) plot(ecdf(x1), xlim=hilo, verticals=TRUE, cex=0, xlab=xlab, log=log, ylab="Cum. distribution", main="") grid() plot(ecdf(x2), add=TRUE, verticals=TRUE, cex=0, lwd=3) legend(x="bottomright", lwd=c(1,3), lty=1, bty="n", legend=legendstrings) k <- ks.test(x1,x2, ...) text(x=hilo[1], y=c(.9,.85), pos=4, cex=.8, labels=c( paste("KS test statistic: ", sprintf("%.3g", k$statistic)), paste("Prob value: ", sprintf("%.3g", k$p.value)) ) ) k } x1 <- rnorm(100, mean=7, sd=1) x2 <- rnorm(100, mean=9, sd=1) # Check error detection -- two.cdfs.plot(x1,x2) # Typical use -- two.cdfs.plot(x1, x2, c("X1","X2"), xlab="Height (metres)", log="x") # Send args into ks.test() -- two.cdfs.plot(x1, x2, c("X1","X2"), alternative="less")

To read in a simple data file, and look around it's contents

# Suppose you have a file "x.data" which looks like this: # 1997,3.1,4 # 1998,7.2,19 # 1999,1.7,2 # 2000,1.1,13 # To read it in -- A <- read.table("x.data", sep=",", col.names=c("year", "my1", "my2")) nrow(A) # Count the rows in A summary(A$year) # The column "year" in data frame A # is accessed as A$year A$newcol <- A$my1 + A$my2 # Makes a new column in A newvar <- A$my1 - A$my2 # Makes a new R object "newvar" A$my1 <- NULL # Removes the column "my1" # You might find these useful, to "look around" a dataset -- str(A) summary(A) library(Hmisc) # This requires that you've installed the Hmisc package contents(A) describe(A)

To read in a simple data file where date data is present

# Suppose you have a file "x.data" which looks like this: # 1997-07-04,3.1,4 # 1997-07-05,7.2,19 # 1997-07-07,1.7,2 # 1997-07-08,1.1,13 A <- read.table("x.data", sep=",", col.names=c("date", "my1", "my2")) A$date <- as.Date(A$date, format="%Y-%m-%d") # Say ?strptime to learn how to use "%" to specify # other date formats. Two examples -- # "15/12/2002" needs "%d/%m/%Y" # "03 Jun 1997" needs "%d %b %Y" # Actually, if you're using the ISO 8601 date format, i.e. # "%Y-%m-%d", that's the default setting and you don't need to # specify the format. A$newcol <- A$my1 + A$my2 # Makes a new column in A newvar <- A$my1 - A$my2 # Makes a new R object "newvar" A$my1 <- NULL # Delete the `my1' column summary(A) # Makes summary statistics

To read in files produced by CMIE's "Business Beacon"

# This assumes you have made a file of MONTHLY data using CMIE's # Business Beacon program. This contains 2 columns: M3 and M0. A <- read.table( # Generic to all BB files -- sep="|", # CMIE's .txt file is pipe delimited skip=3, # Skip the 1st 3 lines na.strings=c("N.A.","Err"), # The ways they encode missing data # Specific to your immediate situation -- file="bb_data.text", col.names=c("junk", "date", "M3", "M0") ) A$junk <- NULL # Blow away this column # Parse the CMIE-style "Mmm yy" date string that's used on monthly data A$date <- as.Date(paste("1", as.character(A$date)), format="%d %b %Y")

Reading and writing ascii files, reading and writing binary files

# And, to measure how much faster it is working with binary files. # First manufacture a tall data frame: # FYI -- runif(10) yields 10 U(0,1) random numbers. B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000)) summary(B) # Write out ascii file: write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA) # Read in this resulting ascii file: C=read.table("/tmp/foo.csv", header = TRUE, sep = ",", row.names=1) # Write a binary file out of dataset C: save(C, file="/tmp/foo.binary") # Delete the dataset C: rm(C) # Restore from foo.binary: load("/tmp/foo.binary") summary(C) # should yield the same results # as summary(B) above. # Now we time all these operations -- cat("Time creation of dataset:\n") system.time({ B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000)) }) cat("Time writing an ascii file out of dataset B:\n") system.time( write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA) ) cat("Time reading an ascii file into dataset C:\n") system.time( {C=read.table("/tmp/foo.csv", header = TRUE, sep=",", row.names=1) }) cat("Time writing a binary file out of dataset C:\n") system.time(save(C, file="/tmp/foo.binary")) cat("Time reading a binary file + variablenames from /tmp/foo.binary:\n") system.time(load("/tmp/foo.binary")) # and then read it in from binary file

Lots of timesR

# Goals: Lots of times, you need to give an R object to a friend, # or embed data into an email. # First I invent a little dataset -- set.seed(101) # To make sure you get the same random numbers as me # FYI -- runif(10) yields 10 U(0,1) random numbers. A = data.frame(x1=runif(10), x2=runif(10), x3=runif(10)) # Look at it -- print(A) # Writing to a binary file that can be transported save(A, file="/tmp/my_data_file.rda") # You can give this file to a friend load("/tmp/my_data_file.rda") # Plan B - you want pure ascii, which can be put into an email -- dput(A) # This gives you a block of R code. Let me utilise that generated code # to create a dataset named "B". B <- structure(list(x1 = c(0.372198376338929, 0.0438248154241592, 0.709684018278494, 0.657690396532416, 0.249855723232031, 0.300054833060130, 0.584866625955328, 0.333467143354937, 0.622011963743716, 0.54582855431363 ), x2 = c(0.879795730113983, 0.706874740775675, 0.731972594512627, 0.931634427979589, 0.455120594473556, 0.590319729177281, 0.820436094887555, 0.224118480458856, 0.411666829371825, 0.0386105608195066), x3 = c(0.700711545301601, 0.956837461562827, 0.213352001970634, 0.661061500199139, 0.923318882007152, 0.795719761401415, 0.0712125543504953, 0.389407767681405, 0.406451216200367, 0.659355078125373)), .Names = c("x1", "x2", "x3"), row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame") # Verify that A and B are near-identical -- A-B # or, all.equal(A,B)

Make a time-series object using the "zoo" package

A <- data.frame(date=c("1995-01-01", "1995-01-02", "1995-01-03", "1995-01-06"), x=runif(4), y=runif(4)) A$date <- as.Date(A$date) # yyyy-mm-dd is the default format # So far there's nothing new - it's just a data frame. I have hand- # constructed A but you could equally have obtained it using read.table(). # I want to make a zoo matrix out of the numerical columns of A library(zoo) B <- A B$date <- NULL z <- zoo(as.matrix(B), order.by=A$date) rm(A, B) # So now you are holding "z", a "zoo" object. You can do many cool # things with it. # See http://www.google.com/search?hl=en&q=zoo+quickref+achim&btnI=I%27m+Feeling+Lucky # To drop down to a plain data matrix, say C <- coredata(z) rownames(C) <- as.character(time(z)) # Compare -- str(C) str(z) # The above is a tedious way of doing these things, designed to give you # an insight into what is going on. If you just want to read a file # into a zoo object, a very short path is something like: # z <- read.zoo(filename, format="%d %b %Y")

All manner of import and export of datasets

# Invent a dataset -- A <- data.frame( name=c("a","b","c"), ownership=c("Case 1","Case 1","Case 2"), listed.at=c("NSE",NA,"BSE"), # Firm "b" is unlisted. is.listed=c(TRUE,FALSE,TRUE), # R convention - boolean variables are named "is.something" x=c(2.2,3.3,4.4), date=as.Date(c("2004-04-04","2005-05-05","2006-06-06")) ) # To a spreadsheet through a CSV file -- write.table(A,file="demo.csv",sep = ",",col.names = NA,qmethod = "double") B <- read.table("demo.csv", header = TRUE, sep = ",", row.names = 1) # To R as a binary file -- save(A, file="demo.rda") load("demo.rda") # To the Open XML standard for transport for statistical data -- library(StatDataML) writeSDML(A, "/tmp/demo.sdml") B <- readSDML("/tmp/demo.sdml") # To Stata -- library(foreign) write.dta(A, "/tmp/demo.dta") B <- read.dta("/tmp/demo.dta") # foreign::write.foreign() also has a pathway to SAS and SPSS.

Special cases in reading files

# Reading in a .bz2 file -- read.table(bzfile("file.text.bz2")) # Requires you have ./file.text.bz2 # Reading in a .gz file -- read.table(gzfile("file.text.gz")) # Requires you have ./file.text.bz2 # Reading from a pipe -- mydata <- read.table(pipe("awk -f filter.awk input.txt")) # Reading from a URL -- read.table(url("http://www.mayin.org/ajayshah/A/demo.text")) # This also works -- read.table("http://www.mayin.org/ajayshah/A/demo.text") # Hmm, I couldn't think of how to read a .bz2 file from a URL. How about: read.table(pipe("links -source http://www.mayin.org/ajayshah/A/demo.text.bz2 | bunzip2")) # Reading binary files from a URL -- load(url("http://www.mayin.org/ajayshah/A/nifty_weekly_returns.rda"))

Reading in a Microsoft .xls file directly

library(gdata) a <- read.xls("file.xls", sheet=2) # This reads in the 2nd sheet # Look at what the cat dragged in str(a) # If you have a date column, you'll want to fix it up like this: a$date <- as.Date(as.character(a$X), format="%d-%b-%y") a$X <- NULL # Also see http://tolstoy.newcastle.edu.au/R/help/06/04/25674.html for # another path.

ARMA modeling - estimation, diagnostics, forecasting

# 0. SETUP DATA rawdata <- c(-0.21,-2.28,-2.71,2.26,-1.11,1.71,2.63,-0.45,-0.11,4.79,5.07,-2.24,6.46,3.82,4.29,-1.47,2.69,7.95,4.46,7.28,3.43,-3.19,-3.14,-1.25,-0.50,2.25,2.77,6.72,9.17,3.73,6.72,6.04,10.62,9.89,8.23,5.37,-0.10,1.40,1.60,3.40,3.80,3.60,4.90,9.60,18.20,20.60,15.20,27.00,15.42,13.31,11.22,12.77,12.43,15.83,11.44,12.32,12.10,12.02,14.41,13.54,11.36,12.97,10.00,7.20,8.74,3.92,8.73,2.19,3.85,1.48,2.28,2.98,4.21,3.85,6.52,8.16,5.36,8.58,7.00,10.57,7.12,7.95,7.05,3.84,4.93,4.30,5.44,3.77,4.71,3.18,0.00,5.25,4.27,5.14,3.53,4.54,4.70,7.40,4.80,6.20,7.29,7.30,8.38,3.83,8.07,4.88,8.17,8.25,6.46,5.96,5.88,5.03,4.99,5.87,6.78,7.43,3.61,4.29,2.97,2.35,2.49,1.56,2.65,2.49,2.85,1.89,3.05,2.27,2.91,3.94,2.34,3.14,4.11,4.12,4.53,7.11,6.17,6.25,7.03,4.13,6.15,6.73,6.99,5.86,4.19,6.38,6.68,6.58,5.75,7.51,6.22,8.22,7.45,8.00,8.29,8.05,8.91,6.83,7.33,8.52,8.62,9.80,10.63,7.70,8.91,7.50,5.88,9.82,8.44,10.92,11.67) # Make a R timeseries out of the rawdata: specify frequency & startdate gIIP <- ts(rawdata, frequency=12, start=c(1991,4)) print(gIIP) plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2, main="Full data") grid() # Based on this, I decide that 4/1995 is the start of the sensible period. gIIP <- window(gIIP, start=c(1995,4)) print(gIIP) plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2, main="Estimation subset") grid() # Descriptive statistics about gIIP mean(gIIP); sd(gIIP); summary(gIIP); plot(density(gIIP), col="blue", main="(Unconditional) Density of IIP growth") acf(gIIP) # 1. ARMA ESTIMATION m.ar2 <- arima(gIIP, order = c(2,0,0)) print(m.ar2) # Print it out # 2. ARMA DIAGNOSTICS tsdiag(m.ar2) # His pretty picture of diagnostics ## Time series structure in errors print(Box.test(m.ar2$residuals, lag=12, type="Ljung-Box")); ## Sniff for ARCH print(Box.test(m.ar2$residuals^2, lag=12, type="Ljung-Box")); ## Eyeball distribution of residuals plot(density(m.ar2$residuals), col="blue", xlim=c(-8,8), main=paste("Residuals of AR(2)")) # 3. FORECASTING ## Make a picture of the residuals plot.ts(m.ar2$residual, ylab="Innovations", col="blue", lwd=2) s <- sqrt(m.ar2$sigma2) abline(h=c(-s,s), lwd=2, col="lightGray") p <- predict(m.ar2, n.ahead = 12) # Make 12 predictions. print(p) ## Watch the forecastability decay away from fat values to 0. ## sd(x) is the naive sigma. p$se is the prediction se. gain <- 100*(1-p$se/sd(gIIP)) plot.ts(gain, main="Gain in forecast s.d.", ylab="Per cent", col="blue", lwd=2) ## Make a pretty picture that puts it all together ts.plot(gIIP, p$pred, p$pred-1.96*p$se, p$pred+1.96*p$se, gpars=list(lty=c(1,1,2,2), lwd=c(2,2,1,1), ylab="IIP growth (%)", col=c("blue","red", "red", "red"))) grid() abline(h=mean(gIIP), lty=2, lwd=2, col="lightGray") legend(x="bottomleft", cex=0.8, bty="n", lty=c(1,1,2,2), lwd=c(2,1,1,2), col=c("blue", "red", "red", "lightGray"), legend=c("IIP", "AR(2) forecasts", "95% C.I.", "Mean IIP growth"))