R examples

MustWatch





BASICS
  # Setup some scalars
  x <- 3
  y <- 4
  z <- sqrt(x*x + y*y)
  use.dots.in.names <- z - 5

  # Print out z
  print(z)
  # The default action is print()
  z

  # So you have a neat scientific calculator
  sin(log(2.718281828)*pi)

SCALARS
  # The basic scalar types
  x <- 3
  take.me <- FALSE
  s <- "this is a string"
  x
  take.me
  s

  # Exploring objects using str() --
  str(x)
  str(take.me)
  str(s)

DATES: A SPECIAL KIND OF SCALAR
  s <- "2004-04-22"
  d <- as.Date(s)
  d
  str(d)
  s <- "22-Apr-2004"
  d <- as.Date(s, format="%d-%b-%Y")
  d
  as.Date("22-04-2004", format="%d-%m-%Y")
  as.Date("22-04-04", format="%d-%m-%y")
  as.Date("22-Apr-04", format="%d-%b-%y")

VECTORS
  x <- c(2,3,7, 9, 10, 11)
  # the c() operator, for concatenation
  x[1]
  x[6]
  length(x)

  # Help about all R functions is online:
  ?length

  # Arithmetic that gobbles a vector at a time
  x
  x+2
  y <- log(x)
  y

  # Clever vector addressing
  x[1]
  x[6]
  x[c(1,6)]
  indexes <- c(1,3,5)
  x[indexes]

  # Another shorthand
  1:4
  x[1:4]
  indexes <- 1:4
  x[indexes]

  # Another shorthand
  switches <- c(TRUE,FALSE,FALSE,FALSE,FALSE,TRUE)
  x[switches]

  # Dropping some elements
  x
  x[-2]
  x[c(-2,-3)]

  # Bottom line: Very nice mechanisms for addressing.

  # Let's stay in touch with how to explore objects --
  str(x)
  str(x[1])

FACTORS: A SPECIAL KIND OF SCALAR
  names  <- c("Payal", "Shraddha", "Aditi", "Kritika", "Diwakar")
  attire <- c("Sari",  "Salwar",   "Sari",  "Sari",    "Kurta")
  # attire is a "categorical variable"
  attire <- factor(attire)
  attire
  table(attire)
  table(names, attire)

  # Internally, a factor is just stored as an integer
  as.numeric(attire)

NOT AVAILABLE: A SPECIAL SCALAR
  x <- c(2,3,NA,4,5,NA)
  x
  x + 2                                   # Automatic rules about NA arithmetic

MATRICES
  X <- matrix(NA, nrow=7, ncol=3)
  X
  X[4,] <- c(2,3,4)
  X
  X[,3] <- 1:7
  X
  str(X)
  nrow(X)
  ncol(X)

  # As with vectors, arithmetic that attacks a full matrix at a time!
  X + 1

LISTS
  # Ability to make a parcel of apparently unrelated materials.
  results <- list(mu=0.2, sigma=0.9, x=c(1,2,3,9))
  str(results)
  # Accessing elements --
  results$mu
  results$sigma <- 9
  results$new <- "Hello"
  results

DAILY TIME-SERIES USING THE `ZOO' PACKAGE
  # Zoo is a very powerful, high-level time-series system. To learn more:
  library(help=zoo)
  vignette("zoo-quickref")

  # Let's do a daily time-series
  dates <- as.Date(c("2004-04-01", "2004-04-02", "2004-04-03", "2004-04-05"))
  values <- rnorm(4)
  library(zoo)
  z <- zoo(values, order.by=dates)
  z
  plot(z)

DATASET, TERMED `DATA FRAME' in R
  # A dataset is a list of vectors, all of the same length.
  names <- c("Payal", "Shraddha", "Aditi", "Kritika", "Diwakar")
  age <-   c(15,      17,         15,      19,        20)
  iq <-    c(90,      100,        110,     120,       160)
  # By default, data.frame() forces the names to be factors...
  D <- data.frame(names=names, age=age, iq=iq)
  str(D)
  D
  nrow(D)
  ncol(D)
  summary(D)
  # Accessing a column in a data frame
  D$age
  # Accessing an observation in a data frame - use matrix-like notation
  D[3,]
  # Accessing the 3rd value of age
  D$age[3]

WRITING AND THEN READING FILES
  D
  write.csv(D, file="demo.csv")
  system("cat demo.csv")                  # Say "type" if you're on DOS
  E <- read.table("demo.csv", skip=1, sep=",",
                  col.names=c("obsnum","name","age","iq"))
  E
  E$obsnum <- NULL
  E
  # Compare against --
  D


REGRESSION
  m <- lm(iq ~ age, D)
  m
  summary(m)
  # Remember that shoe size is strongly correlated with IQ
  plot(D$age, D$iq, xlab="Age (years)", ylab="IQ")
  abline(h=100, lty=3)
  lines(D$age, fitted.values(m), col="blue", lwd=2)

SIMULATION
  runif(10)
  runif(10)
  set.seed(133); runif(10)
  set.seed(133); runif(10)
  summary(runif(100))
  summary(runif(1000))
  summary(runif(10000))
  summary(runif(100000))

  cat("See how large datasets stabilise sample statistics\n")
  summary(runif(100)); summary(runif(100))
  summary(runif(10000)); summary(runif(10000))

  plot(density(runif(1000)))
  plot(density(rnorm(1000)))
  x <- seq(-4,4,0.01)
  lines(x, dnorm(x), col="blue", lwd=2)

SIMPLE SUMMARY STATISTICS
  x <- rnorm(10000)
  summary(x)
  quantile(x, probs=c(0.01, 0.025, 0.25, 0.5, 0.75, 0.975, 0.99))
  IQR(x)
  fivenum(x)
  sd(x)
  range(x)
  mean(x)
  mean(x, trim=0.1)                     # Delete 10% of data at each end first

REGRESSION ON A SIMULATED DATASET
  x <- 2 + 3*runif(100)
  summary(x)
  y <- -7 + 4*x + rnorm(100)
  summary(lm(y ~ x))                    # Convince yourself the intercept is ok
  # Most statistical functions give back results of results.
  res.1 <- lm(y ~ x)
  str(res.1)                            # There's a lot of goodies here!
  res.1$coefficients
  # In this case, detailed calculations require summary() to get what
  # I call "level 2 results".
  res.2 <- summary(res.1)
  str(res.1)                            # There's even more goodies here!!
  res.2$coefficients
  res.2$r.squared
  res.2$sigma
  # There are useful generic functions which work for a lot of models --
  logLik(res.1)
  AIC(res.1)

TIME SERIES ANALYSIS
  # ACF of white noise
  z <- rnorm(100)
  acf(z)
  z <- rnorm(1000)
  acf(z)

  # Simulate data from an AR(1) process
  x <- arima.sim(model=list(ar=c(0.5)), n=1000)
  arima(x, order=c(1,0,0))
  acf(x)
  # Let's try to estimate a wrong model --
  arima(x, order=c(3,0,0))

A first look at R objects
 - vectors, lists, matrices, data frames.

# To make vectors "x" "y" "year" and "names"
x <- c(2,3,7,9)
y <- c(9,7,3,2)
year <- 1990:1993
names <- c("payal", "shraddha", "kritika", "itida")
# Accessing the 1st and last elements of y --
y[1]
y[length(y)]

# To make a list "person" --
person <- list(name="payal", x=2, y=9, year=1990)
person
# Accessing things inside a list --
person$name
person$x

# To make a matrix, pasting together the columns "year" "x" and "y"
# The verb cbind() stands for "column bind"
cbind(year, x, y)

# To make a "data frame", which is a list of vectors of the same length --
D <- data.frame(names, year, x, y)
nrow(D)
# Accessing one of these vectors
D$names
# Accessing the last element of this vector
D$names[nrow(D)]
# Or equally,
D$names[length(D$names)]

sorting
#
# The approach here needs to be explained. If `i' is a vector of
# integers, then the data frame D[i,] picks up rows from D based
# on the values found in `i'.
#
# The order() function makes an integer vector which is a correct
# ordering for the purpose of sorting.

D <- data.frame(x=c(1,2,3,1), y=c(7,19,2,2))
D

# Sort on x
indexes <- order(D$x)
D[indexes,]

# Print out sorted dataset, sorted in reverse by y
D[rev(order(D$y)),]

Prices and returns
# I like to multiply returns by 100 so as to have "units in percent".
# In other words, I like it for 5% to be a value like 5 rather than 0.05.

# I. Simulate random-walk prices, switch between prices & returns.
# Simulate a time-series of PRICES drawn from a random walk
# where one-period returns are i.i.d. N(mu, sigma^2).
ranrw <- function(mu, sigma, p0=100, T=100) {
  cumprod(c(p0, 1 + (rnorm(n=T, mean=mu, sd=sigma)/100)))
}
prices2returns <- function(x) {
  100*diff(log(x))
}
returns2prices <- function(r, p0=100) {
  c(p0, p0 * exp(cumsum(r/100)))
}

cat("Simulate 25 points from a random walk starting at 1500 --\n")
p <- ranrw(0.05, 1.4, p0=1500, T=25)
    # gives you a 25-long series, starting with a price of 1500, where
    # one-period returns are N(0.05,1.4^2) percent.
print(p)

cat("Convert to returns--\n")
r <- prices2returns(p)
print(r)

cat("Go back from returns to prices --\n")
goback <- returns2prices(r, 1500)
print(goback)

# II. Plenty of powerful things you can do with returns....
summary(r); sd(r)                       # summary statistics
plot(density(r))                        # kernel density plot
acf(r)                                  # Autocorrelation function
ar(r)                                   # Estimate a AIC-minimising AR model
Box.test(r, lag=2, type="Ljung")        # Box-Ljung test
library(tseries)
runs.test(factor(sign(r)))              # Runs test
bds.test(r)                             # BDS test.

# III. Visualisation and the random walk
# I want to obtain intuition into what kinds of price series can happen,
# given a starting price, a mean return, and a given standard deviation.
# This function simulates out 10000 days of a price time-series at a time,
# and waits for you to click in the graph window, after which a second
# series is painted, and so on. Make the graph window very big and
# sit back and admire.
# The point is to eyeball many series and thus obtain some intuition
# into what the random walk does.
visualisation <- function(p0, s, mu, labelstring) {
  N <- 10000
  x <- (1:(N+1))/250                        # Unit of years
  while (1) {
    plot(x, ranrw(mu, s, p0, N), ylab="Level", log="y",
         type="l", col="red", xlab="Time (years)",
         main=paste("40 years of a process much like", labelstring))
    grid()
    z=locator(1)
  }
}

# Nifty -- assuming sigma of 1.4% a day and E(returns) of 13% a year 
visualisation(2600, 1.4, 13/250, "Nifty")

# The numerical values here are used to think about what the INR/USD
# exchange rate would have looked like if it started from 31.37, had
# a mean depreciation of 5% per year, and had the daily vol of a floating
# exchange rate like EUR/USD.
visualisation(31.37, 0.7, 5/365, "INR/USD (NOT!) with daily sigma=0.7")
# This is of course not like the INR/USD series in the real world -
# which is neither a random walk nor does it have a vol of 0.7% a day.

# The numerical values here are used to think about what the USD/EUR
# exchange rate, starting with 1, having no drift, and having the observed
# daily vol of 0.7. (This is about right).
visualisation(1, 0.7, 0, "USD/EUR with no drift")

# IV. A monte carlo experiment about the runs test
# Measure the effectiveness of the runs test when faced with an
# AR(1) process of length 100 with a coeff of 0.1
set.seed(101)
one.ts <- function() {arima.sim(list(order = c(1,0,0), ar = 0.1), n=100)}
table(replicate(1000, runs.test(factor(sign(one.ts())))$p.value < 0.05))
# We find that the runs test throws up a prob value of below 0.05
# for 91 out of 1000 experiments.
# Wow! :-)
# To understand this, you need to look up the man pages of:
#    set.seed, arima.sim, sign, factor, runs.test, replicate, table.
# e.g. say ?replicate

write functions
#        To write functions that send back multiple objects.

# FIRST LEARN ABOUT LISTS --
X = list(height=5.4, weight=54)
print("Use default printing --")
print(X)
print("Accessing individual elements --")
cat("Your height is ", X$height, " and your weight is ", X$weight, "\n")

# FUNCTIONS --
square <- function(x) {
  return(x*x)
}
cat("The square of 3 is ", square(3), "\n")

                 # default value of the arg is set to 5.
cube <- function(x=5) {
  return(x*x*x);
}
cat("Calling cube with 2 : ", cube(2), "\n")    # will give 2^3
cat("Calling cube        : ", cube(), "\n")     # will default to 5^3.

# LEARN ABOUT FUNCTIONS THAT RETURN MULTIPLE OBJECTS --
powers <- function(x) {
  parcel = list(x2=x*x, x3=x*x*x, x4=x*x*x*x);
  return(parcel);
}

X = powers(3);
print("Showing powers of 3 --"); print(X);

# WRITING THIS COMPACTLY (4 lines instead of 7)

powerful <- function(x) {
  return(list(x2=x*x, x3=x*x*x, x4=x*x*x*x));
}
print("Showing powers of 3 --"); print(powerful(3));

# In R, the last expression in a function is, by default, what is
# returned. So you could equally just say:
powerful <- function(x) {list(x2=x*x, x3=x*x*x, x4=x*x*x*x)}

R vector notation
cat("EXAMPLE 1: sin(x) for a vector --\n")
# Suppose you have a vector x --
x = c(0.1,0.6,1.0,1.5)

# The bad way --
n = length(x)
r = numeric(n)
for (i in 1:n) {
  r[i] = sin(x[i])
}
print(r)

# The good way -- don't use loops --
print(sin(x))


cat("\n\nEXAMPLE 2: Compute the mean of every row of a matrix --\n")
# Here's another example. It isn't really about R; it's about thinking in
# matrix notation. But still.
# Let me setup a matrix --
N=4; M=100;
r = matrix(runif(N*M), N, M)

# So I face a NxM matrix 
#               [r11 r12 ... r1N]
#               [r21 r22 ... r2N]
#               [r32 r32 ... r3N]
# My goal: each column needs to be reduced to a mean.

# Method 1 uses loops:
mean1 = numeric(M)
for (i in 1:M) {
  mean1[i] = mean(r[,i])
}

# Alternatively, just say:
mean2 = rep(1/N, N) %*% r               # Pretty!

# The two answers are the same --
all.equal(mean1,mean2[,])
#
# As an aside, I should say that you can do this directly by using
# the rowMeans() function. But the above is more about pedagogy rather
# than showing you how to get rowmeans.


cat("\n\nEXAMPLE 3: Nelson-Siegel yield curve\n")
# Write this asif you're dealing with scalars --
# Nelson Siegel function
nsz <- function(b0, b1, b2, tau, t) {
  tmp = t/tau
  tmp2 = exp(-tmp)
  return(b0 + ((b1+b2)*(1-tmp2)/(tmp)) - (b2*tmp2))
}

timepoints <- c(0.01,1:5)

# The bad way:
z <- numeric(length(timepoints))
for (i in 1:length(timepoints)) {
  z[i] <- nsz(14.084,-3.4107,0.0015,1.8832,timepoints[i])
}
print(z)

# The R way --
print(z <- nsz(14.084,-3.4107,0.0015,1.8832,timepoints))


cat("\n\nEXAMPLE 3: Making the NPV of a bond--\n")
# You know the bad way - sum over all cashflows, NPVing each.
# Now look at the R way.
C = rep(100, 6)
nsz(14.084,-3.4107,0.0015,1.8832,timepoints)        # Print interest rates
C/((1.05)^timepoints)                               # Print cashflows discounted @ 5%
C/((1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints))^timepoints)) # Using NS instead of 5%
# NPV in two different ways --
C %*% (1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints)))^-timepoints
sum(C * (1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints)))^-timepoints)
# You can drop back to a flat yield curve at 5% easily --
sum(C * 1.05^-timepoints)

# Make a function for NPV --
npv <- function(C, timepoints, r) {
  return(sum(C * (1 + (0.01*r))^-timepoints))
}
npv(C, timepoints, 5)

# Bottom line: Here's how you make the NPV of a bond with cashflows C
# at timepoints timepoints when the zero curve is a Nelson-Siegel curve --
npv(C, timepoints, nsz(14.084,-3.4107,0.0015,1.8832,timepoints))
# Wow!

# ---------------------------------------------------------------------------
# Elegant vector notation is amazingly fast (in addition to being beautiful)
N <- 1e5
x <- runif(N, -3,3)
y <- runif(N)

method1 <- function(x,y) {
  tmp <- NULL 
  for (i in 1:N) {
    if (x[i] < 0) tmp <- c(tmp, y[i])
  }
  tmp
}

method2 <- function(x,y) {
  y[x < 0]
}

s1 <- system.time(ans1 <- method1(x,y))
s2 <- system.time(ans2 <- method2(x,y))
all.equal(ans1,ans2)
s1/s2           # On my machine it's 2000x faster

indexing notation, and the use of is.na()
x <- c(2,7,9,2,NA,5)                 # An example vector to play with.

# Give me elems 1 to 3 --
x[1:3]

# Give me all but elem 1 --
x[-1]

# Odd numbered elements --
indexes <- seq(1,6,2)
x[indexes]
# or, more compactly,
x[seq(1,6,2)]

# Access elements by specifying "on" / "off" through booleans --
require <- c(TRUE,TRUE,FALSE,FALSE,FALSE,FALSE)
x[require]
# Short vectors get reused! So, to get odd numbered elems --
x[c(TRUE,FALSE)]

# Locate missing data --
is.na(x)

# Replace missing data by 0 --
x[is.na(x)] <- 0
x

# Similar ideas work for matrices --
y <- matrix(c(2,7,9,2,NA,5), nrow=2)
y

# Make a matrix containing columns 1 and 3 --
y[,c(1,3)]

# Let us see what is.na(y) does --
is.na(y)
str(is.na(y))
# So is.na(y) gives back a matrix with the identical structure as that of y.
# Hence I can say
y[is.na(y)] <- -1
y

latex tabular out of an R matrix
# Setup a nice R object:
m <- matrix(rnorm(8), nrow=2)
rownames(m) <- c("Age", "Weight")
colnames(m) <- c("Person1", "Person2", "Person3", "Person4")
m

# Translate it into a latex tabular:
library(xtable)
xtable(m, digits=rep(3,5))

# Production latex code that goes into a paper or a book --
print(xtable(m,
             caption="String",
             label="t:"),
             type="latex",
             file="blah.gen",
             table.placement="tp",
             latex.environments=c("center", "footnotesize"))
# Now you do \input{blah.gen} in your latex file.



# You're lazy, and want to use R to generate latex tables for you?
data <- cbind(
              c(7,9,11,2),
              c(2,4,19,21)
              )
colnames(data) <- c("a","b")
rownames(data) <- c("x","y","z","a")
xtable(data)

# or you could do
data <- rbind(
              c(7,2),
              c(9,4),
              c(11,19),
              c(2,21)
              )
# and the rest goes through identically.

Associative arrays (as in awk) or hashes (as in perl)
#       Or, more generally, adventures in R addressing.

# Here's a plain R vector:
x <- c(2,3,7,9)
# But now I tag every elem with labels:
names(x) <- c("kal","sho","sad","aja")
# Associative array operations:
x["kal"] <- 12
# Pretty printing the entire associative array:
x

# This works for matrices too:
m <- matrix(runif(10), nrow=5)
rownames(m) <- c("violet","indigo","blue","green","yellow")
colnames(m) <- c("Asia","Africa")
# The full matrix --
m
# Or even better --
library(xtable)
xtable(m)

# Now address symbolically --
m[,"Africa"]
m["indigo",]
m["indigo","Africa"]

# The "in" operator, as in awk --
for (colour in c("yellow", "orange", "red")) {
  if (colour %in% rownames(m)) {
    cat("For Africa and ", colour, " we have ", m[colour, "Africa"], "\n")
  } else {
    cat("Colour ", colour, " does not exist in the hash.\n")
  }
}

# This works for data frames also --
D <- data.frame(m)
D
# Look closely at what happened --
str(D)                                  # The colours are the rownames(D).

# Operations --
D$Africa
D[,"Africa"]
D["yellow",]
# or
subset(D, rownames(D)=="yellow")

colnames(D) <- c("Antarctica","America")
D
D$America

Utilise matrix notation
#       We use the problems of portfolio analysis as an example.

# Prices of 4 firms to play with, at weekly frequency (for calendar 2004) --
p <- structure(c(300.403, 294.604, 291.038, 283.805, 270.773, 275.506, 292.271, 292.837, 284.872, 295.037, 280.939, 259.574, 250.608, 268.84, 266.507, 263.94, 273.173, 238.609, 230.677, 192.847, 219.078, 201.846, 210.279, 193.281, 186.748, 197.314, 202.813, 204.08, 226.044, 242.442, 261.274, 269.173, 256.05, 259.75, 243, 250.3, 263.45, 279.5, 289.55, 291.95, 302.1, 284.4, 283.5, 287.8, 298.3, 307.6, 307.65, 311.9, 327.7, 318.1, 333.6, 358.9, 385.1, 53.6, 51.95, 47.65, 44.8, 44.85, 44.3, 47.1, 44.2, 41.8, 41.9, 41, 35.3, 33.35, 35.6, 34.55, 35.55, 40.05, 35, 34.85, 28.95, 31, 29.25, 29.05, 28.95, 24.95, 26.15, 28.35, 29.4, 32.55, 37.2, 39.85, 40.8, 38.2, 40.35, 37.55, 39.4, 39.8, 43.25, 44.75, 47.25, 49.6, 47.6, 46.35, 49.4, 49.5, 50.05, 50.5, 51.85, 56.35, 54.15, 58, 60.7, 62.7, 293.687, 292.746, 283.222, 286.63, 259.774, 259.257, 270.898, 250.625, 242.401, 248.1, 244.942, 239.384, 237.926, 224.886, 243.959, 270.998, 265.557, 257.508, 258.266, 257.574, 251.917, 250.583, 250.783, 246.6, 252.475, 266.625, 263.85, 249.925, 262.9, 264.975, 273.425, 275.575, 267.2, 282.25, 284.25, 290.75, 295.625, 296.25, 291.375, 302.225, 318.95, 324.825, 320.55, 328.75, 344.05, 345.925, 356.5, 368.275, 374.825, 373.525, 378.325, 378.6, 374.4, 1416.7, 1455.15, 1380.97, 1365.31, 1303.2, 1389.64, 1344.05, 1266.29, 1265.61, 1312.17, 1259.25, 1297.3, 1327.38, 1250, 1328.03, 1347.46, 1326.79, 1286.54, 1304.84, 1272.44, 1227.53, 1264.44, 1304.34, 1277.65, 1316.12, 1370.97, 1423.35, 1382.5, 1477.75, 1455.15, 1553.5, 1526.8, 1479.85, 1546.8, 1565.3, 1606.6, 1654.05, 1689.7, 1613.95, 1703.25, 1708.05, 1786.75, 1779.75, 1906.35, 1976.6, 2027.2, 2057.85, 2029.6, 2051.35, 2033.4, 2089.1, 2065.2, 2091.7), .Dim = c(53, 4), .Dimnames = list(NULL, c("TISCO", "SAIL", "Wipro", "Infosys")))
# Shift from prices to returns --
r <- 100*diff(log(p))

# Historical expected returns --
colMeans(r)
# Historical correlation matrix --
cor(r)
# Historical covariance matrix --
S <- cov(r)
S

# Historical portfolio variance for a stated portfolio of 20%,20%,30%,30% --
w <- c(.2, .2, .3, .3)
t(w) %*% S %*% w

# The portfolio optimisation function in tseries --
library(tseries)
optimised <- portfolio.optim(r)         # This uses the historical facts from r
optimised$pw                            # Weights
optimised$pm                            # Expected return using these weights
optimised$ps                            # Standard deviation of optimised port.

price time-series
#       A stock is traded on 2 exchanges.
#       Price data is missing at random on both exchanges owing to non-trading.
#       We want to make a single price time-series utilising information
#          from both exchanges. I.e., missing data for exchange 1 will
#          be replaced by information for exchange 2 (if observed).

# Let's create some example data for the problem.
e1 <- runif(15)                         # Prices on exchange 1
e2 <- e1 + 0.05*rnorm(15)               # Prices on exchange 2.
cbind(e1, e2)
# Blow away 5 points from each at random.
e1[sample(1:15, 5)] <- NA
e2[sample(1:15, 5)] <- NA
cbind(e1, e2)

# Now how do we reconstruct a time-series that tries to utilise both?
combined <- e1                          # Do use the more liquid exchange here.
missing <- is.na(combined)
combined[missing] <- e2[missing]        # if it's also missing, I don't care.
cbind(e1, e2, combined)
# There you are.

panel of pictures
par(mfrow=c(3,2))                       # 3 rows, 2 columns.

# Now the next 6 pictures will be placed on these 6 regions. :-)

# Let me take some pains on the 1st
plot(density(runif(100)), lwd=2)
text(x=0, y=0.2, "100 uniforms")        # Showing you how to place text at will
abline(h=0, v=0)
              # All these statements effect the 1st plot.

x=seq(0.01,1,0.01)
par(col="blue")                         # default colour to blue.

# 2 --
plot(x, sin(x), type="l")
lines(x, cos(x), type="l", col="red")

# 3 --
plot(x, exp(x), type="l", col="green")
lines(x, log(x), type="l", col="orange")

# 4 --
plot(x, tan(x), type="l", lwd=3, col="yellow")

# 5 --
plot(x, exp(-x), lwd=2)
lines(x, exp(x), col="green", lwd=3)

# 6 --
plot(x, sin(x*x), type="l")
lines(x, sin(1/x), col="pink")

Make pictures in PDF files
xpts <- seq(-3,3,.05)

# Here is my suggested setup for a two-column picture --
pdf("demo2.pdf", width=5.6, height=2.8, bg="cadetblue1", pointsize=8)
par(mai=c(.6,.6,.2,.2))
plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4",
     xlab="x", ylab="sin(x*x)")
grid(col="white", lty=1, lwd=.2)
abline(h=0, v=0)

# My suggested setup for a square one-column picture --
pdf("demo1.pdf", width=2.8, height=2.8, bg="cadetblue1", pointsize=8)
par(mai=c(.6,.6,.2,.2))
plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4",
     xlab="x", ylab="sin(x*x)")
grid(col="white", lty=1, lwd=.2)
abline(h=0, v=0)



histogram with tails shown in red
# This happened on the R mailing list on 7 May 2004.

x <- rnorm(1000)
hx <- hist(x, breaks=100, plot=FALSE)
plot(hx, col=ifelse(abs(hx$breaks) < 1.669, 4, 2))
         # What is cool is that "col" is supplied a vector.

Visualisation of 3-dimensional (x,y,z) data using contour
#       plots and using colour to represent the 3rd dimension.
#       The specific situation is: On a grid of (x,y) points, you have
#       evaluated f(x,y). Now you want a graphical representation of
#       the resulting list of (x,y,z) points that you have.

# Setup an interesting data matrix of (x,y,z) points:
points <- structure(
 c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0.998, 0.124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0.998, 0.71, 0.068, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0.998, 0.898, 0.396, 0.058, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.998, 0.97, 0.726, 0.268, 0.056, 0.006, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.996, 0.88, 0.546, 0.208, 0.054, 0.012, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.998, 0.964, 0.776, 0.418, 0.18, 0.054, 0.014, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.998, 0.906, 0.664, 0.342, 0.166, 0.056, 0.018, 0.006, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.986, 0.862, 0.568, 0.29, 0.15, 0.056, 0.022, 0.008, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.954, 0.778, 0.494, 0.26, 0.148, 0.056, 0.024, 0.012, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.906, 0.712, 0.43, 0.242, 0.144, 0.058, 0.028, 0.012, 0.006, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.878, 0.642, 0.38, 0.222, 0.142, 0.066, 0.034, 0.014, 0.008, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0.846, 0.586, 0.348, 0.208, 0.136, 0.068, 0.034, 0.016, 0.012, 0.006, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0.8, 0.538, 0.318, 0.204, 0.136, 0.07, 0.046, 0.024, 0.012, 0.008, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0, 0, 0.762, 0.496, 0.294, 0.2, 0.138, 0.072, 0.05, 0.024, 0.014, 0.012, 0.006, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0, 0.704, 0.472, 0.286, 0.198, 0.138, 0.074, 0.054, 0.028, 0.016, 0.012, 0.008, 0.006, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0.668, 0.438, 0.276, 0.196, 0.138, 0.078, 0.054, 0.032, 0.024, 0.014, 0.012, 0.008, 0.004, 0.004, 0.002, 0.002, 0, 0, 0, 0.634, 0.412, 0.27, 0.194, 0.14, 0.086, 0.056, 0.032, 0.024, 0.016, 0.012, 0.01, 0.006, 0.004, 0.004, 0.002, 0.002, 0, 0, 0.604, 0.388, 0.26, 0.19, 0.144, 0.088, 0.058, 0.048, 0.026, 0.022, 0.014, 0.012, 0.008, 0.006, 0.004, 0.004, 0.002, 0.002, 0, 0.586, 0.376, 0.256, 0.19, 0.146, 0.094, 0.062, 0.052, 0.028, 0.024, 0.014, 0.012, 0.012, 0.008, 0.004, 0.004, 0.004, 0.002, 0.002, 0.566, 0.364, 0.254, 0.192, 0.148, 0.098, 0.064, 0.054, 0.032, 0.024, 0.022, 0.014, 0.012, 0.012, 0.008, 0.004, 0.004, 0.004, 0.002),
  .Dim = c(399, 3),
  .Dimnames = list(NULL, c("x", "y", "z"))
 )

summary(points)
  # x is a grid from 0 to 1
  # y is a grid from 20 to 200
  # z is the interesting object which will be the 3rd dimension.

# Solution using contourplot() from package 'lattice'
library(lattice)
d3 <- data.frame(points)
contourplot(z ~ x+y, data=d3)
## or nicer
contourplot(z ~ x+y, data=d3, cuts=20, region = TRUE)
## or using logit - transformed z values:
contourplot(qlogis(z) ~ x+y, data=d3, pretty=TRUE, region = TRUE)

# An interesting alternative is levelplot()
levelplot(z ~ x+y, pretty=TRUE, contour=TRUE, data=d3)

# There is a contour() function in R. Even though it sounds obvious
# for the purpose, it is a bit hard to use.
# contour() wants 3 inputs: vectors of x and y values, and a matrix of
# z values, where the x values correspond to the rows of z, and the y
# values to the columns.  A collection of points like `points' above
# needs to be turned into such a grid. It might sound odd, but contour()
# image() and persp() have used this kind of input for the longest time.
# 
# For irregular data, there's an interp function in the akima package
# that can convert from irregular data into the grid format.
#
# The `points' object that I have above - a list of (x,y,z) points -
# fits directly into the mentality of lattice::contourplot() but not
# into the requirements of contour()

Display of a macroeconomic time-series, with a filled colour
#       bar showing a recession.

years <- 1950:2000
timeseries <- cumsum(c(100, runif(50)*5))
hilo <- range(timeseries)
plot(years, timeseries, type="l", lwd=3)
# A recession from 1960 to 1965 --
polygon(x=c(1960,1960, 1965,1965),
        y=c(hilo, rev(hilo)),
        density=NA, col="orange", border=NA)
lines(years, timeseries, type="l", lwd=3) # paint again so line comes on top

# alternative method -- though not as good looking --
# library(plotrix)
# gradient.rect(1960, hilo[1], 1965, hilo[2],
#               reds=c(0,1), greens=c(0,0), blues=c(0,0),
#               gradient="y")

Display two series on one plot, one with a left y axis
#       and another with a right y axis.

y1 <- cumsum(rnorm(100))
y2 <- cumsum(rnorm(100, mean=0.2))

par(mai=c(.8, .8, .2, .8))
plot(1:100, y1, type="l", col="blue", xlab="X axis label", ylab="Left legend")
par(new=TRUE)
plot(1:100, y2, type="l", ann=FALSE, yaxt="n")
axis(4)
legend(x="topleft", bty="n", lty=c(1,1), col=c("blue","black"),
       legend=c("String 1 (left scale)", "String 2 (right scale)"))

Simulate a dataset from the OLS model and obtain
#       obtain OLS estimates for it.

x <- runif(100, 0, 10)                  # 100 draws from U(0,10)
y <- 2 + 3*x + rnorm(100)               # beta = [2, 3] and sigma = 1

# You want to just look at OLS results?
summary(lm(y ~ x))

# Suppose x and y were packed together in a data frame --
D <- data.frame(x,y)
summary(lm(y ~ x, D))

# Full and elaborate steps --
d <- lm(y ~ x)
# Learn about this object by saying ?lm and str(d)
# Compact model results --
print(d)
# Pretty graphics for regression diagnostics --
par(mfrow=c(2,2))
plot(d)

d <- summary(d)
# Detailed model results --
print(d)
# Learn about this object by saying ?summary.lm and by saying str(d)
cat("OLS gave slope of ", d$coefficients[2,1],
    "and a error sigma of ", d$sigma, "\n")


## I need to drop down to a smaller dataset now --
x <- runif(10)
y <- 2 + 3*x + rnorm(10)
m <- lm(y ~ x)

# Now R supplies a wide range of generic functions which extract
# useful things out of the result of estimation of many kinds of models.

residuals(m)
fitted(m)
AIC(m)
AIC(m, k=log(10))                        # SBC
vcov(m)
logLik(m)

"Dummy variables" in regression
# Suppose you have this data:
people = data.frame(
  age =       c(21,62,54,49,52,38),
  education = c("college", "school", "none", "school", "college", "none"),
  education.code = c(  2,        1,      0,        1,         2,      0 )
  )
# Here people$education is a string categorical variable and
# people$education.code is the same thing, with a numerical coding system.
people

# Note the structure of the dataset --
str(people)
# The strings supplied for `education' have been treated (correctly) as
# a factor, but education.code is being treated as an integer and not as
# a factor.


# We want to do a dummy variable regression. Normally you would have:
#  1 Chosen college as the omitted category
#  2 Made a dummy for "none" named educationnone
#  3 Made a dummy for "school" named educationschool
#  4 Ran a regression like lm(age ~ educationnone + educationschool, people)
# But this is R. Things are cool:
lm(age ~ education, people)

# ! :-)
# When you feed him an explanatory variable like education, he does all
# these steps automatically. (He chose college as the omitted category).

# If you use an integer coding, then the obvious thing goes wrong --
lm(age ~ education.code, people)
# because he's thinking that education.code is an integer explanatory
# variable. So you need to:

lm(age ~ factor(education.code), people)
# (he choose a different omitted category)

# Alternatively, fix up the dataset --
people$education.code <- factor(people$education.code)
lm(age ~ education.code, people)

#
# Bottom line:
# Once the dataset has categorical variables correctly represented as factors, i.e. as
str(people)
# doing OLS in R induces automatic generation of dummy variables while leaving one out:
lm(age ~ education, people)
lm(age ~ education.code, people)

# But what if you want the X matrix?
m <- lm(age ~ education, people)
model.matrix(m)
# This is the design matrix that went into the regression m.

latex table with results of an OLS regression
# Get an OLS --
x1 = runif(100)
x2 = runif(100, 0, 2)
y = 2 + 3*x1 + 4*x2 + rnorm(100)
m = lm(y ~ x1 + x2)

# and print it out prettily --
library(xtable)
# Bare --
xtable(m)
xtable(anova(m))

# Better --
print.xtable(xtable(m, caption="My regression",
                    label="t:mymodel",
                    digits=c(0,3,2,2,3)),
             type="latex",
             file="xtable_demo_ols.tex",
             table.placement = "tp",
             latex.environments=c("center", "footnotesize"))

print.xtable(xtable(anova(m),
                    caption="ANOVA of my regression",
                    label="t:anova_mymodel"),
             type="latex",
             file="xtable_demo_anova.tex",
             table.placement = "tp",
             latex.environments=c("center", "footnotesize"))

# Read the documentation of xtable. It actually knows how to generate
# pretty latex tables for a lot more R objects than just OLS results.
# It can be a workhorse for making tabular out of matrices, and
# can also generate HTML.

Simulate a dataset from a "fixed effects" model
# and obtain "least squares dummy variable" (LSDV) estimates.
#
# We do this in the context of a familiar "earnings function" -
#  log earnings is quadratic in log experience, with parallel shifts by
#  education category.

# Create an education factor with 4 levels --
education <- factor(sample(1:4,1000, replace=TRUE),
                    labels=c("none", "school", "college", "beyond"))
# Simulate an experience variable with a plausible range --
experience <- 30*runif(1000)            # experience from 0 to 20 years
# Make the intercept vary by education category between 4 given values --
intercept <- c(0.5,1,1.5,2)[education]

# Simulate the log earnings --
log.earnings <- intercept +
  2*experience - 0.05*experience*experience + rnorm(1000)
A <- data.frame(education, experience, e2=experience*experience, log.earnings)
summary(A)

# The OLS path to LSDV --
summary(lm(log.earnings ~ -1 + education + experience + e2, A))

Using data from Yahoo finance
#       for weekly returns , estimate the beta of Sun Microsystems
# This is the `elaborate version' (36 lines), also see terse version (16 lines)

library(tseries)

# I know that the yahoo symbol for the common stock of Sun Microsystems
# is "SUNW" and for the S&P 500 index is "^GSPC".
prices <- cbind(get.hist.quote("SUNW", quote="Adj", start="2003-01-01", retclass="zoo"),
                get.hist.quote("^GSPC", quote="Adj", start="2003-01-01", retclass="zoo"))
colnames(prices) <- c("SUNW", "SP500")
prices <- na.locf(prices)               # Copy last traded price when NA

# To make weekly returns, you must have this incantation:
nextfri.Date <- function(x) 7 * ceiling(as.numeric(x - 1)/7) + as.Date(1)
# and then say
weekly.prices <- aggregate(prices, nextfri.Date,tail,1)

# Now we can make weekly returns --
r <- 100*diff(log(weekly.prices))

# Now shift out of zoo to become an ordinary matrix --
r <- coredata(r)
rj <- r[,1]
rM <- r[,2]
d <- lm(rj ~ rM)               # Market model estimation.
print(summary(d))

# Make a pretty picture
big <- max(abs(c(rj, rM)))
range <- c(-big, big)
plot(rM, rj, xlim=range, ylim=range,
     xlab="S&P 500 weekly returns (%)", ylab="SUNW weekly returns (%)")
grid()
abline(h=0, v=0)
lines(rM, d$fitted.values, col="blue")

Terse version of estimating the beta
#        using weekly returns and data from Yahoo finance.
#        By Gabor Grothendieck.

library(tseries)

getstock <- function(x)
   c(get.hist.quote(x, quote = "Adj", start = "2003-01-01", compress = "w"))
r <- diff(log(cbind(sp500 = getstock("^gspc"), sunw = getstock("sunw"))))

mm <- lm(sunw ~ ., r)
print(summary(mm))

range <- range(r, -r)
plot(r[,1], r[,2], xlim = range, ylim = range,
     xlab = "S&P 500 weekly returns (%)", ylab = "SUNW weekly returns (%)")
grid()
abline(mm, h = 0, v = 0, col = "blue")

nonlinear regression, in three ways
#       By just supplying the function to be fit,
#       By also supplying the analytical derivatives, and
#       By having him analytically differentiate the function to be fit.
#
# John Fox has a book "An R and S+ companion to applied regression"
# (abbreviated CAR).
# An appendix associated with this book, titled
#   "Nonlinear regression and NLS"
# is up on the web, and I strongly recommend that you go read it.
#
# This file is essentially from there (I have made slight changes).

# First take some data - from the CAR book --
library(car)
data(US.pop)
attach(US.pop)
plot(year, population, type="l", col="blue")

# So you see, we have a time-series of the US population. We want to
# fit a nonlinear model to it.

library(stats)                            # Contains nonlinear regression
time <- 0:20
pop.mod <- nls(population ~ beta1/(1 + exp(beta2 + beta3*time)),
  start=list(beta1=350, beta2=4.5, beta3=-0.3), trace=TRUE)
# You just write in the formula that you want to fit, and supply
# starting values. "trace=TRUE" makes him show iterations go by.

summary(pop.mod)
# Add in predicted values into the plot
lines(year, fitted.values(pop.mod), lwd=3, col="red")

# Look at residuals
plot(year, residuals(pop.mod), type="b")
abline(h=0, lty=2)

# Using analytical derivatives:
model <- function(beta1, beta2, beta3, time) {
  m <- beta1/(1+exp(beta2+beta3*time))
  term <- exp(beta2 + beta3*time)
  gradient <- cbind((1+term)^-1,
                    -beta1*(1+term)^-2 * term,
                    -beta1*(1+term)^-2 * term * time)
  attr(m, 'gradient') <- gradient
  return(m)
}

summary(nls(population ~ model(beta1, beta2, beta3, time),
            start=list(beta1=350, beta2=4.5, beta3=-0.3)))

# Using analytical derivatives, using automatic differentiation (!!!):
model <- deriv(~ beta1/(1 + exp(beta2+beta3*time)), # rhs of model
               c('beta1', 'beta2', 'beta3'), # parameter names
               function(beta1, beta2, beta3, time){} # arguments for result
               )
summary(nls(population ~ model(beta1, beta2, beta3, time),
            start=list(beta1=350, beta2=4.5, beta3=-0.3)))

Some of the standard tests
# A classical setting --
x <- runif(100, 0, 10)                  # 100 draws from U(0,10)
y <- 2 + 3*x + rnorm(100)               # beta = [2, 3] and sigma is 1
d <- lm(y ~ x)
# CLS results --
summary(d)

library(sandwich)
library(lmtest)
# Durbin-Watson test --
dwtest(d, alternative="two.sided")
# Breusch-Pagan test --
bptest(d)
# Heteroscedasticity and autocorrelation consistent (HAC) tests
coeftest(d, vcov=kernHAC)

# Tranplant the HAC values back in --
library(xtable)
sum.d <- summary(d)
xtable(sum.d)
sum.d$coefficients[1:2,1:4] <- coeftest(d, vcov=kernHAC)[1:2,1:4]
xtable(sum.d)

Experiment with fitting nonlinear functional forms
#       in OLS, using orthogonal polynomials to avoid difficulties with
#       near-singular design matrices that occur with ordinary polynomials.
#       Shriya Anand, Gabor Grothendieck, Ajay Shah, March 2006.

# We will deal with noisy data from the d.g.p. y = sin(x) + e
x <- seq(0, 2*pi, length.out=50)
set.seed(101)
y <- sin(x) + 0.3*rnorm(50)
basicplot <- function(x, y, minx=0, maxx=3*pi, title="") {
  plot(x, y, xlim=c(minx,maxx), ylim=c(-2,2), main=title)
  lines(x, sin(x), col="blue", lty=2, lwd=2)
  abline(h=0, v=0)
}
x.outsample <- seq(0, 3*pi, length.out=100)

# Severe multicollinearity with ordinary polynomials
x2 <- x*x
x3 <- x2*x
x4 <- x3*x
cor(cbind(x, x2, x3, x4))
# and a perfect design matrix using orthogonal polynomials
m <- poly(x, 4)
all.equal(cor(m), diag(4))              # Correlation matrix is I.

par(mfrow=c(2,2))
# Ordinary polynomial regression --
  p <- lm(y ~ x + I(x^2) + I(x^3) + I(x^4))
  summary(p)
  basicplot(x, y, title="Polynomial, insample") # Data
  lines(x, fitted(p), col="red", lwd=3)  # In-sample
  basicplot(x, y, title="Polynomial, out-of-sample")
  predictions.p <- predict(p, list(x = x.outsample))    # Out-of-sample
  lines(x.outsample, predictions.p, type="l", col="red", lwd=3)
  lines(x.outsample, sin(x.outsample), type="l", col="blue", lwd=2, lty=2)
  # As expected, polynomial fitting gives terrible results out of sample.

# These IDENTICAL things using orthogonal polynomials
  d <- lm(y ~ poly(x, 4))
  summary(d)
  basicplot(x, y, title="Orth. poly., insample") # Data
  lines(x, fitted(d), col="red", lwd=3)  # In-sample
  basicplot(x, y, title="Orth. poly., out-of-sample")
  predictions.op <- predict(d, list(x = x.outsample))    # Out-of-sample
  lines(x.outsample, predictions.op, type="l", col="red", lwd=3) 
  lines(x.outsample, sin(x.outsample), type="l", col="blue", lwd=2, lty=2)

# predict(d) is magical! See ?SafePrediction
# The story runs at two levels. First, when you do an OLS model,
# predict()ion requires applying coefficients to an appropriate
# X matrix. But one level deeper, the polynomial or orthogonal-polynomial
# needs to be utilised for computing the X matrix based on the
# supplied x.outsample data.
# If you say p <- poly(x, n)
# then you can say predict(p, new) where predict.poly() gets invoked.
# And when you say predict(lm()), the full steps are worked out for
# you automatically: predict.poly() is used to make an X matrix and
# then prediction based on the regression results is done.

all.equal(predictions.p, predictions.op) # Both paths are identical for this
                                         # (tame) problem.

R syntax where model specification is an argument to a function
# Invent a dataset
x <- runif(100); y <- runif(100); z <- 2 + 3*x + 4*y + rnorm(100)
D <- data.frame(x=x, y=y, z=z)

amodel <- function(modelstring) {
  summary(lm(modelstring, D))
}

amodel(z ~ x)
amodel(z ~ y)

Show the efficiency of the mean when compared with the median
#       using a large simulation where both estimators are applied on
#       a sample of U(0,1) uniformly distributed random numbers.

one.simulation <- function(N=100) {     # N defaults to 100 if not supplied
  x <- runif(N)
  return(c(mean(x), median(x)))
}

# Simulation --
results <- replicate(100000, one.simulation(20)) # Gives back a 2x100000 matrix

# Two kernel densities --
k1 <- density(results[1,])              # results[1,] is the 1st row
k2 <- density(results[2,])

# A pretty picture --
xrange <- range(k1$x, k2$x)
plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="")
grid()
lines(k2$x, k2$y, col="red")
abline(v=.5)
legend(x="topleft", bty="n",
       lty=c(1,1), 
       col=c("black", "red"),
       legend=c("Mean", "Median"))

Simulation to study size and power in a simple problem
set.seed(101)

# The data generating process: a simple uniform distribution with stated mean
dgp <- function(N,mu) {runif(N)-0.5+mu}

# Simulate one FIXED hypothesis test for H0:mu=0, given a true mu for a sample size N
one.test <- function(N, truemu) {
  x <- dgp(N,truemu)
  muhat <- mean(x)
  s <- sd(x)/sqrt(N)
  # Under the null, the distribution of the mean has standard error s
  threshold <- 1.96*s
  (muhat < -threshold) || (muhat > threshold)
} # Return of TRUE means reject the null

# Do one experiment, where the fixed H0:mu=0 is run Nexperiments times with a sample size N.
# We return only one number: the fraction of the time that H0 is rejected.
experiment <- function(Nexperiments, N, truemu) {
  sum(replicate(Nexperiments, one.test(N, truemu)))/Nexperiments
}

# Measure the size of a test, i.e. rejections when H0 is true
experiment(10000, 50, 0)
# Measurement with sample size of 50, and true mu of 0.

# Power study: I.e. Pr(rejection) when H0 is false
# (one special case in here is when the H0 is actually true)

muvalues <- seq(-.15,.15,.01)
  # When true mu < -0.15 and when true mu > 0.15,
  # the Pr(rejection) veers to 1 (full power) and it's not interesting.

# First do this with sample size of 50
results <- NULL
for (truth in muvalues) {
  results <- c(results, experiment(10000, 50, truth))
}
par(mai=c(.8,.8,.2,.2))
plot(muvalues, results, type="l", lwd=2, ylim=c(0,1),
     xlab="True mu", ylab="Pr(Rejection of H0:mu=0)")
abline(h=0.05, lty=2)

# Now repeat this with sample size of 100 (should yield a higher power)
results <- NULL
for (truth in muvalues) {
  results <- c(results, experiment(10000, 100, truth))
}
lines(muvalues, results, lwd=2, col="blue")
legend(x=-0.15, y=.2, lwd=c(2,1,2), lty=c(1,2,1), cex=.8,
       col=c("black","black","blue"), bty="n",
       legend=c("N=50", "Size, 0.05", "N=100"))

Do bootstrap inference, as an example, for a sample median
library(boot)

samplemedian <- function(x, d) {        # d is a vector of integer indexes
  return(median(x[d]))                  # The genius is in the x[d] notation
}

data <- rnorm(50)                          # Generate a dataset with 50 obs
b  <-  boot(data, samplemedian, R=2000)    # 2000 bootstrap replications
cat("Sample median has a sigma of ", sd(b$t[,1]), "\n")
plot(b)

# Make a 99% confidence interval
boot.ci(b, conf=0.99, type="basic")

OLS by MLE
# OLS likelihood function
# Note: I am going to write the LF using sigma2=sigma^2 and not sigma.
ols.lf1 <- function(theta, y, X) {
  beta <- theta[-1]
  sigma2 <- theta[1]
  if (sigma2 <= 0) return(NA)
  n <- nrow(X)
  e <- y - X%*%beta                                  # t() = matrix transpose
  logl <- ((-n/2)*log(2*pi)) - ((n/2)*log(sigma2)) - ((t(e)%*%e)/(2*sigma2))
  return(-logl) # since optim() does minimisation by default.
}

# Analytical derivatives
ols.gradient <- function(theta, y, X) {
  beta <- theta[-1]
  sigma2 <- theta[1]
  e <- y - X%*%beta
  n <- nrow(X)

  g <- numeric(length(theta))  
  g[1] <- (-n/(2*sigma2)) + (t(e)%*%e)/(2*sigma2*sigma2) # d logl / d sigma
  g[-1] <- (t(X) %*% e)/sigma2                           # d logl / d beta

  return(-g)
}

X <- cbind(1, runif(1000))
theta.true <- c(2,4,6) # error variance = 2, intercept = 4, slope = 6.
y <- X %*% theta.true[-1] + sqrt(theta.true[1]) * rnorm(1000)

# Estimation by OLS --
d <- summary(lm(y ~ X[,2]))
theta.ols <- c(sigma2 = d$sigma^2, d$coefficients[,1])
cat("OLS theta = ", theta.ols, "\n\n")

cat("\nGradient-free (constrained optimisation) --\n")
optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1,
      lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), y=y, X=X)

cat("\nUsing the gradient (constrained optimisation) --\n")
optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, gr=ols.gradient,
      lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), y=y, X=X)

cat("\n\nYou say you want a covariance matrix?\n")
p <- optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, gr=ols.gradient,
           lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), hessian=TRUE,
           y=y, X=X)
inverted <- solve(p$hessian)
results <- cbind(p$par, sqrt(diag(inverted)), p$par/sqrt(diag(inverted)))
colnames(results) <- c("Coefficient", "Std. Err.", "t")
rownames(results) <- c("Sigma", "Intercept", "X")
cat("MLE results --\n")
print(results)
cat("Compare with the OLS results --\n")
d$coefficients

# Picture of how the loglikelihood changes if you perturb the sigma
theta <- theta.ols
delta.values <- seq(-1.5, 1.5, .01)
logl.values <- as.numeric(lapply(delta.values,
                                 function(x) {-ols.lf1(theta+c(x,0,0),y,X)}))
plot(sqrt(theta[1]+delta.values), logl.values, type="l", lwd=3, col="blue",
     xlab="Sigma", ylab="Log likelihood")
grid()

To do `moving window volatility' of returns
library(zoo)

# Some data to play with (Nifty on all fridays for calendar 2004) --
p <- structure(c(1946.05, 1971.9, 1900.65, 1847.55, 1809.75, 1833.65, 1913.6, 1852.65, 1800.3, 1867.7, 1812.2, 1725.1, 1747.5, 1841.1, 1853.55, 1868.95, 1892.45, 1796.1, 1804.45, 1582.4, 1560.2, 1508.75, 1521.1, 1508.45, 1491.2, 1488.5, 1537.5, 1553.2, 1558.8, 1601.6, 1632.3, 1633.4, 1607.2, 1590.35, 1609, 1634.1, 1668.75, 1733.65, 1722.5, 1775.15, 1820.2, 1795, 1779.75, 1786.9, 1852.3, 1872.95, 1872.35, 1901.05, 1996.2, 1969, 2012.1, 2062.7, 2080.5), index = structure(c(12419, 12426, 12433, 12440, 12447, 12454, 12461, 12468, 12475, 12482, 12489, 12496, 12503, 12510, 12517, 12524, 12531, 12538, 12545, 12552, 12559, 12566, 12573, 12580, 12587, 12594, 12601, 12608, 12615, 12622, 12629, 12636, 12643, 12650, 12657, 12664, 12671, 12678, 12685, 12692, 12699, 12706, 12713, 12720, 12727, 12734, 12741, 12748, 12755, 12762, 12769, 12776, 12783), class = "Date"), frequency = 0.142857142857143, class = c("zooreg", "zoo"))

# Shift to returns --
r <- 100*diff(log(p))
head(r)
summary(r)
sd(r)

# Compute the moving window vol --
vol <- sqrt(250) * rollapply(r, 20, sd, align = "right")

# A pretty plot --
plot(vol, type="l", ylim=c(0,max(vol,na.rm=TRUE)),
     lwd=2, col="purple", xlab="2004",
     ylab=paste("Annualised sigma, 20-week window"))
grid()
legend(x="bottomleft", col=c("purple", "darkgreen"),
       lwd=c(2,2), bty="n", cex=0.8,
       legend=c("Annualised 20-week vol (left scale)", "Nifty (right scale)"))
par(new=TRUE)
plot(p, type="l", lwd=2, col="darkgreen",
     xaxt="n", yaxt="n", xlab="", ylab="")
axis(4)

Joint distributions, marginal distributions, useful tables
# First let me invent some fake data
set.seed(102)                           # This yields a good illustration.
x <- sample(1:3, 15, replace=TRUE)
education <- factor(x, labels=c("None", "School", "College"))
x <- sample(1:2, 15, replace=TRUE)
gender <- factor(x, labels=c("Male", "Female"))
age <- runif(15, min=20,max=60)

D <- data.frame(age, gender, education)
rm(x,age,gender,education)
print(D)

# Table about education
table(D$education)

# Table about education and gender --
table(D$gender, D$education)
# Joint distribution of education and gender --
table(D$gender, D$education)/nrow(D)

# Add in the marginal distributions also
addmargins(table(D$gender, D$education))
addmargins(table(D$gender, D$education))/nrow(D)

# Generate a good LaTeX table out of it --
library(xtable)
xtable(addmargins(table(D$gender, D$education))/nrow(D),
       digits=c(0,2,2,2,2))             # You have to do | and \hline manually.

# Study age by education category
by(D$age, D$gender, mean)
by(D$age, D$gender, sd)
by(D$age, D$gender, summary)

# Two-way table showing average age depending on education & gender
a <- matrix(by(D$age, list(D$gender, D$education), mean), nrow=2)
rownames(a) <- levels(D$gender)
colnames(a) <- levels(D$education)
print(a)
# or, of course,
print(xtable(a))

Get the data in place
load(file="demo.rda")
summary(firms)

# Look at it --
plot(density(log(firms$mktcap)))
plot(firms$mktcap, firms$spread, type="p", cex=.2, col="blue", log="xy",
     xlab="Market cap (Mln USD)", ylab="Bid/offer spread (bps)")
m=lm(log(spread) ~ log(mktcap), firms)
summary(m)

# Making deciles --
library(gtools)
library(gdata)
                                      # for deciles (default=quartiles)
size.category = quantcut(firms$mktcap, q=seq(0, 1, 0.1), labels=F)
table(size.category)
means = aggregate(firms, list(size.category), mean)
print(data.frame(means$mktcap,means$spread))

# Make a picture combining the sample mean of spread (in each decile)
# with the weighted average sample mean of the spread (in each decile),
# where weights are proportional to size.
wtd.means = by(firms, size.category,
  function(piece) (sum(piece$mktcap*piece$spread)/sum(piece$mktcap)))
lines(means$mktcap, means$spread, type="b", lwd=2, col="green", pch=19)
lines(means$mktcap, wtd.means, type="b", lwd=2, col="red", pch=19)
legend(x=0.25, y=0.5, bty="n", 
       col=c("blue", "green", "red"), 
       lty=c(0, 1, 1), lwd=c(0,2,2), 
       pch=c(0,19,19),
       legend=c("firm", "Mean spread in size deciles",
         "Size weighted mean spread in size deciles"))

# Within group standard deviations --
aggregate(firms, list(size.category), sd)

# Now I do quartiles by BOTH mktcap and spread.
size.quartiles = quantcut(firms$mktcap, labels=F)
spread.quartiles = quantcut(firms$spread, labels=F)
table(size.quartiles, spread.quartiles)
# Re-express everything as joint probabilities
table(size.quartiles, spread.quartiles)/nrow(firms)
# Compute cell means at every point in the joint table:
aggregate(firms, list(size.quartiles, spread.quartiles), mean)

# Make pretty two-way tables
aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, nobs)
aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, mean)
aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, sd)
aggregate.table(firms$spread, size.quartiles, spread.quartiles, mean)
aggregate.table(firms$spread, size.quartiles, spread.quartiles, sd)

Scare the hell out of children with the Cauchy distribution
# A function which simulates N draws from one of two distributions,
# and returns the mean obtained thusly.
one.simulation <- function(N=100, distribution="normal") {
  if (distribution == "normal") {
    x <- rnorm(N)
  } else {
    x <- rcauchy(N)
  }
  mean(x)
}

k1 <- density(replicate(1000, one.simulation(20)))
k2 <- density(replicate(1000, one.simulation(20, distribution="cauchy")))

xrange <- range(k1$x, k2$x)
plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="")
grid()
lines(k2$x, k2$y, col="red")
abline(v=.5)
legend(x="topleft", bty="n",
       lty=c(1,1), 
       col=c("black", "red"),
       legend=c("Mean of Normal", "Mean of Cauchy"))
# The distribution of the mean of normals collapses into a point;
# that of the cauchy does not.

# Here's more scary stuff --
for (i in 1:10) {
  cat("Sigma of distribution of 1000 draws from mean of normal - ",
      sd(replicate(1000, one.simulation(20))), "\n")
}
for (i in 1:10) {
  cat("Sigma of distribution of 1000 draws from mean of cauchy - ",
      sd(replicate(1000, one.simulation(20, distribution="cauchy"))), "\n")
}

# Exercise for the reader: Compare the distribution of the median of
# the Normal against the distribution of the median of the Cauchy.

example of simulation-based inference
# This is in the context of testing for time-series dependence in
# stock market returns data.
# The code here does the idea of Kim, Nelson, Startz (1991).
# We want to use the distribution of realworld returns data, without
# needing assumptions about normality.
# The null is lack of dependence (i.e. an efficient market).
# So repeatedly, the data is permuted, and the sample ACF is computed.
# This gives us the distribution of the ACF under H0: independence, but
# while using the empirical distribution of the returns data.

# Weekly returns on Nifty, 1/1/2002 to 31/12/2003, 104 weeks of data.
r <- c(-0.70031182197603, 0.421690133064168, -1.20098072984689, 0.143402360644984, 3.81836537549516, 3.17055939373247, 0.305580301919228, 1.23853814691852, 0.81584795095706, -1.51865139747764, -2.71223626421522, -0.784836480094242, 1.09180041170998, 0.397649587762761, -4.11309534220923, -0.263912425099111, -0.0410144239805454, 1.75756212770972, -2.3335373897992, -2.19228764624217, -3.64578978183987, 1.92535789661354, 3.45782867883164, -2.15532607229374, -0.448039988298987, 1.50124793565896, -1.45871585874362, -2.13459863369767, -6.2128068251802, -1.94482987066289, 0.751294815735637, 1.78244982829590, 1.61567494389745, 1.53557708728931, -1.53557708728931, -0.322061470004265, -2.28394919698225, 0.70399304137414, -2.93580952607737, 2.38125098034425, 0.0617697039252185, -4.14482733720716, 2.04397528093754, 0.576400673606603, 3.43072725191913, 2.96465382864843, 2.89833358015583, 1.85387040058336, 1.52136515035952, -0.637268376944444, 1.75418926224609, -0.804391905851354, -0.861816058320475, 0.576902488444109, -2.84259880663331, -1.35375536139417, 1.49096529042234, -2.05404881010045, 2.86868849528146, -0.258270670200478, -4.4515881438687, -1.73055019137092, 3.04427015714648, -2.94928202352018, 1.62081315773994, -6.83117945164824, -0.962715713711582, -1.75875847071740, 1.50330330252721, -0.0479705789653728, 3.68968303215933, -0.535807567290103, 3.94034871061182, 3.85787174417738, 0.932185956989873, 4.08598654183674, 2.27343783689715, 1.13958830440017, 2.01737201171230, -1.88131458327554, 1.97596267156648, 2.79857144562001, 2.22470306481695, 2.03212951411427, 4.95626853448883, 3.40400972901396, 3.03840139165246, -1.89863129741417, -3.70832135042951, 4.78478922155396, 4.3973589590097, 4.9667050392987, 2.99775078737081, -4.12349101552438, 3.25638269809945, 2.29683376253966, -2.64772825878214, -0.630835277076258, 4.72528848505451, 1.87368447333380, 3.17543946162564, 4.58174427843208, 3.23625985632168, 2.29777651227296)

# The 1st autocorrelation from the sample:
acf(r, 1, plot=FALSE)$acf[2]

# Obtain 1000 draws from the distribution of the 1st autocorrelation
# under the null of independence:
set.seed <- 101
simulated <- replicate(1000, acf(r[sample(1:104, replace=FALSE)], 1, plot=FALSE)$acf[2])
# At 95% --
quantile(simulated, probs=c(.025,.975))
# At 99% --
quantile(simulated, probs=c(.005,.995))

# So we can reject the null at 95% but not at 99%.

# A pretty picture.
plot(density(simulated), col="blue")
abline(v=0)
abline(v=quantile(simulated, probs=c(.025,.975)), lwd=2, col="purple")
abline(v=acf(r, 1, plot=FALSE)$acf[2], lty=2, lwd=4, col="yellow")

Standard computations with well-studied distributions
# The normal distribution is named "norm". With this, we have:

# Normal density
dnorm(c(-1.96,0,1.96))

# Cumulative normal density
pnorm(c(-1.96,0,1.96))

# Inverse of this
qnorm(c(0.025,.5,.975))
pnorm(qnorm(c(0.025,.5,.975)))

# 1000 random numbers from the normal distribution
summary(rnorm(1000))


# Here's the same ideas, for the chi-squared distribution with 10 degrees
# of freedom.
dchisq(c(0,5,10), df=10)

# Cumulative normal density
pchisq(c(0,5,10), df=10)

# Inverse of this
qchisq(c(0.025,.5,.975), df=10)

# 1000 random numbers from the normal distribution
summary(rchisq(1000, df=10))

two-sample Kolmogorov-Smirnoff test
# Goal: Given two vectors of data,
#       superpose their CDFs
#       and show the results of the two-sample Kolmogorov-Smirnoff test

# The function consumes two vectors x1 and x2.
# You have to provide a pair of labels as `legendstrings'.
# If you supply an xlab, it's used
# If you specify log - e.g. log="x" - this is passed on to plot.
# The remaining args that you specify are sent on into ks.test()
two.cdfs.plot <- function(x1, x2, legendstrings, xlab="", log="", ...) {
  stopifnot(length(x1)>0,
            length(x2)>0,
            length(legendstrings)==2)
  hilo <- range(c(x1,x2))

  par(mai=c(.8,.8,.2,.2))
  plot(ecdf(x1), xlim=hilo, verticals=TRUE, cex=0,
       xlab=xlab, log=log, ylab="Cum. distribution", main="")
  grid()
  plot(ecdf(x2), add=TRUE, verticals=TRUE, cex=0, lwd=3)
  legend(x="bottomright", lwd=c(1,3), lty=1, bty="n",
         legend=legendstrings)

  k <- ks.test(x1,x2, ...)
  text(x=hilo[1], y=c(.9,.85), pos=4, cex=.8,
     labels=c(
       paste("KS test statistic: ", sprintf("%.3g", k$statistic)),
       paste("Prob value: ", sprintf("%.3g", k$p.value))
       )
     )
  k
}

x1 <- rnorm(100, mean=7, sd=1)
x2 <- rnorm(100, mean=9, sd=1)

# Check error detection --
two.cdfs.plot(x1,x2)

# Typical use --
two.cdfs.plot(x1, x2, c("X1","X2"), xlab="Height (metres)", log="x")

# Send args into ks.test() --
two.cdfs.plot(x1, x2, c("X1","X2"), alternative="less")
To read in a simple data file, and look around it's contents
# Suppose you have a file "x.data" which looks like this:
#        1997,3.1,4
#        1998,7.2,19
#        1999,1.7,2
#        2000,1.1,13
# To read it in --

A <- read.table("x.data", sep=",",
                col.names=c("year", "my1", "my2"))
nrow(A)                                 # Count the rows in A

summary(A$year)                         # The column "year" in data frame A
                                        # is accessed as A$year

A$newcol <- A$my1 + A$my2               # Makes a new column in A
newvar <- A$my1 - A$my2                 # Makes a new R object "newvar"
A$my1 <- NULL                           # Removes the column "my1"

# You might find these useful, to "look around" a dataset --
str(A)
summary(A)
library(Hmisc)          # This requires that you've installed the Hmisc package
contents(A)
describe(A)

To read in a simple data file where date data is present
# Suppose you have a file "x.data" which looks like this:
#        1997-07-04,3.1,4
#        1997-07-05,7.2,19
#        1997-07-07,1.7,2
#        1997-07-08,1.1,13

A <- read.table("x.data", sep=",",
                col.names=c("date", "my1", "my2"))
A$date <- as.Date(A$date, format="%Y-%m-%d")
       # Say ?strptime to learn how to use "%" to specify
       # other date formats. Two examples --
       # "15/12/2002"  needs "%d/%m/%Y"
       # "03 Jun 1997" needs "%d %b %Y"

       # Actually, if you're using the ISO 8601 date format, i.e.
       # "%Y-%m-%d", that's the default setting and you don't need to
       # specify the format.

A$newcol <- A$my1 + A$my2               # Makes a new column in A
newvar <- A$my1 - A$my2                 # Makes a new R object "newvar"
A$my1 <- NULL                           # Delete the `my1' column
summary(A)                              # Makes summary statistics

To read in files produced by CMIE's "Business Beacon"
#       This assumes you have made a file of MONTHLY data using CMIE's
#       Business Beacon program. This contains 2 columns: M3 and M0.

A <- read.table(
                # Generic to all BB files --
                sep="|",                # CMIE's .txt file is pipe delimited
                skip=3,                 # Skip the 1st 3 lines
                na.strings=c("N.A.","Err"),  # The ways they encode missing data
                # Specific to your immediate situation --
                file="bb_data.text",
                col.names=c("junk", "date", "M3", "M0")
                )
A$junk <- NULL                          # Blow away this column

# Parse the CMIE-style "Mmm yy" date string that's used on monthly data
A$date <- as.Date(paste("1", as.character(A$date)), format="%d %b %Y")

Reading and writing ascii files, reading and writing binary files
#       And, to measure how much faster it is working with binary files.

# First manufacture a tall data frame:
                # FYI -- runif(10) yields 10 U(0,1) random numbers.
B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000))
summary(B)

# Write out ascii file:
write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA)
# Read in this resulting ascii file:
C=read.table("/tmp/foo.csv", header = TRUE, sep = ",", row.names=1)
# Write a binary file out of dataset C:
save(C, file="/tmp/foo.binary")
# Delete the dataset C:
rm(C)
# Restore from foo.binary:
load("/tmp/foo.binary")
summary(C)                              # should yield the same results
                                        # as summary(B) above.


# Now we time all these operations --
cat("Time creation of dataset:\n")
system.time({
  B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000))
})

cat("Time writing an ascii file out of dataset B:\n")
system.time(
            write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA)
            )

cat("Time reading an ascii file into dataset C:\n")
system.time(
            {C=read.table("/tmp/foo.csv", header = TRUE, sep=",", row.names=1)
           })

cat("Time writing a binary file out of dataset C:\n")
system.time(save(C, file="/tmp/foo.binary"))

cat("Time reading a binary file + variablenames from /tmp/foo.binary:\n")
system.time(load("/tmp/foo.binary"))    # and then read it in from binary file

Lots of timesR
# Goals: Lots of times, you need to give an R object to a friend,
#        or embed data into an email.

# First I invent a little dataset --
set.seed(101)   # To make sure you get the same random numbers as me
                # FYI -- runif(10) yields 10 U(0,1) random numbers.
A = data.frame(x1=runif(10), x2=runif(10), x3=runif(10))
# Look at it --
print(A)

# Writing to a binary file that can be transported
save(A, file="/tmp/my_data_file.rda")   # You can give this file to a friend
load("/tmp/my_data_file.rda")

# Plan B - you want pure ascii, which can be put into an email --
dput(A)
# This gives you a block of R code. Let me utilise that generated code
# to create a dataset named "B".
B <- structure(list(x1 = c(0.372198376338929, 0.0438248154241592, 
0.709684018278494, 0.657690396532416, 0.249855723232031, 0.300054833060130, 
0.584866625955328, 0.333467143354937, 0.622011963743716, 0.54582855431363
), x2 = c(0.879795730113983, 0.706874740775675, 0.731972594512627, 
0.931634427979589, 0.455120594473556, 0.590319729177281, 0.820436094887555, 
0.224118480458856, 0.411666829371825, 0.0386105608195066), x3 = c(0.700711545301601, 
0.956837461562827, 0.213352001970634, 0.661061500199139, 0.923318882007152, 
0.795719761401415, 0.0712125543504953, 0.389407767681405, 0.406451216200367, 
0.659355078125373)), .Names = c("x1", "x2", "x3"), row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame")

# Verify that A and B are near-identical --
A-B
# or,
all.equal(A,B)

Make a time-series object using the "zoo" package
A <- data.frame(date=c("1995-01-01", "1995-01-02", "1995-01-03", "1995-01-06"),
                x=runif(4),
                y=runif(4))
A$date <- as.Date(A$date) # yyyy-mm-dd is the default format
# So far there's nothing new - it's just a data frame. I have hand-
# constructed A but you could equally have obtained it using read.table().

# I want to make a zoo matrix out of the numerical columns of A
library(zoo)
B <- A
B$date <- NULL
z <- zoo(as.matrix(B), order.by=A$date)
rm(A, B)

# So now you are holding "z", a "zoo" object. You can do many cool
# things with it.
# See http://www.google.com/search?hl=en&q=zoo+quickref+achim&btnI=I%27m+Feeling+Lucky

# To drop down to a plain data matrix, say
C <- coredata(z)
rownames(C) <- as.character(time(z))
# Compare --
str(C)
str(z)

# The above is a tedious way of doing these things, designed to give you
# an insight into what is going on. If you just want to read a file
# into a zoo object, a very short path is something like:
#        z <- read.zoo(filename, format="%d %b %Y")

All manner of import and export of datasets
# Invent a dataset --
A <- data.frame(
                name=c("a","b","c"),
                ownership=c("Case 1","Case 1","Case 2"),
                listed.at=c("NSE",NA,"BSE"),
                   # Firm "b" is unlisted.
                is.listed=c(TRUE,FALSE,TRUE),
                   # R convention - boolean variables are named "is.something"
                x=c(2.2,3.3,4.4),
                date=as.Date(c("2004-04-04","2005-05-05","2006-06-06"))
              )

# To a spreadsheet through a CSV file --
write.table(A,file="demo.csv",sep = ",",col.names = NA,qmethod = "double")
B <- read.table("demo.csv", header = TRUE, sep = ",", row.names = 1)

# To R as a binary file --
save(A, file="demo.rda")
load("demo.rda")

# To the Open XML standard for transport for statistical data --
library(StatDataML)
writeSDML(A, "/tmp/demo.sdml")
B <- readSDML("/tmp/demo.sdml")

# To Stata --
library(foreign)
write.dta(A, "/tmp/demo.dta")
B <- read.dta("/tmp/demo.dta")

# foreign::write.foreign() also has a pathway to SAS and SPSS.

Special cases in reading files
# Reading in a .bz2 file --
read.table(bzfile("file.text.bz2"))           # Requires you have ./file.text.bz2

# Reading in a .gz file --
read.table(gzfile("file.text.gz"))            # Requires you have ./file.text.bz2

# Reading from a pipe --
mydata <- read.table(pipe("awk -f filter.awk input.txt"))

# Reading from a URL --
read.table(url("http://www.mayin.org/ajayshah/A/demo.text"))

# This also works --
read.table("http://www.mayin.org/ajayshah/A/demo.text")

# Hmm, I couldn't think of how to read a .bz2 file from a URL. How about:
read.table(pipe("links -source http://www.mayin.org/ajayshah/A/demo.text.bz2 | bunzip2"))

# Reading binary files from a URL --
load(url("http://www.mayin.org/ajayshah/A/nifty_weekly_returns.rda"))

Reading in a Microsoft .xls file directly
library(gdata)
a <- read.xls("file.xls", sheet=2)                # This reads in the 2nd sheet

# Look at what the cat dragged in
str(a)

# If you have a date column, you'll want to fix it up like this:
a$date <- as.Date(as.character(a$X), format="%d-%b-%y")
a$X <- NULL


# Also see http://tolstoy.newcastle.edu.au/R/help/06/04/25674.html for
# another path.

ARMA modeling - estimation, diagnostics, forecasting

# 0. SETUP DATA
rawdata <- c(-0.21,-2.28,-2.71,2.26,-1.11,1.71,2.63,-0.45,-0.11,4.79,5.07,-2.24,6.46,3.82,4.29,-1.47,2.69,7.95,4.46,7.28,3.43,-3.19,-3.14,-1.25,-0.50,2.25,2.77,6.72,9.17,3.73,6.72,6.04,10.62,9.89,8.23,5.37,-0.10,1.40,1.60,3.40,3.80,3.60,4.90,9.60,18.20,20.60,15.20,27.00,15.42,13.31,11.22,12.77,12.43,15.83,11.44,12.32,12.10,12.02,14.41,13.54,11.36,12.97,10.00,7.20,8.74,3.92,8.73,2.19,3.85,1.48,2.28,2.98,4.21,3.85,6.52,8.16,5.36,8.58,7.00,10.57,7.12,7.95,7.05,3.84,4.93,4.30,5.44,3.77,4.71,3.18,0.00,5.25,4.27,5.14,3.53,4.54,4.70,7.40,4.80,6.20,7.29,7.30,8.38,3.83,8.07,4.88,8.17,8.25,6.46,5.96,5.88,5.03,4.99,5.87,6.78,7.43,3.61,4.29,2.97,2.35,2.49,1.56,2.65,2.49,2.85,1.89,3.05,2.27,2.91,3.94,2.34,3.14,4.11,4.12,4.53,7.11,6.17,6.25,7.03,4.13,6.15,6.73,6.99,5.86,4.19,6.38,6.68,6.58,5.75,7.51,6.22,8.22,7.45,8.00,8.29,8.05,8.91,6.83,7.33,8.52,8.62,9.80,10.63,7.70,8.91,7.50,5.88,9.82,8.44,10.92,11.67)

# Make a R timeseries out of the rawdata: specify frequency & startdate
gIIP <- ts(rawdata, frequency=12, start=c(1991,4))
print(gIIP)
plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2,
        main="Full data")
grid()

# Based on this, I decide that 4/1995 is the start of the sensible period.
gIIP <- window(gIIP, start=c(1995,4))
print(gIIP)
plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2,
        main="Estimation subset")
grid()

# Descriptive statistics about gIIP
mean(gIIP); sd(gIIP); summary(gIIP);
plot(density(gIIP), col="blue", main="(Unconditional) Density of IIP growth")
acf(gIIP)


# 1. ARMA ESTIMATION
m.ar2 <- arima(gIIP, order = c(2,0,0))
print(m.ar2)                       # Print it out


# 2. ARMA DIAGNOSTICS
tsdiag(m.ar2)                      # His pretty picture of diagnostics
## Time series structure in errors
print(Box.test(m.ar2$residuals, lag=12, type="Ljung-Box")); 
## Sniff for ARCH
print(Box.test(m.ar2$residuals^2, lag=12, type="Ljung-Box"));
## Eyeball distribution of residuals
plot(density(m.ar2$residuals), col="blue", xlim=c(-8,8),
     main=paste("Residuals of AR(2)"))


# 3. FORECASTING
## Make a picture of the residuals
plot.ts(m.ar2$residual, ylab="Innovations", col="blue", lwd=2)
s <- sqrt(m.ar2$sigma2)
abline(h=c(-s,s), lwd=2, col="lightGray")

p <- predict(m.ar2, n.ahead = 12)         # Make 12 predictions.
print(p)

## Watch the forecastability decay away from fat values to 0.
## sd(x) is the naive sigma. p$se is the prediction se.
gain <- 100*(1-p$se/sd(gIIP))
plot.ts(gain, main="Gain in forecast s.d.", ylab="Per cent",
        col="blue", lwd=2)

## Make a pretty picture that puts it all together
ts.plot(gIIP, p$pred, p$pred-1.96*p$se, p$pred+1.96*p$se,
        gpars=list(lty=c(1,1,2,2), lwd=c(2,2,1,1),
          ylab="IIP growth (%)", col=c("blue","red", "red", "red")))
grid()
abline(h=mean(gIIP), lty=2, lwd=2, col="lightGray")
legend(x="bottomleft", cex=0.8, bty="n",
       lty=c(1,1,2,2), lwd=c(2,1,1,2),
       col=c("blue", "red", "red", "lightGray"),
       legend=c("IIP", "AR(2) forecasts", "95% C.I.", "Mean IIP growth"))