The contents of this file are copyright of Cambridge University Press, 1997. 11.1 Introduction ===================== y <- rnorm(20) y y <- rgamma(n=10,shape=2) y <- rgamma(n=10,shape=c(1:10)) sample(10) sample(10,replace=T) set <- c(11,22,33,44,55) sample(set) sample(set,size=10,replace=T) sample(set,size=10,replace=T,prob=c(0.1,0.1,0.1,0.1,0.6)) city city$u city$x city$x[1] city$x[c(1:4)] city$x[c(1,5,10)] city[c(1,5,10),2] city$x[-1] city[c(1:3),] i <- sample(10,replace=T) city[i,] 11.2 Basic Bootstraps ===================== city.fun <- function( data, i ) { d <- data[i,] mean(d$x)/mean(d$u) } city.boot <- boot( data=city, statistic=city.fun, R=50) city.boot plot(city.boot) names(city.boot) unix.time( city.boot <- boot(city,city.fun,R=50) ) dos.time( city.boot <- boot(city,city.fun,R=50) ) \end{verbatim} mat <- as.matrix(city) mat.fun <- function( data, i ) { d <- data[i,] mean(d[,2])/mean(d[,1]) } unix.time( mat.boot <- boot(mat,mat.fun,R=50,keepf=T) ) f <- boot.array(city.boot) f[1:20,] city.w <- function( data, w=rep(1,nrow(data))/nrow(data) ) { w <- w/sum(w) sum(w*data$x)/sum(w*data$u) } city.boot <- boot( city, city.w, R=20, stype="w") city.f <- function( data, f ) mean(f*data$x)/mean(f*data$u) city.boot <- boot( city, city.f, R=20, stype="f") air.fun <- function( data, i ) { d <- data[i,] c( mean(d), var(d)/nrow(data) ) } air.boot <- boot( data=aircondit, statistic=air.fun, R=200) city.subset <- function( data, i, n=10 ) { d <- data[i[1:n],] mean(d[,2])/mean(d[,1]) } city.boot <- boot( data=city, statistic=city.subset, R=200, n=5) boot.array(city.boot,indices=T)[,1:5] aircondit.fun <- function( data ) mean(data$hours) aircondit.sim <- function( data, mle ) { d <- data d$hours <- rexp( n=nrow(data), rate=mle ) d } aircondit.mle <- 1/mean(aircondit$hours) aircondit.para <- boot( data=aircondit, statistic=aircondit.fun, R=20, sim="parametric", ran.gen=aircondit.sim, mle=aircondit.mle) l.city <- log(city) city.mle <- c(apply(l.city,2,mean),sqrt(apply(l.city,2,var)), corr(l.city)) city.sim <- function( data, mle ) { n <- nrow(data) d <- matrix(rnorm(2*n),n,2) d[,2] <- mle[2] + mle[4]*(mle[5]*d[,2]+sqrt(1-mle[5]^2)*d[,1]) d[,1] <- mle[1] + mle[3]*d[,1] data$x <- exp(d[,2]) data$u <- exp(d[,1]) data } city.f <- function( data ) mean(data[,2])/mean(data[,1]) city.para <- boot( city, city.f, R=200, sim="parametric", ran.gen=city.sim, mle=city.mle ) city.boot <- boot( data=city, statistic=function( data, i) city.f(data[i,]), R=200) L.diff <- empinf( data=city, statistic=city.w, stype="w" ) cbind(L.diff,(city$x-city.w(city)*city$u)/mean(city$u)) city.boot <- boot( city, city.fun, R=999) L.reg <- empinf(city.boot) L.reg J <- empinf( data=city, statistic=city.fun, stype="i", type="jack") var.linear(L.diff) var.linear(L.reg) city.tL.reg <- linear.approx(city.boot) city.tL.diff <- linear.approx(city.boot, L=L.diff) split.screen(c(1,2)) screen(1); plot(city.tL.reg,city.boot$t); abline(0,1,lty=2) screen(2); plot(city.tL.diff,city.boot$t); abline(0,1,lty=2) 11.3 Further Ideas ===================== gravity grav <- gravity[as.numeric(gravity$series)>=7,] grav grav.fun <- function( data, i, trim=0.125 ) { d <- data[i,] m <- tapply( d$g, d$series, mean, trim=trim ) m[7]-m[8] } grav.boot <- boot( grav, grav.fun, R=200, strata=grav$series) grav.L <- empinf(grav.boot) grav.tL <- linear.approx(grav.boot) var.linear( grav.L, strata=grav$series) y <- rnorm(99) h <- 0.5 y.gen <- function( data, mle ) { n <- length(data) i <- sample( n, n, replace=T) data[i] + mle*rnorm(n) } y.boot <- boot( y, median, R=200, sim="parametric", ran.gen=y.gen, mle=h ) var(y.boot$t) aml1 <- aml[aml$group==1,] aml1.fun <- function( data ) { surv <- survfit(Surv(data$time,data$cens)) p1 <- min(surv$surv[surv$time<20]) m1 <- min(surv$time[surv$surv<0.5]) c( p1, m1 ) } aml1.ord <- censboot( data=aml1, statistic=aml1.fun, R=50) aml1.ord aml1.fail <- survfit(Surv(time,cens),data=aml1) aml1.cens <- survfit(Surv(time-0.01*cens,1-cens),data=aml1) aml1.con <- censboot( data=aml1, statistic=aml1.fun, R=50, F.surv=aml1.fail, G.surv=aml1.cens, sim="cond") city.fun <- function( data, i ) { d <- data[i,] rat <- mean(d$x)/mean(d$u) L <- (d$x-rat*d$u)/mean(d$u) c( rat, sum(L^2)/nrow(d)^2, L ) } city.boot <- boot( city, city.fun, R=999) city.L <- city.boot$t0[3:12] split.screen(c(1,2)); screen(1); split.screen(c(2,1)); screen(4) attach(city) plot(u,x,type="n",xlim=c(0,300),ylim=c(0,300)) text(u,x,round(city.L,2)) screen(3) plot(u,x,type="n",xlim=c(0,300),ylim=c(0,300)) text(u,x,c(1:10)); abline(0,city.boot$t0[1],lty=2) screen(2) jack.after.boot( boot.out=city.boot, useJ=F, stinf=F, L=city.L) close.screen(all=T) jack.after.boot(city.boot,useJ=F,stinf=F,index=2) jack.after.boot(city.boot,useJ=F,stinf=F,t=city.boot$t[,2]) city.freq <- smooth.f( theta=1.4, boot.out=city.boot) city.w( city, city.freq) 11.4 Tests ===================== fir.mle <- c( sum(fir$count), nrow(fir)) fir.gen <- function( data, mle) { d <- data y <- sample(x=mle[2],size=mle[1],replace=T) j <- as.numeric(names(table(y))) d$count <- tabulate(y,mle[2]) d } fir.fun <- function( data ) (nrow(data)-1)*var(data$count)/mean(data$count) fir.boot <- boot( fir, fir.fun, R=999, sim="parametric", ran.gen=fir.gen, mle=fir.mle ) qqplot(qchisq(c(1:fir.boot$R)/(fir.boot$R+1),df=49),fir.boot$t) abline(0,1,lty=2); abline(h=fir.boot$t0) perm.fun <- function( data, i ) cor( data[,1],data[i,2] ) ducks.perm <- boot( ducks, perm.fun, R=499, sim="permutation") (sum(ducks.perm$t>ducks.perm$t0)+1)/(ducks.perm$R+1) qqnorm(ducks.perm$t,ylim=c(-1,1)) abline(h=ducks.perm$t0,lty=2) duck <- c(ducks[,1],ducks[,2]) n <- nrow(ducks) duck.fun <- function( data, i, n ) { x <- data[i] cor(x[1:n],x[(n+1):(2*n)]) } .Random.seed <- ducks.perm$seed ducks.boot <- boot( duck, duck.fun, R=499, strata=rep(c(1,2),c(n,n)), n=n) (sum(ducks.boot$t>ducks.boot$t0)+1)/(ducks.boot$R+1) z <- grav$g z[grav$series==8] <- -z[grav$series==8] z.tilt <- exp.tilt( L=z, theta=0, strata=grav$series) z.tilt grav.test <- function( data, i ) { d <- data[i,] diff(tapply(d$g,d$series,mean))[7] } grav.boot <- boot( data=grav, statistic=grav.test, R=999, weights=z.tilt$p, strata=grav$series) (sum(grav.boot$t>grav.boot$t0)+1)/(grav.boot$R+1) 11.5 Confidence Intervals ========================== boot.ci( boot.out=city.boot) boot.ci( boot.out=city.boot, type=c("norm","perc","basic","BCa"), conf=c(0.8,0.9), index=1) boot.ci( city.boot, h=log, hinv=exp, hdot=function(u) 1/u ) abc.ci( data=city, statistic=city.w) 11.6 Linear Regression ========================== fit.model <- function( data ) { fit <- glm(log(brain)~log(body),data=data) l1 <- l1fit(log(data$body),log(data$brain)) c( coef(fit), coef(l1) ) } mammals.fun <- function( data, i ) fit.model( data[i,] ) mammals.boot <- boot( mammals, mammals.fun, R=99 ) mammals.boot$t mam.lm <- glm(log(brain)~log(body),data=mammals) mam.diag <- glm.diag(mam.lm) glm.diag.plots(mam.lm) res <- (mam.diag$res-mean(mam.diag$res))*mam.diag$sd mam <- data.frame(mammals,res=res,fit=fitted(mam.lm)) mam.fun <- function( data, i ) { d <- data d$brain <- exp( d$fit+d$res[i] ) fit.model(d) } mam.boot <- boot( mam, mam.fun, R=99 ) mam.boot mam.w <- function( data, w) coef( glm( log(data$brain)~log(data$body), weights=w))[2] mam.L <- empinf( data=mammals, statistic=mam.w ) sqrt(var.linear(mam.L)) mam.mle <- c( nrow(mam), (5+sqrt(5))/10 ) mam.wild <- function( data, mle ) { d <- data i <- 2*rbinom( mle[1], size=1, prob=1-mle[2])-1 d$brain <- exp( d$fit+d$res*(1-i*sqrt(5))/2 ) d } mam.boot.wild <- boot( mam, fit.model, R=20, sim="parametric", ran.gen=mam.wild, mle=mam.mle) d.pred <- mam[c(46,47),] pred <- function( data, d.pred ) predict( glm(log(brain)~log(body),data=data), d.pred ) mam.pred <- function( data, i, i.pred, d.pred ) { d <- data d$brain <- exp( d$fit+d$res[i] ) pred( d, d.pred) - (d.pred$fit + d$res[i.pred] ) } mam.boot.pred <- boot( mam, mam.pred, R=199, m=2, d.pred=d.pred ) orig <- matrix(pred( mam, d.pred ),mam.boot.pred$R,2,byrow=T) exp(apply(orig+mam.boot.pred$t,2,quantile,c(0.025,0.5,0.975))) x1 <- runif(50); x2 <- runif(50); x3 <- runif(50) x4 <- runif(50); x5 <- runif(50); y <- rnorm(50)+2*x1+2*x2 fake <- data.frame(y,x1,x2,x3,x4,x5) subset.boot <- function(data, i, size=0) { n <- nrow(data) i.t <- i[1:(n-size)] data.t <- data[i.t, ] res0 <- data$y - mean(data.t$y) lm.d <- lm(y ~ x1, data=data.t) res1 <- data$y - predict.lm(lm.d, data) lm.d <- update(lm.d, .~.+x2) res2 <- data$y - predict.lm(lm.d, data) lm.d <- update(lm.d, .~.+x3) res3 <- data$y - predict.lm(lm.d, data) lm.d <- update(lm.d, .~.+x4) res4 <- data$y - predict.lm(lm.d, data) lm.d <- update(lm.d, .~.+x5) res5 <- data$y - predict.lm(lm.d, data) meansq <- function( y ) mean( y^2 ) apply(cbind(res0,res1,res2,res3,res4,res5),2,meansq)/n } fake.boot.40 <- boot(fake, subset.boot, R=100, size=40) delta.hat.40 <- apply(fake.boot.40$t,2,mean) plot(c(0:5),delta.hat.40,xlab="Number of covariates", ylab="Delta hat (M)",type="l",ylim=c(0,0.1)) .Random.seed <- fake.boot.40$seed fake.boot.30 <- boot(fake, subset.boot, R=100, size=30) delta.hat.30 <- apply(fake.boot.30$t,2,mean) lines((0:5),delta.hat.30,lty=2) 11.7 Further Regression ========================== calcium.fun <- function( data, i ) { d <- data[i,] d.nls <- nls(cal~beta0*(1-exp(-time*beta1)),data=d, start=list(beta0=5,beta1=0.2)) c(coefficients(d.nls),sum(d.nls$residuals^2)/(nrow(d)-2)) } cal.boot <- boot( calcium, calcium.fun, R=19, strata=calcium$time) leuk.glm <- glm(time~log10(wbc)+ag-1,Gamma(log),data=leuk) leuk.diag <- glm.diag(leuk.glm) muhat <- fitted(leuk.glm) rL <- log(leuk$time/muhat)/sqrt(1-leuk.diag$h) eps <- 10^(-4) u <- -log(seq(from=eps,to=1-eps,by=eps)) d <- sign(u-1)*sqrt( 2*(u-1-log(u)) )/leuk.diag$sd r.dev <- smooth.spline( d, u) z <- predict(r.dev, leuk.diag$rd)$y leuk.mle <- data.frame(muhat,rL,z) fit.model <- function(data) { data.glm <- glm(time~log10(wbc)+ag-1,Gamma(log),data=data) c(coefficients(data.glm),deviance(data.glm)) } leuk.gen <- function(data,mle) { i <- sample(nrow(data),replace=T) data$time <- mle$muhat*mle$z[i] data } leuk.boot <- boot(leuk, fit.model, R=19, sim="parametric", ran.gen=leuk.gen, mle=leuk.mle ) mel.cox <- coxph(Surv(time,status==1)~log(thickness)+strata(ulcer), data=melanoma) mel.surv <- survfit(mel.cox) mel.cens <- survfit(Surv(time-0.01*(status!=1),status!=1)~, data=melanoma) mel.fun <- function( d ) { attach(d) cox <- coxph(Surv(time,status==1)~log(thickness)+strata(ulcer)) eta <- unique(cox$linear.predictors) u <- unique(thickness) sp <- smooth.spline(u,eta,df=20) th <- seq(from=0.25,to=10,by=0.25) eta <- predict(sp,th)$y detach("d") eta } \end{verbatim} attach(melanoma) mel.boot <- censboot( melanoma, mel.fun, R=99, strata=ulcer) mel.boot.mod <- censboot( melanoma, mel.fun, R=99, F.surv=mel.surv, G.surv=mel.cens, strata=ulcer, cox=mel.cox, sim="model" ) mel.boot.con <- censboot( melanoma, mel.fun, R=99, F.surv=mel.surv, G.surv=mel.cens, strata=ulcer, cox=mel.cox, sim="cond" ) th <- seq(from=0.25,to=10,by=0.25) split.screen(c(2,1)) screen(1) plot(th,mel.boot$t0,type="n",xlab="tumour thickness (mm)", xlim=c(0,10),ylim=c(-2,2),ylab="linear predictor") lines(th,mel.boot$t0,lwd=3) rug(jitter(thickness)) for (i in 1:19) lines(th,mel.boot$t[i,],lwd=0.5) screen(2) plot(th,mel.boot$t0,type="n",xlab="tumour thickness (mm)", xlim=c(0,10),ylim=c(-2,2),ylab="linear predictor") lines(th,mel.boot$t0,lwd=3) mel.env <- envelope(mel.boot$t,level=0.95) lines(th,mel.env$point[1,],lty=1) lines(th,mel.env$point[2,],lty=1) mel.env <- envelope(mel.boot.mod$t,level=0.95) lines(th,mel.env$point[1,],lty=2) lines(th,mel.env$point[2,],lty=2) mel.env <- envelope(mel.boot.con$t,level=0.95) lines(th,mel.env$point[1,],lty=3) lines(th,mel.env$point[2,],lty=3) detach("melanoma") attach(motor) motor.smooth <- smooth.spline(times,accel,w=1/v) motor.small <- smooth.spline(times,accel,w=1/v, spar=motor.smooth$spar/2) motor.big <- smooth.spline(times,accel,w=1/v, spar=motor.smooth$spar*2) res <- (motor$accel-motor.small$y)/sqrt(1-motor.small$lev) motor.mle <- data.frame(bigfit=motor.big$y,res=res) xpoints <- c(10,20,25,30,35,45) motor.fun <- function( data, x ) { y.smooth <- smooth.spline(data$times,data$accel,w=1/data$v) predict(y.smooth,x)$y } motor.gen <- function( data, mle ) { d <- data i <- c(1:nrow(data)) i1 <- sample(i[data$strata==1],replace=T) i2 <- sample(i[data$strata==2],replace=T) i3 <- sample(i[data$strata==3],replace=T) d$accel <- mle$bigfit + mle$res[c(i1,i2,i3)] d } motor.boot <- boot( motor, motor.fun, R=999, sim="parametric", ran.gen=motor.gen, mle=motor.mle, x=xpoints ) mu.big <- predict(motor.big,xpoints)$y mu <- predict(motor.smooth,xpoints)$y ylims <- apply(motor.boot$t,2,quantile,c(0.05,0.95)) ytop <- mu - (ylims[1,]-mu.big) ybot <- mu - (ylims[2,]-mu.big) 11.8 Time Series ========================== sun <- 2*(sqrt(sunspot+1)-1) ts.plot(sun) sun.ar <- ar(sun) sun.ar$order sun.fun <- function(tsb) { ar.fit <- ar(tsb, order.max=25) c(ar.fit$order, mean(tsb), tsb) } sun.1 <- tsboot(sun, sun.fun, R=99, l=20, sim="fixed") tsplot( sun.1$t[1,3:291],main="Block simulation, l=20") table(sun.1$t[,1]) var(sun.1$t[,2]) qqnorm(sun.1$t[,2]) sun.2 <- tsboot(sun, sun.fun, R=99, l=20, sim="geom") sun.model <- list(order=c(sun.ar$order,0,0),ar=sun.ar$ar) sun.res <- sun.ar$resid[!is.na(sun.ar$resid)] sun.res <- sun.res - mean(sun.res) sun.sim <- function(res,n.sim, ran.args) { rg1 <- function(n, res) sample(res, n, replace=T) ts.orig <- ran.args$ts ts.mod <- ran.args$model mean(ts.orig)+rts(arima.sim(model=ts.mod, n=n.sim, rand.gen=rg1, res=as.vector(res))) } sun.3 <- tsboot(sun.res, sun.fun, R=99, sim="model", n.sim=114, ran.gen=sun.sim,ran.args=list(ts=sun, model=sun.model)) sun.black <- function(res, n.sim, ran.args) { ts.orig <- ran.args$ts ts.mod <- ran.args$model mean(ts.orig) + rts(arima.sim(model=ts.mod,n=n.sim,innov=res)) } sun.1b <- tsboot(sun.res, sun.fun, R=99, l=20, sim="fixed", ran.gen=sun.black,ran.args=list(ts=log(sun), model=sun.model)) 11.9 Improved Simulation ========================== city.bal <- boot( city, city.fun, R=20, sim="balanced") control( city.boot, bias.adj=T) city.con <- control( city.boot) city.con$L city.con$bias city.con$var city.con$quantiles city.top <- exp.tilt( L=city.L, theta=2, t0=city.w(city) ) city.bot <- exp.tilt( L=city.L, theta=1.2, t0=city.w(city) ) city.tilt <- boot( city, city.fun, R=c(100,99), weights=rbind(city.top$p,city.bot$p) ) imp.weights( city.tilt) imp.moments( city.tilt) imp.quantile( city.tilt) imp.prob( city.tilt, t0=1.2, def=F) z <- (city.tilt$t[,1]-city.tilt$t0[1])/sqrt(city.tilt$t[,2]) imp.quantile( boot.out=city.tilt, t=z ) city.tilt <- tilt.boot( city, city.fun, R=c(500,250,249)) q <- smooth.f( theta=1.4, boot.out=city.tilt ) city.w( city, q ) imp.moments( city.tilt, q=q) imp.quantile( city.tilt, q=q) saddle( A=city.L/nrow(city), u=2-city.w(city) ) city.t0 <- c(0, sqrt(var.linear( city.L)) ) city.sad <- saddle.distn( A=city.L/nrow(city), t0=city.t0 ) city.sad city.t0 <- c( city.w(city), sqrt(var.linear(city.L)) ) Afn <- function( t, data) data$x-t*data$u ufn <- function( t, data ) 0 saddle( A=Afn( 2, city), u=0 ) city.sad <- saddle.distn( A=Afn, u=ufn, t0=city.t0, data=city) K.adj <- function( xi ) { L <- city$x-city.t*city$u nrow(city)*log(sum( exp( xi*L ) )/nrow(city) )-city.t*xi } K2 <- function( xi ) { L <- city$x-city.t*city$u p <- exp( L*xi ) nrow(city)*( sum( L^2*p )/sum(p) - (sum(L*p)/sum(p))^2 ) } city.t <- 2 saddle( K.adj=K.adj, K2=K2) bigcity.L <- (bigcity$x-city.w(bigcity)*bigcity$u)/mean(bigcity$u) bigcity.t0 <- c( city.w(bigcity), sqrt(var.linear(bigcity.L)) ) Afn <- function( t, data ) cbind( data$x-t*data$u, 1 ) ufn <- function( t, data ) c(0,25) saddle( A=Afn(2, bigcity), u=ufn(2, bigcity), wdist="p", type="cond") city.sad <- saddle.distn( A=Afn, u=ufn, wdist="p", type="cond", data=bigcity, t0=bigcity.t0) 11.10 Semiparametric likelihoods ================================ gam.L _ function( y, tmin=min(y)+0.1, tmax=max(y)-0.1, n.t ) { gam.loglik <- function( l.nu, mu, y) { nu <- exp(l.nu) -sum( log( dgamma( nu*y/mu, nu)*nu/mu ) ) } out <- matrix( NA, n.t+1, 3) for (it in 0:n.t) { t <- tmin + (tmax-tmin)*it/n.t fit <- nlminb( 0, gam.loglik, mu=t, y=y) out[1+it,] <- c( t, exp(fit$parameters), -fit$objective) } out } air.gam <- gam.L( aircondit7$hours, 40, 120, 100) air.gam[,3] <- air.gam[,3] - max(air.gam[,3]) plot(air.gam[,1],air.gam[,3],type="l",xlab="theta", ylab="Log likelihood",xlim=c(40,120)) abline(h=-0.5*qchisq(0.95,1),lty=2) air.EL <- EL.profile(aircondit7$hours,tmin=40,tmax=120,n.t=100) lines(air.EL[,1],air.EL[,2],lty=2) air.EEF <- EEF.profile(aircondit7$hours,tmin=40,tmax=120,n.t=100) lines(air.EEF[,1],air.EEF[,3],lty=3)