################################################### ##Functiosn to augment Poisson regression modeling: joint.fun<-function(x,mu,stdev,df,i=NULL,...) { #This is the Poisson distribution with log link function #This function assmes that we integrate over the distribution of x #as Students t distribution with mean mu and scale sd, with df degrees of freedom #integral(-Inf,Inf) f(theta)g(theta,x) dtheta where f is a density, #integral(0,1) g(Finverse(Z),x) dz, where Z=F(theta) dx=f(theta), theta=Finverse(Z) #This makes the integral better, as it reflects the Monte-Carlo method with uniform samples of theta. lx<-mu+stdev*qt(x,df) rate<-exp(lx) if(is.null(i)) return(rate) exp(-rate)*rate^i/factorial(i) } posterior<-function(N=NULL,Nbins=NULL,jf=joint.fun,pval=c(.05,.5,.95),truncated=TRUE,...) { #Evaluates the posterior probabilities based on the mean and standard deviation given the joint function of the likelihood with the #posterior the parameters mu, sd and df are arguments to joint.fun, but passed using ... so this function is generic and could be used for #any joint.fun for discrete data, #Truncated is used when N is not null, to specify the last observation as the P(x>N) # if(is.null(N)) { warning("Quantile output cannot be combined from multiple models") qval<-jf(x=pval,...) names(qval)<-paste(pval*100,"%",sep="") return(qval) } if(N==0) return(integrate(jf,lower=0,upper=1,...=...,i=NULL)$value) if(is.null(Nbins)) { round<-FALSE } else { round<-TRUE } end<-N+2-truncated pvalue<-rep(0.0,end) for(i in 1:(N+1)) { test<-try(integrate(jf,lower=0,upper=1,...=...,i=i-1),silent=TRUE) if(class(test)!="try-error") pvalue[i]<-test$value } if(truncated) pvalue<-pvalue/sum(pvalue) else pvalue[end]<-1-sum(pvalue) if(round==TRUE) out<-posterior.histogram(p=pvalue,N=Nbins) else out<-pvalue if(truncated) names(out)<-0:N else names(out)<-c(0:N,paste(N+1,"+")) return(out) } posterior.histogram<-function(p,N) { ###This function takes probabilities and converts them to bined values count<-p*N alpha<-uniroot(f=function(x){ (sum(trunc(count+x))-N)},c(0,1)) count<-trunc(count+alpha$root) if(alpha$f.root >0 ) warning(paste("Total Count,",sum(count),"Cannot be made equal to,",N)) ##Optimizer failed to go to zero count } posterior.poisson<-function(object,newdata,bias=TRUE,accuracy=10^-10,use.t=TRUE,rate=FALSE,simple=FALSE,...) { ##currently the fit is only for Poisson random variables. ###This provides a companion function to the glm function for the poisson glm augmented to approximate overdispersed Poisson as well. ##Theory when model quasi(link=log,variance=mu) is specified the disperions is provided. if(simple) type="response" else type="link" new<-predict.glm(object=object,newdata=newdata,type=type,se.fit=TRUE) #The se.fit is for responses assumed to be independent. if(simple) return(rbind(mu1=new$fit,mu2=new$se.fit^2+new$fit^2)) if(use.t) df<-object$df.residual #not function else df<-10^12 #large value if(df<10) stop("Prediction cannot be done with residual degrees of freedom < 10 using t-distribution") #Protect predictions... if(bias) { new$fit<-new$fit-new$se.fit^2/2*df/(df-2) -new$se.fit^4/4*(1/(df-4)*(df/df-2)^2) #Correction, fwif the E(e(ax)) for any t distribution of any degree is unbounded! This is an approximation... good for large N... } out<-vector(mode="list",length=length(new$fit)) if(rate) return(function(rate,type=c("density","prob"),lower.tail=TRUE,sel=NULL) { if(!is.null(sel)) new<-lapply(new,"[",sel) out<-outer(log(rate),new$fit,"-")/outer(rep(1,length(rate)),new$se.fit,"*") type<-match.arg(type) if(type=="density") { out<-dt(x=out,df=df)/outer(rate,new$se,"*") #dout=drate/(rate*se) f(rate)=f(out)*dout/drate=f(out)/(se*rate) approximately a gamma distribution... with mean=exp(fit+se^2/2), var=mean^2*(exp(se^2)-1); shape=mean^2/var = 1/(exp(se^2)-1) ; rate= shape/mean out[rate<=0]<-0; #Not quite accurate for t distribution but reasonable for our work } else out<-pt(q=out,df=df,lower.tail=lower.tail) dimnames(out)<-list(rate=format(rate,digits=4),pred=1:length(new$fit)) return(out) } ) for(i in seq(along=out)) out[[i]]<-posterior(mu=new$fit[i],stdev=new$se.fit[i],df=df,rel.tol=accuracy,...) names(out)<-1:length(out) return(as.matrix(data.frame(out,check.names=FALSE))) } ################BIC and verification bic.poisson<-function(glmobject,...) { #This function takes the output of bic.glm and makes predictions based on the data output #It returns a single matrix of count predictions by model. predictions<-lapply(glmobject$models,function(x) posterior.poisson(x,...=...)) probs<-glmobject$postprob if(mode(predictions[[1]])=="function") { return( function(rate,type="density",lower.tail=TRUE,sel=NULL) { predictions2<-lapply(predictions,function(pfun,rr=rate,Type=type,Lower.tail=lower.tail,Select=sel) pfun(rate=rr,type=Type,lower.tail=Lower.tail,sel=Select)); mixpredictions(predictions=predictions2,probs=probs)}) } else out<-mixpredictions(predictions=predictions,prob=glmobject$postprob) if(length(rownames(out))==2 && all(rownames(out)==c("mu1","mu2"))) { out[2,]<-sqrt(out[2,]-out[1,]^2) rownames(out)<-c("mu","se") } out } mixpredictions<-function(predictions,probs) { names<-dimnames(predictions[[1]]) #Keep dims<-dim(predictions[[1]]) #Save lenp<-length(predictions) predictions<-array(unlist(predictions),c(dims,lenp)) dimnames(predictions)<-c(names,list(model=1:lenp)) predictions<-aperm(predictions,c(3,1,2)) #1=model probs<-array(probs,dim(predictions)) #rotate through models... ppredictions<-predictions*probs out<-apply(ppredictions,c(2,3),sum) out } crossvalidation<-function(data,modeltype=c("bic","glm","best"),validation=c("coverage","mean"),N=20,formula=NULL,family="poisson",progressbar=TRUE,trace=FALSE,run.test=TRUE,BlockSize=1,...) { #Simple form no blocking #coverage probabilities, a prediction is made for the lower upper limits (here count). both symmetrical and smallest are chosen. #For example for 90% coverage for the "smallest" covtype the interval P(count in (a,b)) >=.9 is the smallest a,b such that this is true #In "symmetrical" covtype it is the largest a and smallest b such that P(count < a) <= .05 & P(count > b) <= .05 so P(count in (a,b)) >= .90 #... arguments to be passed to prediction routine. # #If the valiation type is mean then a list of a matrix with one row per observation with columns, observed, predicted, sigmax is returned and several error measurements are calcualted e.g. mean square error mean abs deviation , logrithm error. #sigmax is the prediction error for x it is sqrt(var(E(x |lambda)) + E(var(x|lambda))) and as such given mu and se from the prediction algorithm for the rate # it is sqrt(mu+se^2). The distribution is approximately negative binomial with p = mu/sigmax^2 and r= mu*p/(1-p) (Note that the rate is approximately gamma distribution with alpha=r so beta=p/(1-p). #If the validation is coverage probabilities then simply output a list with two arrays and two vector, one called lower and another called upper, vector of observed coverages is returned as well as the original. #BlockSize determine chunk sizes. predout<-NULL R1<-nrow(data) BlockSize<-max(1,min(R1,BlockSize)) R<-(R1-1)%/%BlockSize+1 RT<-c(1:R1,rep(0,R*BlockSize-R1)) R<-length(RT)%/%BlockSize dim(RT)<-c(R,BlockSize) #R rows RT<-RT[,colSums(RT)>0,drop=FALSE] BlockSize<-ncol(RT) if(!progressbar || !require(tcltk)) progressbar<-FALSE if(progressbar) pb<-tkProgressBar(title = paste("Leave one out cross validation with Blocksize",BlockSize), label = "Initializing", min = 0, max = R, initial = 0, width = 300) for(i in 1:R) { if(progressbar) setTkProgressBar(pb,value=i,label=paste(i,"of", R, "validations complete",sep=" ")) ###set up the cross validation scheme xdata<-data[-RT[i,],] ydata<-data[RT[i,],] if(R==1) { warning("insample validation") xdata<-data ydata<-data } if(modeltype=="bic" || modeltype == "best") { output<-bic.glm.formula(f=formula,data=xdata,glm.family=family,return.model=TRUE,...) if(trace) cat(paste(paste(output$label,100*round(output$postprob,2),sep=":"),collapse=";"),"\n") if(modeltype=="best") { output<-list(postprob=1,models=list(output$models[[1]])) #cat(output$models[[1]]$aic,round(coef(output$models[[1]]),2),"\n") }# } else { output<-glm(data=xdata,family=family,formula=formula,control=glm.control(),...) output<-list(postprob=1,models=list(output)) } #Now we can use the SAME function depending on the type validation<-match.arg(validation) if(validation=="mean") predout<-cbind(predout,bic.poisson(glmobject=output,newdata=ydata,simple=TRUE,...)) #generates a matrix with top row mu and bottom row se of rate (update later). else predout<-cbind(predout,bic.poisson(glmobject=output,newdata=ydata,simple=FALSE,N=N,...)) #Force N since we need counts for this... we could do rates but they cannot verify against counts. } #RTt<-RT; dim(RTt)<-rev(dim(RT)); as.vector(t(RTt)) if(R>1) { tRT<-t(RT) Rorder<-order(tRT[tRT>0]) predout<-predout[,Rorder] } attr(predout,"y")<-model.extract(model.frame(formula=formula,data=data),"response") if(run.test && validation=="coverage") predout<-PIT(predout,...) return(predout) } PIT<-function(df,u=seq(0,1,.05),y=NULL,eps=1e-7,plot=TRUE,titletext="Bayesian Model Average",use.aic=NULL,print.tests=TRUE,nbest=NULL,...) { #PIT probability integral transform. #df is the predicted distributions for each left out observation, i.e. df[,1] is the predicted distribution #one column per observation, and one row per value (0:M) #df is generated by crossvalidation() #y is a the vector of observations #u is a vector of probabilities F(u) for continuous is just the emprical probability that P(y)<=u i.e. the mean value #of F[y+1] u #Thus we let U = F(y-1)*(1-V)+F(y)*V i.e. U= F(y-1)+ (F(y)-F(u-1))*V and V is uniform [0,1] #then F(u) = P(U<=u)= 0 for u= F(y) but (u-F(y-1))/(F(y)-F(y-1))= (u-F(y-1))/f(y), f(y) is probability #mass function. #d is a matrix of M+1 (i.e. 0:M) rows and N columns, so that each column is a probability distribution for eace #leave one out sample prediction. #u is a vector in which we want to find the probabilities i.e we want to calculate the mean value of F(u). #TO get histogram, take differences. #use.aic and nbest are dummy variables used to "eat" up the variables passed using ... to prevent rect() from throwing a warning... if(is.null(y)) y<-attr(df,"y") ones<-colSums(df) M<-ncol(df) if(any(abs(ones-1)> eps)) stop("Density must add to 1") if(length(y) != M) stop(paste("Observation length",length(y),"not equal to",M,"number of predicted distributions")) lu<-length(u) #number of histogram samples. if(any(abs(ones-1)>0)) df<-t(t(df)/ones) #normalize P<-rbind(rep(0,M),apply(df,2,cumsum)) selections<-cbind(y+1,1:M) Py1<-rep(P[selections],lu) #F(y-1) = 0 for y=-1 dy<-rep(df[selections],lu) #d[y] tfun<-function(x) ifelse(x<=0,0,ifelse(x>=1,1,x)) #broken linear function with y=x for 0=1 z<-tfun((rep(u,each=M)-Py1)/dy) dim(z)<-c(M,lu) out<-colSums(z)/M # if df is 0 this counts number of observations with F(y)<=u cumulative=data.frame(P=out,u=u) if(plot) PIT.plot(cumulative,titletext=titletext,...) scoretests<-rmse(df,y) mu<-scoretests$mu sigma<-scoretests$sigma tests<-unlist(scoretests[-(1:2)]) alltests<-c(tests,RPS=rankedprobscore(df,y)) if(print.tests) print(alltests) return(list(Pdf=cumulative,tests=alltests,mu=mu,sigma=sigma,y=y,df=df)) } rmse<-function(df,y) { #caculate all scoring rules, use mean value #tests curtesy of Czado et. al. df<-t(t(df)/colSums(df)) range<-0:(nrow(df)-1) mu<-colSums(df*range) sigma<-sqrt(colSums(df*range^2)-mu^2) px<-df[cbind(y+1,1:ncol(df))] p2<-colSums(df^2) qs<-mean(-2*px+p2) sphs<-mean(-px/sqrt(p2)) lss<-mean(-log(px)) z<-(y-mu) mse<-mean(z^2) nses=mean((z/sigma)^2) dss=nses+mean(2*log(sigma)) mad=mean(abs(z)) return(list(mu=mu,sigma=sigma,LogS=lss,QS=qs,SphS=sphs,MSE=mse,nses=nses,DSS=dss,mad=mad)) } rankedprobscore<-function(df,y) { #df is the predicted distributions for each left out observation, i.e. df[,1] is the predicted distribution #one column per observation, and one row per value (0:M) #df is generated by crossvalidation() #y is the vector of observations. pk<-apply(df,2,cumsum) #probability sum test. M<-nrow(df)-1 N<-ncol(df) yk<-outer(0:M,y, ">=" ) score<-(pk-yk)^2 return(sum(score)/length(y)) } PIT.plot<-function(Pdf,titletext="Bayesian Model Average",add=FALSE,...) { x<-Pdf$P u<-Pdf$u ytop<-diff(x) ybottom<-rep(0,length(u)-1) xright<-u[-1] xleft<-rev(rev(u)[-1]) maxx<-max(ytop) if(!add) plot(type="n",x=c(0,1),y=c(0,maxx), xlab="Probability Integral Transform",ylab="relative frequency",main=titletext) rect(xleft=xleft,ybottom=ybottom,xright=xright, ytop=ytop,...) }