###################################################    
##Functiosn to augment Poisson regression modeling:
  
joint.fun<-function(x,mu,stdev,df,i=NULL,...)
{
#This is the Poisson distribution with log link function
#This function assmes that we integrate over the distribution of x 
#as Students t distribution with mean mu and scale sd, with df degrees of freedom
#integral(-Inf,Inf) f(theta)g(theta,x) dtheta where f is a density, 
#integral(0,1) g(Finverse(Z),x) dz, where Z=F(theta) dx=f(theta), theta=Finverse(Z)
#This makes the integral better, as it reflects the Monte-Carlo method with uniform samples of theta.
    lx<-mu+stdev*qt(x,df)
    rate<-exp(lx)
    if(is.null(i)) return(rate)
    exp(-rate)*rate^i/factorial(i)
}



posterior<-function(N=NULL,Nbins=NULL,jf=joint.fun,pval=c(.05,.5,.95),truncated=TRUE,...)
{
#Evaluates the posterior probabilities based on the mean and standard deviation given the joint function of the likelihood with the 
#posterior the parameters mu, sd and df are arguments to joint.fun, but passed using ... so this function is generic and could be used for
#any joint.fun for discrete data, 
#Truncated is used when N is not null, to specify the last observation as the P(x>N) 
#
    if(is.null(N))
    {
    warning("Quantile output cannot be combined from multiple models")
    qval<-jf(x=pval,...)
    names(qval)<-paste(pval*100,"%",sep="")
    return(qval)
    }
    if(N==0)
    return(integrate(jf,lower=0,upper=1,...=...,i=NULL)$value)
    if(is.null(Nbins)) 
    {    
    round<-FALSE
    }
    else
    {
    round<-TRUE
    }
    end<-N+2-truncated
    pvalue<-rep(0.0,end)
    for(i in 1:(N+1))
    {
    test<-try(integrate(jf,lower=0,upper=1,...=...,i=i-1),silent=TRUE)
    if(class(test)!="try-error")
    pvalue[i]<-test$value
    }
    if(truncated)
    pvalue<-pvalue/sum(pvalue)
    else
    pvalue[end]<-1-sum(pvalue) 
    if(round==TRUE) out<-posterior.histogram(p=pvalue,N=Nbins)
else
out<-pvalue
if(truncated)
names(out)<-0:N
else
names(out)<-c(0:N,paste(N+1,"+"))
return(out)
}

posterior.histogram<-function(p,N)
{
###This function takes probabilities and converts them to bined values
     count<-p*N
     alpha<-uniroot(f=function(x){ (sum(trunc(count+x))-N)},c(0,1)) 
     count<-trunc(count+alpha$root)
     if(alpha$f.root >0 ) warning(paste("Total Count,",sum(count),"Cannot be made equal to,",N)) ##Optimizer failed to go to zero
     count
}



posterior.poisson<-function(object,newdata,bias=TRUE,accuracy=10^-10,use.t=TRUE,rate=FALSE,simple=FALSE,...)
{
##currently the fit is only for Poisson random variables.
###This provides a companion function to the glm function for the poisson glm augmented to approximate overdispersed Poisson as well.
##Theory when model quasi(link=log,variance=mu) is specified the disperions is provided.
if(simple) type="response" else type="link"
new<-predict.glm(object=object,newdata=newdata,type=type,se.fit=TRUE) #The se.fit is for responses assumed to be independent.
if(simple)
return(rbind(mu1=new$fit,mu2=new$se.fit^2+new$fit^2))
if(use.t)
df<-object$df.residual #not function 
else
df<-10^12 #large value 
if(df<10) stop("Prediction cannot be done with residual degrees of freedom < 10 using t-distribution") #Protect predictions... 
if(bias)
{
new$fit<-new$fit-new$se.fit^2/2*df/(df-2) -new$se.fit^4/4*(1/(df-4)*(df/df-2)^2)   #Correction, fwif the E(e(ax)) for any t distribution of any degree is unbounded! This is an approximation... good for large N... 
}
out<-vector(mode="list",length=length(new$fit))
if(rate)
return(function(rate,type=c("density","prob"),lower.tail=TRUE,sel=NULL) 
{ 
  if(!is.null(sel))
  new<-lapply(new,"[",sel)
  out<-outer(log(rate),new$fit,"-")/outer(rep(1,length(rate)),new$se.fit,"*")
  type<-match.arg(type)
  if(type=="density") 
  {
   out<-dt(x=out,df=df)/outer(rate,new$se,"*") #dout=drate/(rate*se) f(rate)=f(out)*dout/drate=f(out)/(se*rate) approximately a gamma distribution... with mean=exp(fit+se^2/2), var=mean^2*(exp(se^2)-1); shape=mean^2/var = 1/(exp(se^2)-1) ; rate= shape/mean  
   out[rate<=0]<-0; #Not quite accurate for t distribution but reasonable for our work
  }
  else out<-pt(q=out,df=df,lower.tail=lower.tail)
  dimnames(out)<-list(rate=format(rate,digits=4),pred=1:length(new$fit))
  return(out)
}
)

for(i in seq(along=out))
out[[i]]<-posterior(mu=new$fit[i],stdev=new$se.fit[i],df=df,rel.tol=accuracy,...)
names(out)<-1:length(out)
return(as.matrix(data.frame(out,check.names=FALSE)))
}


################BIC and verification 

bic.poisson<-function(glmobject,...)
{
#This function takes the output of bic.glm and makes predictions based on the data output
#It returns a single matrix of count predictions by model.
predictions<-lapply(glmobject$models,function(x) posterior.poisson(x,...=...)) 
probs<-glmobject$postprob
if(mode(predictions[[1]])=="function")
{
 return( function(rate,type="density",lower.tail=TRUE,sel=NULL) { predictions2<-lapply(predictions,function(pfun,rr=rate,Type=type,Lower.tail=lower.tail,Select=sel) pfun(rate=rr,type=Type,lower.tail=Lower.tail,sel=Select)); mixpredictions(predictions=predictions2,probs=probs)}) 
}
else 
 out<-mixpredictions(predictions=predictions,prob=glmobject$postprob)
if(length(rownames(out))==2  && all(rownames(out)==c("mu1","mu2")))
{
 out[2,]<-sqrt(out[2,]-out[1,]^2)
 rownames(out)<-c("mu","se")
}
 out
}

mixpredictions<-function(predictions,probs)
{
names<-dimnames(predictions[[1]]) #Keep
dims<-dim(predictions[[1]]) #Save
lenp<-length(predictions)
predictions<-array(unlist(predictions),c(dims,lenp))
dimnames(predictions)<-c(names,list(model=1:lenp))
predictions<-aperm(predictions,c(3,1,2)) #1=model
probs<-array(probs,dim(predictions)) #rotate through models...
ppredictions<-predictions*probs
out<-apply(ppredictions,c(2,3),sum)
out
}


crossvalidation<-function(data,modeltype=c("bic","glm","best"),validation=c("coverage","mean"),N=20,formula=NULL,family="poisson",progressbar=TRUE,trace=FALSE,run.test=TRUE,BlockSize=1,...)
{
#Simple form no blocking
#coverage probabilities, a prediction is made for the lower upper limits (here count). both symmetrical and smallest are chosen.
#For example for 90% coverage for the "smallest" covtype the interval P(count in (a,b)) >=.9 is the smallest a,b such that this is true
#In "symmetrical" covtype it is the largest a and smallest b such that P(count < a) <= .05 & P(count > b) <= .05 so P(count in (a,b)) >= .90 
#... arguments to be passed to prediction routine.
#
#If the valiation type is mean then a list of a matrix with one row per observation with columns, observed, predicted, sigmax is returned and several error measurements are calcualted e.g. mean square error mean abs deviation , logrithm error.
#sigmax is the prediction error for x it is sqrt(var(E(x |lambda)) + E(var(x|lambda))) and as such given mu and se from the prediction algorithm for the rate 
# it is sqrt(mu+se^2). The distribution is approximately negative binomial with p =  mu/sigmax^2 and r= mu*p/(1-p) (Note that the rate is approximately gamma distribution with alpha=r so beta=p/(1-p). 
#If the validation is coverage probabilities then simply output a list with two arrays and two vector, one called lower and another called upper, vector of observed coverages is returned as well as the original.
#BlockSize determine chunk sizes.

 predout<-NULL
 R1<-nrow(data)
 BlockSize<-max(1,min(R1,BlockSize))
 R<-(R1-1)%/%BlockSize+1
 RT<-c(1:R1,rep(0,R*BlockSize-R1))
 R<-length(RT)%/%BlockSize
 dim(RT)<-c(R,BlockSize) #R rows 
 RT<-RT[,colSums(RT)>0,drop=FALSE]
 BlockSize<-ncol(RT)
 if(!progressbar || !require(tcltk)) progressbar<-FALSE 
 if(progressbar) pb<-tkProgressBar(title = paste("Leave one out cross validation with Blocksize",BlockSize), label = "Initializing", min = 0, max = R, initial = 0, width = 300)
 
 for(i in 1:R)
 {
 if(progressbar) setTkProgressBar(pb,value=i,label=paste(i,"of", R, "validations complete",sep=" "))
 ###set up the cross validation scheme
  xdata<-data[-RT[i,],]  
  ydata<-data[RT[i,],]
  if(R==1) 
  {
  warning("insample validation")
  xdata<-data
  ydata<-data
  }
  if(modeltype=="bic" || modeltype == "best")
  {
  output<-bic.glm.formula(f=formula,data=xdata,glm.family=family,return.model=TRUE,...)
  if(trace) cat(paste(paste(output$label,100*round(output$postprob,2),sep=":"),collapse=";"),"\n")
  if(modeltype=="best")
  {
  output<-list(postprob=1,models=list(output$models[[1]]))  #cat(output$models[[1]]$aic,round(coef(output$models[[1]]),2),"\n")
  }#
  }
  else
   {
    output<-glm(data=xdata,family=family,formula=formula,control=glm.control(),...)
    output<-list(postprob=1,models=list(output))
   }
 #Now we can use the SAME function depending on the type
 validation<-match.arg(validation)
 if(validation=="mean")
 predout<-cbind(predout,bic.poisson(glmobject=output,newdata=ydata,simple=TRUE,...)) #generates a matrix with top row mu and bottom row se of rate (update later).
 else
 predout<-cbind(predout,bic.poisson(glmobject=output,newdata=ydata,simple=FALSE,N=N,...))  #Force N since we need counts for this... we could do rates but they cannot verify against counts.
 }
 #RTt<-RT; dim(RTt)<-rev(dim(RT)); as.vector(t(RTt))
if(R>1)
{
 tRT<-t(RT)
 Rorder<-order(tRT[tRT>0])
 predout<-predout[,Rorder]
}
 attr(predout,"y")<-model.extract(model.frame(formula=formula,data=data),"response")
 if(run.test && validation=="coverage")
 predout<-PIT(predout,...)
 return(predout)
}

PIT<-function(df,u=seq(0,1,.05),y=NULL,eps=1e-7,plot=TRUE,titletext="Bayesian Model Average",use.aic=NULL,print.tests=TRUE,nbest=NULL,...)
{
#PIT probability integral transform. 
#df is the predicted distributions for each left out observation, i.e. df[,1] is the predicted distribution
#one column per observation, and one row per value (0:M) 
#df is generated by crossvalidation()
#y is a the vector of observations
#u is a vector of probabilities F(u) for continuous is just the emprical probability that P(y)<=u i.e. the mean value 
#of F[y+1]<u, where y is a single obsevations, and we are calculating the probability that F(y)<=u,
#F should be uniform, so any histogram should be uniform.
#Coverage probabilities for symmetric intervals can be found from the difference between the (alpha/2 and 1-alpha/2) with covarage probability 1-alpha" 
#However, for discrete cases F(x) is not continuous, so that F(F^1(u))> u 
#Thus we let U = F(y-1)*(1-V)+F(y)*V i.e.  U= F(y-1)+ (F(y)-F(u-1))*V and V is uniform [0,1] 
#then F(u) = P(U<=u)= 0 for u<F(y-1) and 1, for u >= F(y) but (u-F(y-1))/(F(y)-F(y-1))= (u-F(y-1))/f(y), f(y) is probability
#mass function.
#d is a matrix of M+1 (i.e. 0:M) rows  and N columns, so that each column is a probability distribution for eace
#leave one out sample prediction.
#u is a vector in which we want to find the probabilities i.e we want to calculate the mean value of F(u).
#TO get histogram, take differences.
#use.aic and nbest are dummy variables used to "eat" up the variables passed using ... to prevent rect() from throwing a warning... 
if(is.null(y)) y<-attr(df,"y")
ones<-colSums(df)
M<-ncol(df)
if(any(abs(ones-1)> eps)) stop("Density must add to 1")
if(length(y) != M) stop(paste("Observation length",length(y),"not equal to",M,"number of predicted distributions"))
lu<-length(u) #number of histogram samples. 
if(any(abs(ones-1)>0))
df<-t(t(df)/ones) #normalize
P<-rbind(rep(0,M),apply(df,2,cumsum))
selections<-cbind(y+1,1:M)
Py1<-rep(P[selections],lu)  #F(y-1) = 0 for y=-1 
dy<-rep(df[selections],lu)   #d[y]
tfun<-function(x) ifelse(x<=0,0,ifelse(x>=1,1,x)) #broken linear function with y=x for 0<x<1, y=0 for x<=0 and y=1 for x>=1
z<-tfun((rep(u,each=M)-Py1)/dy)
dim(z)<-c(M,lu)
out<-colSums(z)/M # if df is 0 this counts number of observations with F(y)<=u 
cumulative=data.frame(P=out,u=u)
if(plot) PIT.plot(cumulative,titletext=titletext,...)
scoretests<-rmse(df,y)
mu<-scoretests$mu
sigma<-scoretests$sigma
tests<-unlist(scoretests[-(1:2)])
alltests<-c(tests,RPS=rankedprobscore(df,y))
if(print.tests) print(alltests)
return(list(Pdf=cumulative,tests=alltests,mu=mu,sigma=sigma,y=y,df=df))
} 

rmse<-function(df,y)
{
#caculate all scoring rules, use mean value
#tests curtesy of Czado et. al. 
df<-t(t(df)/colSums(df))
range<-0:(nrow(df)-1)
mu<-colSums(df*range)
sigma<-sqrt(colSums(df*range^2)-mu^2)
px<-df[cbind(y+1,1:ncol(df))]
p2<-colSums(df^2)
qs<-mean(-2*px+p2)
sphs<-mean(-px/sqrt(p2))
lss<-mean(-log(px))
z<-(y-mu)
mse<-mean(z^2)
nses=mean((z/sigma)^2)
dss=nses+mean(2*log(sigma))
mad=mean(abs(z))
return(list(mu=mu,sigma=sigma,LogS=lss,QS=qs,SphS=sphs,MSE=mse,nses=nses,DSS=dss,mad=mad))
}

rankedprobscore<-function(df,y)
{
#df is the predicted distributions for each left out observation, i.e. df[,1] is the predicted distribution
#one column per observation, and one row per value (0:M) 
#df is generated by crossvalidation()
#y is the vector of observations.
pk<-apply(df,2,cumsum)
#probability sum test.
M<-nrow(df)-1
N<-ncol(df)
yk<-outer(0:M,y, ">="  )
score<-(pk-yk)^2
return(sum(score)/length(y))
}

PIT.plot<-function(Pdf,titletext="Bayesian Model Average",add=FALSE,...)
{
x<-Pdf$P
u<-Pdf$u
ytop<-diff(x)
ybottom<-rep(0,length(u)-1)
xright<-u[-1]
xleft<-rev(rev(u)[-1])
maxx<-max(ytop)
if(!add)
plot(type="n",x=c(0,1),y=c(0,maxx), xlab="Probability Integral Transform",ylab="relative frequency",main=titletext)
rect(xleft=xleft,ybottom=ybottom,xright=xright, ytop=ytop,...)
}