################################################################################ 
### file:    roc.R
### author:  Peter DeWitt
### 
### purpose: take in a formula for a logistic regression model and
###          return a data frame with the sensitivity and specificity
###          
###          two main functions to write, the function to find the 
###          sensitivity and specificity, returning a data frame and a
###          second function to plot the roc curves using the ggplot2
###          library.
###
### changes: 18 March 2011 - file creation - no need for two functions
###                          everything you need is in one function.
###
###   3 August 2011 - reformated the code to meet the google style guide and
###  added the to the function to show the ROC for the training set and for the
###  testing set on the same plot.
###   
################################################################################


roc <- function(formula, data, testing.data = NULL, show.auc = TRUE){
  require(ggplot2)

  training.model <- glm(formula, data=data, family = binomial())
  testing.model  <- glm(formula, data=testing.data, family = binomial())

  training.pred.vals <- training.model$fitted.values
  testing.pred.vals  <- predict(training.model, newdata = testing.data, 
                                type = "response") 
     
  true.pos <- function(x, pred.vals, model){
                sum((pred.vals >= x) & (model$y))
              }
  true.neg <- function(x, pred.vals, model){
                sum((pred.vals <  x) & !(model$y))
              }

  false.pos <- function(x, pred.vals, model){
                 sum((pred.vals >= x) & !(model$y))
               }
  false.neg <- function(x, pred.vals, model){
                 sum((pred.vals <  x) & (model$y))
               }

  n <- 200 # number of thresholds to check
  x <- matrix(seq(1, 0, length = n))
  
  training.true.positives  <- apply(x, 1, true.pos, 
                                    pred.vals = training.pred.vals, 
                                    model = training.model)
  training.true.negatives  <- apply(x, 1, true.neg, 
                                    pred.vals = training.pred.vals, 
                                    model = training.model)
  training.false.positives <- apply(x, 1, false.pos, 
                                    pred.vals = training.pred.vals, 
                                    model = training.model)
  training.false.negatives <- apply(x, 1, false.neg, 
                                    pred.vals = training.pred.vals, 
                                    model = training.model)

  training.sensitivity <- training.true.positives / (training.true.positives +
                                                     training.false.negatives)
  training.specificity <- training.true.negatives / (training.true.negatives + 
                                                     training.false.positives)
  
  testing.true.positives  <- apply(x, 1, true.pos, 
                                   pred.vals = testing.pred.vals, 
                                   model = testing.model)
  testing.true.negatives  <- apply(x, 1, true.neg, 
                                   pred.vals = testing.pred.vals, 
                                   model = testing.model)
  testing.false.positives <- apply(x, 1, false.pos, 
                                   pred.vals = testing.pred.vals, 
                                   model = testing.model)
  testing.false.negatives <- apply(x, 1, false.neg, 
                                   pred.vals = testing.pred.vals, 
                                   model = testing.model)

  testing.sensitivity <- testing.true.positives / (testing.true.positives + 
                                                   testing.false.negatives)
  testing.specificity <- testing.true.negatives / (testing.true.negatives + 
                                                   testing.false.positives)
  
  
  df <- data.frame(
    false.positives = c(1 - training.specificity, 1 - testing.specificity), 
    true.positives  = c(training.sensitivity, testing.sensitivity),
    model           = gl(2, k = n, labels = c("Training", "Validating")))

  training.auc <- sum((df[2:n, 1] - df[1:(n-1), 1]) * 1/2 * 
                      (df[2:n, 2] + df[1:(n-1), 2]))

  testing.auc <- sum((df[(n+2):(2*n), 1] - df[(n+1):(2*n-1), 1]) * 1/2 * 
                     (df[(n+2):(2*n), 2] + df[(n+1):(2*n-1), 2]))  
    
  q <- ggplot(data = df) + 
         aes(x = false.positives, y = true.positives) + 
         geom_line(aes(colour = model), size = 1.25) +  
         geom_segment(aes(x = 0, y = 0), xend = 1, yend = 1, 
                      colour = "black", lty = 2) +
         xlab("1$-$specificity") + ylab("Sensitivity") + 
         opts(legend.position = c(0.85, 0.50),
              legend.background = theme_rect(fill = 'white')) +
         theme_bw()
  
  if (show.auc){
    q <- q + geom_text(aes(x = 0.6, y = 0.1), hjust = 0, vjust = 0, 
                       label = paste("Training AUC:", round(training.auc, 4))) + 
             geom_text(aes(x = 0.6, y = 0.0), hjust = 0, vjust = 0,
                       label = paste("Validation AUC:", round(testing.auc, 4)))
  }

  return(list(testing.auc = training.auc, 
              validation.auc = testing.auc, 
              plot = q))

}

#example data set to use while writing the function
#mydata <- read.csv(url("http://www.ats.ucla.edu/stat/r/dae/binary.csv"))
#mylogit <- glm(admit~gre+gpa+as.factor(rank), family=binomial(link="logit"),
#               na.action=na.pass, data = mydata)

#temp <- 
#  roc(admit ~ gre + gpa, 
#      data = mydata[1:200, ], 
#      testing.data = mydata[201:400, ]) 
#temp$auc
#temp$values
#temp$plot + opts(title = ("Monkeys"))

###################################################
### End of file ### End of file ### End of file ###
###################################################