################################################################################ ### file: roc.R ### author: Peter DeWitt ### ### purpose: take in a formula for a logistic regression model and ### return a data frame with the sensitivity and specificity ### ### two main functions to write, the function to find the ### sensitivity and specificity, returning a data frame and a ### second function to plot the roc curves using the ggplot2 ### library. ### ### changes: 18 March 2011 - file creation - no need for two functions ### everything you need is in one function. ### ### 3 August 2011 - reformated the code to meet the google style guide and ### added the to the function to show the ROC for the training set and for the ### testing set on the same plot. ### ################################################################################ roc <- function(formula, data, testing.data = NULL, show.auc = TRUE){ require(ggplot2) training.model <- glm(formula, data=data, family = binomial()) testing.model <- glm(formula, data=testing.data, family = binomial()) training.pred.vals <- training.model$fitted.values testing.pred.vals <- predict(training.model, newdata = testing.data, type = "response") true.pos <- function(x, pred.vals, model){ sum((pred.vals >= x) & (model$y)) } true.neg <- function(x, pred.vals, model){ sum((pred.vals < x) & !(model$y)) } false.pos <- function(x, pred.vals, model){ sum((pred.vals >= x) & !(model$y)) } false.neg <- function(x, pred.vals, model){ sum((pred.vals < x) & (model$y)) } n <- 200 # number of thresholds to check x <- matrix(seq(1, 0, length = n)) training.true.positives <- apply(x, 1, true.pos, pred.vals = training.pred.vals, model = training.model) training.true.negatives <- apply(x, 1, true.neg, pred.vals = training.pred.vals, model = training.model) training.false.positives <- apply(x, 1, false.pos, pred.vals = training.pred.vals, model = training.model) training.false.negatives <- apply(x, 1, false.neg, pred.vals = training.pred.vals, model = training.model) training.sensitivity <- training.true.positives / (training.true.positives + training.false.negatives) training.specificity <- training.true.negatives / (training.true.negatives + training.false.positives) testing.true.positives <- apply(x, 1, true.pos, pred.vals = testing.pred.vals, model = testing.model) testing.true.negatives <- apply(x, 1, true.neg, pred.vals = testing.pred.vals, model = testing.model) testing.false.positives <- apply(x, 1, false.pos, pred.vals = testing.pred.vals, model = testing.model) testing.false.negatives <- apply(x, 1, false.neg, pred.vals = testing.pred.vals, model = testing.model) testing.sensitivity <- testing.true.positives / (testing.true.positives + testing.false.negatives) testing.specificity <- testing.true.negatives / (testing.true.negatives + testing.false.positives) df <- data.frame( false.positives = c(1 - training.specificity, 1 - testing.specificity), true.positives = c(training.sensitivity, testing.sensitivity), model = gl(2, k = n, labels = c("Training", "Validating"))) training.auc <- sum((df[2:n, 1] - df[1:(n-1), 1]) * 1/2 * (df[2:n, 2] + df[1:(n-1), 2])) testing.auc <- sum((df[(n+2):(2*n), 1] - df[(n+1):(2*n-1), 1]) * 1/2 * (df[(n+2):(2*n), 2] + df[(n+1):(2*n-1), 2])) q <- ggplot(data = df) + aes(x = false.positives, y = true.positives) + geom_line(aes(colour = model), size = 1.25) + geom_segment(aes(x = 0, y = 0), xend = 1, yend = 1, colour = "black", lty = 2) + xlab("1$-$specificity") + ylab("Sensitivity") + opts(legend.position = c(0.85, 0.50), legend.background = theme_rect(fill = 'white')) + theme_bw() if (show.auc){ q <- q + geom_text(aes(x = 0.6, y = 0.1), hjust = 0, vjust = 0, label = paste("Training AUC:", round(training.auc, 4))) + geom_text(aes(x = 0.6, y = 0.0), hjust = 0, vjust = 0, label = paste("Validation AUC:", round(testing.auc, 4))) } return(list(testing.auc = training.auc, validation.auc = testing.auc, plot = q)) } #example data set to use while writing the function #mydata <- read.csv(url("http://www.ats.ucla.edu/stat/r/dae/binary.csv")) #mylogit <- glm(admit~gre+gpa+as.factor(rank), family=binomial(link="logit"), # na.action=na.pass, data = mydata) #temp <- # roc(admit ~ gre + gpa, # data = mydata[1:200, ], # testing.data = mydata[201:400, ]) #temp$auc #temp$values #temp$plot + opts(title = ("Monkeys")) ################################################### ### End of file ### End of file ### End of file ### ###################################################