# Function for plotting an ROC curve # assumes that obs is a vector of type logical and that probs is a vector of probabilities # probs[i] is the model probability that obs[i] == TRUE # returns a vector of threshold probabilities p and for each p[i] the false +ve and true +ve classification rates ROC <- function(obs, probs) { na.idx <- is.na(obs) | is.na(probs) obs <- obs[!na.idx] probs <- probs[!na.idx] n <- length(obs) n.pos <- sum(obs) delta <- 0.001 p <- seq(0, 1, by=delta) k <- length(p) falsepos <- rep(0, k) truepos <- rep(0, k) for (i in 1:k) { pred <- probs >= p[i] falsepos[i] <- sum(pred & !obs)/(n - n.pos) truepos[i] <- sum(pred & obs)/n.pos } plot(falsepos, truepos, type="l", xlab="false +ves", ylab="true +ves", main="ROC") return(data.frame('p' = p, 'falsepos' = falsepos, 'truepos' = truepos)) } # Logistic Regression for stagec data # Load data stagec <- read.csv('stagec.csv') stagec$ploidy <- factor(stagec$ploidy, labels = c('diploid', 'tetraploid', 'aneuploid')) stagec$eet <- factor(stagec$eet, labels = 0:1) stagec <- stagec[-1] # Remove pgtime # Remove rows with missing data na.idx <- apply(is.na.data.frame(stagec), 1, sum) stagec <- stagec[!na.idx,] # Split into test and training sets n <- length(stagec[[1]]) test.idx <- 1:(n/4) # or randomly using test.idx <- sort(sample(n, n/4)) stagec.test <- stagec[test.idx,] stagec.train <- stagec[-test.idx,] # Full model stagec.full <- glm(pgstat ~ age + eet + g2 + grade + gleason + ploidy, family = binomial, data = stagec.train) summary(stagec.full) stagec.full.out <- predict(stagec.full, newdata = stagec.test, type = 'response') roc.full <- ROC(stagec.test$pgstat == 1, stagec.full.out) # Stepwise model selection library(MASS) stagec.reduced <- stepAIC(stagec.full) # for a large data set this will take some time stagec.reduced.out <- predict(stagec.reduced, newdata = stagec.test, type = 'response') windows() roc.reduced <- ROC(stagec.test$pgstat == 1, stagec.reduced.out) # Plot the effects of selected factors on model probabilities factors <- c('g2', 'grade', 'ploidy') stagec.reduced.terms <- predict(stagec.reduced, newdata = stagec.train, type = 'terms', terms = factors) windows() par(mfrow=c(1,3)) for (i in 1:3) { j <- (1:7)[names(stagec.train) == factors[i]] plot(stagec.train[[j]], stagec.reduced.terms[,i], xlab = factors[i], ylab = 'contribution') } par(mfrow=c(1,1))