# This is spam.R file # It contains some commands that might be helpful for # the spam problem # Part A: library(rpart) spam.data <- read.table("spam.data", sep=",", na.strings=NA) dim(spam.data) names(spam.data) <- c("wfmake", "wfaddress", "wfall", "wf3d", "wfour", "wfover", "wfremove", "wfinternet", "wforder", "wfmail", "wfreceive", "wfwill", "wfpeople", "wfreport", "wfaddresses", "wffree", "wfbusiness", "wfemail", "wfyou", "wfcredit", "wfyour", "wffont", "wf000", "wfmoney", "wfhp", "wfhpl", "wfgeorge", "wf650", "wflab", "wflabs", "wftelnet", "wf857", "wfdata", "wf415", "wf85", "wftechnology", "wf1999", "wfparts", "wfpm", "wfdirect", "wfcs", "wfmeeting", "wforiginal", "wfproject", "wfre", "wfedu", "wftable", "wfconference", "cfsc", "cfpar", "cfbrack", "cfexc", "cfdollar", "cfpound", "crlaverage", "crllongest", "crltotal", "spam") spam.data$spam<-as.factor(spam.data$spam) # Part B spam.test<-read.table("spam.test",sep=",") names(spam.test) <- c("wfmake", "wfaddress", "wfall", "wf3d", "wfour", "wfover", "wfremove", "wfinternet", "wforder", "wfmail", "wfreceive", "wfwill", "wfpeople", "wfreport", "wfaddresses", "wffree", "wfbusiness", "wfemail", "wfyou", "wfcredit", "wfyour", "wffont", "wf000", "wfmoney", "wfhp", "wfhpl", "wfgeorge", "wf650", "wflab", "wflabs", "wftelnet", "wf857", "wfdata", "wf415", "wf85", "wftechnology", "wf1999", "wfparts", "wfpm", "wfdirect", "wfcs", "wfmeeting", "wforiginal", "wfproject", "wfre", "wfedu", "wftable", "wfconference", "cfsc", "cfpar", "cfbrack", "cfexc", "cfdollar", "cfpound", "crlaverage", "crllongest", "crltotal", "spam") spam.test$spam<-as.factor(spam.test$spam) # Part B spam.train<-read.table("spam.train",sep=",") names(spam.train) <- c("wfmake", "wfaddress", "wfall", "wf3d", "wfour", "wfover", "wfremove", "wfinternet", "wforder", "wfmail", "wfreceive", "wfwill", "wfpeople", "wfreport", "wfaddresses", "wffree", "wfbusiness", "wfemail", "wfyou", "wfcredit", "wfyour", "wffont", "wf000", "wfmoney", "wfhp", "wfhpl", "wfgeorge", "wf650", "wflab", "wflabs", "wftelnet", "wf857", "wfdata", "wf415", "wf85", "wftechnology", "wf1999", "wfparts", "wfpm", "wfdirect", "wfcs", "wfmeeting", "wforiginal", "wfproject", "wfre", "wfedu", "wftable", "wfconference", "cfsc", "cfpar", "cfbrack", "cfexc", "cfdollar", "cfpound", "crlaverage", "crllongest", "crltotal", "spam") spam.train$spam<-as.factor(spam.train$spam) my.control <- rpart.control(cp=0, xval=10) spamtraintree<-rpart(spam~.,data=spam.train,method="class",control=my.control) #find optimal cp and prune: #besttraintree <- prune(spamtraintree, cp=youroptimalcp) #make predictions using your besttraintree: predictions <- predict.rpart(besttraintree, newdata=spam.test, type="class") #compute totaltesterror, spamtesterror, nonspamtesterror: truevalues <- spam.test$spam len <- length(truevalues) spamtesterror <- 0 nonspamtesterror <- 0 totaltesterror <- 0 for (i in 1:len) { if (truevalues[i] == 1 & predictions[i] == 0) { spamtesterror <- spamtesterror + 1 totaltesterror <- totaltesterror + 1 } if (truevalues[i] == 0 & predictions[i] == 1) { nonspamtesterror <- nonspamtesterror + 1 totaltesterror <- totaltesterror + 1 } } totaltesterror <- totaltesterror/len spamtesterror <- spamtesterror/sum(truevalues == 1) nonspamtesterror <- nonspamtesterror/sum(truevalues == 0) totaltesterror spamtesterror nonspamtesterror # Part C: my.control <- rpart.control(cp=0, xval=10) loss1 <- matrix(c(0,2,1,0), ncol=2) spamtraintree1 <- rpart(spam ~ ., data=spam.train, method="class", control=my.control, parms=list(loss=loss1)) #find optimal cp and prune: #besttraintree1 <- prune(spamtraintree1, cp=youroptimalcp) #etc. loss2 <- matrix(c(0,10,1,0), ncol=2) spamtraintree2 <- rpart(spam ~ ., data=spam.train, method="class", control=my.control, parms=list(loss=loss2)) #find optimal cp and prune: #besttraintree2 <- prune(spamtraintree2, cp=youroptimalcp) #etc.