[R] predict () for LDA and GLM

Thu Mar 22 14:02:13 CET 2012

Here is the full code. Look to the last part, denoted #(f) for the question
being asked in this post:

#(a) Split datapoints into training (70 points) and test (30 points) sets.
#Read in ass4-data.txt and ass3-phodata.txt
ass4data =
read.delim('http://www.moseslab.csb.utoronto.ca/alan/ass4-data.txt', header
= FALSE, sep = "\t")

#Separate all positive and negative hits
ass4q1.neg = ass4data[which(ass4data[,1] == 0),]
ass4q1.pos = ass4data[which(ass4data[,1] == 1),]

#Reset row names
rownames(ass4q1.neg) = NULL
rownames(ass4q1.pos) = NULL

#Sample 70% (35 out of 50 in each positive/negative set) for training set,
rest for testing set
ass4q1.negRid = sample(1:nrow(ass4q1.neg),floor(0.7*nrow(ass4q1.neg)))
ass4q1.posRid = sample(1:nrow(ass4q1.pos),floor(0.7*nrow(ass4q1.pos)))

#Combine negative and positive values from each data set to create training
and testing arrays
ass4q1.trainSet = as.matrix(rbind(ass4q1.neg[ass4q1.negRid,],
ass4q1.pos[ass4q1.posRid,]))
ass4q1.testSet =
rbind(ass4q1.neg[-(ass4q1.negRid),],ass4q1.pos[-(ass4q1.posRid),])

#Reset row names
rownames(ass4q1.trainSet) = NULL
rownames(ass4q1.testSet) = NULL

ass4q1.trainSetDF = as.data.frame(ass4q1.trainSet)
ass4q1.trainSetDF$V1 = factor(ass4q1.trainSetDF$V1)

ass4q1.testSetDF = as.data.frame(ass4q1.testSet)
ass4q1.testSetDF$V1 = factor(ass4q1.testSetDF$V1)

##############
#(b)Load MASS, e1071 and glmnet
library(MASS)
library(e1071)
library(glmnet)

#############
#(c)How many features does the data contain?
#The data contains 32 features (columns of data)

#############
#(d)How does the number of parameters required for Naïve Bayes, LDA, and
Logistic
#Regression (unregularized) scale as a function of the number of features?

#If Y is binary with <X1 ... Xp> features, then the number of parameters is
P(Y).

#NaiveBayes
#P(Y) = p • (mew(Y=1), mew(Y=0), sigma(Y=1), sigma(Y=0))
#	= 1 + 4p 

#Linear Discriminant Analysis
#Have to estimate one covariance matrix and p mean values for each class.
#To compute the covariance matrix is p x p, but since the upper or lower
halfsymetrical, we disregard half, but include the
#middle diagonal by multiplying p x (p + 1) and dividing by 2.
#Calculating p mean values for each class is 2p (2 classes of binary Y).
#Thus:

P(Y) = (p(p + 1) / 2) + 2p

#Logistic Regression
#P(Y) = 1 + p

#To plot the relationship: 
ass4q1.dVS= matrixmatrix(,ncol(ass4q1.trainSet)-1,3)

for (p in 1:ncol(ass4q1.trainSet)-1){
ass4q1.dVS[p,1] = (1 + (4*p))
ass4q1.dVS[p,2] = ((p *(p + 1) / 2) + 2*p)
ass4q1.dVS[p,3] = (1 + p)
}

png('ass4q1.dVS.png')
plot(ass4q1.dVS[,2], type="o", col="blue",ylim=c(0,max(ass4q1.dVS)),
ann=FALSE)
lines(ass4q1.dVS[,1], type="o", pch=22, lty=2, col="red")
lines(ass4q1.dVS[,3], type="o", pch=23, lty=3, col="green")
title(main = "Number of parameters as a function of features",
col.main="red", font.main=4)
title(xlab= "Features", col.lab="red")
title(ylab= "Parameters", col.lab="red")
legend(1, max(ass4q1.dVS), c("LDA", "Naive Bayes", "Logistic Regression"),
cex=0.8, col=c("blue","red","green"), pch=21:23, lty=1:3)
dev.off()

#############
#(e)Train Naïve Bayes, LDA and Logistic Regression to classify the training
data
#using the first two, four, eight, 16 or 32 features, starting from the left
of the file. Plot
#the classification error (FP + FN)/(TP+FP+TN+FN) on the training data as a
function
#of the number of parameters for each method.

#Contingency table organized as:
#TN FN
#FP TP

#Organize tables to store data:
ass4q1.dNBtable = matrix(,5,2)
ass4q1.dLDAtable = matrix(,5,2)
ass4q1.dGLMtable = matrix(,5,2)

i = 1
for(p in c(2,4,8,16,32)){
	ass4q1.dNBtable[i,1] = (1 + (4*p))	
	ass4q1.dLDAtable[i,1] = ((p *(p + 1) / 2) + 2*p)
	ass4q1.dGLMtable[i,1] = (1+p)
	i = i+1
}

#Copying blank tables for part (f)
ass4q1.dNBtable.testData = ass4q1.dNBtable
ass4q1.dLDAtable.testData = ass4q1.dLDAtable
ass4q1.dGLMtable.testData = ass4q1.dGLMtable

#############
#(e)Train Naïve Bayes, LDA and Logistic Regression to classify the training
data
#using the first two, four, eight, 16 or 32 features, starting from the left
of the file. Plot
#the classification error (FP + FN)/(TP+FP+TN+FN) on the training data as a
function
#of the number of parameters for each method.

#Contingency table organized as:
#TN FN
#FP TP

#Organize tables to store data:
ass4q1.dNBtable = matrix(,5,2)
ass4q1.dLDAtable = matrix(,5,2)
ass4q1.dGLMtable = matrix(,5,2)

i = 1
for(p in c(2,4,8,16,32)){
	ass4q1.dNBtable[i,1] = (1 + (4*p))	
	ass4q1.dLDAtable[i,1] = ((p *(p + 1) / 2) + 2*p)
	ass4q1.dGLMtable[i,1] = (1+p)
	i = i+1
}

#Copying blank tables for part (f)
ass4q1.dNBtable.testData = ass4q1.dNBtable
ass4q1.dLDAtable.testData = ass4q1.dLDAtable
ass4q1.dGLMtable.testData = ass4q1.dGLMtable

#############
#2 Features
#2 features for NaiveBayes
ass4q1.dNB = naiveBayes(ass4q1.trainSetDF[,2:3], ass4q1.trainSetDF[,1])
ass4q1.dNB.cTable = table(predict(ass4q1.dNB, ass4q1.trainSetDF[,2:3]),
ass4q1.trainSetDF[,1])

ass4q1.dNBtable[1,2] = (ass4q1.dNB.cTable[2,1] + ass4q1.dNB.cTable
[1,2])/(sum(ass4q1.dNB.cTable))

#2 features for LDA
ass4q1.dLDA = lda(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:3])
ass4q1.dLDA.cTable = table(predict(ass4q1.dLDA,
ass4q1.trainSetDF[,2:3])$class, ass4q1.trainSetDF[,1])

ass4q1.dLDAtable[1,2] = (ass4q1.dLDA.cTable[2,1] + ass4q1.dLDA.cTable
[1,2])/(sum(ass4q1.dLDA.cTable))

#2 features for GLM
ass4q1.dGLM = glm(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:3], family =
"binomial")
ass4q1.dGLM.cTable = table(predict(ass4q1.dGLM,
ass4q1.trainSetDF[,2:3]),ass4q1.trainSetDF[,1])

ass4q1.dGLMtable[1,2] = ((35-sum(ass4q1.dGLM.cTable[1:35,1]) +
(35-sum(ass4q1.dGLM.cTable[36:70,2]))) / 70)

#############
#4 Features
#4 features for NaiveBayes
ass4q1.dNB = naiveBayes(ass4q1.trainSetDF[,2:5], ass4q1.trainSetDF[,1])
ass4q1.dNB.cTable = table(predict(ass4q1.dNB, ass4q1.trainSetDF[,2:5]),
ass4q1.trainSetDF[,1])

ass4q1.dNBtable[2,2] = (ass4q1.dNB.cTable[2,1] + ass4q1.dNB.cTable
[1,2])/(sum(ass4q1.dNB.cTable))

#4 features for LDA
ass4q1.dLDA = lda(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:5])
ass4q1.dLDA.cTable = table(predict(ass4q1.dLDA,
ass4q1.trainSetDF[,2:5])$class, ass4q1.trainSetDF[,1])

ass4q1.dLDAtable[2,2] = (ass4q1.dLDA.cTable[2,1] + ass4q1.dLDA.cTable
[1,2])/(sum(ass4q1.dLDA.cTable))

#4 features for GLM
ass4q1.dGLM = glm(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:5], family =
"binomial")
ass4q1.dGLM.cTable = table(predict(ass4q1.dGLM,
ass4q1.trainSetDF[,2:5]),ass4q1.trainSetDF[,1])

ass4q1.dGLMtable[2,2] = ((35-sum(ass4q1.dGLM.cTable[1:35,1]) +
(35-sum(ass4q1.dGLM.cTable[36:70,2]))) / 70)

#############
#8 Features
#8 features for NaiveBayes
ass4q1.dNB = naiveBayes(ass4q1.trainSetDF[,2:7], ass4q1.trainSetDF[,1])
ass4q1.dNB.cTable = table(predict(ass4q1.dNB, ass4q1.trainSetDF[,2:7]),
ass4q1.trainSetDF[,1])

ass4q1.dNBtable[3,2] = (ass4q1.dNB.cTable[2,1] + ass4q1.dNB.cTable
[1,2])/(sum(ass4q1.dNB.cTable))

#8 features for LDA
ass4q1.dLDA = lda(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:7])
ass4q1.dLDA.cTable = table(predict(ass4q1.dLDA,
ass4q1.trainSetDF[,2:7])$class, ass4q1.trainSetDF[,1])

ass4q1.dLDAtable[3,2] = (ass4q1.dLDA.cTable[2,1] + ass4q1.dLDA.cTable
[1,2])/(sum(ass4q1.dLDA.cTable))

#8 features for GLM
ass4q1.dGLM = glm(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:7], family =
"binomial")
ass4q1.dGLM.cTable = table(predict(ass4q1.dGLM,
ass4q1.trainSetDF[,2:7]),ass4q1.trainSetDF[,1])

ass4q1.dGLMtable[3,2] = ((35-sum(ass4q1.dGLM.cTable[1:35,1]) +
(35-sum(ass4q1.dGLM.cTable[36:70,2]))) / 70)

#############
#16 Features
#16 features for NaiveBayes
ass4q1.dNB = naiveBayes(ass4q1.trainSetDF[,2:17], ass4q1.trainSetDF[,1])
ass4q1.dNB.cTable = table(predict(ass4q1.dNB, ass4q1.trainSetDF[,2:17]),
ass4q1.trainSetDF[,1])

ass4q1.dNBtable[4,2] = (ass4q1.dNB.cTable[2,1] + ass4q1.dNB.cTable
[1,2])/(sum(ass4q1.dNB.cTable))

#16 features for LDA
ass4q1.dLDA = lda(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:17])
ass4q1.dLDA.cTable = table(predict(ass4q1.dLDA,
ass4q1.trainSetDF[,2:17])$class, ass4q1.trainSetDF[,1])

ass4q1.dLDAtable[4,2] = (ass4q1.dLDA.cTable[2,1] + ass4q1.dLDA.cTable
[1,2])/(sum(ass4q1.dLDA.cTable))

#16 features for GLM
ass4q1.dGLM = glm(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:17], family =
"binomial")
ass4q1.dGLM.cTable = table(predict(ass4q1.dGLM,
ass4q1.trainSetDF[,2:17]),ass4q1.trainSetDF[,1])

ass4q1.dGLMtable[4,2] = ((35-sum(ass4q1.dGLM.cTable[1:35,1]) +
(35-sum(ass4q1.dGLM.cTable[36:70,2]))) / 70)

#############
#32 Features
#32 features for NaiveBayes
ass4q1.dNB = naiveBayes(ass4q1.trainSetDF[,2:33], ass4q1.trainSetDF[,1])
ass4q1.dNB.cTable = table(predict(ass4q1.dNB, ass4q1.trainSetDF[,2:33]),
ass4q1.trainSetDF[,1])

ass4q1.dNBtable[5,2] = (ass4q1.dNB.cTable[2,1] + ass4q1.dNB.cTable
[1,2])/(sum(ass4q1.dNB.cTable))

#32 features for LDA
ass4q1.dLDA = lda(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:33])
ass4q1.dLDA.cTable = table(predict(ass4q1.dLDA,
ass4q1.trainSetDF[,2:33])$class, ass4q1.trainSetDF[,1])

ass4q1.dLDAtable[5,2] = (ass4q1.dLDA.cTable[2,1] + ass4q1.dLDA.cTable
[1,2])/(sum(ass4q1.dLDA.cTable))

#16 features for GLM
ass4q1.dGLM = glm(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:33], family =
"binomial")
ass4q1.dGLM.cTable = table(predict(ass4q1.dGLM,
ass4q1.trainSetDF[,2:33]),ass4q1.trainSetDF[,1])

ass4q1.dLDAtable[5,2] = ((35-sum(ass4q1.dGLM.cTable[1:35,1]) +
(35-sum(ass4q1.dGLM.cTable[36:70,2]))) / 70)

png('ass4q1.dTables.png')
plot(ass4q1.dLDAtable[,1],ass4q1.dLDAtable[,2], type="o",
col="blue",ylim=c(0,.4), ann=FALSE)
lines(ass4q1.dNBtable[,1], ass4q1.dNBtable[,2], type="o", pch=22, lty=2,
col="red")
lines(ass4q1.dGLMtable[,1], ass4q1.dGLMtable[,2], type="o", pch=23, lty=3,
col="green")
title(main = "Classification error as a function of number of parameters",
col.main="red", font.main=4)
title(xlab= "Parameters", col.lab="red")
title(ylab= "(FP + FN)/(TP+FP+TN+FN)", col.lab="red")
legend(300, .4, c("LDA", "Naive Bayes", "Logistic Regression"), cex=0.8,
col=c("blue","red","green"), pch=21:23, lty=1:3)
dev.off()

#############
#(f)Plot the classification error as a function of the number of parameters
on the test
#data for each method. Does this differ from your answer in part (e)?
Explain why.

#############
#2 Features
#2 features for NaiveBayes

ass4q1.dNB.cTable = table(predict(ass4q1.dNB, ass4q1.testSetDF[,2:3]),
ass4q1.testSetDF[,1])

ass4q1.dNBtable.testData[1,2] = (ass4q1.dNB.cTable[2,1] + ass4q1.dNB.cTable
[1,2])/(sum(ass4q1.dNB.cTable))

#WORKS!

#2 features for LDA
ass4q1.dLDA.cTable = table(predict(ass4q1.dLDA,
ass4q1.testSetDF[,2:3])$class, ass4q1.testSetDF[,1])

#DOESN'T WORK!

ass4q1.dLDAtable.tesetData[1,2] = (ass4q1.dLDA.cTable[2,1] +
ass4q1.dLDA.cTable [1,2])/(sum(ass4q1.dLDA.cTable))

#2 features for GLM
ass4q1.dGLM.cTable = table(predict(ass4q1.dGLM, ass4q1.testSetDF[,2:3], s =
),ass4q1.testSetDF[,1])
#DOESN'T WORK!

ass4q1.dGLMtable[1,2] = ((35-sum(ass4q1.dGLM.cTable[1:35,1]) +
(35-sum(ass4q1.dGLM.cTable[36:70,2]))) / 70)
Uwe Ligges-3 wrote
> 
> On 22.03.2012 03:24, palanski wrote:
>> Hi!
>>
>> I'm using GLM, LDA and NaiveBayes for binomial classification. My
>> training
>> set is 70 rows long with 32 features, and my test set is 30 rows long
>> with
>> 32 features.
>>
>> Using Naive Bayes, I can train a model, and then predict the test set
>> with
>> it like so:
>>
>> ass4q1.dLDA = lda(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:3])
>> table(predict(ass4q1.dNB, ass4q1.testSetDF[,2:3]), ass4q1.testSetDF[,1])
>>
>>
>> However, when the same is done for LDA or GLM, suddenly it tells me that
>> the
>> number of rows doesn't match and doesn't predict my test data. The error
>> for
>> GLM, as an example, is:
>>
>> Error in table(predict(ass4q1.dGLM, ass4q1.testSetDF[, 2:3]),
>> ass4q1.testSetDF[,  :
>>    all arguments must have the same length
>> In addition: Warning message:
>> 'newdata' had 30 rows but variable(s) found have 70 rows
>  >
>>
>> What am I missing?
> 
> A correct formula describing the model with separate variables with the 
> data.frame passed to the data argument of the lda() function.
> A reproducible example is missing, hence this is just a guess.
> 
> Uwe Ligges
> 
> 
> 
>>
>> --
>> View this message in context:
>> http://r.789695.n4.nabble.com/predict-for-LDA-and-GLM-tp4494381p4494381.html
>> Sent from the R help mailing list archive at Nabble.com.
>>
>> ______________________________________________
>> R-help@ mailing list
>> https://stat.ethz.ch/mailman/listinfo/r-help
>> PLEASE do read the posting guide
>> http://www.R-project.org/posting-guide.html
>> and provide commented, minimal, self-contained, reproducible code.
> 
> ______________________________________________
> R-help@ mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide
> http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
> 

Uwe Ligges-3 wrote
> 
> On 22.03.2012 03:24, palanski wrote:
>> Hi!
>>
>> I'm using GLM, LDA and NaiveBayes for binomial classification. My
>> training
>> set is 70 rows long with 32 features, and my test set is 30 rows long
>> with
>> 32 features.
>>
>> Using Naive Bayes, I can train a model, and then predict the test set
>> with
>> it like so:
>>
>> ass4q1.dLDA = lda(ass4q1.trainSet[,1]~ass4q1.trainSet[,2:3])
>> table(predict(ass4q1.dNB, ass4q1.testSetDF[,2:3]), ass4q1.testSetDF[,1])
>>
>>
>> However, when the same is done for LDA or GLM, suddenly it tells me that
>> the
>> number of rows doesn't match and doesn't predict my test data. The error
>> for
>> GLM, as an example, is:
>>
>> Error in table(predict(ass4q1.dGLM, ass4q1.testSetDF[, 2:3]),
>> ass4q1.testSetDF[,  :
>>    all arguments must have the same length
>> In addition: Warning message:
>> 'newdata' had 30 rows but variable(s) found have 70 rows
>  >
>>
>> What am I missing?
> 
> A correct formula describing the model with separate variables with the 
> data.frame passed to the data argument of the lda() function.
> A reproducible example is missing, hence this is just a guess.
> 
> Uwe Ligges
> 
> 
> 
>>
>> --
>> View this message in context:
>> http://r.789695.n4.nabble.com/predict-for-LDA-and-GLM-tp4494381p4494381.html
>> Sent from the R help mailing list archive at Nabble.com.
>>
>> ______________________________________________
>> R-help@ mailing list
>> https://stat.ethz.ch/mailman/listinfo/r-help
>> PLEASE do read the posting guide
>> http://www.R-project.org/posting-guide.html
>> and provide commented, minimal, self-contained, reproducible code.
> 
> ______________________________________________
> R-help@ mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide
> http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
> 

--
View this message in context: http://r.789695.n4.nabble.com/predict-for-LDA-and-GLM-tp4494381p4495386.html
Sent from the R help mailing list archive at Nabble.com.