[BioC] How to do k-fold validation using SVM

Ramon Diaz rdiaz@cnio.es
Mon, 27 Jan 2003 14:37:39 +0100


--------------Boundary-00=_RIKDOLK72K5YKH5275KI
Content-Type: text/plain;
  charset="iso-8859-1"
Content-Transfer-Encoding: quoted-printable

Sorry for jumping into the thread so late: I just read the posting today.
Anyway, I have used the following code a couple of times, and maybe it is=
 of=20
some help to you. In each "round" (for each set of training data) I selec=
t=20
the best K genes (where best means with largest abs. value of t statistic=
)=20
and then fit the svm using those K genes.


For a couple of reasons, I use mt.maxT (from the multtest library) to get=
 the =20
t statistic, but you can modify the function at your convenience (like an=
=20
ANOVA for > 3 or whatever you want). Note also that I use a linear kernel=
=2E=20


Hope it helps,

Ram=F3n

gene.select <- function(data, class, size =3D NULL, threshold =3D NULL) {
#    t.stat <- apply(data, 2, function(x) {abs(t.test(x ~ class)$statisti=
c)})=20
this is slower than
    tmp <- mt.maxT(t(data), class, B=3D 1)[, c(1, 2)]
        selected <- tmp[seq(1:size), 1]
    return(selected)
}


cross.valid.svm <- function(data, y, knumber =3D 10, size =3D 200) {
    ## data is structured as subjects in rows, genes in columns
    ## (and thus is transposed inside gene.select to be fed to mt.maxT).

    ## If you want leave-one-out, set knumber =3D NULL
    ## size is the number of genes used when building the classifier.
    ##  those are the "best" genes, based on a t-statistic.
 =20

## First, selecting the data subsets for cross-validation
  if(is.null(knumber)) {
        knumber <- length(y)
        leave.one.out <- TRUE
    }
    else leave.one.out <- FALSE
    N <- length(y)
    if(knumber > N) stop(message =3D "knumber has to be <=3D number of su=
bjects")
    reps <- floor(N/knumber)
    reps.last <- N - (knumber-1)*reps
    index.select <- c( rep(seq(from =3D 1, to =3D (knumber - 1)), reps),
                      rep(knumber, reps.last))
    index.select <- sample(index.select)
   =20
    cv.errors <- matrix(-99, nrow =3D knumber, ncol =3D 4)

    ## Fit model for each data set.   =20
    for(sample.number in 1:knumber) {
        ## gene selection
        gene.subset <- gene.select(data[index.select !=3D sample.number, =
],
                                    y[index.select !=3D sample.number],
                                    size =3D size)
       =20
       =20
        ## predict from svm on that subset
        y.f <- factor(y)
        test.set <- data[index.select =3D=3D sample.number, gene.subset]
        if(is.null(dim(test.set))) test.set <- matrix(test.set, nrow =3D =
1) ##=20
for leave-one-out
        predicted <- predict(svm(data[index.select !=3D sample.number,=20
gene.subset],
                                 y.f[index.select !=3D sample.number],
                                 kernel =3D "linear"), test.set)
       =20
        cv.errors[sample.number, 1] <- length(which((y.f[index.select =3D=
=3D=20
sample.number] =3D=3D 1)
                                                    & predicted =3D=3D 0)=
)
        cv.errors[sample.number, 2] <- length(which((y.f[index.select =3D=
=3D=20
sample.number] =3D=3D 0)
                                                    & predicted =3D=3D 1)=
)
        cv.errors[sample.number, 3] <- length(which(y.f[index.select =3D=3D=
=20
sample.number] !=3D predicted))
        cv.errors[sample.number, 4] <- length(predicted)
              =20
    }
    cv.errors <- data.frame(cv.errors)
    names(cv.errors) <- c("true.1.pred.0", "true.0.pred.1", "total.error"=
,=20
"number.predicted")
    average.error.rate <- sum(cv.errors[, 3])/sum(cv.errors[, 4])
    return(list(cv.errors =3D cv.errors, average.error.rate =3D=20
average.error.rate))
   =20
}


## An example code:
cross.valid.svm(matrix.covar, class.vector, k =3D 10, size =3D 10)

--=20
Ram=F3n D=EDaz-Uriarte
Bioinformatics Unit
Centro Nacional de Investigaciones Oncol=F3gicas (CNIO)
(Spanish National Cancer Center)
Melchor Fern=E1ndez Almagro, 3
28029 Madrid (Spain)
http://bioinfo.cnio.es/~rdiaz


--------------Boundary-00=_RIKDOLK72K5YKH5275KI
Content-Type: text/plain;
  charset="iso-8859-1";
  name="cv-svm.R"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment; filename="cv-svm.R"

cross.valid.optim.svm <- function(data, y, knumber = 10, size = 200) {
## First, selecting the data subsets for cross-validation
    ## data is structured as subjects in rows, genes in columns
    ## (and thus is transposed inside gene.select to be fed to mt.maxT).

    ## If you want leave-one-out, set knumber = NULL
    ## size is the number of genes used when building the classifier.
    ##  those are the "best" genes, based on a t-statistic.

        ## gene selection
    gene.subset <- gene.select(data, y, size = size)

    if(is.null(knumber)) {
        knumber <- length(y)
        leave.one.out <- TRUE
    }
    else leave.one.out <- FALSE
    N <- length(y)
    if(knumber > N) stop(message = "knumber has to be <= number of subjects")
    reps <- floor(N/knumber)
    reps.last <- N - (knumber-1)*reps
    index.select <- c( rep(seq(from = 1, to = (knumber - 1)), reps),
                      rep(knumber, reps.last))
    index.select <- sample(index.select)
    
    cv.errors <- matrix(-99, nrow = knumber, ncol = 4)

    ## Fit model for each data set.    
    for(sample.number in 1:knumber) {
 
        
        ## predict from svm on that subset
        y.f <- factor(y)
        test.set <- data[index.select == sample.number, gene.subset]
        if(is.null(dim(test.set))) test.set <- matrix(test.set, nrow = 1) ## for leave-one-out
        predicted <- predict(svm(data[index.select != sample.number, gene.subset],
                                 y.f[index.select != sample.number],
                                 kernel = "linear"), test.set)
        
        cv.errors[sample.number, 1] <- length(which((y.f[index.select == sample.number] == 1)
                                                    & predicted == 0))
        cv.errors[sample.number, 2] <- length(which((y.f[index.select == sample.number] == 0)
                                                    & predicted == 1))
        cv.errors[sample.number, 3] <- length(which(y.f[index.select == sample.number] != predicted))
        cv.errors[sample.number, 4] <- length(predicted)
               
    }
    cv.errors <- data.frame(cv.errors)
    names(cv.errors) <- c("true.1.pred.0", "true.0.pred.1", "total.error", "number.predicted")
    average.error.rate <- sum(cv.errors[, 3])/sum(cv.errors[, 4])
    return(list(cv.errors = cv.errors, average.error.rate = average.error.rate))
    
}

gene.select <- function(data, class, size = NULL, threshold = NULL) {
#    t.stat <- apply(data, 2, function(x) {abs(t.test(x ~ class)$statistic)}) this is slower than
    tmp <- mt.maxT(t(data), class, B= 1)[, c(1, 2)]
        selected <- tmp[seq(1:size), 1]
    return(selected)
}



cross.valid.optim.svm(matrix.data, censored, k = 10, size = 10)

--------------Boundary-00=_RIKDOLK72K5YKH5275KI--