AW: [R] read.table problems

Jens Oehlschlägel-Akiyoshi jens.oehlschlaegel-akiyoshi@mdfactory.de
Mon, 15 Nov 1999 16:43:04 +0100


Two short remarks

1) I still do changes on my version of read.table() as I might need a proper
csv.reader during the next days,
   so perhaps you might want to wait

2) Of course an alternative to patching read.table() is renaming my version
to e.g. read.table.csv()
   (but it is not a full csv.reader yet)

Here is an update, which by default strips quotation marks enclosing
colnames, character data or numeric data
With this version you can read in csv data such that

 "white.begin"  ,  "predy"   , "x"     , "y"
 "       asdf"  ,  0.367985  , "25.34" , 0

gives as resulting col.names instead of

> names(temp)
[1] "\"highq\"" "\"lowq\""  "\"predy\"" "\"x\""     "\"y\""

> names(temp)
[1] "highq" "lowq"  "predy" "x"     "y"

and as resulting data

 "       asdf", 0.367985, 25.34, 0

however it will fail on character data containing the separator or quotation
marks

Best regards


Jens Oehlschlägel-Akiyoshi



read.table <-
function (file
, header = FALSE
, sep = ""
, row.names
, col.names
, as.is = FALSE
, na.strings = "NA"
, skip = 0
, dec.sep = '.'                ## added by JOA
, strip.white = TRUE           ## added by JOA, set to FALSE for backward
compatibility
, quotation.mark = '"'         ## added by JOA
, strip.quotation.marks = TRUE ## added by JOA, set to FALSE for backward
compatibility
)
{
    ##### remark: this version of read.table seems to works fine for
numerical csv data
    ##### but will fail on character data containing the separator or
quotation marks
    ### Start JOA changes
    ## could have a warning
    #if (strip.white && sep[1] %in% c("", " ", "\t"))warning("read.table:
strip.white AND with white space seperator !?")
    ## .Internal(type.convert) calls do_typecvt() which calls C-library
function strtod() which uses '.' as dec.sep
    ## thus let's replace other dec.sep here
    if (dec.sep[1]==sep[1])stop("parsing rule violation: sep must not equal
dec.sep")
    type.convert <- function(x, na.strings = "NA", as.is = FALSE,
dec=dec.sep){
    	if (dec[1]!="."){
    	  # R-developers may know a more efficient internal function to replace
characters
    	  # I also don't know whether it is efficient to change parameters
before calling .Internal
    	  # however it seems to work
    	  # In case a global option$dec.sep is introduced, this fix needs to be
adapted
    	  x <- gsub(dec[1], '.', x, ignore.case=FALSE, extended=FALSE)
    	}
      .Internal(type.convert(x, na.strings, as.is))
    }
    quoted.string.matcher <- paste('\(^', quotation.mark, '\)\(.*\)\(',
quotation.mark, '$\)', sep='')
    unquote <- function(text, quoted.string)sub(quoted.string.matcher,
'\\2', text, extended=T)
    ### Stop JOA changes

    row.lens <- count.fields(file, sep, skip)
    nlines <- length(row.lens)
    rlabp <- nlines > 1 && (row.lens[2] - row.lens[1]) == 1
    if (rlabp && missing(header))
        header <- TRUE
    if (header) {
        col.names <- scan(file, what = "", sep = sep, nlines = 1,
            strip.white = strip.white, # added by JOA
            quiet = TRUE, skip = skip)
        skip <- skip + 1
        row.lens <- row.lens[-1]
        nlines <- nlines - 1
    }
    if (strip.quotation.marks) col.names <- unquote(col.names) ## added by
JOA
    else if (missing(col.names))
        col.names <- paste("V", 1:row.lens[1], sep = "")
    cols <- unique(row.lens)
    if (length(cols) != 1) {
        cat("\nrow.lens=\n")
        print(row.lens)
        stop("all rows must have the same length.")
    }
    what <- rep(list(""), cols)
    if (rlabp)
        col.names <- c("row.names", col.names)
    names(what) <- col.names
    data <- scan(file = file, what = what, sep = sep, skip = skip,
        na.strings = na.strings,
        strip.white = strip.white, ## added by JOA
        quiet = TRUE)

    if (cols != length(data)) {
        warning(paste("cols =", cols, " != length(data) =", length(data)))
        cols <- length(data)
    }
    if (is.logical(as.is)) {
        as.is <- rep(as.is, length = cols)
    }
    else if (is.numeric(as.is)) {
        if (any(as.is < 1 | as.is > cols))
            stop("invalid numeric as.is expression")
        i <- rep(FALSE, cols)
        i[as.is] <- TRUE
        as.is <- i
    }
    else if (length(as.is) != cols)
        stop(paste("as.is has the wrong length", length(as.is),
            "!= cols =", cols))

   ## Start changed by JOA
   if (strip.quotation.marks) {
         for (i in 1:cols)
             if (as.is[i])
                 data[[i]] <- unquote(data[[i]])
             else
                 data[[i]] <- type.convert( unquote(data[[i]]) )
   } else {
       for (i in 1:cols) if (!as.is[i])
           data[[i]] <- type.convert(data[[i]])
   }
   ## Stop changed by JOA

    if (missing(row.names)) {
        if (rlabp) {
            row.names <- data[[1]]
            data <- data[-1]
        }
        else row.names <- as.character(1:nlines)
    }
    else if (is.null(row.names)) {
        row.names <- as.character(1:nlines)
    }
    else if (is.character(row.names)) {
        if (length(row.names) == 1) {
            rowvar <- (1:cols)[match(col.names, row.names, 0) ==
                1]
            row.names <- data[[rowvar]]
            data <- data[-rowvar]
        }
    }
    else if (is.numeric(row.names) && length(row.names) == 1) {
        rlabp <- row.names
        row.names <- data[[rlabp]]
        data <- data[-rlabp]
    }
    else stop("invalid row.names specification")
    class(data) <- "data.frame"
    row.names(data) <- row.names
    data
}

-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-
r-devel mailing list -- Read http://www.ci.tuwien.ac.at/~hornik/R/R-FAQ.html
Send "info", "help", or "[un]subscribe"
(in the "body", not the subject !)  To: r-devel-request@stat.math.ethz.ch
_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._