[Rd] writeForeignSAS and potential extensions

Stephen Weigand Weigand.Stephen at mayo.edu
Thu Jul 13 20:48:19 CEST 2006


Dear R-devel,

I've made some potential extensions to writeForeignSAS 
in 'foreign' that I wanted to pass along if anyone is 
interested.  I've attached the diff -u output against 
the version found in foreign_0.8-15 and an .R file 
with my changes.  (In this .R file, the function is named 
writeForeignSAS7 to simplify testing/comparisons.)

I've tried to alter the current version as little as
possible while making the following changes:

* Try to convert data.frame names to SAS-legal names and
allow the user to specify an 8- or 32-character limit.

* For factors, try to convert the variable name to a
SAS-legal 8-character name not ending in a digit 

* Read in 'datafile' with DSD specified in the INFILE
statement.  SAS says this "changes how SAS treats
delimiters when list input is used and sets the default
delimiter to a comma.  When you specify DSD, SAS treats
two consecutive delimiters as a missing value and
removes quotation marks from character values."  The
point of this is the added safety of using 'quote=TRUE' 
when writing 'datafile' via write.table

* Functionality to write out Dates and read them in with
an INFORMAT statement

* Functionality to write out datetime variables
(assuming a class of POSIXct) and read them in with an
INFORMAT statement

* In order to handle character variables a bit better,
use a LENGTH statement to tell SAS the maximum character
width of values in the variable. Without this, some
character values can be truncated.

If it'd be helpful to make any changes or add anything, 
I'd be happy try to do so.

Finally, some testing code that works in SAS 6.12, 8.2,
and 9.

d <-
  structure(list(a.b = as.integer(c(1, 2)),
                 alphabetsoup = 
                 structure(as.integer(c(1, 2)),
                   .Label = c("A", "B"), 
                   class = "factor"),
                 datevar1 = structure(c(13342, 12977), 
                                      class = "Date"),
                 datetimevar1 = structure(c(1152802685, 
                                            1152716285),
                   class = c("POSIXt", "POSIXct")),
                 charactervariable = c("L", 
                    "Last, First")),
            .Names = c("a.b", "alphabetsoup",
                "datevar1", "datetimevar1",
                "charactervariable"),
            row.names = c("1", "2"),
            class = "data.frame")

require(foreign)

### adQuote here to (temporarily) avoid ':::'
adQuote <- function (x) paste("\"", x, "\"", sep = "")

dfile <- file.path(tempdir(), "test.dat")
cfile <- file.path(tempdir(), "test.sas")
write.foreign(d, datafile = dfile, codefile = cfile,
              package = "SAS7", validvarname = "V6")
file.show(dfile)
file.show(cfile)

Sincerely,

Stephen

::::::::::::::::::::::::::::::::::  
Stephen Weigand 
Division of Biostatistics 
Mayo Clinic Rochester, Minn., USA 
Phone (507) 266-1650, fax 284-9542
-------------- next part --------------
--- writeForeignSAS.R	Fri Feb 17 03:30:53 2006
+++ /tmp/writeForeignSAS.R	Thu Jul 13 12:24:24 2006
@@ -1,21 +1,52 @@
-writeForeignSAS<-function(df,datafile,codefile,dataname="rdata"){
+make.SAS.names <- function(varnames, validvarname = c("V7", "V6")){
+  validvarname <- match.arg(validvarname)
+  nmax <- if(validvarname == "V7") 32 else 8
   
+  x <- sub("^([0-9])", "_\\1", varnames)
+  x <- gsub("[^a-zA-Z0-9_]", "_", x)
+  x <- abbreviate(x, minlength = nmax)
+  
+  if (any(nchar(x) > nmax) || any(duplicated(x)))
+    stop("Cannot uniquely abbreviate the variable names to ",
+         nmax, " or fewer characters")
+  names(x) <- varnames
+  x
+}
+
+make.SAS.formats <- function(varnames){  
+  x <- sub("^([0-9])", "_\\1", varnames)
+  x <- gsub("[^a-zA-Z0-9_]", "_", x)
+  x <- sub("([0-9])$", "\\1f", x) # can't end in digit so append 'f'
+  x <- abbreviate(x, minlength = 8)
+  
+  if(any(nchar(x) > 8) || any(duplicated(x)))
+    stop("Cannot uniquely abbreviate format names to conform to ",
+         " eight-character limit and not ending in a digit")
+  names(x) <- varnames
+  x  
+}
+
+writeForeignSAS7<-function(df,datafile,codefile,dataname="rdata",
+                          validvarname = c("V7", "V6")){
   factors <- sapply(df, is.factor)
   strings <- sapply(df, is.character)
-
+  dates <- sapply(df, FUN = function(x) inherits(x, "Date"))
+  datetimes <- sapply(df, FUN = function(x) inherits(x, "POSIXct"))
+  
   varlabels <- names(df)
-  varnames <- abbreviate(names(df), 8)
-  if (any(sapply(varnames, nchar) > 8)) 
-    stop("Cannot abbreviate the variable names to eight or fewer letters")
-  if (any(abbreviated <- (varnames != varlabels))) 
-    message("Some variable names were abbreviated.")
+  varnames <- make.SAS.names(names(df), validvarname = validvarname)
+  if (any(varnames != varlabels))
+    message("Some variable names were abbreviated or otherwise altered.")
   
   
   dfn<-df
   if (any(factors))
     dfn[factors]<-lapply(dfn[factors], as.numeric)     
+  if (any(datetimes))
+    dfn[datetimes] <- lapply(dfn[datetimes],
+                             FUN = function(x) format(x, "%d%b%Y %H:%M:%S"))
   write.table(dfn, file = datafile, row = FALSE, col = FALSE, 
-              sep = ",", quote = FALSE, na = ".")
+              sep = ",", quote = TRUE, na = "")
   lrecl<-max(sapply(readLines(datafile),nchar))+4
 
   cat("* Written by R;\n", file=codefile)
@@ -22,24 +53,50 @@
   cat("* ",deparse(sys.call(-2))[1],";\n\n",file=codefile,append=TRUE)
   if (any(factors)){
     cat("PROC FORMAT;\n",file=codefile,append=TRUE)
-    for(v in 1:ncol(df)){
-      if (factors[v]){
-        cat("value ",varnames[v],"\n",file=codefile,append=TRUE)
-        values<-levels(df[[v]])
+    fmtnames <- make.SAS.formats(varnames[factors])
+    fmt.values <- lapply(df[, factors, drop = FALSE], levels)
+    names(fmt.values) <- fmtnames
+    for (f in fmtnames){
+      cat("value",f,"\n",file=codefile,append = TRUE)
+      values<-fmt.values[[f]]
         for(i in 1:length(values)){
           cat("    ",i,"=",adQuote(values[i]),"\n",file=codefile,append=TRUE)
         }
         cat(";\n\n",file=codefile,append=TRUE)
-      }
-      }
+     }
   }
 
   cat("DATA ",dataname,";\n",file=codefile,append=TRUE)
+
+  if (any(strings)){    
+    cat("LENGTH", file = codefile, append = TRUE)
+    lengths <- sapply(df[,strings, drop = FALSE],
+                      FUN = function(x) max(nchar(x)))
+    names(lengths) <- varnames[strings]
+    for(v in varnames[strings])
+      cat("\n", v, "$", lengths[v],file=codefile,append=TRUE)
+    cat("\n;\n\n", file = codefile, append = TRUE)
+  }
+
+  if (any(dates)){    
+    cat("INFORMAT", file = codefile, append = TRUE)
+    for(v in varnames[dates])
+      cat("\n", v, file = codefile, append = TRUE)
+    cat("\n YYMMDD10.\n;\n\n", file = codefile, append = TRUE)
+  }                  
+
+  if (any(datetimes)){    
+    cat("INFORMAT", file = codefile, append = TRUE)
+    for(v in varnames[datetimes])
+      cat("\n", v, file = codefile, append = TRUE)
+    cat("\n DATETIME18.\n;\n\n", file = codefile, append = TRUE)
+  }
+  
   cat("INFILE ",adQuote(datafile),
-      "\n     DELIMITER=','",
+      "\n     DSD", 
       "\n     LRECL=",lrecl,";\n",
       file=codefile,append=TRUE)
-
+  
   cat("INPUT",file=codefile,append=TRUE)
   for(v in 1:ncol(df)){
     cat("\n",varnames[v],file=codefile,append=TRUE)
@@ -49,16 +106,26 @@
   cat("\n;\n",file=codefile,append=TRUE)
 
   for(v in 1:ncol(df)){
-    if (abbreviated[v])
+    if (varnames[v] != names(varnames)[v]) 
       cat("LABEL ",varnames[v],"=",adQuote(varlabels[v]),";\n",
           file=codefile,append=TRUE)
-  } 
-
-  for(v in 1:ncol(df)){
-    if(factors[v])
-      cat("FORMAT ",varnames[v],paste(varnames[v],".",sep=""),";\n",
+  }
+  
+  if (any(factors)){
+    for (f in 1:length(fmtnames)) 
+      cat("FORMAT", names(fmtnames)[f],paste(fmtnames[f],".",sep = ""),";\n",
           file=codefile,append=TRUE)
   }
-
+  
+  if (any(dates)){    
+    for(v in varnames[dates])
+      cat("FORMAT", v, "yymmdd10.;\n", file = codefile, append = TRUE)
+  }        
+  
+  if (any(datetimes)){    
+    for(v in varnames[datetimes])
+      cat("FORMAT", v, "datetime18.;\n", file = codefile, append = TRUE)
+  }
+  
   cat("RUN;\n",file=codefile,append=TRUE)
 } 
-------------- next part --------------
make.SAS.names <- function(varnames, validvarname = c("V7", "V6")){
  validvarname <- match.arg(validvarname)
  nmax <- if(validvarname == "V7") 32 else 8
  
  x <- sub("^([0-9])", "_\\1", varnames)
  x <- gsub("[^a-zA-Z0-9_]", "_", x)
  x <- abbreviate(x, minlength = nmax)
  
  if (any(nchar(x) > nmax) || any(duplicated(x)))
    stop("Cannot uniquely abbreviate the variable names to ",
         nmax, " or fewer characters")
  names(x) <- varnames
  x
}

make.SAS.formats <- function(varnames){  
  x <- sub("^([0-9])", "_\\1", varnames)
  x <- gsub("[^a-zA-Z0-9_]", "_", x)
  x <- sub("([0-9])$", "\\1f", x) # can't end in digit so append 'f'
  x <- abbreviate(x, minlength = 8)
  
  if(any(nchar(x) > 8) || any(duplicated(x)))
    stop("Cannot uniquely abbreviate format names to conform to ",
         " eight-character limit and not ending in a digit")
  names(x) <- varnames
  x  
}

writeForeignSAS7<-function(df,datafile,codefile,dataname="rdata",
                          validvarname = c("V7", "V6")){
  factors <- sapply(df, is.factor)
  strings <- sapply(df, is.character)
  dates <- sapply(df, FUN = function(x) inherits(x, "Date"))
  datetimes <- sapply(df, FUN = function(x) inherits(x, "POSIXct"))
  
  varlabels <- names(df)
  varnames <- make.SAS.names(names(df), validvarname = validvarname)
  if (any(varnames != varlabels))
    message("Some variable names were abbreviated or otherwise altered.")
  
  
  dfn<-df
  if (any(factors))
    dfn[factors]<-lapply(dfn[factors], as.numeric)     
  if (any(datetimes))
    dfn[datetimes] <- lapply(dfn[datetimes],
                             FUN = function(x) format(x, "%d%b%Y %H:%M:%S"))
  write.table(dfn, file = datafile, row = FALSE, col = FALSE, 
              sep = ",", quote = TRUE, na = "")
  lrecl<-max(sapply(readLines(datafile),nchar))+4

  cat("* Written by R;\n", file=codefile)
  cat("* ",deparse(sys.call(-2))[1],";\n\n",file=codefile,append=TRUE)
  if (any(factors)){
    cat("PROC FORMAT;\n",file=codefile,append=TRUE)
    fmtnames <- make.SAS.formats(varnames[factors])
    fmt.values <- lapply(df[, factors, drop = FALSE], levels)
    names(fmt.values) <- fmtnames
    for (f in fmtnames){
      cat("value",f,"\n",file=codefile,append = TRUE)
      values<-fmt.values[[f]]
        for(i in 1:length(values)){
          cat("    ",i,"=",adQuote(values[i]),"\n",file=codefile,append=TRUE)
        }
        cat(";\n\n",file=codefile,append=TRUE)
     }
  }

  cat("DATA ",dataname,";\n",file=codefile,append=TRUE)

  if (any(strings)){    
    cat("LENGTH", file = codefile, append = TRUE)
    lengths <- sapply(df[,strings, drop = FALSE],
                      FUN = function(x) max(nchar(x)))
    names(lengths) <- varnames[strings]
    for(v in varnames[strings])
      cat("\n", v, "$", lengths[v],file=codefile,append=TRUE)
    cat("\n;\n\n", file = codefile, append = TRUE)
  }

  if (any(dates)){    
    cat("INFORMAT", file = codefile, append = TRUE)
    for(v in varnames[dates])
      cat("\n", v, file = codefile, append = TRUE)
    cat("\n YYMMDD10.\n;\n\n", file = codefile, append = TRUE)
  }                  

  if (any(datetimes)){    
    cat("INFORMAT", file = codefile, append = TRUE)
    for(v in varnames[datetimes])
      cat("\n", v, file = codefile, append = TRUE)
    cat("\n DATETIME18.\n;\n\n", file = codefile, append = TRUE)
  }
  
  cat("INFILE ",adQuote(datafile),
      "\n     DSD", 
      "\n     LRECL=",lrecl,";\n",
      file=codefile,append=TRUE)
  
  cat("INPUT",file=codefile,append=TRUE)
  for(v in 1:ncol(df)){
    cat("\n",varnames[v],file=codefile,append=TRUE)
    if(strings[v])
      cat(" $ ",file=codefile,append=TRUE)
  }
  cat("\n;\n",file=codefile,append=TRUE)

  for(v in 1:ncol(df)){
    if (varnames[v] != names(varnames)[v]) 
      cat("LABEL ",varnames[v],"=",adQuote(varlabels[v]),";\n",
          file=codefile,append=TRUE)
  }
  
  if (any(factors)){
    for (f in 1:length(fmtnames)) 
      cat("FORMAT", names(fmtnames)[f],paste(fmtnames[f],".",sep = ""),";\n",
          file=codefile,append=TRUE)
  }
  
  if (any(dates)){    
    for(v in varnames[dates])
      cat("FORMAT", v, "yymmdd10.;\n", file = codefile, append = TRUE)
  }        
  
  if (any(datetimes)){    
    for(v in varnames[datetimes])
      cat("FORMAT", v, "datetime18.;\n", file = codefile, append = TRUE)
  }
  
  cat("RUN;\n",file=codefile,append=TRUE)
} 


More information about the R-devel mailing list