[Rd] more powerful iconv

Matt Shotwell shotwelm at musc.edu
Sat Jun 19 22:53:00 CEST 2010


R community,

As you may know, R's iconv doesn't work well converting to and from
encodings that allow embedded nulls. For example

> iconv("foo", to="UTF-16")
Error in iconv("foo", to = "UTF-16") : 
  embedded nul in string: '\xff\xfef\0o\0o\0'

However, I don't believe embedded nulls are at issue here, but rather
that R's iconv doesn't accept objects of type RAWSXP. The iconv
mechanism, after all, operates on encoded binary data, and not
necessarily null terminated C strings. I'd like to submit a very small
patch (12 lines w/o documentation) that allows R's iconv to operate on
raw objects, while not interfering or affecting the behavior of iconv on
character vectors. To keep this message terse, I've put additional
discussion, description of what the patch does, and examples here:
http://biostatmatt.com/archives/456

Also, here is a link to the patch file:
http://biostatmatt.com/R/R-devel-iconv-0.0.patch

If this change is adopted, I'd be happy to submit a documentation patch
also.

-Matt

Index: src/library/base/R/New-Internal.R
===================================================================
--- src/library/base/R/New-Internal.R	(revision 52328)
+++ src/library/base/R/New-Internal.R	(working copy)
@@ -239,7 +239,7 @@
 
 iconv <- function(x, from = "", to = "", sub = NA, mark = TRUE)
 {
-    if(!is.character(x)) x <- as.character(x)
+    if(!is.character(x) && !is.raw(x)) x <- as.character(x)
     .Internal(iconv(x, from, to, as.character(sub), mark))
 }
 
Index: src/main/sysutils.c
===================================================================
--- src/main/sysutils.c	(revision 52328)
+++ src/main/sysutils.c	(working copy)
@@ -548,16 +548,17 @@
 	int mark;
 	const char *from, *to;
 	Rboolean isLatin1 = FALSE, isUTF8 = FALSE;
+	Rboolean isRawx = (TYPEOF(x) == RAWSXP);
 
-	if(TYPEOF(x) != STRSXP)
-	    error(_("'x' must be a character vector"));
+	if(TYPEOF(x) != STRSXP && !isRawx)
+	    error(_("'x' must be a character vector or raw"));
 	if(!isString(CADR(args)) || length(CADR(args)) != 1)
 	    error(_("invalid '%s' argument"), "from");
 	if(!isString(CADDR(args)) || length(CADDR(args)) != 1)
 	    error(_("invalid '%s' argument"), "to");
 	if(!isString(CADDDR(args)) || length(CADDDR(args)) != 1)
 	    error(_("invalid '%s' argument"), "sub");
-	if(STRING_ELT(CADDDR(args), 0) == NA_STRING) sub = NULL;
+	if(STRING_ELT(CADDDR(args), 0) == NA_STRING || isRawx) sub = NULL;
 	else sub = translateChar(STRING_ELT(CADDDR(args), 0));
 	mark = asLogical(CAD4R(args));
 	if(mark == NA_LOGICAL)
@@ -584,7 +585,7 @@
 	PROTECT(ans = duplicate(x));
 	R_AllocStringBuffer(0, &cbuff);  /* 0 -> default */
 	for(i = 0; i < LENGTH(x); i++) {
-	    si = STRING_ELT(x, i);
+	    si = isRawx ? x : STRING_ELT(x, i);
 	top_of_loop:
 	    inbuf = CHAR(si); inb = LENGTH(si);
 	    outbuf = cbuff.data; outb = cbuff.bufsize - 1;
@@ -622,7 +623,7 @@
 		goto next_char;
 	    }
 
-	    if(res != -1 && inb == 0) {
+	    if(res != -1 && inb == 0 && !isRawx) {
 		cetype_t ienc = CE_NATIVE;
 
 		nout = cbuff.bufsize - 1 - outb;
@@ -632,7 +633,12 @@
 		}
 		SET_STRING_ELT(ans, i, mkCharLenCE(cbuff.data, nout, ienc));
 	    }
-	    else SET_STRING_ELT(ans, i, NA_STRING);
+	    else if(!isRawx) SET_STRING_ELT(ans, i, NA_STRING);
+	    else {
+		nout = cbuff.bufsize - 1 - outb;
+		ans = allocVector(RAWSXP, nout);
+		memcpy(RAW(ans), cbuff.data, nout);
+	    }
 	}
 	Riconv_close(obj);
 	R_FreeStringBuffer(&cbuff);

-- 
Matthew S. Shotwell
Graduate Student
Division of Biostatistics and Epidemiology
Medical University of South Carolina
http://biostatmatt.com



More information about the R-devel mailing list