[Rd] [R] HTTP User-Agent header

Seth Falcon sfalcon at fhcrc.org
Fri Jul 28 18:44:08 CEST 2006


[moved from R-help to R-devel]

Prof Brian Ripley <ripley at stats.ox.ac.uk> writes:
> Otherwise, see ?download.file and choose a different download method,
> or look at the source code (src/modules/internet/nanohttp.c) and submit a 
> patch.

I have a rough draft patch, see below, that adds a User-Agent header
to HTTP requests made in R via download.file.  If there is interest, I
will polish it.

Why have R identify itself?  Well, I think it is reasonable behavior
for legitimate "browsers" to identify themselves.  It will help a user
whose institution has a rather harsh web proxy policy (however, silly
it may be).  It will also be of use in tracking use of R, versions,
and OSes on CRAN mirrors.

Here is an example of what the user-agent string will be for an R
running on OSX:

    R (2.4.0 powerpc-apple-darwin8.7.0 powerpc darwin8.7.0)

And here is the patch...

+ seth

Index: src/include/R_ext/R-ftp-http.h
===================================================================
--- src/include/R_ext/R-ftp-http.h	(revision 38709)
+++ src/include/R_ext/R-ftp-http.h	(working copy)
@@ -36,7 +36,7 @@
 int   R_FTPRead(void *ctx, char *dest, int len);
 void  R_FTPClose(void *ctx);
 
-void *	RxmlNanoHTTPOpen(const char *URL, char **contentType, int cacheOK);
+void *	RxmlNanoHTTPOpen(const char *URL, char **contentType, const char *headers, int cacheOK);
 int	RxmlNanoHTTPRead(void *ctx, void *dest, int len);
 void	RxmlNanoHTTPClose(void *ctx);
 int 	RxmlNanoHTTPReturnCode(void *ctx);
Index: src/include/Rmodules/Rinternet.h
===================================================================
--- src/include/Rmodules/Rinternet.h	(revision 38709)
+++ src/include/Rmodules/Rinternet.h	(working copy)
@@ -9,7 +9,7 @@
 typedef Rconnection (*R_NewUrlRoutine)(char *description, char *mode);
 typedef Rconnection (*R_NewSockRoutine)(char *host, int port, int server, char *mode); 
 
-typedef void * (*R_HTTPOpenRoutine)(const char *url, const int cacheOK);
+typedef void * (*R_HTTPOpenRoutine)(const char *url, const char *headers, const int cacheOK);
 typedef int    (*R_HTTPReadRoutine)(void *ctx, char *dest, int len);
 typedef void   (*R_HTTPCloseRoutine)(void *ctx);
 	      
Index: src/main/names.c
===================================================================
--- src/main/names.c	(revision 38709)
+++ src/main/names.c	(working copy)
@@ -885,7 +885,7 @@
 {"sockSelect",do_sockselect,0,	11,     3,      {PP_FUNCALL, PREC_FN,	0}},
 {"getAllConnections",do_getallconnections,0,11, 0,      {PP_FUNCALL, PREC_FN,	0}},
 {"summary.connection",do_sumconnection,0,11,    1,      {PP_FUNCALL, PREC_FN,	0}},
-{"download", 	do_download,	0,      11,     5,      {PP_FUNCALL, PREC_FN,	0}},
+{"download", 	do_download,	0,      11,     6,      {PP_FUNCALL, PREC_FN,	0}},
 {"nsl", 	do_nsl,		0,      11,     1,      {PP_FUNCALL, PREC_FN,	0}},
 {"gzcon", 	do_gzcon,	0,      11,     3,      {PP_FUNCALL, PREC_FN,	0}},
 
Index: src/main/memory.c
===================================================================
--- src/main/memory.c	(revision 38709)
+++ src/main/memory.c	(working copy)
@@ -2478,8 +2478,11 @@
 SEXP (STRING_ELT)(SEXP x, int i) {
 #ifdef USE_TYPE_CHECKING
     if(TYPEOF(x) != STRSXP)
+      x = 1/(1-1);
+    /*
 	error("%s() can only be applied to a '%s', not a '%s'", 
 	      "STRING_ELT", "character vector", type2char(TYPEOF(x)));
+    */
 #endif
     return STRING_ELT(x, i);
 }
Index: src/main/internet.c
===================================================================
--- src/main/internet.c	(revision 38709)
+++ src/main/internet.c	(working copy)
@@ -129,7 +129,7 @@
 {
     if(!initialized) internet_Init();
     if(initialized > 0)
-	return (*ptr->HTTPOpen)(url, 0);
+	return (*ptr->HTTPOpen)(url, NULL, 0);
     else {
 	error(_("internet routines cannot be loaded"));
 	return NULL;
Index: src/library/utils/R/unix/download.file.R
===================================================================
--- src/library/utils/R/unix/download.file.R	(revision 38709)
+++ src/library/utils/R/unix/download.file.R	(working copy)
@@ -1,5 +1,5 @@
-download.file <- function(url, destfile, method,
-                          quiet = FALSE, mode = "w", cacheOK = TRUE)
+download.file <- function(url, destfile, method, quiet = FALSE, mode = "w",
+                          cacheOK = TRUE, headers = NULL)
 {
     method <- if (missing(method))
         ifelse(!is.null(getOption("download.file.method")),
@@ -7,6 +7,8 @@
                "auto")
     else
         match.arg(method, c("auto", "internal", "wget", "lynx"))
+    if (is.null(headers))
+      headers <- httpUserAgent()
 
     if(method == "auto") {
         if(capabilities("http/ftp"))
@@ -22,7 +24,8 @@
             stop("no download method found")
     }
     if(method == "internal")
-        status <- .Internal(download(url, destfile, quiet, mode, cacheOK))
+      status <- .Internal(download(url, destfile, quiet, mode, headers,
+                                   cacheOK))
     else if(method == "wget") {
         extra <- if(quiet) " --quiet" else ""
         if(!cacheOK) extra <- paste(extra, "--cache=off")
Index: src/library/utils/R/windows/download.file.R
===================================================================
--- src/library/utils/R/windows/download.file.R	(revision 38709)
+++ src/library/utils/R/windows/download.file.R	(working copy)
@@ -1,5 +1,5 @@
-download.file <- function(url, destfile, method,
-                          quiet = FALSE, mode = "w", cacheOK = TRUE)
+download.file <- function(url, destfile, method, quiet = FALSE, mode = "w",
+                          cacheOK = TRUE, headers = NULL)
 {
     method <- if (missing(method))
         ifelse(!is.null(getOption("download.file.method")),
@@ -7,6 +7,8 @@
                "auto")
     else
         match.arg(method, c("auto", "internal", "wget", "lynx"))
+    if (is.null(headers))
+      headers <- httpUserAgent()
 
     if(method == "auto") {
         if(capabilities("http/ftp"))
@@ -22,7 +24,8 @@
             stop("no download method found")
     }
     if(method == "internal")
-        status <- .Internal(download(url, destfile, quiet, mode, cacheOK))
+        status <- .Internal(download(url, destfile, quiet, mode, headers,
+                                     cacheOK))
     else if(method == "wget") {
         extra <- if(quiet) " --quiet" else ""
         if(!cacheOK) extra <- paste(extra, "--cache=off")
Index: src/library/utils/R/readhttp.R
===================================================================
--- src/library/utils/R/readhttp.R	(revision 38709)
+++ src/library/utils/R/readhttp.R	(working copy)
@@ -6,3 +6,15 @@
         stop("transfer failure")
     file.show(file, delete.file = delete.file, title = title, ...)
 }
+
+
+httpUserAgent <- function(agent)
+{
+    if (missing(agent)) {
+        Rver <- paste(R.version$major, R.version$minor, sep=".")
+        Rdetails <- paste(Rver, R.version$platform, R.version$arch,
+                          R.version$os)
+        agent <- paste("R (", Rdetails, ")", sep="")
+    }
+    paste("User-Agent: ", agent, "\r\n", sep="")
+}
Index: src/library/utils/man/download.file.Rd
===================================================================
--- src/library/utils/man/download.file.Rd	(revision 38709)
+++ src/library/utils/man/download.file.Rd	(working copy)
@@ -29,6 +29,13 @@
 
   \item{cacheOK}{logical.  Is a server-side cached value acceptable?
     Implemented for the \code{"internal"} and \code{"wget"} methods.}
+
+  \item{headers}{character. Headers to be used in the HTTP request.
+  This should be a character vector of length one formatted correctly
+  for use in the HTTP header.  The default value of \code{NULL}
+  results in a standard user agent header to be added to the HTTP
+  request that identified R as \code{User-Agent: R (X.Y.Z platform
+  arch os)}.  }
 }
 \details{
   The function \code{download.file} can be used to download a single
Index: src/modules/internet/internet.c
===================================================================
--- src/modules/internet/internet.c	(revision 38709)
+++ src/modules/internet/internet.c	(working copy)
@@ -28,7 +28,7 @@
 #include <Rconnections.h>
 #include <R_ext/R-ftp-http.h>
 
-static void *in_R_HTTPOpen(const char *url, const int cacheOK);
+static void *in_R_HTTPOpen(const char *url, const char *headers, const int cacheOK);
 static int   in_R_HTTPRead(void *ctx, char *dest, int len);
 static void  in_R_HTTPClose(void *ctx);
 
@@ -70,7 +70,7 @@
 
     switch(type) {
     case HTTPsh:
-	ctxt = in_R_HTTPOpen(url, 0);
+	ctxt = in_R_HTTPOpen(url, NULL, 0);
 	if(ctxt == NULL) {
 	  /* if we call error() we get a connection leak*/
 	  /* so do_url has to raise the error*/
@@ -238,14 +238,14 @@
 }
 #endif
 
-/* download(url, destfile, quiet, mode, cacheOK) */
+/* download(url, destfile, quiet, mode, headers, cacheOK) */
 
 #define CPBUFSIZE 65536
 #define IBUFSIZE 4096
 static SEXP in_do_download(SEXP call, SEXP op, SEXP args, SEXP env)
 {
-    SEXP ans, scmd, sfile, smode;
-    char *url, *file, *mode;
+    SEXP ans, scmd, sfile, smode, sheaders;
+    char *url, *file, *mode, *headers;
     int quiet, status = 0, cacheOK;
 
     checkArity(op, args);
@@ -268,6 +268,14 @@
     if(!isString(smode) || length(smode) != 1)
 	error(_("invalid '%s' argument"), "mode");
     mode = CHAR(STRING_ELT(smode, 0));
+    sheaders = CAR(args); args = CDR(args);
+    if(TYPEOF(sheaders) == NILSXP)
+        headers = NULL;
+    else {
+        if(!isString(sheaders) || length(sheaders) != 1)
+            error(_("invalid '%s' argument"), "headers");
+        headers = CHAR(STRING_ELT(sheaders, 0));
+    }
     cacheOK = asLogical(CAR(args));
     if(cacheOK == NA_LOGICAL)
 	error(_("invalid '%s' argument"), "cacheOK");
@@ -319,7 +327,7 @@
 #ifdef Win32
 	R_FlushConsole();
 #endif
-	ctxt = in_R_HTTPOpen(url, cacheOK);
+	ctxt = in_R_HTTPOpen(url, headers, cacheOK);
 	if(ctxt == NULL) status = 1;
 	else {
 	    if(!quiet) REprintf(_("opened URL\n"), url);
@@ -473,7 +481,7 @@
 
 #if defined(SUPPORT_LIBXML) && !defined(USE_WININET)
 
-void *in_R_HTTPOpen(const char *url, int cacheOK)
+void *in_R_HTTPOpen(const char *url, const char *headers, const int cacheOK)
 {
     inetconn *con;
     void *ctxt;
@@ -484,15 +492,15 @@
     if(timeout == NA_INTEGER || timeout <= 0) timeout = 60;
 
     RxmlNanoHTTPTimeout(timeout);
-    ctxt = RxmlNanoHTTPOpen(url, NULL, cacheOK);
+    ctxt = RxmlNanoHTTPOpen(url, NULL, headers, cacheOK);
     if(ctxt != NULL) {
 	int rc = RxmlNanoHTTPReturnCode(ctxt);
 	if(rc != 200) {
 	    char *msg;
-	    RxmlNanoHTTPClose(ctxt);
 	    /* bug work-around: it will crash on OS X if passed directly */
 	    msg = _("cannot open: HTTP status was '%d %s'");
 	    warning(msg, rc, RxmlNanoHTTPStatusMsg(ctxt));
+	    RxmlNanoHTTPClose(ctxt);
 	    return NULL;
 	} else {
 	    type = RxmlNanoHTTPContentType(ctxt);
Index: src/modules/internet/nanohttp.c
===================================================================
--- src/modules/internet/nanohttp.c	(revision 38709)
+++ src/modules/internet/nanohttp.c	(working copy)
@@ -1034,6 +1034,9 @@
  * @contentType:  if available the Content-Type information will be
  *                returned at that location
  *
+ * @headers: headers to be used in the HTTP request.  These must be name/value
+ *           pairs separated by ':', each on their own line.
+ *
  * This function try to open a connection to the indicated resource
  * via HTTP GET.
  *
@@ -1042,10 +1045,11 @@
  */
 
 void*
-RxmlNanoHTTPOpen(const char *URL, char **contentType, int cacheOK)
+RxmlNanoHTTPOpen(const char *URL, char **contentType, const char *headers,
+                 int cacheOK)
 {
     if (contentType != NULL) *contentType = NULL;
-    return RxmlNanoHTTPMethod(URL, NULL, NULL, contentType, NULL, cacheOK);
+    return RxmlNanoHTTPMethod(URL, NULL, NULL, contentType, headers, cacheOK);
 }
 
 /**



More information about the R-devel mailing list