[Rd] [R] HTTP User-Agent header

Seth Falcon sfalcon at fhcrc.org
Sun Jul 30 16:39:04 CEST 2006


Robert Gentleman <rgentlem at fhcrc.org> writes:
> OK, that suggests setting at the options level would solve both of your 
> problems and that seems like the best approach. I don't really want to 
> pass this around as a parameter through the maze of functions that might 
> actually download something if we don't have to.

I have an updated patch that adds an HTTPUserAgent option.  The
default is a string like:

    R (2.4.0 x86_64-unknown-linux-gnu x86_64 linux-gnu)

If the HTTPUserAgent option is NULL, no user agent header is added to
HTTP requests (this is the current behavior).  This option allows R to
use an arbitrary user agent header.

The patch adds two non-exported functions to utils: 
   1) defaultUserAgent - returns a string like above
   2) makeUserAgent - formats content of HTTPUserAgent option for use
      as part of an HTTP request header.

I've tested on OSX and Linux, but not on Windows.  When USE_WININET is
defined, a user agent string of "R" was already being used.  With this
patch, the HTTPUserAgent options is used.  I'm unsure if NULL is
allowed.

Also, in src/main/internet.c there is a comment:
  "Next 6 are for use by libxml, only"
and then a definition for R_HTTPOpen.  Not sure how/when these get
used.  The user agent for these calls remains unspecified with this
patch.

+ seth


Patch summary:
 src/include/R_ext/R-ftp-http.h   |    2 +-
 src/include/Rmodules/Rinternet.h |    2 +-
 src/library/base/man/options.Rd  |    5 +++++
 src/library/utils/R/readhttp.R   |   25 +++++++++++++++++++++++++
 src/library/utils/R/zzz.R        |    3 ++-
 src/main/internet.c              |    2 +-
 src/modules/internet/internet.c  |   37 +++++++++++++++++++++++++------------
 src/modules/internet/nanohttp.c  |    8 ++++++--
 8 files changed, 66 insertions(+), 18 deletions(-)



Index: src/include/R_ext/R-ftp-http.h
===================================================================
--- src/include/R_ext/R-ftp-http.h	(revision 38715)
+++ src/include/R_ext/R-ftp-http.h	(working copy)
@@ -36,7 +36,7 @@
 int   R_FTPRead(void *ctx, char *dest, int len);
 void  R_FTPClose(void *ctx);
 
-void *	RxmlNanoHTTPOpen(const char *URL, char **contentType, int cacheOK);
+void *	RxmlNanoHTTPOpen(const char *URL, char **contentType, const char *headers, int cacheOK);
 int	RxmlNanoHTTPRead(void *ctx, void *dest, int len);
 void	RxmlNanoHTTPClose(void *ctx);
 int 	RxmlNanoHTTPReturnCode(void *ctx);
Index: src/include/Rmodules/Rinternet.h
===================================================================
--- src/include/Rmodules/Rinternet.h	(revision 38715)
+++ src/include/Rmodules/Rinternet.h	(working copy)
@@ -9,7 +9,7 @@
 typedef Rconnection (*R_NewUrlRoutine)(char *description, char *mode);
 typedef Rconnection (*R_NewSockRoutine)(char *host, int port, int server, char *mode); 
 
-typedef void * (*R_HTTPOpenRoutine)(const char *url, const int cacheOK);
+typedef void * (*R_HTTPOpenRoutine)(const char *url, const char *headers, const int cacheOK);
 typedef int    (*R_HTTPReadRoutine)(void *ctx, char *dest, int len);
 typedef void   (*R_HTTPCloseRoutine)(void *ctx);
 	      
Index: src/main/internet.c
===================================================================
--- src/main/internet.c	(revision 38715)
+++ src/main/internet.c	(working copy)
@@ -129,7 +129,7 @@
 {
     if(!initialized) internet_Init();
     if(initialized > 0)
-	return (*ptr->HTTPOpen)(url, 0);
+	return (*ptr->HTTPOpen)(url, NULL, 0);
     else {
 	error(_("internet routines cannot be loaded"));
 	return NULL;
Index: src/library/utils/R/zzz.R
===================================================================
--- src/library/utils/R/zzz.R	(revision 38715)
+++ src/library/utils/R/zzz.R	(working copy)
@@ -9,7 +9,8 @@
              internet.info = 2,
              pkgType = .Platform$pkgType,
              str = list(strict.width = "no"),
-             example.ask = "default")
+             example.ask = "default",
+             HTTPUserAgent = defaultUserAgent())
     extra <-
         if(.Platform$OS.type == "windows") {
             list(mailer = "none",
Index: src/library/utils/R/readhttp.R
===================================================================
--- src/library/utils/R/readhttp.R	(revision 38715)
+++ src/library/utils/R/readhttp.R	(working copy)
@@ -6,3 +6,28 @@
         stop("transfer failure")
     file.show(file, delete.file = delete.file, title = title, ...)
 }
+
+
+
+defaultUserAgent <- function()
+{
+    Rver <- paste(R.version$major, R.version$minor, sep=".")
+    Rdetails <- paste(Rver, R.version$platform, R.version$arch,
+                      R.version$os)
+    paste("R (", Rdetails, ")", sep="")
+}
+
+
+makeUserAgent <- function(format = TRUE) {
+    agent <- getOption("HTTPUserAgent")
+    if (is.null(agent)) {
+        return(NULL)
+    }
+    if (length(agent) != 1)
+      stop(sQuote("HTTPUserAgent"),
+           " option must be a length one character vector or NULL")
+    if (format)
+      paste("User-Agent: ", agent[1], "\r\n", sep = "")
+    else
+      agent[1]
+}
Index: src/library/base/man/options.Rd
===================================================================
--- src/library/base/man/options.Rd	(revision 38715)
+++ src/library/base/man/options.Rd	(working copy)
@@ -368,6 +368,11 @@
     \item{\code{help.try.all.packages}:}{default for an argument of
       \code{\link{help}}.}
 
+    \item{\code{HTTPUserAgent}:}{string used as the user agent in HTTP
+      requests.  If \code{NULL}, HTTP requests will be made without a
+      user agent header.  The default is \code{R (<version> <platform>
+      <arch> <os>)}}
+
     \item{\code{internet.info}:}{The minimum level of information to be
       printed on URL downloads etc.  Default is 2, for failure causes.
       Set to 1 or 0 to get more information.}
Index: src/modules/internet/internet.c
===================================================================
--- src/modules/internet/internet.c	(revision 38715)
+++ src/modules/internet/internet.c	(working copy)
@@ -28,7 +28,7 @@
 #include <Rconnections.h>
 #include <R_ext/R-ftp-http.h>
 
-static void *in_R_HTTPOpen(const char *url, const int cacheOK);
+static void *in_R_HTTPOpen(const char *url, const char *headers, const int cacheOK);
 static int   in_R_HTTPRead(void *ctx, char *dest, int len);
 static void  in_R_HTTPClose(void *ctx);
 
@@ -70,7 +70,7 @@
 
     switch(type) {
     case HTTPsh:
-	ctxt = in_R_HTTPOpen(url, 0);
+	ctxt = in_R_HTTPOpen(url, NULL, 0);
 	if(ctxt == NULL) {
 	  /* if we call error() we get a connection leak*/
 	  /* so do_url has to raise the error*/
@@ -238,14 +238,14 @@
 }
 #endif
 
-/* download(url, destfile, quiet, mode, cacheOK) */
+/* download(url, destfile, quiet, mode, headers, cacheOK) */
 
 #define CPBUFSIZE 65536
 #define IBUFSIZE 4096
 static SEXP in_do_download(SEXP call, SEXP op, SEXP args, SEXP env)
 {
-    SEXP ans, scmd, sfile, smode;
-    char *url, *file, *mode;
+    SEXP ans, scmd, sfile, smode, sheaders, agentFun;
+    char *url, *file, *mode, *headers;
     int quiet, status = 0, cacheOK;
 
     checkArity(op, args);
@@ -271,6 +271,17 @@
     cacheOK = asLogical(CAR(args));
     if(cacheOK == NA_LOGICAL)
 	error(_("invalid '%s' argument"), "cacheOK");
+#ifdef USE_WININET
+    PROTECT(agentFun = lang2(install("makeUserAgent"), ScalarLogical(0)));
+#else
+    PROTECT(agentFun = lang1(install("makeUserAgent")));
+#endif
+    PROTECT(sheaders = eval(agentFun, R_FindNamespace(mkString("utils"))));
+    UNPROTECT(1);
+    if(TYPEOF(sheaders) == NILSXP)
+        headers = NULL;
+    else 
+        headers = CHAR(STRING_ELT(sheaders, 0));
 #ifdef Win32
     if (!pbar.wprog) {
 	pbar.wprog = newwindow(_("Download progress"), rect(0, 0, 540, 100),
@@ -319,7 +330,7 @@
 #ifdef Win32
 	R_FlushConsole();
 #endif
-	ctxt = in_R_HTTPOpen(url, cacheOK);
+	ctxt = in_R_HTTPOpen(url, headers, cacheOK);
 	if(ctxt == NULL) status = 1;
 	else {
 	    if(!quiet) REprintf(_("opened URL\n"), url);
@@ -466,14 +477,14 @@
 
     PROTECT(ans = allocVector(INTSXP, 1));
     INTEGER(ans)[0] = status;
-    UNPROTECT(1);
+    UNPROTECT(2);
     return ans;
 }
 
 
 #if defined(SUPPORT_LIBXML) && !defined(USE_WININET)
 
-void *in_R_HTTPOpen(const char *url, int cacheOK)
+void *in_R_HTTPOpen(const char *url, const char *headers, const int cacheOK)
 {
     inetconn *con;
     void *ctxt;
@@ -484,7 +495,7 @@
     if(timeout == NA_INTEGER || timeout <= 0) timeout = 60;
 
     RxmlNanoHTTPTimeout(timeout);
-    ctxt = RxmlNanoHTTPOpen(url, NULL, cacheOK);
+    ctxt = RxmlNanoHTTPOpen(url, NULL, headers, cacheOK);
     if(ctxt != NULL) {
 	int rc = RxmlNanoHTTPReturnCode(ctxt);
 	if(rc != 200) {
@@ -605,7 +616,8 @@
 }
 #endif /* USE_WININET_ASYNC */
 
-static void *in_R_HTTPOpen(const char *url, const int cacheOK)
+static void *in_R_HTTPOpen(const char *url, const char *headers, 
+                           const int cacheOK)
 {
     WIctxt  wictxt;
     DWORD status, d1 = 4, d2 = 0, d3 = 100;
@@ -622,7 +634,7 @@
     wictxt->length = -1;
     wictxt->type = NULL;
     wictxt->hand =
-	InternetOpen("R", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL,
+	InternetOpen(headers, INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL,
 #ifdef USE_WININET_ASYNC
 		     INTERNET_FLAG_ASYNC
 #else
@@ -870,7 +882,8 @@
 #endif
 
 #ifndef HAVE_INTERNET
-static void *in_R_HTTPOpen(const char *url, const int cacheOK)
+static void *in_R_HTTPOpen(const char *url, const char *headers, 
+                           const int cacheOK)
 {
     return NULL;
 }
Index: src/modules/internet/nanohttp.c
===================================================================
--- src/modules/internet/nanohttp.c	(revision 38715)
+++ src/modules/internet/nanohttp.c	(working copy)
@@ -1034,6 +1034,9 @@
  * @contentType:  if available the Content-Type information will be
  *                returned at that location
  *
+ * @headers: headers to be used in the HTTP request.  These must be name/value
+ *           pairs separated by ':', each on their own line.
+ *
  * This function try to open a connection to the indicated resource
  * via HTTP GET.
  *
@@ -1042,10 +1045,11 @@
  */
 
 void*
-RxmlNanoHTTPOpen(const char *URL, char **contentType, int cacheOK)
+RxmlNanoHTTPOpen(const char *URL, char **contentType, const char *headers,
+                 int cacheOK)
 {
     if (contentType != NULL) *contentType = NULL;
-    return RxmlNanoHTTPMethod(URL, NULL, NULL, contentType, NULL, cacheOK);
+    return RxmlNanoHTTPMethod(URL, NULL, NULL, contentType, headers, cacheOK);
 }
 
 /**



More information about the R-devel mailing list