[R] What is the fastest way to see what are in an RData file?

Gustaf Rydevik gustaf.rydevik at gmail.com
Thu Dec 17 12:33:29 CET 2009


On Wed, Dec 16, 2009 at 10:13 PM, Peng Yu <pengyu.ut at gmail.com> wrote:
>
> Currently, I load the RData file then ls() and str(). But loading the file
> takes too long if the file is big. Most of the time, I only interested what
> the variables are in the the file and the attributes of the variables (like
> if it is a data.frame, matrix, what are the colnames/rownames, etc.)
>
> I'm wondering if there is any facility in R to help me avoid loading the
> whole file.


I thought this was interesting as well, so i did a bit of searching
through the R-help list archives and found this answer by Simon
Urbanek:
https://stat.ethz.ch/pipermail/r-devel/2007-August/046724.html
The link to a c-routine that does what you want still works, but for
future reference I'm pasting the code below.

Regards,
Gustaf

----------------------------
/*  rdcopy v0.1-0 - extract objects or display contents of RData RDX2 files
 *
 *  Copyright (C) 2007        Simon Urbanek
 *  based in part on src/main/serialize.c and src/main/saveload.c from R:
 *  Copyright (C) 1995, 1996  Robert Gentleman and Ross Ihaka
 *  Copyright (C) 1997--2007  Robert Gentleman, Ross Ihaka and the
 *                            R Development Core Team
 *  License: GPL v2
 *
 *  Although R includes are needed to compile this (for constants),
 *  libR does NOT have to be linked.
 */

#include <stdio.h>
#include <rpc/types.h>
#include <rpc/xdr.h>
#include <R.h>
#include <Rinternals.h>

#ifndef _
#define _(X) X
#endif

#undef error
void error(char *fmt, ...) {
  va_list(ap);

  va_start(ap, fmt);
  vprintf(fmt, ap);
  va_end(ap);
  exit(1);
}

/* .RData:

 byte 0..4  XDR2. - file magic ("XDR2\n"=XDR ver2)
 byte 5..6  X.    - format ("A\n"=ASCII, "B\n"=binary, "X\n"=XDR)
 byte 7...          RXDR2 stream.

 Note: RXDR2 format in NOT a valid XDR format! Strings and
       raw bytes are not padded and thus cannot be read
       using XDR alone.
*/

/* we need to override this so that we don't have to really use libR */
SEXP R_NilValue = 0;

/*  those are directly from serialize.c */
#define REFSXP            255
#define NILVALUE_SXP      254
#define GLOBALENV_SXP     253
#define UNBOUNDVALUE_SXP  252
#define MISSINGARG_SXP    251
#define BASENAMESPACE_SXP 250
#define NAMESPACESXP      249
#define PACKAGESXP        248
#define PERSISTSXP        247
#define CLASSREFSXP       246
#define GENERICREFSXP     245
#define BCREPDEF          244
#define BCREPREF          243
#define EMPTYENV_SXP      242
#define BASEENV_SXP       241

/* map type to a name */
static const char *nameSEXP(int type) {
  switch (type) {
  case REFSXP: return "REF";
  case NILVALUE_SXP: return "NULL";
  case GLOBALENV_SXP: return ".GlobalEnv";
  case UNBOUNDVALUE_SXP: return "<unbound>";
  case MISSINGARG_SXP: return "<missing>";
  case BASENAMESPACE_SXP: return "<<base>>";
  case NAMESPACESXP: return "NAMESPACE";
  case PACKAGESXP: return "PACKAGE";
  case PERSISTSXP: return "PERSIST";
  case CLASSREFSXP: return "CLASSREF";
  case GENERICREFSXP: return "GENERICREF";
  case BCREPDEF: return "BC-REP-DEF";
  case BCREPREF: return "BC-REP-REF";
  case EMPTYENV_SXP: return "<empty-env>";
  case BASEENV_SXP: return "<base-env>";
  case NILSXP: return "NIL";
  case SYMSXP: return "SYM";
  case LISTSXP: return "LIST";
  case CLOSXP: return "CLO";
  case ENVSXP: return "ENV";
  case PROMSXP: return "PROM";
  case LANGSXP: return "LANG";
  case SPECIALSXP: return "SPECIAL";
  case BUILTINSXP: return "BUILTIN";
  case CHARSXP: return "CHAR";
  case LGLSXP: return "LGL";
  case INTSXP: return "INT";
  case REALSXP: return "REAL";
  case CPLXSXP: return "CPLX";
  case STRSXP: return "STR";
  case DOTSXP: return "...";
  case ANYSXP: return "ANY";
  case VECSXP: return "VEC";
  case EXPRSXP: return "EXPR";
  case BCODESXP: return "BCODE";
  case EXTPTRSXP: return "EXTPTR";
  case WEAKREFSXP: return "WEAKREF";
  case RAWSXP: return "RAW";
  case S4SXP: return "S4";
  }
  return "?";
}

/* again from serialize.c */

#define IS_OBJECT_BIT_MASK (1 << 8)
#define HAS_ATTR_BIT_MASK (1 << 9)
#define HAS_TAG_BIT_MASK (1 << 10)
#define ENCODE_LEVELS(v) (v << 12)
#define DECODE_LEVELS(v) (v >> 12)
#define DECODE_TYPE(v) (v & 255)

/* this structure is passed acros all functions. it encapsulates both
the reading an book-keeping */

typedef struct {
  XDR xdrs;
  char *buf;
  long bs;
  FILE *f;
  int lev;
  char *flag;
  int refs;
  long *ref; /* reference offsets */
  int maxrefs; /* length of the refes vector */
  int verb;
  int mode;
  int flags;
  long target;
  FILE *copyf;
} SaveLoadData;

#define M_Read         0
#define M_NonRefCopy   1
#define M_Copy         2
#define M_NonRefSelect 3
#define F_NOREF  1

/* the following is partially based on src/main/saveload.c from R */

static void XdrInInit(FILE *fp, SaveLoadData *d, long sbsize)
{
  xdrstdio_create(&d->xdrs, fp, XDR_DECODE);
  d->buf = (char*) malloc(sbsize);
  if (!(d->buf))
    error(_("cannot allocate memory for a string buffer"));
  d->bs = sbsize;
  d->f = fp;
  d->lev = 0;
  d->flag = 0;
  d->flags = 0;
  d->refs = 0;
  d->maxrefs = 2048;
  d->ref = (long*) malloc(sizeof(long)*d->maxrefs);
  d->copyf = 0;
  d->mode = M_Read;
}

static void XdrInTerm(SaveLoadData *d)
{
  xdr_destroy(&d->xdrs);
  free(d->buf);
  if (d->f) fclose(d->f);
  if (d->copyf) fclose(d->copyf);
}

static void XdrSkipBytes(SaveLoadData *d, int n) {
  while (n > d->bs) {
    XdrSkipBytes(d, d->bs);
    n-=d->bs;
  }
  fread(d->buf, 1, n, d->f);
  if (d->copyf && (d->mode == M_NonRefCopy || d->mode == M_Copy))
    fwrite(d->buf, 1, n, d->copyf);
  /* fseek(d->f, n, SEEK_CUR); */
}

static int XdrInInteger(SaveLoadData *d)
{
  int i=0;
  if (!xdr_int(&d->xdrs, &i)) {
    xdr_destroy(&d->xdrs);
    error(_("a I read error occurred"));
  }
  if (d->copyf && (d->mode == M_NonRefCopy || d->mode == M_Copy)) {
    unsigned int y = (unsigned int) i;
    unsigned char ib[4];
    ib[0]=y>>24; ib[1]=(y>>16)&255; ib[2]=(y>>8)&255; ib[3]=y&255;
    fwrite(ib, 1, 4, d->copyf);
  }
  return i;
}

static double XdrInReal(SaveLoadData *d)
{
  double x;
  if (!xdr_double(&d->xdrs, &x)) {
    xdr_destroy(&d->xdrs);
    error(_("a R read error occurred"));
  }
  if (d->copyf && (d->mode == M_NonRefCopy || d->mode == M_Copy)) {
    unsigned long long y = *((unsigned long long*) &x);
    unsigned char ib[8];
    ib[0]=y>>56; ib[1]=(y>>48)&255; ib[2]=(y>>40)&255; ib[3]=(y>>32)&255;
    ib[4]=(y>>24)&255; ib[5]=(y>>16)&255; ib[6]=(y>>8)&255; ib[7]=y&255;
    fwrite(ib, 1, 8, d->copyf);
  }
  return x;
}

static Rcomplex XdrInComplex(SaveLoadData *d)
{
  Rcomplex x;
  if (!xdr_double(&d->xdrs, &(x.r)) || !xdr_double(&d->xdrs, &(x.i))) {
    xdr_destroy(&d->xdrs);
    error(_("a CR read error occurred"));
  }
  if (d->copyf && (d->mode == M_NonRefCopy || d->mode == M_Copy)) {
    unsigned long long y = *((unsigned long long*) &x.r);
    unsigned long long v = *((unsigned long long*) &x.i);
    unsigned char ib[16];
    ib[0]=y>>56; ib[1]=(y>>48)&255; ib[2]=(y>>40)&255; ib[3]=(y>>32)&255;
    ib[4]=(y>>24)&255; ib[5]=(y>>16)&255; ib[6]=(y>>8)&255; ib[7]=y&255;
    ib[8]=v>>56; ib[9]=(v>>48)&255; ib[10]=(v>>40)&255; ib[11]=(v>>32)&255;
    ib[12]=(v>>24)&255; ib[13]=(v>>16)&255; ib[14]=(v>>8)&255; ib[15]=v&255;
    fwrite(ib, 1, 16, d->copyf);
  }
  return x;
}

static char *XdrInBytes(SaveLoadData *d, char *buf, unsigned int len) {
  if (!buf) {
    XdrSkipBytes(d, len);
    return d->buf;
  }

  fread(buf, 1, len, d->f);
  if (d->copyf && (d->mode == M_NonRefCopy || d->mode == M_Copy))
    fwrite(buf, 1, len, d->copyf);
  return buf;
}

static char *XdrInString(SaveLoadData *d)
{
  if (!xdr_string(&d->xdrs, &d->buf, d->bs)) {
    xdr_destroy(&d->xdrs);
    error(_("a S read error occurred"));
  }
  return d->buf;
}

/* back to serialize.c */

#define UNPACK_REF_INDEX(i) ((i) >> 8)

static SEXP ReadItem(SaveLoadData *d);

static SEXP InStringVec(SaveLoadData *d) {
  int per = XdrInInteger(d);
  int len = XdrInInteger(d);
  int i = 0;
  while (i < len) {
    ReadItem(d);
    i++;
  }
  return R_NilValue;
}

static void AddReadRef(SaveLoadData *d, long off) {
  if (d->flags & F_NOREF) return;
  d->ref[d->refs++] = off;
  if (d->verb) printf(" {ref=%d}", d->refs);
  if (d->refs>=d->maxrefs) {
    XdrInTerm(d);
    error(_("too many references in the data file"));
  }
}

#define InVec(fp, obj, accessor, infunc, length)                        \
  {                                                            \
    int cnt;                                                \
    for (cnt = 0; cnt < length; ++cnt)              \
      /*accessor(obj, cnt,*/ infunc(fp);        \
  }


static SEXP ReadBCLang(SaveLoadData *d, int type) {
  switch (type) {
  case BCREPREF:
    XdrInInteger(d);
    return R_NilValue;
  case BCREPDEF:
  case LANGSXP:
  case LISTSXP:
    {
      int pos = -1;
      if (type == BCREPDEF) {
    pos  = XdrInInteger(d);
    type = XdrInInteger(d);
      }
      /*TAG*/ ReadItem(d);
      /*CAR*/ ReadBCLang(d, XdrInInteger(d));
      /*CDR*/ ReadBCLang(d, XdrInInteger(d));
      return R_NilValue;
    }
  default: return ReadItem(d);
  }
}

static SEXP ReadBC(SaveLoadData *d) {
  ReadItem(d); /* code */
  { /* consts */
    int blen = XdrInInteger(d);
    int bc = 0;
    while (bc < blen) {
      int type = XdrInInteger(d);
      switch (type) {
      case BCODESXP:
    ReadBC(d);
    break;
      case LANGSXP:
      case LISTSXP:
      case BCREPDEF:
      case BCREPREF:
    ReadBCLang(d, type);
    break;
      default:
    ReadItem(d);
      }
      bc++;
    }
  }
}

static SEXP ReadItem_(SaveLoadData *d, long boe, int cut);

static SEXP ReadItem(SaveLoadData *d) {
  long boe = ftell(d->f);
  if (d->mode==M_NonRefSelect && boe==d->target) {
    printf(" -> saving object at %ld\n", boe);
    d->mode=M_NonRefCopy;
    ReadItem_(d, boe, 1);
    d->mode=M_Read;
  } else ReadItem_(d, boe, 0);
  return 0;
}

static SEXP ReadItem_(SaveLoadData *d, long boe, int cut) {
  int flags = XdrInInteger(d);
  int type = DECODE_TYPE(flags);
  int lev = DECODE_LEVELS(flags);
  int hasattr = flags & HAS_ATTR_BIT_MASK ? 1 : 0;
  int hastag  = flags & HAS_TAG_BIT_MASK ? 1 : 0;
  int isobj   = flags & IS_OBJECT_BIT_MASK ? 1 : 0;
  SEXP s = R_NilValue;
  int len;
  int isroot = 0;

  char px[64], *cpx=px+d->lev; *cpx=0; while (--cpx>=px) *cpx=' ';

  if (!d->flag) d->flag="";
  if (type!=CHARSXP && d->verb)
    printf("\n@%-7ld%s%s %s %08x [type=%d%s%s%s]", boe, px, d->flag,
nameSEXP(type), flags, type, hasattr?",ATTR":"", hastag?",TAG":"",
isobj?",OBJ":"");
  d->flag="";

  switch(type) {
  case NILVALUE_SXP:      return 0/* R_NilValue */;
  case EMPTYENV_SXP:      return 0/*R_EmptyEnv*/;
  case BASEENV_SXP:       return 0/*R_BaseEnv*/;
  case GLOBALENV_SXP:     return 0/*R_GlobalEnv*/;
  case UNBOUNDVALUE_SXP:  return 0/*R_UnboundValue*/;
  case MISSINGARG_SXP:    return 0/*R_MissingArg*/;
  case BASENAMESPACE_SXP:
    return 0/*R_BaseNamespace*/;
  case REFSXP:
    {
      int refi = UNPACK_REF_INDEX(flags);
      if (!refi) refi = XdrInInteger(d);
      if (d->verb) printf("<REFSXP: %d>", refi);
      if (d->mode==M_NonRefCopy) {
    long cp = ftell(d->f);
    long cop = ftell(d->copyf);
    long back = -4;
    SaveLoadData e;
    e.verb=0;
    if (!UNPACK_REF_INDEX(flags)) back -= 4;
    if (refi<1 || refi>d->refs) {
      XdrInTerm(d);
      error(_("invalid reference %d"), refi);
    }
    if (fseek(d->f, d->ref[refi-1], SEEK_SET)) {
      XdrInTerm(d);
      error(_("unable to seek to reference %d"), refi);
    }
    if (fseek(d->copyf, back, SEEK_CUR)) { /* backup to overwise the
reference */
      XdrInTerm(d);
      error(_("unable to seek in the output stream"));
    }
    XdrInInit(d->f, &e, d->bs);
    e.flags=F_NOREF; e.copyf=d->copyf; e.mode=d->mode;
    ReadItem(&e);
    e.copyf=0; e.f=0; /* we need to delete those to Term doesn't close them */
    XdrInTerm(&e);
    if (fseek(d->f, cp, SEEK_SET)) {
      XdrInTerm(d);
      error(_("unable to return to reference point"));
    }
      }
      return R_NilValue;
    }
  case PERSISTSXP:
    InStringVec(d);
    AddReadRef(d, boe);
    return s;
  case SYMSXP:
    d->lev++;
    ReadItem(d); /* print name */
    AddReadRef(d, boe);
    d->lev--;
    return s;
  case PACKAGESXP:
    InStringVec(d);
    AddReadRef(d, boe);
    return s;
  case NAMESPACESXP:
    InStringVec(d);
    AddReadRef(d, boe);
    return s;
  case ENVSXP:
    {
      int locked = XdrInInteger(d);
      AddReadRef(d, boe);

      d->lev++;
      /*ENCLOS*/ ReadItem(d);
      /*FRAME*/ ReadItem(d);
      /*TAG*/ ReadItem(d);
      /*ATTR*/ ReadItem(d);
      /* We don't write out the object bit for environments, so
reconstruct it here if needed. */
      /* Convert a NULL enclosure to baseenv()
     if (ENCLOS(s) == R_NilValue) SET_ENCLOS(s, R_BaseEnv); */
      d->lev--;

      return s;
    }
  case LISTSXP:
    if (d->lev==0) isroot=1;
  case LANGSXP:
  case CLOSXP:
  case PROMSXP:
  case DOTSXP:
    d->lev++;
    if (hasattr) { d->flag="ATT"; ReadItem(d); }
    if (hastag) { d->flag="TAG"; ReadItem(d);
      if (isroot) printf(d->verb?"\n%s\t%ld":"%s\t%ld\n", d->buf, boe);
    };
    /*CAR*/ d->flag="CAR"; ReadItem(d);
    if (cut) { /* if this is the selected object, then we cannot
proceed to CDR but close it instead */
      unsigned char ib[4] = { 0, 0, 0, NILVALUE_SXP };
      fwrite(ib, 1, 4, d->copyf);
      d->lev--;
      return 0;
    }
    /*CDR*/ d->flag="CDR"; if (isroot) d->lev=0; ReadItem(d);
    /* For reading closures and promises stored in earlier versions,
convert NULL env to baseenv()
    if      (type == CLOSXP && CLOENV(s) == R_NilValue) SET_CLOENV(s,
R_BaseEnv);
    else if (type == PROMSXP && PRENV(s) == R_NilValue) SET_PRENV(s,
R_BaseEnv); */
    if (d->lev>0) d->lev--;
    isroot=0;
    return s;
  default:
    /* These break out of the switch to have their ATTR, LEVELS, and
OBJECT fields filled in.  Each leaves the newly allocated value
PROTECTed */
    switch (type) {
    case EXTPTRSXP:
      d->lev++;
      AddReadRef(d, boe);
      /*PtrProtected*/ ReadItem(d);
      /*PtrTag*/ ReadItem(d);
      d->lev--;
      break;
    case WEAKREFSXP:
      AddReadRef(d, boe);
      break;
    case SPECIALSXP:
    case BUILTINSXP:
      len = XdrInInteger(d);
      XdrInBytes(d, 0, len);
      break;
    case CHARSXP:
      len = XdrInInteger(d);
      if (len == -1)
    s = 0 /*NA_STRING*/;
      else {
    char *c = XdrInBytes(d, 0, len);
    c[len]=0;
    if (d->verb>1) printf(" '%s'", c);
      }
      break;
    case LGLSXP:
      len = XdrInInteger(d);
      InVec(d, s, SET_LOGICAL_ELT, XdrInInteger, len);
      break;
    case INTSXP:
      len = XdrInInteger(d);
      InVec(d, s, SET_INTEGER_ELT, XdrInInteger, len);
      break;
    case REALSXP:
      len = XdrInInteger(d);
      InVec(d, s, SET_REAL_ELT, XdrInReal, len);
      break;
    case CPLXSXP:
      len = XdrInInteger(d);
      InVec(d, s, SET_COMPLEX_ELT, XdrInComplex, len);
      break;
    case STRSXP:
      {
    int count = 0;
    len = XdrInInteger(d);
    d->lev++;
    for (; count < len; ++count)
      ReadItem(d);
    d->lev--;
      }
      break;
    case VECSXP:
    case EXPRSXP:
      {
    int count = 0;
    len = XdrInInteger(d);
    d->lev++;
    for (; count < len; ++count)
      ReadItem(d);
    d->lev--;
      }
      break;
    case BCODESXP:
      {
    int count = 0;
    len = XdrInInteger(d);
    while (count < len) {
      ReadBC(d);
      count++;
    }
      }
      break;
    case CLASSREFSXP:
      error(_("this version of R cannot read class references"));
    case GENERICREFSXP:
      error(_("this version of R cannot read generic function references"));
    case RAWSXP:
      len = XdrInInteger(d);
      XdrSkipBytes(d, len);
      break;
    case S4SXP:
      break;
    default:
      s = R_NilValue; /* keep compiler happy */
      error(_("ReadItem: unknown type %i, perhaps written by later
version of R"), type);
    }
    d->lev++;
    if (hasattr) ReadItem(d);
    d->lev--;
    return s;
  }
}

int main(int ac, char **av) {
  char sig[16];
  int ver, wri, rel;
  FILE *f, *of = 0;
  SaveLoadData sal, *d = &sal;

  if (ac<2) {
    printf("\n Usage: rdcopy <source> [-v | <target> <offset>]\n\n
Extracts an object from a RData file.\n Use rdlist to obtain all valid
offsets for each object.\n\n");
    return 1;
  }

  f = fopen(av[1], "rb");
  sal.verb = 0;
  if (!f)
    error(_("unable to open file %s"), av[1]);

  if (fread(sig, 1, 7, f)!=7) {
    fclose(f);
    error(_("unable to read magic number"));
  }
  sig[7]=0;

  if (!strcmp(sig, "XDR2\nX\n")) {
    { char *c=sig; while(*c) { if (*c<' ') *c='.'; c++; } }
    printf("Format: '%s'\n", sig);
    fclose(f);
    error(_("XDR v2 is the only supported format"));
  }

  if (ac>2) {
    if (!strcmp(av[2],"-v")) {
      d->verb=2;
    } else {
      of = fopen(av[2], "wb");
      if (!of) {
    fclose(f);
    error(_("unable to create %s"), av[2]);
      }
      fwrite(sig, 1, 7, of);
    }
  }
  XdrInInit(f, d, 64*1024);
  d->mode=of?M_NonRefCopy:M_Read;
  d->copyf=of;
  d->target=ac>3?atol(av[3]):0;
  ver=XdrInInteger(d);
  wri=XdrInInteger(d);
  rel=XdrInInteger(d);
  printf("Format version %x, R version = %d.%d.%d, release = %x\n",
ver, wri>>16, (wri>>8)&255, wri&255, rel);

  if (ver != 2) {
    XdrInTerm(d);
    error(_("Sorry, this tool supported RXDR version 2 format only\n"));
  }

  if (of) d->mode=M_NonRefSelect;
  ReadItem(d);

  XdrInTerm(d);

  if (d->mode!=M_Read)
    printf("\nNo object selected. Please use above offsets to select
an object.\n");

  return 0;
}




More information about the R-help mailing list