[R] How to join many records against SQL database

Andrew Ziem ahz001 at gmail.com
Sun Mar 1 00:19:10 CET 2009


This is a working example of how to merge records with a SQL database
given the constraints
1. The database is too large to pull all the records
2. The database permissions don't allow creating a table for
temporarily storing identifiers
3. The R database driver doesn't allow creating temporary table
4. There are too many identifiers to pass in a single query


library(RSQLite)

max_ids <- 100 # try a larger number for greater efficiency

# manufacture data
y <- data.frame( list(1:1000), list(rnorm(1000)) )
colnames(y) <- c("id", "rnd")

m <- dbDriver("SQLite")
tfile <- tempfile()
con <- dbConnect(m, dbname = tfile)
dbWriteTable(con, "y", y)

rm(y)

x <- data.frame(1:1000)
x['letter'] <- head(rep(LETTERS, 1000/(24+1)),1000)
colnames(x) <- c("id", "letter")

# define functions
get_y <- function(ids)
{
        ids2 <- paste(ids, collapse = ",")
        cmd <- paste("select * from y where id in (", ids2, ")")
        d <- dbSendQuery(con, cmd)
        rs <- fetch(d)
        rs[,2:3]
}

loop <- function()
{
        y <<- get_y(x[x$part==0, 'id'])

        for (i in 1:(max_ids-1))
        {
                y2 <- get_y(x[x$part==i, 'id'])
                y <<- rbind(y, y2)
        }
}

call_y <- function(z)
{
        get_y(x[x$part==z, 'id'])
}

# work
x$part <- as.numeric(row.names(x)) %% max_ids

system.time(loop())

merged <- subset(merge(x, y), select=c('id', 'letter', 'rnd'))




Andrew




More information about the R-help mailing list