[R] web scraping image

Curtis DeGasperi curtis.degasperi at gmail.com
Thu Jun 4 18:31:13 CEST 2015


I'm working on a script that downloads data from the USGS NWIS server.
dataRetrieval makes it easy to quickly get the data in a neat tabular
format, but I was also interested in getting the tabular text files -
also fairly easy for me using download.file.

However, I'm not skilled enough to work out how to download the nice
graphic files that can be produced dynamically from the USGS NWIS
server (for example:
http://nwis.waterdata.usgs.gov/nwis/peak?site_no=12144500&agency_cd=USGS&format=img)

My question is how do I get the image from this web page and save it
to a local directory? scrapeR returns the information from the page
and I suspect this is a possible solution path, but I don't know what
the next step is.

My code provided below works from a list I've created of USGS flow
gauging stations.

Curtis

## Code to process USGS daily flow data for high and low flow analysis
## Need to start with list of gauge ids to process
## Can't figure out how to automate download of images

require(dataRetrieval)
require(data.table)
require(scrapeR)

df <- read.csv("usgs_stations.csv", header=TRUE)

lstas <-length(df$siteno) #length of locator list

print(paste('Processsing...',df$name[1],' ',df$siteno[1], sep = ""))

datall <-  readNWISpeak(df$siteno[1])

for (a in 2:lstas) {
  # Print station being processed
  print(paste('Processsing...',df$name[a],' ',df$siteno[a], sep = ""))

  dat<-  readNWISpeak(df$siteno[a])

  datall <- rbind(datall,dat)

}

write.csv(datall, file = "usgs_peaks.csv")

# Retrieve ascii text files and graphics

for (a in 1:lstas) {

  print(paste('Processsing...',df$name[1],' ',df$siteno[1], sep = ""))

  graphic.url <-
paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no=',df$siteno[a],'&agency_cd=USGS&format=img',
sep = "")
  peakfq.url <-
paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no=',df$siteno[a],'&agency_cd=USGS&format=hn2',
sep = "")
  tab.url  <- paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no=',df$siteno[a],'&agency_cd=USGS&format=rdb',
sep = "")

  graphic.fn <- paste('graphic_',df$siteno[a],'.gif', sep = "")
  peakfq.fn <- paste('peakfq_',df$siteno[a],'.txt', sep = "")
  tab.fn  <- paste('tab_',df$siteno[a],'.txt', sep = "")

  download.file(graphic.url,graphic.fn,mode='wb') # This apparently
doesn't work - file is empty
  download.file(peakfq.url,peakfq.fn)
  download.file(tab.url,tab.fn)
}

# scrapeR
pageSource<-scrape(url="http://nwis.waterdata.usgs.gov/nwis/peak?site_no=12144500&agency_cd=USGS&format=img",headers=TRUE,
parse=FALSE)
page<-scrape(object="pageSource")



More information about the R-help mailing list