How to access data in html tables through R!

Download R script here.

library(XML)
library(chron)
library(plyr)

b = readLines("http://www.firm-racing.com/result_report.asp?RID=792&type=1")
bdoc <- htmlParse(b, asText = T)

result.header <- getNodeSet(bdoc, path = "//table/tr/td/div")
header <- unlist(lapply(result.header, function(x) c(xmlValue(x))))

result.table <- getNodeSet(bdoc, path = "//table/td/div")
racer.rslt <- matrix(unlist(lapply(result.table, function(x)
                     c(xmlValue(x)))),
                     ncol = 16, byrow = T)
result.df <- as.data.frame(racer.rslt,stringsAsFactors = FALSE)
header <- c("bib", "category", "swim_cat", "swim_ov", "swim_time",
            "TT1", "bike_cat", "bike_ov", "bike_time", "TT2",
            "run_cat", "run_ov", "run_time", "overall_cat",
            "overall_ov", "overall_time")
names(result.df) <- header

The below code I added later to “prettify” the numbers. Namely, so they are in the form that package “chron” likes (ie. “00:00:00”). This will help later if you want to do any time analyses on the data!

for(i in c(5,6,9,10,13,16)){
  for(j in 1:nrow(result.df)){
    y = "00:00:00"
    x = result.df[j,i]
    pos.add = 8 - nchar(x)
    pos = gregexpr("[0 1 2 3 4 5 6 7 8 9]",x)[[1]] + pos.add 
    for(v in 1:length(pos)){
      substr(y, pos[v], pos[v]) <- substr(x, pos[v] - pos.add, 
                                          pos[v] - pos.add)
    }
    result.df[j,i] = y
  }
  result.df[[c(header[i])]]= times(result.df[[c(header[i])]])
}
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s