I'd like to know how to ignore links that do not fit the conditions set forth in title, date_time, and text; thus managing to continue scraping the site.
The error that occurs when a link does not have or does not follow the conditions: "Error in data.frame (title, date_time, text): arguments imply differing number of rows: 1, 0 "
Below is the script:
# iniciar bibliotecas
library(XML)
library(xlsx)
#url_base <- "http://www.saocarlosagora.com.br/busca/?q=PDT&page=2"
url_base <- "http://www.saocarlosagora.com.br/busca/?q=bolt&page=koxa"
url_base <- gsub("bolt", "PDT", url_base)
links_saocarlos <- c()
for (i in 1:4){
url1 <- gsub("koxa", i, url_base)
pag<- readLines(url1)
pag<- htmlParse(pag)
pag<- xmlRoot(pag)
links <- xpathSApply(pag, "//div[@class='item']/a", xmlGetAttr, name="href")
links <- paste("http://www.saocarlosagora.com.br/", links, sep ="")
links_saocarlos<- c(links_saocarlos, links)
}
dados <- data.frame()
for(links in links_saocarlos){
pag1<- readLines(links)
pag1<- htmlParse(pag1)
pag1<- xmlRoot(pag1)
titulo <- xpathSApply(pag1, "//div[@class='row-fluid row-margin']/h2", xmlValue)
data_hora <- xpathSApply (pag1, "//div[@class='horarios']", xmlValue)
texto <- xpathSApply(pag1, "//div[@id='HOTWordsTxt']/p", xmlValue)
dados <- rbind(dados, data.frame(titulo, data_hora, texto))
}
agregar <- aggregate(dados$texto,list(dados$titulo,dados$data_hora),paste,collapse=' ')