Code from Data 2

Amazon spidering

Spidering - download the right set of pages

#> wdir <- "C:/Users/batagelj/Documents/papers/2017/Moscow/EDA/test"
#> setwd(wdir)
#> library(XML)
 
amazonDat <- function(fBooks,maxBooks){
# Spiders (snow ball) the Amazon books from a given seed-book
#  amazon('Books.csv',10)
# Vladimir Batagelj, 20-21. nov. 2004 / 10. nov. 2006 / 12. nov. 2017
  cat("AmazonDat:",date(),"\n\n")
  bdic <- new.env(hash=TRUE,parent=emptyenv())
  csv <- file(fBooks,"w");  cat('*Books',date(),"\n",file=csv)
  rep <- file("Report.txt","w")
  url1 <- 'http://www.amazon.com/exec/obidos/tg/detail/-/'
  url2 <- '?v=glance';
  book <- '0521840856'; books <- c(book)
  assign(book,1,env=bdic)
  cat(length(bdic),' "',book,'"\n',sep='',file=rep)
  while (length(books)>0){
    bk <- books[1]; books <- books[-1]
    bID <- get(bk,env=bdic); cat(bID,'\n')
    page <- paste(url1,bk,url2,sep='')
    html <- readLines(con<-url(page)); close(con)
    S <- html[nchar(html)>0]
#   process Page
    Page <- htmlParse(S)
    titl <- xpathSApply(Page,"//title",xmlValue)
    cat(paste(bID,titl,sep=";"),"\n",file=csv)
#   extract links to other books
    i <- grep("a-link-normal",S,ignore.case=TRUE)
    j <- grep(" data-a-dynamic-image",S,ignore.case=TRUE)
    k <- intersect(i,j)
    cut <- S[k]
    ii <- regexpr("/dp/",cut)+4
    for (b in 1:length(ii)) {
      j <- ii[b];
      if (j > 0) {
        bk <- substr(cut[b],j,j+9)
        cat('test',b,bk,'\n'); flush.console()
        if (exists(bk,env=bdic,inherits=FALSE)){
          bID <- get(bk,env=bdic,inherits=FALSE)
        } else {
          line <- cut[b]
          assign(bk,length(bdic)+1,env=bdic)
          if (length(bdic) <= maxBooks) books <- append(books,bk)
          cat(length(bdic),' "',bk,'"\n',sep='',file=rep)
          cat('new vertex ',length(bdic),' - ',bk,'\n')
          l <- regexpr('alt="',line)[1]+5; part <- substr(line,l,nchar(line))
          r <- regexpr('"',part)[1]-1; t <- substr(part,1,r)
          cat(length(bdic),": ",t,"\n", sep="")
        }
      }
    }
    flush.console()
  }
  close(rep); close(csv); cat('Amazon - END\n')
}
 
amazonDat('Books.csv',10)
> amazonDat('Books.csv',10)
AmazonDat: Mon Nov 13 12:58:41 2017 
 
1 
test 1 0521387078 
new vertex  2  -  0521387078 
2: Social Network Analysis: Methods and Applications (Structural Analysis in the Social Sciences)
2 
test 1 1446247414 
new vertex  3  -  1446247414 
3: Analyzing Social Networks
test 2 0195379470 
new vertex  4  -  0195379470 
4: Understanding Social Networks: Theories, Concepts, and Findings
test 3 0199206651 
new vertex  5  -  0199206651 
5: Networks: An Introduction
test 4 1493909827 
new vertex  6  -  1493909827 
6: Statistical Analysis of Network Data with R (Use R!)
test 5 0521600979 
new vertex  7  -  0521600979 
7: Models and Methods in Social Network Analysis (Structural Analysis in the Social Sciences)
test 6 0521195330 
new vertex  8  -  0521195330 
8: Networks, Crowds, and Markets: Reasoning about a Highly Connected World
test 7 1446247414 
test 8 0195379470 
test 9 1493909827 
test 10 1473952123 
new vertex  9  -  1473952123 
9: Social Network Analysis
test 11 0521600979 
test 12 1412947154 
new vertex  10  -  1412947154 
10: Social Network Analysis: History, Theory and Methodology
3 
test 1 0195379470 
test 2 0521387078 
test 3 1412947154 
test 4 1446276139 
new vertex  11  -  1446276139 
11: Doing Social Network Research: Network-based Research Design for Social Scientists
test 5 1473952123 
test 6 3319238825 
new vertex  12  -  3319238825 
12: A User&rsquo;s Guide to Network Analysis in R
test 7 0195379470 
...
test 6 1446276139 
test 7 1446247414 
test 8 0195379470 
test 9 1483325210 
test 10 1446209040 
new vertex  27  -  1446209040 
27: Social Network Analysis
test 11 0521387078 
test 12 0761963391 
new vertex  28  -  0761963391 
28: Social Network Analysis: A Handbook
10 
test 1 1446247414 
test 2 0195379470 
test 3 0521387078 
test 4 1412927498 
new vertex  29  -  1412927498 
29: Social Network Analysis (Quantitative Applications in the Social Sciences)
test 5 1446209040 
test 6 1446276139 
test 7 1446247414 
test 8 0195379470 
test 9 0521387078 
test 10 1473952123 
test 11 1483325210 
test 12 1446209040 
Amazon - END

Amazon spidering and scraping

Scraping - extract interesting data from downloaded page(s)

#> wdir <- "C:/Users/batagelj/Documents/papers/2017/Moscow/EDA/test"
#> setwd(wdir)
#> library(XML)
 
amazonDat <- function(seeds,fBooks,maxBooks,step){
# Creates a dataset about books from Amazon
#  amazon('Books.csv',10)
# Vladimir Batagelj, 20-21. nov. 2004 / 10. nov. 2006 / 12. nov. 2017
  cover <- c("hard","paper","cd","dvd","spiral","loose","book","calendar",
    "map","bound","bind","bond","leather","audio","music") 
  Cover <- c("Hardcover","Paperback","CD","CD","Spiral","Loose leaf","Book",
    "Calendar","Map","Special","Special","Special","Leather","Audio","Audio") 
  lc <- length(cover)
  cat("AmazonDat:",date(),"\n\n")
  bdic <- new.env(hash=TRUE,parent=emptyenv())
  csv <- file(fBooks,"w"); # cat('*Books',date(),"\n",file=csv)
  h <- paste("bID","Amazon","bind","npag","pub","year","lang","wid",
    "thi","hei","duni","weig","wuni","pric","titl",sep=";")
  cat(h,"\n",file=csv)
  rep <- file("Report.txt","w")
  url1 <- 'http://www.amazon.com/exec/obidos/tg/detail/-/'
  url2 <- '?v=glance'
  books <- seeds
  for(bk in seeds) assign(bk,1+length(bdic),env=bdic)
  cat(length(bdic),' "',book,'"\n',sep='',file=rep)
  while (length(books)>0){
    bk <- books[1]; books <- books[-1]
    bID <- get(bk,env=bdic)
    if((step==0)|(bID%%step==0)) cat(bID,":",date(),'\n')
    page <- paste(url1,bk,url2,sep='')
    html <- readLines(con<-url(page)); close(con)
    S <- html[nchar(html)>0]
#   process Page
    Page <- htmlParse(S)
    titl <- gsub(";","//",xpathSApply(Page,"//title",xmlValue))
    lab <- xpathSApply(Page,"//li/b",xmlValue)
    r <- as.vector(unlist(xpathSApply(Page,"//li/b/../text()")))
  # cover and pages
    bind <- "Other"
    for(i in 1:lc){
      J <- grep(cover[i],lab,ignore.case=TRUE)
      if(length(J)>0) break
    }
    if(length(J)>0){
      j <- J[1]; bind <- Cover[i]; pag <- xmlValue(r[[j]])
      npag <- as.numeric(gsub("([0-9]+).*$","\\1",pag))
    } else npag <- NA 
  # publisher and publication year
    J <- grep("Publi",lab,ignore.case=TRUE)
    if(length(J)>0){
      j <- J[1]; pub <- trimws(xmlValue(r[[j]]))
      L <- unlist(strsplit(pub,"\\(")); pub <- trimws(L[1])
      if(length(L)>1){N <- as.numeric(strsplit(L[2], "\\D+")[[1]][-1])
      } else N <- as.numeric(strsplit(L[1], "\\D+")[[1]][-1])
      if(length(N)>0) year <- max(N) else year <- NA
      if(grepl(";",pub)) pub <- strsplit(pub,";")[[1]][1]
    } else { pub <- NA; year <- NA }
  # language
    J <- grep("Lang",lab,ignore.case=TRUE)
    if(length(J)>0){ j <- J[1]; lang <- trimws(xmlValue(r[[j]]))
    } else lang <- NA 
  # dimensions
    J <- grep("Dim",lab,ignore.case=TRUE)
    if(length(J)>0){
      j <- J[1]; pdim <- trimws(xmlValue(r[[j]]))
      L <- unlist(strsplit(pdim,"x"))
      if(length(L)==3){ wid <- as.numeric(L[1]); thi <- as.numeric(L[2]);
        K <- unlist(strsplit(trimws(L[3])," "))
        hei <- as.numeric(K[1]); duni <- K[2]
      } else { wid <- NA; thi <- NA; hei <- NA; duni <- NA }
    } else { wid <- NA; thi <- NA; hei <- NA; duni <- NA }
  # weight
    J <- grep("Weight",lab,ignore.case=TRUE)
    if(length(J)>0){ j <- J[1]  
      L <- unlist(strsplit(trimws(xmlValue(r[[j]]))," "))
      weig <- as.numeric(L[1]); wuni <- L[2]
    } else {weig <- NA; wuni <- NA}
  # price
  # <span class=\"a-size-medium a-color-price header-price\">\
    j <- grep("a-size-medium a-color-price header-price",S)
    if(length(j)>0){
      k <- grep("\\$",S[j:(j+10)]); price <- trimws(S[j+k-1])
      pric <- as.numeric(substr(price,2,nchar(price)))
    } else { pric <- NA }
    cat(paste(bID,paste('"',bk,'"',sep=''),bind,npag,pub,year,lang,
      wid,thi,hei,duni,weig,wuni,pric,titl,sep=";"),"\n",file=csv)
#   extract links to other books
    i <- grep("a-link-normal",S,ignore.case=TRUE)
    j <- grep(" data-a-dynamic-image",S,ignore.case=TRUE)
    k <- intersect(i,j)
    cut <- S[k]
    ii <- regexpr("/dp/",cut)+4
    for (b in 1:length(ii)) {
      j <- ii[b];
      if (j > 0) {
        bk <- substr(cut[b],j,j+9)
#        cat('test',b,bk,'\n'); flush.console()
        if (exists(bk,env=bdic,inherits=FALSE)){
          bID <- get(bk,env=bdic,inherits=FALSE)
        } else {
          line <- cut[b]
          assign(bk,length(bdic)+1,env=bdic); nd <- length(bdic)
          if (nd <= maxBooks) books <- append(books,bk)
          cat(nd,' "',bk,'"\n',sep='',file=rep)
          l <- regexpr('alt="',line)[1]+5; part <- substr(line,l,nchar(line))
          r <- regexpr('"',part)[1]-1; t <- substr(part,1,r)
          if(step==0)cat('new book ',nd,' - ',bk,'\n    ',t,"\n",sep="")
        }
      }
    }
    flush.console()
  }
  close(rep); close(csv); cat("Amazon - END:",date(),"\n")
}
 
seeds <- c("0521387078", "0312569378", "1451648537", "1452101248", "0385343841",
     "1451627289", "140123206X", "0307408841", "0425243214", "0553801473")
seeds <- c('0521840856')
amazonDat(seeds,'Books.csv',10,0)
> seeds <- c('0521840856')
> amazonDat(seeds,'newBooks.csv',1000,10)
AmazonDat: Mon Nov 13 09:44:35 2017 
 
10 : Mon Nov 13 09:45:38 2017 
20 : Mon Nov 13 09:46:50 2017 
30 : Mon Nov 13 09:48:05 2017 
40 : Mon Nov 13 09:49:23 2017 
50 : Mon Nov 13 09:50:42 2017 
60 : Mon Nov 13 09:51:54 2017 
70 : Mon Nov 13 09:53:16 2017 
80 : Mon Nov 13 09:54:19 2017 
90 : Mon Nov 13 09:55:09 2017 
100 : Mon Nov 13 09:56:07 2017 
110 : Mon Nov 13 09:57:26 2017 
120 : Mon Nov 13 09:58:39 2017 
... 
890 : Mon Nov 13 12:17:36 2017 
900 : Mon Nov 13 12:19:44 2017 
910 : Mon Nov 13 12:21:52 2017 
920 : Mon Nov 13 12:24:05 2017 
930 : Mon Nov 13 12:26:22 2017 
940 : Mon Nov 13 12:28:32 2017 
950 : Mon Nov 13 12:30:47 2017 
960 : Mon Nov 13 12:33:14 2017 
Error in if (j > 0) { : missing value where TRUE/FALSE needed

4179 different books found (file report.txt) in the crawl; 967 scraped.

EDA

ru/hse/eda18/snowb.txt · Last modified: 2018/10/06 00:34 by vlado
 
Except where otherwise noted, content on this wiki is licensed under the following license: CC Attribution-Noncommercial-Share Alike 3.0 Unported
Recent changes RSS feed Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki