Spidering - download the right set of pages
#> wdir <- "C:/Users/batagelj/Documents/papers/2017/Moscow/EDA/test" #> setwd(wdir) #> library(XML) amazonDat <- function(fBooks,maxBooks){ # Spiders (snow ball) the Amazon books from a given seed-book # amazon('Books.csv',10) # Vladimir Batagelj, 20-21. nov. 2004 / 10. nov. 2006 / 12. nov. 2017 cat("AmazonDat:",date(),"\n\n") bdic <- new.env(hash=TRUE,parent=emptyenv()) csv <- file(fBooks,"w"); cat('*Books',date(),"\n",file=csv) rep <- file("Report.txt","w") url1 <- 'http://www.amazon.com/exec/obidos/tg/detail/-/' url2 <- '?v=glance'; book <- '0521840856'; books <- c(book) assign(book,1,env=bdic) cat(length(bdic),' "',book,'"\n',sep='',file=rep) while (length(books)>0){ bk <- books[1]; books <- books[-1] bID <- get(bk,env=bdic); cat(bID,'\n') page <- paste(url1,bk,url2,sep='') html <- readLines(con<-url(page)); close(con) S <- html[nchar(html)>0] # process Page Page <- htmlParse(S) titl <- xpathSApply(Page,"//title",xmlValue) cat(paste(bID,titl,sep=";"),"\n",file=csv) # extract links to other books i <- grep("a-link-normal",S,ignore.case=TRUE) j <- grep(" data-a-dynamic-image",S,ignore.case=TRUE) k <- intersect(i,j) cut <- S[k] ii <- regexpr("/dp/",cut)+4 for (b in 1:length(ii)) { j <- ii[b]; if (j > 0) { bk <- substr(cut[b],j,j+9) cat('test',b,bk,'\n'); flush.console() if (exists(bk,env=bdic,inherits=FALSE)){ bID <- get(bk,env=bdic,inherits=FALSE) } else { line <- cut[b] assign(bk,length(bdic)+1,env=bdic) if (length(bdic) <= maxBooks) books <- append(books,bk) cat(length(bdic),' "',bk,'"\n',sep='',file=rep) cat('new vertex ',length(bdic),' - ',bk,'\n') l <- regexpr('alt="',line)[1]+5; part <- substr(line,l,nchar(line)) r <- regexpr('"',part)[1]-1; t <- substr(part,1,r) cat(length(bdic),": ",t,"\n", sep="") } } } flush.console() } close(rep); close(csv); cat('Amazon - END\n') } amazonDat('Books.csv',10)
> amazonDat('Books.csv',10) AmazonDat: Mon Nov 13 12:58:41 2017 1 test 1 0521387078 new vertex 2 - 0521387078 2: Social Network Analysis: Methods and Applications (Structural Analysis in the Social Sciences) 2 test 1 1446247414 new vertex 3 - 1446247414 3: Analyzing Social Networks test 2 0195379470 new vertex 4 - 0195379470 4: Understanding Social Networks: Theories, Concepts, and Findings test 3 0199206651 new vertex 5 - 0199206651 5: Networks: An Introduction test 4 1493909827 new vertex 6 - 1493909827 6: Statistical Analysis of Network Data with R (Use R!) test 5 0521600979 new vertex 7 - 0521600979 7: Models and Methods in Social Network Analysis (Structural Analysis in the Social Sciences) test 6 0521195330 new vertex 8 - 0521195330 8: Networks, Crowds, and Markets: Reasoning about a Highly Connected World test 7 1446247414 test 8 0195379470 test 9 1493909827 test 10 1473952123 new vertex 9 - 1473952123 9: Social Network Analysis test 11 0521600979 test 12 1412947154 new vertex 10 - 1412947154 10: Social Network Analysis: History, Theory and Methodology 3 test 1 0195379470 test 2 0521387078 test 3 1412947154 test 4 1446276139 new vertex 11 - 1446276139 11: Doing Social Network Research: Network-based Research Design for Social Scientists test 5 1473952123 test 6 3319238825 new vertex 12 - 3319238825 12: A User’s Guide to Network Analysis in R test 7 0195379470 ... test 6 1446276139 test 7 1446247414 test 8 0195379470 test 9 1483325210 test 10 1446209040 new vertex 27 - 1446209040 27: Social Network Analysis test 11 0521387078 test 12 0761963391 new vertex 28 - 0761963391 28: Social Network Analysis: A Handbook 10 test 1 1446247414 test 2 0195379470 test 3 0521387078 test 4 1412927498 new vertex 29 - 1412927498 29: Social Network Analysis (Quantitative Applications in the Social Sciences) test 5 1446209040 test 6 1446276139 test 7 1446247414 test 8 0195379470 test 9 0521387078 test 10 1473952123 test 11 1483325210 test 12 1446209040 Amazon - END
Scraping - extract interesting data from downloaded page(s)
#> wdir <- "C:/Users/batagelj/Documents/papers/2017/Moscow/EDA/test" #> setwd(wdir) #> library(XML) amazonDat <- function(seeds,fBooks,maxBooks,step){ # Creates a dataset about books from Amazon # amazon('Books.csv',10) # Vladimir Batagelj, 20-21. nov. 2004 / 10. nov. 2006 / 12. nov. 2017 cover <- c("hard","paper","cd","dvd","spiral","loose","book","calendar", "map","bound","bind","bond","leather","audio","music") Cover <- c("Hardcover","Paperback","CD","CD","Spiral","Loose leaf","Book", "Calendar","Map","Special","Special","Special","Leather","Audio","Audio") lc <- length(cover) cat("AmazonDat:",date(),"\n\n") bdic <- new.env(hash=TRUE,parent=emptyenv()) csv <- file(fBooks,"w"); # cat('*Books',date(),"\n",file=csv) h <- paste("bID","Amazon","bind","npag","pub","year","lang","wid", "thi","hei","duni","weig","wuni","pric","titl",sep=";") cat(h,"\n",file=csv) rep <- file("Report.txt","w") url1 <- 'http://www.amazon.com/exec/obidos/tg/detail/-/' url2 <- '?v=glance' books <- seeds for(bk in seeds) assign(bk,1+length(bdic),env=bdic) cat(length(bdic),' "',book,'"\n',sep='',file=rep) while (length(books)>0){ bk <- books[1]; books <- books[-1] bID <- get(bk,env=bdic) if((step==0)|(bID%%step==0)) cat(bID,":",date(),'\n') page <- paste(url1,bk,url2,sep='') html <- readLines(con<-url(page)); close(con) S <- html[nchar(html)>0] # process Page Page <- htmlParse(S) titl <- gsub(";","//",xpathSApply(Page,"//title",xmlValue)) lab <- xpathSApply(Page,"//li/b",xmlValue) r <- as.vector(unlist(xpathSApply(Page,"//li/b/../text()"))) # cover and pages bind <- "Other" for(i in 1:lc){ J <- grep(cover[i],lab,ignore.case=TRUE) if(length(J)>0) break } if(length(J)>0){ j <- J[1]; bind <- Cover[i]; pag <- xmlValue(r[[j]]) npag <- as.numeric(gsub("([0-9]+).*$","\\1",pag)) } else npag <- NA # publisher and publication year J <- grep("Publi",lab,ignore.case=TRUE) if(length(J)>0){ j <- J[1]; pub <- trimws(xmlValue(r[[j]])) L <- unlist(strsplit(pub,"\\(")); pub <- trimws(L[1]) if(length(L)>1){N <- as.numeric(strsplit(L[2], "\\D+")[[1]][-1]) } else N <- as.numeric(strsplit(L[1], "\\D+")[[1]][-1]) if(length(N)>0) year <- max(N) else year <- NA if(grepl(";",pub)) pub <- strsplit(pub,";")[[1]][1] } else { pub <- NA; year <- NA } # language J <- grep("Lang",lab,ignore.case=TRUE) if(length(J)>0){ j <- J[1]; lang <- trimws(xmlValue(r[[j]])) } else lang <- NA # dimensions J <- grep("Dim",lab,ignore.case=TRUE) if(length(J)>0){ j <- J[1]; pdim <- trimws(xmlValue(r[[j]])) L <- unlist(strsplit(pdim,"x")) if(length(L)==3){ wid <- as.numeric(L[1]); thi <- as.numeric(L[2]); K <- unlist(strsplit(trimws(L[3])," ")) hei <- as.numeric(K[1]); duni <- K[2] } else { wid <- NA; thi <- NA; hei <- NA; duni <- NA } } else { wid <- NA; thi <- NA; hei <- NA; duni <- NA } # weight J <- grep("Weight",lab,ignore.case=TRUE) if(length(J)>0){ j <- J[1] L <- unlist(strsplit(trimws(xmlValue(r[[j]]))," ")) weig <- as.numeric(L[1]); wuni <- L[2] } else {weig <- NA; wuni <- NA} # price # <span class=\"a-size-medium a-color-price header-price\">\ j <- grep("a-size-medium a-color-price header-price",S) if(length(j)>0){ k <- grep("\\$",S[j:(j+10)]); price <- trimws(S[j+k-1]) pric <- as.numeric(substr(price,2,nchar(price))) } else { pric <- NA } cat(paste(bID,paste('"',bk,'"',sep=''),bind,npag,pub,year,lang, wid,thi,hei,duni,weig,wuni,pric,titl,sep=";"),"\n",file=csv) # extract links to other books i <- grep("a-link-normal",S,ignore.case=TRUE) j <- grep(" data-a-dynamic-image",S,ignore.case=TRUE) k <- intersect(i,j) cut <- S[k] ii <- regexpr("/dp/",cut)+4 for (b in 1:length(ii)) { j <- ii[b]; if (j > 0) { bk <- substr(cut[b],j,j+9) # cat('test',b,bk,'\n'); flush.console() if (exists(bk,env=bdic,inherits=FALSE)){ bID <- get(bk,env=bdic,inherits=FALSE) } else { line <- cut[b] assign(bk,length(bdic)+1,env=bdic); nd <- length(bdic) if (nd <= maxBooks) books <- append(books,bk) cat(nd,' "',bk,'"\n',sep='',file=rep) l <- regexpr('alt="',line)[1]+5; part <- substr(line,l,nchar(line)) r <- regexpr('"',part)[1]-1; t <- substr(part,1,r) if(step==0)cat('new book ',nd,' - ',bk,'\n ',t,"\n",sep="") } } } flush.console() } close(rep); close(csv); cat("Amazon - END:",date(),"\n") } seeds <- c("0521387078", "0312569378", "1451648537", "1452101248", "0385343841", "1451627289", "140123206X", "0307408841", "0425243214", "0553801473") seeds <- c('0521840856') amazonDat(seeds,'Books.csv',10,0)
> seeds <- c('0521840856') > amazonDat(seeds,'newBooks.csv',1000,10) AmazonDat: Mon Nov 13 09:44:35 2017 10 : Mon Nov 13 09:45:38 2017 20 : Mon Nov 13 09:46:50 2017 30 : Mon Nov 13 09:48:05 2017 40 : Mon Nov 13 09:49:23 2017 50 : Mon Nov 13 09:50:42 2017 60 : Mon Nov 13 09:51:54 2017 70 : Mon Nov 13 09:53:16 2017 80 : Mon Nov 13 09:54:19 2017 90 : Mon Nov 13 09:55:09 2017 100 : Mon Nov 13 09:56:07 2017 110 : Mon Nov 13 09:57:26 2017 120 : Mon Nov 13 09:58:39 2017 ... 890 : Mon Nov 13 12:17:36 2017 900 : Mon Nov 13 12:19:44 2017 910 : Mon Nov 13 12:21:52 2017 920 : Mon Nov 13 12:24:05 2017 930 : Mon Nov 13 12:26:22 2017 940 : Mon Nov 13 12:28:32 2017 950 : Mon Nov 13 12:30:47 2017 960 : Mon Nov 13 12:33:14 2017 Error in if (j > 0) { : missing value where TRUE/FALSE needed
4179 different books found (file report.txt
) in the crawl; 967 scraped.