Collect the data about selected type(s) of articles from Yandex.
For example: https://market.yandex.ru/catalog/54943/list
Кофемолки электрические
Look at the source code of the web page (right button, View page source). We concentrate our attention to the description of Кофемолка Lumme LU-2601
and try too guess the general format - the pages are generated by a program and we can assume that this is done following some rules.
... <div class="n-snippet-card2__stickers"></div> <div class="n-snippet-card2__part n-snippet-card2__part_type_left"> <a class="n-snippet-card2__image link" href="/product--kofemolka-lumme-lu-2601/5044064?show-uid=388597511141994346616006&nid=54943&context=search" title="Кофемолка Lumme LU-2601"> <img class="image" src="//avatars.mds.yandex.net/get-mpic/195452/img_id8810795763912299792/6hq" title="Кофемолка Lumme LU-2601" alt="Кофемолка Lumme LU-2601" srcset="//avatars.mds.yandex.net/get-mpic/195452/img_id8810795763912299792/9hq 1.5x"/></a> <div class="n-snippet-card2__colors"> <div class="n-snippet-color-set"> <div class="n-snippet-color-set__items"> <span class="n-snippet-color-set__item" title="белый" style="background-color: #ffffff" data-color="#ffffff"></span> <span class="n-snippet-color-set__item" title="синий" style="background-color: #0000ff" data-color="#0000ff"></span> <span class="n-snippet-color-set__item" title="черный" style="background-color: #000000" data-color="#000000"></span></div></div></div></div> <div class="n-snippet-card2__part n-snippet-card2__part_type_center"> <div class="n-snippet-card2__header n-snippet-card2__header_has_rating"> <div class="n-snippet-card2__title"> <a class="link n-link_theme_blue" href="/product--kofemolka-lumme-lu-2601/5044064?show-uid=388597511141994346616006&nid=54943&context=search" title="Кофемолка Lumme LU-2601"> Кофемолка Lumme LU-2601</a></div> <div class="n-snippet-card2__header-stickers"> <a class="n-snippet-card2__rating link link_theme_gray link_type_reviews" href="/product--kofemolka-lumme-lu-2601/5044064/reviews?show-uid=388597511141994346616006"> <div class="rating hint i-bem rating_outline_yes" date-rate="4" data-bem="{"hint":{"content":"Рейтинг модели 4 из 5","offset":15}}"> <div class="rating__value">4.0</div> <div class="rating__corner"> <div class="rating__triangle"></div></div></div> <span>11 отзывов</span></a></div></div> <div class="n-snippet-card2__content"> <ul class="n-snippet-card2__desc n-snippet-card2__desc_type_list"> <li class="n-snippet-card2__desc-item">компактная модель для дома</li> <li class="n-snippet-card2__desc-item">мощность 150 Вт</li> </ul> <div class="n-snippet-card2__reasons-to-buy-item"> <span class="n-badge-bestseller b-zone b-spy-visible i-bem n-reasons-to-buy n-reasons-to-buy_tag_bestseller n-reasons-to-buy_isShown_false " data-bem="{"n-badge-bestseller":"","b-spy-visible":"","b-zone":{"name":"reason-to-buy","data":{"entity":"product","id":5044064,"taggedEntity":"bestseller","isShown":false}}}"></span> <div class="n-reasons-to-buy n-reasons-to-buy_type_interest-simple b-zone b-spy-visible i-bem n-reasons-to-buy_tag_interest" data-bem="{"n-reasons-to-buy":"","b-spy-visible":"","b-zone":{"name":"reason-to-buy","data":{"entity":"product","id":5044064,"taggedEntity":"interest"}}}"> <div class="n-reasons-to-buy__content"> <span class="n-reasons-to-buy__label">107 человек купили этот товар</span> <div class="n-reasons-to-buy__bottom"></div> <div data-bem="{"popup":{"directions":{"to":"bottom","axis":"left","tail":{"axis":"left"}}}}" class="popup i-bem popup_autoclosable_yes popup_adaptive_yes popup_animate_yes popup_theme_info popup_type_reasons-to-buy "> <div class="popup__under "></div> <i class="popup__tail"></i> <div class="popup__content">По данным Яндекс.Маркета за 2 месяца.</div></div></div></div></div></div></div> <div class="n-snippet-card2__part n-snippet-card2__part_type_right"> <div class="n-snippet-card2__top"> <div class="n-snippet-card2__price"> <div class="n-snippet-card2__main-price"> <div class="n-snippet-card2__main-price-wrapper"> <a class="link n-smart-link i-bem" href=" kofemolka-lumme-lu-2601/5044064/offers?track=srchlink&show-uid=388597511141994346616006" data-bem="{"n-smart-link":{"subscriptions":["filters"]}}"> <div class="price">560 ₽</div></a></div></div></div> <div class="n-snippet-card2__more-prices-link"> <a class="link n-link_theme_blue link_type_prices i-bem" href="/product--kofemolka-lumme-lu-2601/5044064/offers?track=srchbtn&show-uid=388597511141994346616006&context=search"> 67 предложений</a> от <span class="price">490 ₽</span></div> <div class="n-badge-review b-zone b-spy-visible i-bem n-reasons-to-buy n-reasons-to-buy_tag_review" data-bem="{"n-badge-review":"","b-spy-visible":"","b-zone":{"name":"reason-to-buy","data":{"entity":"product","id":5044064,"taggedEntity":"review","isShown":false}}}"> </div></div><div class="n-snippet-card2__bottom"></div></div></div> <div class="n-snippet-card2 i-bem b-zone b-spy-visible" data-id="model-1940199972" data-bem="{"n-snippet-card2":{"cpc":"/redir/C-RMiAr-MEeCjhvu2MRw9uZFQ1WOGnjNwiLy8M4yuHHvPz6AId1jmp2PXtcFFFrRpPh_Oljic384McE42EwahgmTKbXKkMgjFGEPM95-pUpNkGvj5x6TLRh3qHw1HYnhs60duZtJauK4-RqdmKQH2TEvbxSxj_lsA4cSA5gHirL37wKl0CVwynKrZKqv5OoxpxVKStTgmRAPal9PZfIZ5Ra3c0SuFX6VcD53UztUeRKm-hkur0Z99r9OzKTeIOBRvTpVLM4XRSvUGeKcH-scIuqCpxkeRTvXNXgrbua4ZbdwDEka1oBtzHGclqUVnzcMP1LzlJMcjv2OPkvAYALwph9gH8QYzmmO4ORveXeupfnC6uQbDe1d64YxzRs9MpvQY-DwFlGTxplF1U2mneAwxm58E9yMgDtBKOPIq-U4tQe426rIS2kUvgW9cFXRZ2trGUZplC-YHlIC1CD7q1mMf3lCf9b4KFQY?data=41WTYndNxdlaG-5xTfn6okY8p8Pwg5ELMXbquG1iQx52ZhrBVOCtIdipII7JN8Ld8qZlM0aStPkuhXZ7f1ZI0yfehQtsiFZK8wyFBGuk0cDk2j4lqLvTSfUwfosigDETpCNtMTlGedP9yY_suf9KBd8XmjSTNTsm&b64e=1&sign=b9372f3257a57248d04d5eeeea50d3e8&keyno=1","context":"search","type":"model","clickActionType":"offer-card","modelId":1940199972,"entity":"product"},"b-zone":{"name":"snippet-card2","data":{"isCpa20":false,"id":1940199972}},"b-spy-visible":""}"> <div class="n-snippet-card2__hover"> <div class="n-snippet-card2__toolbar"> <div class="n-product-toolbar i-bem n-product-toolbar_label_no" data-bem="{"n-product-toolbar": {}}"> <a class="n-product-toolbar__item link link_theme_minor wishlist-control wishlist-control_type_toggle pseudo-checkbox hint b-zone b-spy-events i-bem" data-bem="{"wishlist-control":{"item":{"type":"model","itemId":"1940199972","hid":"90598","displayName":"Кофемолка Polaris PCG 1620","modelId":"1940199972","addSign":true},"image":{"url":"//avatars.mds.yandex.net/get-mpic/372220/img_id4464756392254855210.jpeg/2hq"}},"hint":{"to":"top","content":"Добавить в отложенные"},"pseudo-checkbox":"","metrika":"","b-zone":{"name":"to-wishlist"}}"> <i class="image image_name_favorite"></i> <i class="image image_name_favorite-activated"></i> <span class="n-product-toolbar__item-label n-product-toolbar__item-label_activated_no">Отложить</span> <span class="n-product-toolbar__item-label n-product-toolbar__item-label_activated_yes">Отложено</span></a> <div class="n-product-toolbar__item link link_theme_minor hint n-user-lists_type_compare i-bem" data-bem="{"n-user-lists_type_compare":{"id":"1940199972","name":"Кофемолка Polaris PCG 1620","hid":"90598","catname":"Кофемолки","image":{"url":"//avatars.mds.yandex.net/get-mpic/372220/img_id4464756392254855210.jpeg/2hq"},"link":"/compare?track=rmmbr","inComparison":false},"hint":{"to":"top"}}"> <i class="image image_name_compare"></i> <i class="image image_name_compare-in-list"></i> <span class="n-product-toolbar__item-label n-product-toolbar__item-label_activated_no">Сравнить</span> <span class="n-product-toolbar__item-label n-product-toolbar__item-label_activated_yes">В сравнении</span> </div></div></div></div> <div class="n-snippet-card2__stickers"></div> <div class="n-snippet-card2__part n-snippet-card2__part_type_left"> <a class="n-snippet-card2__image link link_type_cpc" href="/product--kofemolka-polaris-pcg-1620/1940199972?show-uid=388597511141994346616007&nid=54943&context=search" title="Кофемолка Polaris PCG 1620"><img class="image" src="//avatars.mds.yandex.net/get-mpic/372220/img_id4464756392254855210.jpeg/6hq" title="Кофемолка Polaris PCG 1620" alt="Кофемолка Polaris PCG 1620" srcset="//avatars.mds.yandex.net/get-mpic/372220/img_id4464756392254855210.jpeg/9hq 1.5x"/></a> ...
We identify the 'positions' of interesting items inside the page:
Borders
... <div class="n-snippet-card2__stickers"></div> <div class="n-snippet-card2__part n-snippet-card2__part_type_left"> ... ITEM description <span class="n-product-toolbar__item-label n-product-toolbar__item-label_activated_yes">В сравнении</span> </div></div></div></div> ...
Title and Rating
For Title there are many options
<div class="n-snippet-card2__header n-snippet-card2__header_has_rating"> <div class="n-snippet-card2__title"> <a class="link n-link_theme_blue" href="/product--kofemolka-lumme-lu-2601/5044064?show-uid=388597511141994346616006&nid=54943&context=search" title="Кофемолка Lumme LU-2601"> Кофемолка Lumme LU-2601 </a> </div> <div class="n-snippet-card2__header-stickers"> <a class="n-snippet-card2__rating link link_theme_gray link_type_reviews" href="/product--kofemolka-lumme-lu-2601/5044064/reviews?show-uid=388597511141994346616006"> <div class="rating hint i-bem rating_outline_yes" date-rate="4" data-bem="{"hint":{"content":"Рейтинг модели 4 из 5","offset":15}}"> <div class="rating__value">4.0</div> <div class="rating__corner"><div class="rating__triangle"></div></div> </div> <span>11 отзывов</span> </a> </div> </div>
Colors
<div class="n-snippet-color-set__items"> <span class="n-snippet-color-set__item" title="белый" style="background-color: #ffffff" data-color="#ffffff"></span> <span class="n-snippet-color-set__item" title="синий" style="background-color: #0000ff" data-color="#0000ff"></span> <span class="n-snippet-color-set__item" title="черный" style="background-color: #000000" data-color="#000000"></span> </div>
Snippets
<div class="n-snippet-card2__content"> <ul class="n-snippet-card2__desc n-snippet-card2__desc_type_list"> <li class="n-snippet-card2__desc-item">компактная модель для дома</li> <li class="n-snippet-card2__desc-item">мощность 150 Вт</li> </ul>
Buyers
<span class="n-reasons-to-buy__label">107 человек купили этот товар</span>
Price
<div class="price">560 ₽</div>
Additional
<div class="n-snippet-card2__more-prices-link"> <a class="link n-link_theme_blue link_type_prices i-bem" href="/product--kofemolka-lumme-lu-2601/5044064/offers?track=srchbtn&show-uid=388597511141994346616006&context=search"> 67 предложений</a> от <span class="price">490 ₽</span> </div>
Details
<a class="link n-link_theme_blue" href="/product--kofemolka-lumme-lu-2601/5044064?show-uid=388597511141994346616006&nid=54943&context=search" title="Кофемолка Lumme LU-2601">Кофемолка Lumme LU-2601</a>
We will first try to use XML parsing.
> wdir <- "C:/Users/batagelj/Documents/papers/2017/Moscow/EDA/test/yandex" > setwd(wdir) > library(XML) > page <- "https://market.yandex.ru/catalog/54943/list" > html <- readLines(con<-url(page),encoding='UTF-8'); close(con) > S <- html[nchar(html)>0] > length(S) [1] 47 > Page <- htmlParse(S) > Items <- xpathSApply(doc=Page,path='//div[@class="n-snippet-card2__part n-snippet-card2__part_type_left"]') > length(Items) [1] 48 > A <- xpathSApply(Page,'//a[@class="n-snippet-card2__image link link_type_cpc"]',xmlAttrs) > length(A) [1] 63 > A[,1:3] [,1] class "n-snippet-card2__image link link_type_cpc" href "/product--kofemolka-bosch-mkm-6000-6003/116335?show-uid=388626211772703675316001&nid=54943&context=search" title "ĂšĂ¾Ñ„ĂµĂÂĽĂ¾ûĂºð Bosch MKM 6000/6003" [,2] class "n-snippet-card2__image link link_type_cpc" href "/product--kofemolka-polaris-pcg-1420/1718505703?show-uid=388626211772703675316002&nid=54943&context=search" title "ĂšĂ¾Ñ„ĂµĂÂĽĂ¾ûĂºð Polaris PCG 1420" [,3] class "n-snippet-card2__image link link_type_cpc" href "/product--kofemolka-redmond-rcg-m1606/13939040?show-uid=388626211772703675316003&nid=54943&context=search" title "ĂšĂ¾Ñ„ĂµĂÂĽĂ¾ûĂºð REDMOND RCG-M1606" > T <- U <- id <- vector("character",ncol(A)) > for(k in 1:ncol(A)){ + tit <- A["title",k]; i <- regexpr(' ',tit)+1; j <- nchar(tit) + T[k] <- as.vector(substr(tit,i,j)) + h <- A["href",k]; j <- unlist(gregexpr('\\?',h))[1]-1 + url <- as.vector(substr(h,1,j)); U[k] <- url + i <- unlist(gregexpr('/',url))[2]+1 + id[k] <- as.vector(substr(url,i,j)) + } > Y <- data.frame(row.names=id,tit=T,url=U) > Y tit url 116335 Bosch MKM 6000/6003 /product--kofemolka-bosch-mkm-6000-6003/116335 1718505703 Polaris PCG 1420 /product--kofemolka-polaris-pcg-1420/1718505703 13939040 REDMOND RCG-M1606 /product--kofemolka-redmond-rcg-m1606/13939040 1940199972 Polaris PCG 1620 /product--kofemolka-polaris-pcg-1620/1940199972 103390395 Bosch TSM6A01 /product--kofemolka-bosch-tsm6a01/103390395 1712957216 VITEK VT-7123 ST /product--kofemolka-vitek-vt-7123-st/1712957216 10955013 Moulinex AR 1108 /product--kofemolka-moulinex-ar-1108/10955013 6120309 De'Longhi KG 79 /product--kofemolka-de-longhi-kg-79/6120309 6120310 De'Longhi KG 89 /product--kofemolka-de-longhi-kg-89/6120310 7345095 Scarlett SC-4245 /product--kofemolka-scarlett-sc-4245/7345095 13214665 REDMOND RCG-1604 /product--kofemolka-redmond-rcg-1604/13214665 13487082 Scarlett SC-CG44501 /product--kofemolka-scarlett-sc-cg44501/13487082 6046078 De'Longhi KG 49 /product--kofemolka-de-longhi-kg-49/6046078 13939059 Polaris PCG 1120 /product--kofemolka-polaris-pcg-1120/13939059 6076508 UNIT UCG-112 /product--kofemolka-unit-ucg-112/6076508 8492231 FIRST AUSTRIA 5486 /product--kofemolka-first-austria-5486/8492231 14126332 CENTEK CT-1357 /product--kofemolka-centek-ct-1357/14126332 6046079 De'Longhi KG 40 /product--kofemolka-de-longhi-kg-40/6046079 1964619536 SUPRA CGS-311 /product--kofemolka-supra-cgs-311/1964619536 1728566477 Kitfort ĂšĂ¢-1315 /product--kofemolka-kitfort-kt-1315/1728566477 >
It seems that the stucture of Yandex pages is not so regular as expected. I decided to turn to the brute force approach - regular expressions.
> wdir <- "C:/Users/batagelj/Documents/papers/2017/Moscow/EDA/test/yandex" > setwd(wdir) > page <- "https://market.yandex.ru/catalog/54943/list" > html <- readLines(con<-url(page),encoding='UTF-8'); close(con) > nchar(html) > S <- paste(html,collapse=" ") > nchar(S) [1] 471629 > begItem <- '<div class="n-snippet-card2__stickers">' > Ib <- as.vector(gregexpr(begItem,S)[[1]]) > K <- length(Ib) > Ib <- c(Ib,Ib[K]+10000) > sp <- rawToChar(as.raw(160)) > titles <- rates <- votes <- colors <- ids <- prods <- prices <- units <- buyers <- c() > for(k in 1:K){ + ls <- Ib[k]; rs <- Ib[k+1]-1 + seg <- substr(S,ls,rs) + # Id and Product URL + ix <- as.vector(gregexpr('<div class="n-snippet-card2__title">',seg)[[1]]) + ls <- ix[1] + 36; pro <- substr(seg,ls,ls+150) + i <- as.vector(regexpr('href="/',pro))+6; j <- as.vector(regexpr('\\?',pro)) + prod <- substr(pro,i,j-1); prods <- append(prods,prod) + id <- strsplit(prod,'/')[[1]][3]; ids <- append(ids,id) + # Title + ix <- gregexpr('search" title=',seg) + it <- as.vector(ix[[1]])+15; Tit <- substr(seg,it,it+100) + jt <- as.vector(regexpr('"',Tit)) + tit <- substr(Tit,1,jt-1); titles <- append(titles,tit) + # Rating + ir <- as.vector(gregexpr('<div class="rating__value">',seg)[[1]]) + rate <- NA; vote <- NA + if(ir>0){ + rat <- substr(seg,ir+27,ir+130) + j <- as.vector(regexpr('<',rat)) + if(length(j)>0) { + rate <- as.numeric(substr(rat,1,j-1)) + # Number of voters + i <- as.vector(regexpr('<span>',rat)) + vot <- substr(rat,i+6,i+15) + j <- as.vector(regexpr(' отзыв',vot)) + if(length(j)>0) vote <- as.numeric(substr(vot,1,j-1)) + } } + rates <- append(rates,rate); votes <- append(votes,vote) + # Colors + ic <- as.vector(gregexpr('<span class="n-snippet-color-set__item" title="',seg)[[1]]) + colrs <- NA + if(ic[1]>0){ cols <- c(); ic <- ic+47 + for(c in ic){ + col <- substr(seg,c,c+15); j <- as.vector(regexpr('"',col)) + color <- substr(col,1,j-1); cols <- c(cols,color) + } + colrs <- paste(cols,collapse=";") + } + colors <- append(colors,colrs) + # Price + i <- as.vector(gregexpr('<div class="price">',seg)[[1]])+19 + pr <- substr(seg,i,i+15); j <- as.vector(regexpr('</div>',pr)) + pri <- substr(pr,1,j-1) + pric <- gsub(" ","",gsub(sp,"",gsub("от","",pri))) + j <- nchar(pric); price <- as.numeric(substr(pric,1,j-1)) + unit <- substr(pric,j,j) + prices <- c(prices,price); units <- c(units,unit) + # Buyers + ix <- as.vector(gregexpr('<span class="n-reasons-to-buy__label">',seg)[[1]]) + i <- ix[1] + 38; buy <- substr(seg,i,i+50) + j <- as.vector(regexpr("челов",buy))[1] + if((j<0)&&(length(ix)>1)){i <- ix[2] + 38; buy <- substr(seg,i,i+50)} + j <- as.vector(regexpr("челов",buy))[1]; + if((j<0)&&(length(ix)>2)){i <- ix[3] + 38; buy <- substr(seg,i,i+50)} + j <- as.vector(regexpr("челов",buy))[1]; buyer <- NA + if(j>0){ buys <- substr(buy,1,j-1); + buyer <- as.numeric(gsub(" ","",gsub(sp,"",buys))) } + buyers <- c(buyers,buyer) + } > Y <- data.frame(id=ids,title=titles,rate=rates,vote=votes,price=prices, + unit=units,buyer=buyers,color=colors,prod=prods) > data.frame(Y$id,Y$rate,Y$vote,Y$price,Y$buyer) Y.id Y.rate Y.vote Y.price Y.buyer 1 116335 4.5 223 1090 1387 2 1718505703 5.0 5 990 26 3 13939040 5.0 NA 1320 56 4 1717575774 5.0 2 565 89 5 5044064 4.0 11 560 110 6 1940199972 4.5 NA 3990 30 7 12358796 4.5 3 422 35 8 13487123 4.5 6 808 NA 9 103390395 NA NA 1180 33 10 1712957216 5.0 2 1310 NA ...
In some prices the “russian” space is used
> pri <- "990 \u20bd" > utf8ToInt(pri) [1] 57 57 48 160 8381
In prices over 999 an additional space is inserted
> pri <- "2 070 ₽" > utf8ToInt(pri) [1] 50 32 48 55 48 32 8381
On the second page a new form of price appears от 20 468 ₽
.
On computers our model of prices doesn't work.
The page for shoes has different structure.