Mineral waters

Mineral Waters on the World

October 27, 2013

# Mineral Waters
# Vladimir Batagelj  October 27, 2013 / October 9, 2009
# ------------------------------------------------------------------------------
# setwd("C:/Users/batagelj/test/R/mineral")
# source("C:\\Users\\batagelj\\test\\R\\mineral\\mineralWaters.R")
# mineral()

initial <- function(s,term) substr(s,1,regexpr(term,s[1])[1]-1)

# 	

mineral <- function(){
  library(XML)
  dat <- file("waters.dat","w")
  cat('*** Mineral waters:',date(),'\n*** from http://www.mineralwaters.org/\n')
  cat('*** Mineral waters:',date(),'\n*** from http://www.mineralwaters.org/\n',
    file=dat)
  url <- 'http://www.mineralwaters.org/index.php?func=disp&parval='
  ind <- 1:4138
#  ind <- c(173,306,956,1255,1347,2044,2079,2660,3659) # strange chars
  for(p in ind){
    pa <- paste(url,p,sep='')
    st <- readLines(con<-url(pa),warn=FALSE); close(con)
    dw <- htmlParse(st)
    ti <- dw['//h2']
    if(length(ti)==0){
      cat('***',p,'Missing\n'); flush.console()
      next
    }
    tit <- xmlValue(ti[[1]]); Encoding(tit) <- "UTF-8"
    coun <- xmlValue(dw['//p/b'][[5]])
    pl <- dw['//table/tr/td/table/tr/td/p[i]']
    plac <- NA
    if(length(pl)>0){
      pla <- xmlValue(pl[[1]]); Encoding(pla) <- "UTF-8" 
      if(substr(pla,1,4)=="Name") {
         plac <- substr(pla,26,nchar(pla))
      } else if(length(pl)>1) { 
         pla <- xmlValue(pl[[2]]); Encoding(pla) <- "UTF-8" 
         if(substr(pla,1,4)=="Name") plac <- substr(pla,26,nchar(pla))
      }     
    } 
    cat('***',p,tit,'\n'); flush.console()
    cat(p,tit,coun,plac,"",sep='|',file=dat)
    j <- grep('No data from analysis available',st,ignore.case=TRUE)
    if(length(j)!=0) {cat('\n',file=dat); flush(dat); next}
    i <- grep('Country: ',st,ignore.case=TRUE)
    v <- st[i]; i <- regexpr('<table',v)
    v <- substr(v,i+6,nchar(v))
    i <- regexpr('<table',v); j <- regexpr('</table',v)
    v <- substr(v,i+6,j-1)
    if(nchar(v)<15) {cat('\n',file=dat); flush(dat); next}
    t <- unlist(strsplit(v,'<tr><td>'))
    for(k in 2:length(t)){
      u <- unlist(strsplit(t[k],'</td><td></td><td>'))
      q <- unlist(strsplit(initial(u[2],'</td>'),'  '))
      e <- u[1]; i <- regexpr('\">',e)
      if(i>0) e <- initial(substr(e,i+2,nchar(e)),'</a>')
      cat(e,q[1],q[2],"",sep='|',file=dat)
    }
    cat('\n',file=dat); flush(dat)
  }
  cat('*** Finished:',date(),'\n',file=dat)
  close(dat)
  cat('*** Finished:',date(),'\n\n')
}

October 18, 2009

initial <- function(s,term) substr(s,1,regexpr(term,s[1])[1]-1)

mineral <- function(){
  library(XML)
  dat <- file("waters.dat","w")
  url <- 'http://www.pmgeiser.ch'
  page <- 'http://www.pmgeiser.ch/mineral/index.php?func=alpha&parval=0'
  stran <- readLines(con<-url(page)); close(con)
  doc <- htmlParse(stran)
  ind <- unlist(doc['/html/body/table/tr/td/p/a/@href'],use.names=FALSE)
  ind <- ind[regexpr('/mineral',ind)==1]
  cat('*** Mineral waters:',date(),'\n',file=dat)
  cat('*** from http://www.pmgeiser.ch/\n',file=dat)
#  ind <- c('/mineral/index.php?func=alpha&parval=0')
  for(p in ind){
    pag <- paste(url,p,sep='')
    str <- readLines(con<-url(pag),warn=FALSE); close(con)
    dow <- htmlParse(str)
    wat <- unlist(dow['//li/a/@href'],use.names=FALSE)
    for(w in wat){
      pa <- paste(url,w,sep='')
      st <- readLines(con<-url(pa),warn=FALSE); close(con)
      dw <- htmlParse(st)
      tit <- xmlValue(dw['//h2'][[1]])
      coun <- xmlValue(dw['//p/b'][[6]])
      i <- grep('Country: ',st,ignore.case=TRUE)
      s <- unlist(strsplit(st[i],'Place of Source</i>: '))
      plac <- initial(s[2],'</p>')
      cat('***',tit,'\n'); flush.console()
      cat('W',tit,'\n',sep='|',file=dat)
      cat('C',coun,'\n',sep='|',file=dat)
      cat('P',plac,'\n',sep='|',file=dat)
      flush(dat)
      j <- grep('No data from analysis available',st,ignore.case=TRUE)
      if(length(j)!=0) next
      v <- st[i]; i <- regexpr('<table',v)
      v <- substr(v,i+6,nchar(v))
      i <- regexpr('<table',v); j <- regexpr('</table',v)
      v <- substr(v,i+6,j-1)
      if(nchar(v)<15) next
      t <- unlist(strsplit(v,'<tr><td>'))
      for(k in 2:length(t)){
        u <- unlist(strsplit(t[k],'</td><td></td><td>'))
        q <- unlist(strsplit(initial(u[2],'</td>'),'  '))
        e <- u[1]; i <- regexpr('\">',e)
        if(i>0) e <- initial(substr(e,i+2,nchar(e)),'</a>')
        cat('E',e,q[1],q[2],'\n',sep='|',file=dat)
        flush(dat)
      }
    }
  }
  close(dat)
}

Fondazione AMGA - acque in bottiglia

World

# http://www.acqueinbottiglia.fondazioneamga.org/world_search_det.asp?ID=1735

setwd("C:/Users/batagelj/work/R/mineral")

trim <- function (x) gsub("^\\s+|\\s+$", "", x)

mineralA <- function(from,to){
  library(XML)
  dat <- file("watersMGAw.dat","w")
  cat('*** Mineral waters - MGA world:',date(),
    '\n*** from http://www.acqueinbottiglia.fondazioneamga.org/\n')
  cat('*** Mineral waters - MGA world:',date(),
    '\n*** from http://www.acqueinbottiglia.fondazioneamga.org/\n',file=dat)
  url <- 'http://www.acqueinbottiglia.fondazioneamga.org/world_search_det.asp?ID='
  ind <- from:to
  cat('*** Started:',date(),'\n',from,'>')
  for(p in ind){
    pa <- paste(url,p,sep='')
    st <- tryCatch(suppressWarnings(readLines(con<-url(pa),warn=FALSE)),
      error=function(e) e)
    close(con)
    if(typeof(st)!="character") {cat("\n***",p); flush.console(); next } 
    dw <- htmlParse(st)
    ti <- dw['//div/div/div[@class="bandablutitolo"]']
    tit <- xmlValue(ti[[1]]); Encoding(tit) <- "UTF-8"
    ta <- dw['//table/tr']
    if(length(ta)==0){
      cat('***',p,'Missing\n'); flush.console()
      next
    }
    cat(p,tit,"",sep="|",file=dat)
    for(i in 1:length(ta)) {
      tat <- xmlValue(ta[[i]]); Encoding(tat) <- "UTF-8"
      u <- unlist(strsplit(tat,':',fixed=TRUE))
      if (length(u)>1) cat(paste(trim(u),collapse="|"),"|",sep="",file=dat)
    }
    cat('\n',file=dat); flush(dat); cat('.')
    if(p%%50==0) cat("\n",p+1,">")
    flush.console()
  }
  close(dat)
  cat('\n*** Finished:',date(),'\n\n')
}

On November 29, 2013 there were waters with indices in 480:2488 (some missing).

Italy

# http://www.acqueinbottiglia.fondazioneamga.org/search_det.asp?ID=16

setwd("C:/Users/batagelj/work/R/mineral")

trim <- function (x) gsub("^\\s+|\\s+$", "", x)

mineralA <- function(from,to){
  library(XML)
  dat <- file("watersMGAi.dat","w")
  cat('*** Mineral waters - MGA Italia:',date(),
    '\n*** from http://www.acqueinbottiglia.fondazioneamga.org/\n')
  cat('*** Mineral waters - MGA Italia:',date(),
    '\n*** from http://www.acqueinbottiglia.fondazioneamga.org/\n',file=dat)
  url <- 'http://www.acqueinbottiglia.fondazioneamga.org/search_det.asp?ID='
  ind <- from:to
  cat('*** Started:',date(),'\n',from,'>')
  for(p in ind){
    pa <- paste(url,p,sep='')
    st <- tryCatch(suppressWarnings(readLines(con<-url(pa),warn=FALSE)),
      error=function(e) e)
    close(con)
    if(typeof(st)!="character") {cat("\n***",p); flush.console(); next }
    dw <- htmlParse(st)
    ti <- dw['//div/div/div[@class="bandablutitolo"]']
    tit <- xmlValue(ti[[1]]); Encoding(tit) <- "UTF-8"
    ta <- dw['//table/tr']
    if(length(ta)==0){
      cat('***',p,'Missing\n'); flush.console()
      next
    }
    cat(p,tit,"",sep="|",file=dat)
    for(i in 1:length(ta)) {
      tat <- xmlValue(ta[[i]]); Encoding(tat) <- "UTF-8"
      u <- unlist(strsplit(tat,':',fixed=TRUE))
      if (length(u)>1) cat(paste(trim(u),collapse="|"),"|",sep="",file=dat)
    }
    cat('\n',file=dat); flush(dat); cat('.')
    if(p%%50==0) cat("\n",p+1,">")
    flush.console()
  }
  close(dat)
  cat('\n*** Finished:',date(),'\n\n')
}

On November 29, 2013 there were waters with indices in 1:523 (some missing).

notes/data/mw.txt · Last modified: 2015/07/16 21:16 by vlado
 
Except where otherwise noted, content on this wiki is licensed under the following license: CC Attribution-Noncommercial-Share Alike 3.0 Unported
Recent changes RSS feed Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki