Code from Data 1

Cyrillic / Unicode in R

> a <- "TOMATO";  b <- "TOMATO"
> a == b
[1] FALSE
> utf8ToInt(a)
[1] 84 79 77 65 84 79
> utf8ToInt(b)
[1] 1058 1054 1052 1040 1058 1054

Reading fixed width data

> wdir <- "C:/Users/batagelj/Documents/papers/2017/Moscow/EDA/test"
> setwd(wdir)
> W <- c(4,3,5,8,4,9,8,4,9)
> H <- c("D","H","V","CD","CH","CDCH","CD2","CH2","X")
> T <- read.fwf("trees.dat",widths=W,skip=1,sep="")
> names(T) <- H
> head(T)
     D  H    V      CD  CH     CDCH     CD2 CH2       X
1  8.3 70 10.3 -4.9484  -6  29.6903 24.4865  36 4822.30
2  8.6 65 10.3 -4.6484 -11  51.1323 21.6075 121 4807.40
3  8.8 63 10.2 -4.4484 -13  57.8290 19.7881 169 4878.72
4 10.5 72 16.4 -2.7484  -4  10.9935  7.5536  16 7938.00
5 10.7 81 18.8 -2.5484   5 -12.7419  6.4943  25 9273.69
6 10.8 83 19.7 -2.4484   7 -17.1387  5.9946  49 9681.12
>

CSV - wines

> wdir <- "C:/Users/batagelj/Documents/papers/2017/Moscow/EDA/test"
> setwd(wdir)
> wines <- read.csv("wine.data",header=FALSE)
> fn <- c("Class","Alcohol","Malic acid","Ash","Alcalinity of ash","Magnesium",
+    "Total phenols","Flavanoids","Nonflavanoid phenols","Proanthocyanins",
+    "Color intensity","Hue","OD280/OD315 of diluted wines","Proline")
> names(wines) <- fn
> dim(wines)
[1] 178  14
> wines[1:3,]
  Class Alcohol Malic acid  Ash Alcalinity of ash Magnesium Total phenols
1     1   14.23       1.71 2.43              15.6       127          2.80
2     1   13.20       1.78 2.14              11.2       100          2.65
3     1   13.16       2.36 2.67              18.6       101          2.80
  Flavanoids Nonflavanoid phenols Proanthocyanins Color intensity  Hue
1       3.06                 0.28            2.29            5.64 1.04
2       2.76                 0.26            1.28            4.38 1.05
3       3.24                 0.30            2.81            5.68 1.03
  OD280/OD315 of diluted wines Proline
1                         3.92    1065
2                         3.40    1050
3                         3.17    1185
>

Reading Excel files

> wdir <- "C:/Users/batagelj/Documents/papers/2017/Moscow/EDA/test"
> setwd(wdir)
> library(readxl)
Warning message:
package ‘readxl’ was built under R version 3.4.2 
> M <- read_excel("./data-7523-2017-10-13.xlsx")
> str(M)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':       93 obs. of  5 variables:
 $ <U+041A><U+043E><U+0434>
: chr  "1" "2" "3" "4" ...
 $ <U+0413><U+043E><U+0434> 
: chr  "2010" "2010" "2010" "2010" ...
 $ global_id
: chr  "37591658" "37591659" "37591660" "37591661" ...
 $ <U+041C><U+0435><U+0441><U+044F><U+0446> 
: chr  "<U+044F><U+043D><U+0432><U+0430><U+0440><U+044C>" "<U+0444><U+0435><U+0432><U+0440><U+0430><U+043B><U+044C>" ...
 $ <U+0413><U+043E><U+0441><U+0443><U+0434><U+0430><U+0440><U+0441><U+0442><U+0432><U+0435><U+043D><U+043D><U+0430><U+044F> ...
> names(M) <- c("code","year","ID","month","marriage")
> mm <- M$month
> mN <- mm[1:12]
> month <- factor(mm,levels=mN)
> as.numeric(month)
 [1]  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1
[26]  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2
[51]  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3
[76]  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9
> D <- read_excel("./data-7522-2017-10-13.xlsx")
> names(D) <- c("code","year","ID","month","divorce")
> str(D)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':       93 obs. of  5 variables:
 $ code   : chr  "1" "2" "3" "4" ...
 $ year   : chr  "2010" "2010" "2010" "2010" ...
 $ ID     : chr  "37591658" "37591659" "37591660" "37591661" ...
 $ month  : chr  "<U+044F><U+043D><U+0432><U+0430><U+0440><U+044C>" "<U+0444><U+0435><U+0432><U+0440><U+0430><U+043B><U+044C>" ...
 $ divorce: chr  "3302" "2937" "4361" "3943" ...
> MD <- data.frame(month=paste(M$year,"-",as.numeric(month),sep=""),
+ mar=as.integer(M$marriage),div=as.integer(D$divorce))
> L <- nrow(MD)
> summary(MD$mar)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   3642    5207    7260    7908   10882   13915 
> summary(MD$div)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   2596    3459    3720    3677    3918    4474 
> me <- as.Date(paste(MD$month,'-01',sep=''),"%Y-%m-%d")
> plot(me,MD$mar,ylim=c(0,14000),xlab="months",ylab="freq",
+ main="Marriage and divorce in Moscow",type="n")
> points(me,MD$mar,col="red",pch=20,cex=1.5)
> points(me,MD$div,col="blue",pch=20,cex=1.5)
> plot(me,MD$mar,col="red",pch=20,cex=1.5,ylim=c(0,14000),xlab="months",
+ ylab="freq",main="Marriage and divorce in Moscow",type='l')
> plot(1:L,1:L,ylim=c(0,14000),main="Marriage and divorce in Moscow",
+ xlab="",ylab="",type="n")
> points(1:L,sort(MD$mar),col="red",pch=20,cex=1.5)
> points(1:L,sort(MD$div),col="blue",pch=20,cex=1.5)
> (q <- (m <- sum(MD$mar))/(d <- sum(MD$div)))
[1] 2.150769
> c(m,d)
[1] 735477 341960
> plot(MD$mar,MD$div,xlab="marriages",ylab="divorces",pch=16,main="Marriage and divorce in Moscow")
> rp <- lm(MD$div ~ MD$mar)
> abline(rp,col="red",lwd=2)
> plot(month,MD$mar,col="red",pch=20,cex=1.5,ylim=c(0,14000),xlab="months",
+ ylab="freq",main="Marriage in Moscow")
> y <- as.integer(M$year)-2009
> b <- rev(heat.colors(n=10)[1:8])
> plot(1:12,1:12,ylim=c(0,14000),xlab="months",ylab="freq",main="Marriage in Moscow",type="n")
> for(k in 1:8){ Y <- MD$mar[y==k]; X <- 1:length(Y); points(X,Y,col=b[k],pch=20,cex=1.5,type="b") }

The Book of the Thousand Nights and a Night, Vol 1

> page <- "http://www.gutenberg.org/cache/epub/3435/pg3435.txt"
> text <- readLines(con<-url(page)); close(con)
> length(text)
[1] 17581
> i <- grep("\\*\\*\\* START OF THIS PROJECT GUTENBERG EBOOK",text,ignore.case=TRUE)
> i
[1] 21
> j <- grep("End of the Project Gutenberg EBook",text,ignore.case=TRUE)[1]
> j
[1] 17220
> book <- text[(i+1):(j-1)]
> separator <- "[[:punct:]]+|[[:space:]]+"
> items <- unlist(strsplit(book,separator))
> words <- tolower(items[nchar(items)>0])
> length(words)
[1] 178740
> t <- table(words)
> z <- rev(sort(t))
> z[1:10]
words
  the   and    of    to     a     i    in    he    my   his 
10596 10259  5015  4111  3176  3169  2457  2331  2069  1867 
> length(z)
[1] 14546
> plot(1:length(z),z,log="xy",pch=16,cex=0.7,xlab="",ylab="freq",
+ main="The Book of the Thousand Nights and a Night, Vol 1")
> N <- names(z)  
> sw <- read.table("stopwords.dat",header=FALSE,stringsAsFactors=FALSE)$V1
> head(sw)
[1] "a"          "about"      "above"      "across"     "after"     
[6] "afterwards"
> spec <- z[!(N %in% sw)]
> spec[1:30]
words
      fn     thou     thee     said        o      thy        s    allah 
    1398      949      919      852      777      723      704      527 
    king     till      man       al     came      day    night     went 
     409      397      382      369      352      332      323      303 
    ...
> sw <- c(sw,"fn","thou","thee","o","s","thy")
> spec <- z[!(N %in% sw)]
> library(wordcloud)
> wordcloud(names(spec)[1:100],spec[1:100],scale=c(5,.5))

Downloading files from a directory

Petruccelli data

files.dir

alum.dat
anneal.dat
anscomb.xls
anscombe.dat
batch.dat
beam.dat
bearings.dat
boxes.dat
boxmix.dat
bread.dat
> wdir <- "C:/Users/batagelj/Documents/papers/2017/Moscow/EDA/test"
> setwd(wdir)
> dir.create("./DL")
> pa <- "https://msu.edu/course//stt/351/snapshot.afs/Petruccelli_dat/"
> L <- as.vector(read.csv("files.dir",header=FALSE)$V1)
> length(L)
[1] 10
> for(fn in L){
+   fname <- paste(pa,fn,sep=""); cat("---",fn,date(),"\n") 
+   test <- tryCatch(download.file(fname,fn,method="auto"),error=function(e) e)
+ }
--- alum.dat Fri Nov 10 14:58:50 2017 
trying URL 'https://msu.edu/course//stt/351/snapshot.afs/Petruccelli_dat/alum.dat'
Content type 'text/plain' length 1810 bytes
downloaded 1810 bytes
 
...
 
--- bread.dat Fri Nov 10 14:59:00 2017 
trying URL 'https://msu.edu/course//stt/351/snapshot.afs/Petruccelli_dat/bread.dat'
Content type 'text/plain' length 973 bytes
downloaded 973 bytes
 
> date()
[1] "Fri Nov 10 14:59:00 2017"

XML

> library(XML)
> pnews <- xmlParse(file="TrumpPutin.xml")
> root <- xmlRoot(pnews)
> meta <- root[[1]][[1]]
> meta[[4]]
<date key="9-11-2017">9-11-2017</date> 
> xmlAttrs(meta[[4]])
        key 
"9-11-2017" 
> xmlValue(meta[[4]])
[1] "9-11-2017"
> cont <- root[[1]][[2]]
> xmlName(cont)
[1] "content"
> xmlSize(cont)
[1] 4
> xpathSApply(doc = pnews, path = "//person")
[[1]]
<person key="Donald Trump">Трампе</person> 
 
[[2]]
<person key="Vladimir Putin">Владимир Путин</person> 
 
[[3]]
<person key="Donald Trump">Дональд Трамп</person> 
 
[[4]]
<person key="Donald Trump">Американский лидер</person> 
 
[[5]]
<person key="Vladimir Putin">Путиным</person> 
 
[[6]]
<person key="Donald Trump">главы Белого дома</person> 
 
[[7]]
<person key="Vladimir Putin">российский президент</person> 
 
[[8]]
<person key="Vladimir Putin">Владимир Путин</person> 
 
[[9]]
<person key="Sergey Lavrov">Сергей Лавров</person> 
 
[[10]]
<person key="Sergey Lavrov">Лавров</person> 
 
[[11]]
<person key="Barack Obama">Барака Обамы</person> 
 
[[12]]
<person key="Sergey Lavrov">Лаврова</person> 
 
[[13]]
<person key="Sergey Lavrov">Лавров</person> 
 
> xpathSApply(doc = pnews, path = "//country")
[[1]]
<country key="Russia">России</country> 
 
[[2]]
<country key="USA">США</country> 
 
[[3]]
<country key="Vietnam">вьетнамском <place key="">Дананге</place></country> 
 
[[4]]
<country key="Russia">России</country> 
 
[[5]]
<country key="USA">США</country> 
 
[[6]]
<country key="Ukraine">Украине</country> 
 
[[7]]
<country key="Siria">сирийского конфликта</country> 
 
[[8]]
<country key="North Korea; South Korea">
  <topic key="Korean peninsula">ситуацию на Корейском полуострове</topic>
</country> 
 
[[9]]
<country key="Siria">
  <topic key="conflict Siria">сирийское урегулирование</topic>
</country> 
 
[[10]]
<country key="Iraq">
  <topic key="Iraq">Ирак</topic>
</country> 
 
[[11]]
<country key="Israel; Palestine">
  <topic key="confict Israel-Palestine"> палестино-израильский конфликт</topic>
</country> 
 
[[12]]
<country key="Ukraine">
  <topic key="Ukraine">Украину</topic>
</country> 
 
> (P <- xpathSApply(pnews,"//person",xmlAttrs))
             key              key              key              key 
  "Donald Trump" "Vladimir Putin"   "Donald Trump"   "Donald Trump" 
             key              key              key              key 
"Vladimir Putin"   "Donald Trump" "Vladimir Putin" "Vladimir Putin" 
             key              key              key              key 
 "Sergey Lavrov"  "Sergey Lavrov"   "Barack Obama"  "Sergey Lavrov" 
             key 
 "Sergey Lavrov" 
> xpathSApply(pnews,"//person",xmlValue)
 [1] "Трампе"               "Владимир Путин"       "Дональд Трамп"       
 [4] "Американский лидер"   "Путиным"              "главы Белого дома"   
 [7] "российский президент" "Владимир Путин"       "Сергей Лавров"       
[10] "Лавров"               "Барака Обамы"         "Лаврова"             
[13] "Лавров"              
> (C <- xpathSApply(pnews,"//country",xmlAttrs))
                       key                        key 
                  "Russia"                      "USA" 
                       key                        key 
                 "Vietnam"                   "Russia" 
                       key                        key 
                     "USA"                  "Ukraine" 
                       key                        key 
                   "Siria" "North Korea; South Korea" 
                       key                        key 
                   "Siria"                     "Iraq" 
                       key                        key 
       "Israel; Palestine"                  "Ukraine" 
> (Pn <- table(P))
P
  Barack Obama   Donald Trump  Sergey Lavrov Vladimir Putin 
             1              4              4              4 
> (Cn <- table(unlist(strsplit(C,";"))))
 
   Palestine  South Korea         Iraq       Israel  North Korea       Russia 
           1            1            1            1            1            2 
       Siria      Ukraine          USA      Vietnam 
           2            2            2            1 
> 

eval

> s <- "3+4*5"
> s
[1] "3+4*5"
> cat(s,"=",eval(parse(text=s)),"\n")
3+4*5 = 23 

Attention, danger

JSON - jsonlite

> library(jsonlite)
> J <- fromJSON(readLines("john.json"))
> J
$firstName
[1] "John"
 
$lastName
[1] "Smith"
 
$isAlive
[1] TRUE
 
$age
[1] 25
 
$address
$address$streetAddress
[1] "21 2nd Street"
 
$address$city
[1] "New York"
 
$address$state
[1] "NY"
 
$address$postalCode
[1] "10021-3100"
 
 
$phoneNumbers
    type       number
1   home 212 555-1234
2 office 646 555-4567
 
$children
list()
 
$spouse
NULL
 
> john <- toJSON(J)
> john
{"firstName":["John"],"lastName":["Smith"],"isAlive":[true],"age":[25],"address":{"streetAddress":["21 2nd Street"],
"city":["New York"],"state":["NY"],"postalCode":["10021-3100"]},"phoneNumbers":[{"type":"home","number":"212 555-1234"},
{"type":"office","number":"646 555-4567"}],"children":[],"spouse":{}} 
> js <- file("john2.json",encoding="UTF-8")
> write(john,file=js)

Environments as dictionaries

> ime <- new.env(hash=TRUE,parent=emptyenv())
> assign("Liska",list(i=1,n=1),env=ime)
> length(ime)
[1] 1
> exists("Liska",env=ime,inherits=FALSE)
[1] TRUE
> exists("Šeka",env=ime,inherits=FALSE)
[1] FALSE
> assign("Šeka",list(i=2,n=1),env=ime)
> exists("Šeka",env=ime,inherits=FALSE)
[1] TRUE
> get("Šeka",env=ime,inherits=FALSE)
$i
[1] 2
 
$n
[1] 1

Environments can be used also to keep some variables to survive the call of a function.

> raj <- new.env()
> exA <- function(a,b,c){u <- a; v <<- b; assign("z",c,env=raj)}  
> u
Error: object 'u' not found
> v
Error: object 'v' not found
> z
Error: object 'z' not found
> exA(3,4,5)
> u
Error: object 'u' not found
> v
[1] 4
> z
Error: object 'z' not found
> get("z",env=raj)
[1] 5

EDA

ru/hse/eda/cdata.txt · Last modified: 2017/11/13 11:20 by vlado
 
Except where otherwise noted, content on this wiki is licensed under the following license: CC Attribution-Noncommercial-Share Alike 3.0 Unported
Recent changes RSS feed Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki