WA and AI networks

WA and AI networks

May 6-9, 2017

Articles / Works

In the source Excel file articles.xlsx (copy of articles_25_04_2017.xlsx) there were several bad records - incomplete or extending over some lines. I first removed the records with only article's ID and corrected multi-line records.

4870 del, 4729 del, 4728 del, 4544-6 join, 4529 del, …, 2157-8 join, 2017 del, 487 del

Reading Excel xlsx file with package readxl

> setwd("C:/Users/batagelj/Documents/2017/malceva/elib")
> library(readxl)
> Ar <- read_excel("./articles.xlsx",col_types = rep("text",25))
> WID <- Ar[,1]
> Wti <- Ar[,2]
> Encoding(Wti) <- "UTF-8"
> AID <- Ar[,4]
> Afn <- Ar[,5]
> Encoding(Afn) <- "UTF-8"
> head(Afn)
[1] "ЗЫРЯНОВ СЕРГЕЙ ГРИГОРЬЕВИЧ"       "ДАВЫДЕНКО ВЛАДИМИР АЛЕКСАНДРОВИЧ"
[3] "FARMER J."                        NA                                
[5] "PISELLI FORTUNATA"                "KOKSAL YONCA"                    
> Wty <- factor(Ar[,7]); tylev <- levels(Wty)
> tylev
 [1] "0"                                         "автореферат диссертации "                 
 [3] "брошюра "                                  "глава в книге "                           
 [5] "депонированная рукопись "                  "диссертация "                             
 [7] "монография "                               "сборник статей "                          
 [9] "сборник тезисов докладов на конференции "  "сборник трудов конференции "              
[11] "статья в журнале "                         "статья в журнале - аннотация "            
[13] "статья в журнале - краткое сообщение "     "статья в журнале - материалы конференции "
[15] "статья в журнале - научная статья "        "статья в журнале - научный отчет "        
[17] "статья в журнале - обзорная статья "       "статья в журнале - переписка "            
[19] "статья в журнале - разное "                "статья в журнале - редакторская заметка " 
[21] "статья в журнале - рецензия "              "статья в открытом архиве "                
[23] "статья в сборнике статей "                 "статья в сборнике трудов конференции "    
[25] "тезисы доклада на конференции "            "учебное пособие "                         
> dim(Ar)
[1] 5227   25
> Wlang <- factor(Ar[,8]); langlev <- levels(Wlang)
> langlev
[1] "английский "   "не определен " "русский "      "украинский "   "французский " 
> Wjour <- factor(Ar[,10]); jourlev <- levels(Wjour)
> length(jourlev)
[1] 2148
> head(jourlev)
[1] " АГАПОВ ВАЛЕРИЙ СЕРГЕЕВИЧ, СМУЛЬСКИЙ СЕРГЕЙ ВЛАДИМИРОВИЧ"                
[2] " КОЛПИНА Л.В.1"                                                          
[3] "2010 IEEE INTERNATIONAL CONFERENCE ON COMMUNICATIONS WORKSHOPS, ICC 2010"
[4] "2011"                                                                    
[5] "2013"                                                                    
[6] "2014"                                                                    
> Wv <- Ar[,11]
> head(Wv)
[1] "4"  "3"  "1"  "12" "7"  "10"
> year <- as.integer(Ar[,12])
> table(year)
year
1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 
  14   30   41   43   43   41   31   38   39   84   80   92   59   73   78   86   60 
2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 
  41   60   59   72  119  273  328  491  619  913 1175  145 
> bp <- Ar[,13]
> K <- Ar[,19]
> head(K)
[1] "политический пространство; POLITICAL AREA; электоральный кластер; ELECTORAL CLUSTER; социальный сеть; 
    SOCIAL NETWORK; транзитивный политический режим; TRANSITIVE POLITICAL REGIME; 
    отчуждение гражданин от власть; ALIENATION OF CITIZENS FROM THE POWER; партия как политический институт;
    PARTY AS A POLITICAL INSTITUTE"
[2] "социологический исследование; социология труд; социальный сеть; неформальный экономика; 
    хозяйственный деятельность; трудовой отношение; неформальный занятость; неформальный трудоустройство" 
[3] "information technology; internet; information provision; nursing" 
[4] "peer counseling; financial aid; fictive kin; social capital"  
[5] "community; social network; social network analysis; urban neighborhood"
[6] "social network; nationalism; state transformation; ottoman empire"
> Encoding(K) <- "UTF-8"

Short names of works

> S <- strsplit(Afn," ")
> S[[1]]
[1] "ЗЫРЯНОВ"     "СЕРГЕЙ"      "ГРИГОРЬЕВИЧ"
> S[[201]]
[1] "BROWN"  "SHEILA"
> S[[55]]
[1] NA
> S[[281]]
[1] "СКОРНИЧЕНКО" "НАТАЛЬЯ"     "НИКОЛАЕВНА" 
> short <- vector(mode="character",length=n)
> for(i in 1:n) {
+    Last <- substr(S[[i]][1],1,8); Init <- substr(S[[i]][2],1,1)
+    short[i] <- paste(Last,"_",Init,"(",year[i],")",Wv[i],":",bp[i],sep="")
+ }
> head(short)
[1] "ЗЫРЯНОВ_С(2008)4:24"   "ДАВЫДЕНК_В(2007)3:74"  "FARMER_J(1999)1:49"   
[4] "NA_NA(2006)12:1687"    "PISELLI_F(2007)7:867"  "KOKSAL_Y(2008)10:1498"
> short[281]
[1] "СКОРНИЧЕ_Н(2008)4:185"
> A <- data.frame(ID=WID,short=short,title=Wti,autID=AID,autName=Afn,type=Wty,lang=Wlang,journ=Wjour,
+   vol=Wv,year=year,bp=bp,keyw=K)
> save(A,file="articles.Rdata")

WJ

> WIn <- factor(WID); Wlev <- levels(WIn)
> nw <- length(Wlev); nj <- length(jourlev)
> wname <- vector(mode="character",length=nw)
> for(i in 1:n){
+    inw <- as.integer(WIn[i])
+    if(wname[inw]=="") wname[inw] <- short[i] else
+    if(wname[inw]!=short[i]) cat("***",i,inw,wname[inw],short[i],"\n",sep=" ")
+ }
> head(wname)
[1] "ЗЫРЯНОВ_С(2008)4:24"   "ДАВЫДЕНК_В(2007)3:74"  "FARMER_J(1999)1:49"   
[4] "NA_NA(2006)12:1687"    "PISELLI_F(2007)7:867"  "KOKSAL_Y(2008)10:1498"
> 
> Encoding(wname) <- "UTF-8"
> jname <- gsub("\"", "\'", jourlev)
> Encoding(jname) <- "UTF-8"
> net <- file("WJ.net","w")
> writeLines(paste("*vertices ",nw+nj," ",nw,sep=""),net,useBytes=T)
> for(i in 1:nw) writeLines(paste(i,' "',wname[i],'"',sep=""),net,useBytes=T)
> for(i in 1:nj) writeLines(paste(nw+i,' "',jname[i],'"',sep=""),net,useBytes=T)
> writeLines("*arcs",net,useBytes=T)
> for(i in 1:n) writeLines(paste(as.integer(WIn[i]),nw+as.integer(Wjour[i]),sep=" "),net,useBytes=T)
> close(net)

For some works the corresponding journal is not known. They result in a NA value in the list of arcs. I manually introduced an additional node 7336 Unknown and replaced NAs with 7376.

year

> clu <- file("year.clu","w")
> writeLines(paste("*vertices ",nw,sep=""),clu)
> for(i in 1:n) writeLines(paste(year[WIn[i]]),clu)
> close(clu)

language

> clu <- file("lang.clu","w"); nl <- length(langlev)
> writeLines(paste(rep("% ",nl),1:nl,langlev,collapse="\n"),clu,useBytes=T)
> writeLines(paste("*vertices ",nw,sep=""),clu,useBytes=T)
> for(i in 1:n) writeLines(paste(as.integer(Wlang[WIn[i]])),clu)
> close(clu)

type

> clu <- file("type.clu","w"); nt <- length(tylev)
> writeLines(paste(rep("% ",nt),1:nt,tylev,collapse="\n"),clu,useBytes=T)
> writeLines(paste("*vertices ",nw,sep=""),clu,useBytes=T)
> for(i in 1:n) writeLines(paste(as.integer(Wty[WIn[i]])),clu)
> close(clu)

list of works

> lst <- file("workList.csv","w")
> writeLines("index;workID;short;workTitle",lst,useBytes=T)
> for(i in 1:nw) writeLines(paste(i,Wlev[i],short[i],Wti[i],sep=";"),lst,useBytes=T)
> close(lst)

WK

> keys <- tolower(K)
> trimws <- function(s) gsub("(^ +)|( +$)","",s)
> S <- trimws(unlist(strsplit(paste(keys,collapse=";"),";")))
> head(S)
[1] "политический пространство" "political area"           
[3] "электоральный кластер"     "electoral cluster"        
[5] "социальный сеть"           "social network"  
> ik <- factor(S); Klev <- levels(ik); nk <- length(Klev)
> nk
[1] 18181

If we list the first 210 keywords in Klev we see that there are some “strange” keywords. Typical examples are included in vector test. We decided to make an additional cleaning of keywords

> test <- c(Klev[1],Klev[2],Klev[11],Klev[179],Klev[190],Klev[194])
> test
[1] "'' one - on - one '' marketing system" "'' арабская весна ''"                 
[3] "\" cloud \" computing"                 "`` big city ''"                       
[5] "a . v . repina"                        "a country of the world"               
> trimws(gsub(" +"," ",gsub("^a ","",gsub("`","",gsub("'","",gsub('\\"',"",gsub("\\."," ",gsub("-"," ",test))))))))
[1] "one on one marketing system" "арабская весна"              "cloud computing"            
[4] "big city"                    "v repina"                    "country of the world"

We recompute ik, Klev and nk

> cleanK <- function(S)
+    trimws(gsub(" +"," ",gsub("^a ","",gsub("`","",gsub("'","",gsub('\\"',"",gsub("\\."," ",gsub("-"," ",S))))))))
> S <- trimws(unlist(strsplit(paste(keys,collapse=";"),";")))
> ik <- factor(cleanK(S)); Klev <- levels(ik); nk <- length(Klev)
> nk
[1] 17962
> Encoding(Klev) <- "UTF-8"
> net <- file("WK.net","w")
> writeLines(paste("*vertices ",nw+nk," ",nw,sep=""),net,useBytes=T)
> for(i in 1:nw) writeLines(paste(i,' "',wname[i],'"',sep=""),net,useBytes=T)
> for(i in 1:nk) writeLines(paste(nw+i,' "',Klev[i],'"',sep=""),net,useBytes=T)
> writeLines("*arcs",net,useBytes=T)
> for(i in 1:n){
+    inw <- as.integer(WIn[i])
+    if(!is.na(keys[i])){
+       S <- cleanK(unlist(strsplit(keys[i],";")))
+       for(k in S) { ink <- as.integer(factor(k,levels=Klev));
+          if(!is.na(ink)) writeLines(paste(inw,nw+ink,sep=" "),net,useBytes=T) }
+    }
+ }
> close(net)

russian

Another problem with keywords is that some of them exist in Russian a English version. May be the Russian/English partition could prove to be useful.

> # utf8ToInt("Ά") = 902   Greek Capital Letter Alpha With Tonos
> latin <- function(s){ LL <- utf8ToInt(gsub(" ","",s))<902; sum(LL)/length(LL) }
> as.vector(sapply(test,latin))
[1] 1.0000000 0.2352941 1.0000000 1.0000000 1.0000000 1.0000000
> russian <- as.integer(as.vector(sapply(test,latin))<0.5)
> russian
[1] 0 1 0 0 0 0
 
> russian <- as.integer(as.vector(sapply(Klev,latin))<0.5)
> clu <- file("russian.clu","w")
> writeLines("% 0 English\n% 1 Russian",clu)
> writeLines(paste("*vertices ",nk,sep=""),clu)
> writeLines(paste(russian,sep="\n"),clu)
> close(clu)

WA works X authors network

First version

I first tried to convert the source file ListOfAuthors_Authorship affiliation.txt into a CSV file readable by Excel. I replaced all ”;” with “§” and afterwards all tabs “\t” by ”;”. There was an error in line 4691. I also added a header. I saved it to the file affil.csv. It opens in Excel.

Trying to read the file affil.csv in R it is reporting some problems. To bypass them I used

> csv <- file("affil.csv","r")
> lines <- readLines(csv)
> close(csv)
> Encoding(lines) <- "UTF-8"
> S <- strsplit(lines,";")
> ns <- length(S); nm <- ns-1
> wId <- vector(mode="character",length=nm)
> aId <- vector(mode="character",length=nm)
> aNm <- vector(mode="character",length=nm)
> iId <- vector(mode="character",length=nm)
> iNm <- vector(mode="character",length=nm)
> for(i in 2:ns){wId[i-1] <- S[[i]][1]; aId[i-1] <- S[[i]][2];
+   aNm[i-1] <- S[[i]][3]; iId[i-1] <- S[[i]][4]; iNm[i-1] <- S[[i]][5] }
> wIn <- factor(wId,levels=Wlev)
> aIn <- factor(aId); alev <- levels(aIn)
> lw <- length(wlev); na <- length(alev)
> aname <- vector(mode="character",length=na)
> for(i in 1:nm){
+    ina <- as.integer(aIn[i])
+    if(aname[ina]=="") aname[ina] <- aNm[i] else
+    if(aname[ina]!=aNm[i]) cat("***",i,ina,aId[i],aname[ina],aNm[i],"\n",sep=" ")
+ }

There are authors that are using different names - see the list.

Now we are ready to export the WA network file in Pajek format

> Encoding(aname) <- "UTF-8"
> net <- file("WA.net","w")
> writeLines(paste("*vertices ",nw+na," ",nw,sep=""),net,useBytes=T)
> for(i in 1:nw) writeLines(paste(i,' "',wname[i],'"',sep=""),net,useBytes=T)
> for(i in 1:na) writeLines(paste(nw+i,' "',aname[i],'"',sep=""),net,useBytes=T)
> writeLines("*arcs",net,useBytes=T)
> for(i in 1:nm) writeLines(paste(as.integer(wIn[i]),nw+as.integer(aIn[i]),sep=" "),net,useBytes=T)
> close(net)
>

There was some searching on Google to learn how to write out from R an UTF-8 encoded file.

In an text editor I added some comments and saved the file as a UTF-8 encoded with BOM (signature).

We prepare also a CSV file linking author's ID with his/her name

> lst <- file("authList.csv","w")
> writeLines("index;authID;authName",lst,useBytes=T)
> for(i in 1:na) writeLines(paste(i,alev[i],aname[i],sep=";"),lst,useBytes=T)
> close(lst)

AI authors X institutions network

The procedures are similar to the procedures for computing WA natwork

> iname <- vector(mode="character",length=ni)
> for(i in 1:nm){
+    ini <- as.integer(iIn[i])
+    if(iname[ini]=="") iname[ini] <- iNm[i] else
+    if(iname[ini]!=iNm[i]) cat("***",i,ini,iId[i],"\n   ",iname[ini],"\n   ",iNm[i],"\n",sep=" ")
+ }

There are no different institutions with the same ID.

Let's construct the AI network

> iName <- gsub("\"", "\'", iname)
> Encoding(iName) <- "UTF-8"
> net <- file("AI.net","w")
> writeLines(paste("*vertices ",na+ni," ",na,sep=""),net,useBytes=T)
> for(i in 1:na) writeLines(paste(i,' "',aname[i],'"',sep=""),net,useBytes=T)
> for(i in 1:ni) writeLines(paste(na+i,' "',iName[i],'"',sep=""),net,useBytes=T)
> writeLines("*arcs",net,useBytes=T)
> for(i in 1:nm) writeLines(paste(as.integer(aIn[i]),na+as.integer(iIn[i]),sep=" "),net,useBytes=T)
> close(net)

We first replaced ” in names of institutions with '.

And finally a list of institutions

> lst <- file("instList.csv","w")
> writeLines("index;instID;instName",lst,useBytes=T)
> for(i in 1:ni) writeLines(paste(i,ilev[i],iName[i],sep=";"),lst,useBytes=T)
> close(lst)

Correction of institutions

The corrections were prepared on the file Renames.xlsx. I sorted the data by index (first column) and added the header line. I saved new version to file Rename.xlsx.

> setwd("C:/Users/batagelj/Documents/2017/malceva/elib")
> library(readxl)
> IN <- read_excel("./Rename.xlsx",sheet=5,col_types=rep("text",4))
> old <- IN[,3]; Encoding(old) <- "UTF-8"
> new <- IN[,4]; Encoding(new) <- "UTF-8"
> ind <- factor(new); lab <-levels(ind); nn <- length(lab)
> lst <- file("instNew.csv","w")
> writeLines("index;instName",lst,useBytes=T)
> for(i in 1:nn) writeLines(paste(i,lab[i],sep=";"),lst,useBytes=T)
> close(lst)
> ni <- length(ind)
> clu <- file("instShrink.clu","w")
> writeLines(paste("*vertices ",ni,sep=""),clu)
> writeLines(paste(as.integer(ind),sep="\n"),clu)
> close(clu)

Analysis

http://vladowiki.fmf.uni-lj.si/doku.php?id=notes:net:colln
1273. Sredin seminar
1274. Sredin seminar
1275. Sredin seminar
Cerinšek, M., Batagelj, V.: Network analysis of Zentralblatt MATH data. Scientometrics, 102(2015)1, 977-1001. arXiv
Bodlaj, J, Batagelj, V: Network Analysis of Publications on Topological Indices from the Web of Science. Molecular informatics, Volume 33, Issue 8, pages 514–535, August 2014. DOI: 10.1002/minf.201400014
Batagelj, V, Cerinšek, M: On bibliographic networks. Scientometrics 96 (2013) 3, 845-864. (DOI) 10.1007/s11192-012-0940-1. arXiv
Kejžar, N., Korenjak Černe, S., Batagelj, V.: Network Analysis of Works on Clustering and Classification from Web of Science. Classification as a Tool for Research. Hermann Locarek-Junge, Claus Weihs eds. Proceedings of IFCS 2009. Studies in Classification, Data Analysis, and Knowledge Organization, Part 3, 525-536, Springer, Berlin, 2010.