====== WA and AI networks ====== May 6-9, 2017 ===== Articles / Works ===== In the source Excel file ''articles.xlsx'' (copy of ''articles_25_04_2017.xlsx'') there were several bad records - incomplete or extending over some lines. I first removed the records with only article's ID and corrected multi-line records. 4870 del, 4729 del, 4728 del, 4544-6 join, 4529 del, ..., 2157-8 join, 2017 del, 487 del Reading Excel xlsx file with package readxl > setwd("C:/Users/batagelj/Documents/2017/malceva/elib") > library(readxl) > Ar <- read_excel("./articles.xlsx",col_types = rep("text",25)) > WID <- Ar[,1] > Wti <- Ar[,2] > Encoding(Wti) <- "UTF-8" > AID <- Ar[,4] > Afn <- Ar[,5] > Encoding(Afn) <- "UTF-8" > head(Afn) [1] "ЗЫРЯНОВ СЕРГЕЙ ГРИГОРЬЕВИЧ" "ДАВЫДЕНКО ВЛАДИМИР АЛЕКСАНДРОВИЧ" [3] "FARMER J." NA [5] "PISELLI FORTUNATA" "KOKSAL YONCA" > Wty <- factor(Ar[,7]); tylev <- levels(Wty) > tylev [1] "0" "автореферат диссертации " [3] "брошюра " "глава в книге " [5] "депонированная рукопись " "диссертация " [7] "монография " "сборник статей " [9] "сборник тезисов докладов на конференции " "сборник трудов конференции " [11] "статья в журнале " "статья в журнале - аннотация " [13] "статья в журнале - краткое сообщение " "статья в журнале - материалы конференции " [15] "статья в журнале - научная статья " "статья в журнале - научный отчет " [17] "статья в журнале - обзорная статья " "статья в журнале - переписка " [19] "статья в журнале - разное " "статья в журнале - редакторская заметка " [21] "статья в журнале - рецензия " "статья в открытом архиве " [23] "статья в сборнике статей " "статья в сборнике трудов конференции " [25] "тезисы доклада на конференции " "учебное пособие " > dim(Ar) [1] 5227 25 > Wlang <- factor(Ar[,8]); langlev <- levels(Wlang) > langlev [1] "английский " "не определен " "русский " "украинский " "французский " > Wjour <- factor(Ar[,10]); jourlev <- levels(Wjour) > length(jourlev) [1] 2148 > head(jourlev) [1] " АГАПОВ ВАЛЕРИЙ СЕРГЕЕВИЧ, СМУЛЬСКИЙ СЕРГЕЙ ВЛАДИМИРОВИЧ" [2] " КОЛПИНА Л.В.1" [3] "2010 IEEE INTERNATIONAL CONFERENCE ON COMMUNICATIONS WORKSHOPS, ICC 2010" [4] "2011" [5] "2013" [6] "2014" > Wv <- Ar[,11] > head(Wv) [1] "4" "3" "1" "12" "7" "10" > year <- as.integer(Ar[,12]) > table(year) year 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 14 30 41 43 43 41 31 38 39 84 80 92 59 73 78 86 60 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 41 60 59 72 119 273 328 491 619 913 1175 145 > bp <- Ar[,13] > K <- Ar[,19] > head(K) [1] "политический пространство; POLITICAL AREA; электоральный кластер; ELECTORAL CLUSTER; социальный сеть; SOCIAL NETWORK; транзитивный политический режим; TRANSITIVE POLITICAL REGIME; отчуждение гражданин от власть; ALIENATION OF CITIZENS FROM THE POWER; партия как политический институт; PARTY AS A POLITICAL INSTITUTE" [2] "социологический исследование; социология труд; социальный сеть; неформальный экономика; хозяйственный деятельность; трудовой отношение; неформальный занятость; неформальный трудоустройство" [3] "information technology; internet; information provision; nursing" [4] "peer counseling; financial aid; fictive kin; social capital" [5] "community; social network; social network analysis; urban neighborhood" [6] "social network; nationalism; state transformation; ottoman empire" > Encoding(K) <- "UTF-8" Short names of works > S <- strsplit(Afn," ") > S[[1]] [1] "ЗЫРЯНОВ" "СЕРГЕЙ" "ГРИГОРЬЕВИЧ" > S[[201]] [1] "BROWN" "SHEILA" > S[[55]] [1] NA > S[[281]] [1] "СКОРНИЧЕНКО" "НАТАЛЬЯ" "НИКОЛАЕВНА" > short <- vector(mode="character",length=n) > for(i in 1:n) { + Last <- substr(S[[i]][1],1,8); Init <- substr(S[[i]][2],1,1) + short[i] <- paste(Last,"_",Init,"(",year[i],")",Wv[i],":",bp[i],sep="") + } > head(short) [1] "ЗЫРЯНОВ_С(2008)4:24" "ДАВЫДЕНК_В(2007)3:74" "FARMER_J(1999)1:49" [4] "NA_NA(2006)12:1687" "PISELLI_F(2007)7:867" "KOKSAL_Y(2008)10:1498" > short[281] [1] "СКОРНИЧЕ_Н(2008)4:185" > A <- data.frame(ID=WID,short=short,title=Wti,autID=AID,autName=Afn,type=Wty,lang=Wlang,journ=Wjour, + vol=Wv,year=year,bp=bp,keyw=K) > save(A,file="articles.Rdata") ===== WJ ===== > WIn <- factor(WID); Wlev <- levels(WIn) > nw <- length(Wlev); nj <- length(jourlev) > wname <- vector(mode="character",length=nw) > for(i in 1:n){ + inw <- as.integer(WIn[i]) + if(wname[inw]=="") wname[inw] <- short[i] else + if(wname[inw]!=short[i]) cat("***",i,inw,wname[inw],short[i],"\n",sep=" ") + } > head(wname) [1] "ЗЫРЯНОВ_С(2008)4:24" "ДАВЫДЕНК_В(2007)3:74" "FARMER_J(1999)1:49" [4] "NA_NA(2006)12:1687" "PISELLI_F(2007)7:867" "KOKSAL_Y(2008)10:1498" > > Encoding(wname) <- "UTF-8" > jname <- gsub("\"", "\'", jourlev) > Encoding(jname) <- "UTF-8" > net <- file("WJ.net","w") > writeLines(paste("*vertices ",nw+nj," ",nw,sep=""),net,useBytes=T) > for(i in 1:nw) writeLines(paste(i,' "',wname[i],'"',sep=""),net,useBytes=T) > for(i in 1:nj) writeLines(paste(nw+i,' "',jname[i],'"',sep=""),net,useBytes=T) > writeLines("*arcs",net,useBytes=T) > for(i in 1:n) writeLines(paste(as.integer(WIn[i]),nw+as.integer(Wjour[i]),sep=" "),net,useBytes=T) > close(net) For some works the corresponding journal is not known. They result in a NA value in the list of arcs. I manually introduced an additional node 7336 ''Unknown'' and replaced NAs with 7376. ===== year ===== > clu <- file("year.clu","w") > writeLines(paste("*vertices ",nw,sep=""),clu) > for(i in 1:n) writeLines(paste(year[WIn[i]]),clu) > close(clu) ===== language ===== > clu <- file("lang.clu","w"); nl <- length(langlev) > writeLines(paste(rep("% ",nl),1:nl,langlev,collapse="\n"),clu,useBytes=T) > writeLines(paste("*vertices ",nw,sep=""),clu,useBytes=T) > for(i in 1:n) writeLines(paste(as.integer(Wlang[WIn[i]])),clu) > close(clu) ===== type ===== > clu <- file("type.clu","w"); nt <- length(tylev) > writeLines(paste(rep("% ",nt),1:nt,tylev,collapse="\n"),clu,useBytes=T) > writeLines(paste("*vertices ",nw,sep=""),clu,useBytes=T) > for(i in 1:n) writeLines(paste(as.integer(Wty[WIn[i]])),clu) > close(clu) ===== list of works ===== > lst <- file("workList.csv","w") > writeLines("index;workID;short;workTitle",lst,useBytes=T) > for(i in 1:nw) writeLines(paste(i,Wlev[i],short[i],Wti[i],sep=";"),lst,useBytes=T) > close(lst) ===== WK ===== > keys <- tolower(K) > trimws <- function(s) gsub("(^ +)|( +$)","",s) > S <- trimws(unlist(strsplit(paste(keys,collapse=";"),";"))) > head(S) [1] "политический пространство" "political area" [3] "электоральный кластер" "electoral cluster" [5] "социальный сеть" "social network" > ik <- factor(S); Klev <- levels(ik); nk <- length(Klev) > nk [1] 18181 If we list the first 210 keywords in Klev we see that there are some "strange" keywords. Typical examples are included in vector test. We decided to make an additional cleaning of keywords > test <- c(Klev[1],Klev[2],Klev[11],Klev[179],Klev[190],Klev[194]) > test [1] "'' one - on - one '' marketing system" "'' арабская весна ''" [3] "\" cloud \" computing" "`` big city ''" [5] "a . v . repina" "a country of the world" > trimws(gsub(" +"," ",gsub("^a ","",gsub("`","",gsub("'","",gsub('\\"',"",gsub("\\."," ",gsub("-"," ",test)))))))) [1] "one on one marketing system" "арабская весна" "cloud computing" [4] "big city" "v repina" "country of the world" We recompute ik, Klev and nk > cleanK <- function(S) + trimws(gsub(" +"," ",gsub("^a ","",gsub("`","",gsub("'","",gsub('\\"',"",gsub("\\."," ",gsub("-"," ",S)))))))) > S <- trimws(unlist(strsplit(paste(keys,collapse=";"),";"))) > ik <- factor(cleanK(S)); Klev <- levels(ik); nk <- length(Klev) > nk [1] 17962 > Encoding(Klev) <- "UTF-8" > net <- file("WK.net","w") > writeLines(paste("*vertices ",nw+nk," ",nw,sep=""),net,useBytes=T) > for(i in 1:nw) writeLines(paste(i,' "',wname[i],'"',sep=""),net,useBytes=T) > for(i in 1:nk) writeLines(paste(nw+i,' "',Klev[i],'"',sep=""),net,useBytes=T) > writeLines("*arcs",net,useBytes=T) > for(i in 1:n){ + inw <- as.integer(WIn[i]) + if(!is.na(keys[i])){ + S <- cleanK(unlist(strsplit(keys[i],";"))) + for(k in S) { ink <- as.integer(factor(k,levels=Klev)); + if(!is.na(ink)) writeLines(paste(inw,nw+ink,sep=" "),net,useBytes=T) } + } + } > close(net) ===== russian ===== Another problem with keywords is that some of them exist in Russian a English version. May be the Russian/English partition could prove to be useful. > # utf8ToInt("Ά") = 902 Greek Capital Letter Alpha With Tonos > latin <- function(s){ LL <- utf8ToInt(gsub(" ","",s))<902; sum(LL)/length(LL) } > as.vector(sapply(test,latin)) [1] 1.0000000 0.2352941 1.0000000 1.0000000 1.0000000 1.0000000 > russian <- as.integer(as.vector(sapply(test,latin))<0.5) > russian [1] 0 1 0 0 0 0 > russian <- as.integer(as.vector(sapply(Klev,latin))<0.5) > clu <- file("russian.clu","w") > writeLines("% 0 English\n% 1 Russian",clu) > writeLines(paste("*vertices ",nk,sep=""),clu) > writeLines(paste(russian,sep="\n"),clu) > close(clu) ===== WA works X authors network ===== [[notes:net:dm:wa0|First version]] I first tried to convert the source file ''ListOfAuthors_Authorship affiliation.txt'' into a CSV file readable by Excel. I replaced all ";" with "§" and afterwards all tabs "\t" by ";". There was an error in line 4691. I also added a header. I saved it to the file ''affil.csv''. It opens in Excel. Trying to read the file ''affil.csv'' in R it is reporting some problems. To bypass them I used > csv <- file("affil.csv","r") > lines <- readLines(csv) > close(csv) > Encoding(lines) <- "UTF-8" > S <- strsplit(lines,";") > ns <- length(S); nm <- ns-1 > wId <- vector(mode="character",length=nm) > aId <- vector(mode="character",length=nm) > aNm <- vector(mode="character",length=nm) > iId <- vector(mode="character",length=nm) > iNm <- vector(mode="character",length=nm) > for(i in 2:ns){wId[i-1] <- S[[i]][1]; aId[i-1] <- S[[i]][2]; + aNm[i-1] <- S[[i]][3]; iId[i-1] <- S[[i]][4]; iNm[i-1] <- S[[i]][5] } > wIn <- factor(wId,levels=Wlev) > aIn <- factor(aId); alev <- levels(aIn) > lw <- length(wlev); na <- length(alev) > aname <- vector(mode="character",length=na) > for(i in 1:nm){ + ina <- as.integer(aIn[i]) + if(aname[ina]=="") aname[ina] <- aNm[i] else + if(aname[ina]!=aNm[i]) cat("***",i,ina,aId[i],aname[ina],aNm[i],"\n",sep=" ") + } There are authors that are using different names - see the [[notes:net:DM:alist|list]]. Now we are ready to export the WA network file in Pajek format > Encoding(aname) <- "UTF-8" > net <- file("WA.net","w") > writeLines(paste("*vertices ",nw+na," ",nw,sep=""),net,useBytes=T) > for(i in 1:nw) writeLines(paste(i,' "',wname[i],'"',sep=""),net,useBytes=T) > for(i in 1:na) writeLines(paste(nw+i,' "',aname[i],'"',sep=""),net,useBytes=T) > writeLines("*arcs",net,useBytes=T) > for(i in 1:nm) writeLines(paste(as.integer(wIn[i]),nw+as.integer(aIn[i]),sep=" "),net,useBytes=T) > close(net) > There was some searching on Google to learn how to write out from R an UTF-8 encoded file. In an text editor I added some comments and saved the file as a UTF-8 encoded with BOM (signature). We prepare also a CSV file linking author's ID with his/her name > lst <- file("authList.csv","w") > writeLines("index;authID;authName",lst,useBytes=T) > for(i in 1:na) writeLines(paste(i,alev[i],aname[i],sep=";"),lst,useBytes=T) > close(lst) ===== AI authors X institutions network ===== The procedures are similar to the procedures for computing WA natwork > iname <- vector(mode="character",length=ni) > for(i in 1:nm){ + ini <- as.integer(iIn[i]) + if(iname[ini]=="") iname[ini] <- iNm[i] else + if(iname[ini]!=iNm[i]) cat("***",i,ini,iId[i],"\n ",iname[ini],"\n ",iNm[i],"\n",sep=" ") + } There are no different institutions with the same ID. Let's construct the AI network > iName <- gsub("\"", "\'", iname) > Encoding(iName) <- "UTF-8" > net <- file("AI.net","w") > writeLines(paste("*vertices ",na+ni," ",na,sep=""),net,useBytes=T) > for(i in 1:na) writeLines(paste(i,' "',aname[i],'"',sep=""),net,useBytes=T) > for(i in 1:ni) writeLines(paste(na+i,' "',iName[i],'"',sep=""),net,useBytes=T) > writeLines("*arcs",net,useBytes=T) > for(i in 1:nm) writeLines(paste(as.integer(aIn[i]),na+as.integer(iIn[i]),sep=" "),net,useBytes=T) > close(net) We first replaced " in names of institutions with '. And finally a list of institutions > lst <- file("instList.csv","w") > writeLines("index;instID;instName",lst,useBytes=T) > for(i in 1:ni) writeLines(paste(i,ilev[i],iName[i],sep=";"),lst,useBytes=T) > close(lst) ===== Correction of institutions ===== The corrections were prepared on the file ''Renames.xlsx''. I sorted the data by index (first column) and added the header line. I saved new version to file ''Rename.xlsx''. > setwd("C:/Users/batagelj/Documents/2017/malceva/elib") > library(readxl) > IN <- read_excel("./Rename.xlsx",sheet=5,col_types=rep("text",4)) > old <- IN[,3]; Encoding(old) <- "UTF-8" > new <- IN[,4]; Encoding(new) <- "UTF-8" > ind <- factor(new); lab <-levels(ind); nn <- length(lab) > lst <- file("instNew.csv","w") > writeLines("index;instName",lst,useBytes=T) > for(i in 1:nn) writeLines(paste(i,lab[i],sep=";"),lst,useBytes=T) > close(lst) > ni <- length(ind) > clu <- file("instShrink.clu","w") > writeLines(paste("*vertices ",ni,sep=""),clu) > writeLines(paste(as.integer(ind),sep="\n"),clu) > close(clu) ===== Analysis ===== * http://vladowiki.fmf.uni-lj.si/doku.php?id=notes:net:colln * {{notes:bm2:files:ss1273.pdf|1273. Sredin seminar}} * {{pub:pdf:sreda1274.pdf|1274. Sredin seminar}} * {{pub:pdf:sreda1275.pdf|1275. Sredin seminar}} * Cerinšek, M., Batagelj, V.: [[http://link.springer.com/article/10.1007/s11192-014-1419-z|Network analysis of Zentralblatt MATH data]]. Scientometrics, 102(2015)1, 977-1001. [[http://arxiv.org/abs/1409.1726|arXiv]] * Bodlaj, J, Batagelj, V: [[http://onlinelibrary.wiley.com/doi/10.1002/minf.201400014/abstract|Network Analysis of Publications on Topological Indices from the Web of Science.]] Molecular informatics, Volume 33, Issue 8, pages 514–535, August 2014. DOI: 10.1002/minf.201400014 * Batagelj, V, Cerinšek, M: [[http://www.springerlink.com/openurl.asp?genre=article&id=doi:10.1007/s11192-012-0940-1|On bibliographic networks]]. Scientometrics 96 (2013) 3, 845-864. (DOI) 10.1007/s11192-012-0940-1. [[http://arxiv.org/abs/1301.4655|arXiv]] * Kejžar, N., Korenjak Černe, S., Batagelj, V.: [[http://www.springerlink.com/content/v660rt6440033841/| Network Analysis of Works on Clustering and Classification from Web of Science]]. Classification as a Tool for Research. Hermann Locarek-Junge, Claus Weihs eds. Proceedings of IFCS 2009. Studies in Classification, Data Analysis, and Knowledge Organization, Part 3, 525-536, Springer, Berlin, 2010.