May 6-9, 2017
In the source Excel file articles.xlsx
(copy of articles_25_04_2017.xlsx
) there were several bad records - incomplete or extending over some lines. I first removed the records with only article's ID and corrected multi-line records.
4870 del, 4729 del, 4728 del, 4544-6 join, 4529 del, …, 2157-8 join, 2017 del, 487 del
Reading Excel xlsx file with package readxl
> setwd("C:/Users/batagelj/Documents/2017/malceva/elib") > library(readxl) > Ar <- read_excel("./articles.xlsx",col_types = rep("text",25)) > WID <- Ar[,1] > Wti <- Ar[,2] > Encoding(Wti) <- "UTF-8" > AID <- Ar[,4] > Afn <- Ar[,5] > Encoding(Afn) <- "UTF-8" > head(Afn) [1] "ЗЫРЯНОВ СЕРГЕЙ ГРИГОРЬЕВИЧ" "ДАВЫДЕНКО ВЛАДИМИР АЛЕКСАНДРОВИЧ" [3] "FARMER J." NA [5] "PISELLI FORTUNATA" "KOKSAL YONCA" > Wty <- factor(Ar[,7]); tylev <- levels(Wty) > tylev [1] "0" "автореферат диссертации " [3] "брошюра " "глава в книге " [5] "депонированная рукопись " "диссертация " [7] "монография " "сборник статей " [9] "сборник тезисов докладов на конференции " "сборник трудов конференции " [11] "статья в журнале " "статья в журнале - аннотация " [13] "статья в журнале - краткое сообщение " "статья в журнале - материалы конференции " [15] "статья в журнале - научная статья " "статья в журнале - научный отчет " [17] "статья в журнале - обзорная статья " "статья в журнале - переписка " [19] "статья в журнале - разное " "статья в журнале - редакторская заметка " [21] "статья в журнале - рецензия " "статья в открытом архиве " [23] "статья в сборнике статей " "статья в сборнике трудов конференции " [25] "тезисы доклада на конференции " "учебное пособие " > dim(Ar) [1] 5227 25 > Wlang <- factor(Ar[,8]); langlev <- levels(Wlang) > langlev [1] "английский " "не определен " "русский " "украинский " "французский " > Wjour <- factor(Ar[,10]); jourlev <- levels(Wjour) > length(jourlev) [1] 2148 > head(jourlev) [1] " АГАПОВ ВАЛЕРИЙ СЕРГЕЕВИЧ, СМУЛЬСКИЙ СЕРГЕЙ ВЛАДИМИРОВИЧ" [2] " КОЛПИНА Л.В.1" [3] "2010 IEEE INTERNATIONAL CONFERENCE ON COMMUNICATIONS WORKSHOPS, ICC 2010" [4] "2011" [5] "2013" [6] "2014" > Wv <- Ar[,11] > head(Wv) [1] "4" "3" "1" "12" "7" "10" > year <- as.integer(Ar[,12]) > table(year) year 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 14 30 41 43 43 41 31 38 39 84 80 92 59 73 78 86 60 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 41 60 59 72 119 273 328 491 619 913 1175 145 > bp <- Ar[,13] > K <- Ar[,19] > head(K) [1] "политический пространство; POLITICAL AREA; электоральный кластер; ELECTORAL CLUSTER; социальный сеть; SOCIAL NETWORK; транзитивный политический режим; TRANSITIVE POLITICAL REGIME; отчуждение гражданин от власть; ALIENATION OF CITIZENS FROM THE POWER; партия как политический институт; PARTY AS A POLITICAL INSTITUTE" [2] "социологический исследование; социология труд; социальный сеть; неформальный экономика; хозяйственный деятельность; трудовой отношение; неформальный занятость; неформальный трудоустройство" [3] "information technology; internet; information provision; nursing" [4] "peer counseling; financial aid; fictive kin; social capital" [5] "community; social network; social network analysis; urban neighborhood" [6] "social network; nationalism; state transformation; ottoman empire" > Encoding(K) <- "UTF-8"
Short names of works
> S <- strsplit(Afn," ") > S[[1]] [1] "ЗЫРЯНОВ" "СЕРГЕЙ" "ГРИГОРЬЕВИЧ" > S[[201]] [1] "BROWN" "SHEILA" > S[[55]] [1] NA > S[[281]] [1] "СКОРНИЧЕНКО" "НАТАЛЬЯ" "НИКОЛАЕВНА" > short <- vector(mode="character",length=n) > for(i in 1:n) { + Last <- substr(S[[i]][1],1,8); Init <- substr(S[[i]][2],1,1) + short[i] <- paste(Last,"_",Init,"(",year[i],")",Wv[i],":",bp[i],sep="") + } > head(short) [1] "ЗЫРЯНОВ_С(2008)4:24" "ДАВЫДЕНК_В(2007)3:74" "FARMER_J(1999)1:49" [4] "NA_NA(2006)12:1687" "PISELLI_F(2007)7:867" "KOKSAL_Y(2008)10:1498" > short[281] [1] "СКОРНИЧЕ_Н(2008)4:185" > A <- data.frame(ID=WID,short=short,title=Wti,autID=AID,autName=Afn,type=Wty,lang=Wlang,journ=Wjour, + vol=Wv,year=year,bp=bp,keyw=K) > save(A,file="articles.Rdata")
> WIn <- factor(WID); Wlev <- levels(WIn) > nw <- length(Wlev); nj <- length(jourlev) > wname <- vector(mode="character",length=nw) > for(i in 1:n){ + inw <- as.integer(WIn[i]) + if(wname[inw]=="") wname[inw] <- short[i] else + if(wname[inw]!=short[i]) cat("***",i,inw,wname[inw],short[i],"\n",sep=" ") + } > head(wname) [1] "ЗЫРЯНОВ_С(2008)4:24" "ДАВЫДЕНК_В(2007)3:74" "FARMER_J(1999)1:49" [4] "NA_NA(2006)12:1687" "PISELLI_F(2007)7:867" "KOKSAL_Y(2008)10:1498" > > Encoding(wname) <- "UTF-8" > jname <- gsub("\"", "\'", jourlev) > Encoding(jname) <- "UTF-8" > net <- file("WJ.net","w") > writeLines(paste("*vertices ",nw+nj," ",nw,sep=""),net,useBytes=T) > for(i in 1:nw) writeLines(paste(i,' "',wname[i],'"',sep=""),net,useBytes=T) > for(i in 1:nj) writeLines(paste(nw+i,' "',jname[i],'"',sep=""),net,useBytes=T) > writeLines("*arcs",net,useBytes=T) > for(i in 1:n) writeLines(paste(as.integer(WIn[i]),nw+as.integer(Wjour[i]),sep=" "),net,useBytes=T) > close(net)
For some works the corresponding journal is not known. They result in a NA value in the list of arcs. I manually introduced an additional node 7336 Unknown
and replaced NAs with 7376.
> clu <- file("year.clu","w") > writeLines(paste("*vertices ",nw,sep=""),clu) > for(i in 1:n) writeLines(paste(year[WIn[i]]),clu) > close(clu)
> clu <- file("lang.clu","w"); nl <- length(langlev) > writeLines(paste(rep("% ",nl),1:nl,langlev,collapse="\n"),clu,useBytes=T) > writeLines(paste("*vertices ",nw,sep=""),clu,useBytes=T) > for(i in 1:n) writeLines(paste(as.integer(Wlang[WIn[i]])),clu) > close(clu)
> clu <- file("type.clu","w"); nt <- length(tylev) > writeLines(paste(rep("% ",nt),1:nt,tylev,collapse="\n"),clu,useBytes=T) > writeLines(paste("*vertices ",nw,sep=""),clu,useBytes=T) > for(i in 1:n) writeLines(paste(as.integer(Wty[WIn[i]])),clu) > close(clu)
> lst <- file("workList.csv","w") > writeLines("index;workID;short;workTitle",lst,useBytes=T) > for(i in 1:nw) writeLines(paste(i,Wlev[i],short[i],Wti[i],sep=";"),lst,useBytes=T) > close(lst)
> keys <- tolower(K) > trimws <- function(s) gsub("(^ +)|( +$)","",s) > S <- trimws(unlist(strsplit(paste(keys,collapse=";"),";"))) > head(S) [1] "политический пространство" "political area" [3] "электоральный кластер" "electoral cluster" [5] "социальный сеть" "social network" > ik <- factor(S); Klev <- levels(ik); nk <- length(Klev) > nk [1] 18181
If we list the first 210 keywords in Klev we see that there are some “strange” keywords. Typical examples are included in vector test. We decided to make an additional cleaning of keywords
> test <- c(Klev[1],Klev[2],Klev[11],Klev[179],Klev[190],Klev[194]) > test [1] "'' one - on - one '' marketing system" "'' арабская весна ''" [3] "\" cloud \" computing" "`` big city ''" [5] "a . v . repina" "a country of the world" > trimws(gsub(" +"," ",gsub("^a ","",gsub("`","",gsub("'","",gsub('\\"',"",gsub("\\."," ",gsub("-"," ",test)))))))) [1] "one on one marketing system" "арабская весна" "cloud computing" [4] "big city" "v repina" "country of the world"
We recompute ik, Klev and nk
> cleanK <- function(S) + trimws(gsub(" +"," ",gsub("^a ","",gsub("`","",gsub("'","",gsub('\\"',"",gsub("\\."," ",gsub("-"," ",S)))))))) > S <- trimws(unlist(strsplit(paste(keys,collapse=";"),";"))) > ik <- factor(cleanK(S)); Klev <- levels(ik); nk <- length(Klev) > nk [1] 17962 > Encoding(Klev) <- "UTF-8" > net <- file("WK.net","w") > writeLines(paste("*vertices ",nw+nk," ",nw,sep=""),net,useBytes=T) > for(i in 1:nw) writeLines(paste(i,' "',wname[i],'"',sep=""),net,useBytes=T) > for(i in 1:nk) writeLines(paste(nw+i,' "',Klev[i],'"',sep=""),net,useBytes=T) > writeLines("*arcs",net,useBytes=T) > for(i in 1:n){ + inw <- as.integer(WIn[i]) + if(!is.na(keys[i])){ + S <- cleanK(unlist(strsplit(keys[i],";"))) + for(k in S) { ink <- as.integer(factor(k,levels=Klev)); + if(!is.na(ink)) writeLines(paste(inw,nw+ink,sep=" "),net,useBytes=T) } + } + } > close(net)
Another problem with keywords is that some of them exist in Russian a English version. May be the Russian/English partition could prove to be useful.
> # utf8ToInt("Ά") = 902 Greek Capital Letter Alpha With Tonos > latin <- function(s){ LL <- utf8ToInt(gsub(" ","",s))<902; sum(LL)/length(LL) } > as.vector(sapply(test,latin)) [1] 1.0000000 0.2352941 1.0000000 1.0000000 1.0000000 1.0000000 > russian <- as.integer(as.vector(sapply(test,latin))<0.5) > russian [1] 0 1 0 0 0 0 > russian <- as.integer(as.vector(sapply(Klev,latin))<0.5) > clu <- file("russian.clu","w") > writeLines("% 0 English\n% 1 Russian",clu) > writeLines(paste("*vertices ",nk,sep=""),clu) > writeLines(paste(russian,sep="\n"),clu) > close(clu)
I first tried to convert the source file ListOfAuthors_Authorship affiliation.txt
into a CSV file readable by Excel. I replaced all ”;” with “§” and afterwards all tabs “\t” by ”;”. There was an error in line 4691. I also added a header. I saved it to the file affil.csv
. It opens in Excel.
Trying to read the file affil.csv
in R it is reporting some problems. To bypass them I used
> csv <- file("affil.csv","r") > lines <- readLines(csv) > close(csv) > Encoding(lines) <- "UTF-8" > S <- strsplit(lines,";") > ns <- length(S); nm <- ns-1 > wId <- vector(mode="character",length=nm) > aId <- vector(mode="character",length=nm) > aNm <- vector(mode="character",length=nm) > iId <- vector(mode="character",length=nm) > iNm <- vector(mode="character",length=nm) > for(i in 2:ns){wId[i-1] <- S[[i]][1]; aId[i-1] <- S[[i]][2]; + aNm[i-1] <- S[[i]][3]; iId[i-1] <- S[[i]][4]; iNm[i-1] <- S[[i]][5] } > wIn <- factor(wId,levels=Wlev) > aIn <- factor(aId); alev <- levels(aIn) > lw <- length(wlev); na <- length(alev) > aname <- vector(mode="character",length=na) > for(i in 1:nm){ + ina <- as.integer(aIn[i]) + if(aname[ina]=="") aname[ina] <- aNm[i] else + if(aname[ina]!=aNm[i]) cat("***",i,ina,aId[i],aname[ina],aNm[i],"\n",sep=" ") + }
There are authors that are using different names - see the list.
Now we are ready to export the WA network file in Pajek format
> Encoding(aname) <- "UTF-8" > net <- file("WA.net","w") > writeLines(paste("*vertices ",nw+na," ",nw,sep=""),net,useBytes=T) > for(i in 1:nw) writeLines(paste(i,' "',wname[i],'"',sep=""),net,useBytes=T) > for(i in 1:na) writeLines(paste(nw+i,' "',aname[i],'"',sep=""),net,useBytes=T) > writeLines("*arcs",net,useBytes=T) > for(i in 1:nm) writeLines(paste(as.integer(wIn[i]),nw+as.integer(aIn[i]),sep=" "),net,useBytes=T) > close(net) >
There was some searching on Google to learn how to write out from R an UTF-8 encoded file.
In an text editor I added some comments and saved the file as a UTF-8 encoded with BOM (signature).
We prepare also a CSV file linking author's ID with his/her name
> lst <- file("authList.csv","w") > writeLines("index;authID;authName",lst,useBytes=T) > for(i in 1:na) writeLines(paste(i,alev[i],aname[i],sep=";"),lst,useBytes=T) > close(lst)
The procedures are similar to the procedures for computing WA natwork
> iname <- vector(mode="character",length=ni) > for(i in 1:nm){ + ini <- as.integer(iIn[i]) + if(iname[ini]=="") iname[ini] <- iNm[i] else + if(iname[ini]!=iNm[i]) cat("***",i,ini,iId[i],"\n ",iname[ini],"\n ",iNm[i],"\n",sep=" ") + }
There are no different institutions with the same ID.
Let's construct the AI network
> iName <- gsub("\"", "\'", iname) > Encoding(iName) <- "UTF-8" > net <- file("AI.net","w") > writeLines(paste("*vertices ",na+ni," ",na,sep=""),net,useBytes=T) > for(i in 1:na) writeLines(paste(i,' "',aname[i],'"',sep=""),net,useBytes=T) > for(i in 1:ni) writeLines(paste(na+i,' "',iName[i],'"',sep=""),net,useBytes=T) > writeLines("*arcs",net,useBytes=T) > for(i in 1:nm) writeLines(paste(as.integer(aIn[i]),na+as.integer(iIn[i]),sep=" "),net,useBytes=T) > close(net)
We first replaced ” in names of institutions with '.
And finally a list of institutions
> lst <- file("instList.csv","w") > writeLines("index;instID;instName",lst,useBytes=T) > for(i in 1:ni) writeLines(paste(i,ilev[i],iName[i],sep=";"),lst,useBytes=T) > close(lst)
The corrections were prepared on the file Renames.xlsx
. I sorted the data by index (first column) and added the header line. I saved new version to file Rename.xlsx
.
> setwd("C:/Users/batagelj/Documents/2017/malceva/elib") > library(readxl) > IN <- read_excel("./Rename.xlsx",sheet=5,col_types=rep("text",4)) > old <- IN[,3]; Encoding(old) <- "UTF-8" > new <- IN[,4]; Encoding(new) <- "UTF-8" > ind <- factor(new); lab <-levels(ind); nn <- length(lab) > lst <- file("instNew.csv","w") > writeLines("index;instName",lst,useBytes=T) > for(i in 1:nn) writeLines(paste(i,lab[i],sep=";"),lst,useBytes=T) > close(lst) > ni <- length(ind) > clu <- file("instShrink.clu","w") > writeLines(paste("*vertices ",ni,sep=""),clu) > writeLines(paste(as.integer(ind),sep="\n"),clu) > close(clu)