====== Creating Pajek two-mode network ====== June 10, 2023 ====== Read the data and split info about authors====== Read and rename raw data. > wdir <- "C:/test/Natalija/data" > setwd(wdir) > SF <- read.csv("scopus_Kaist_rawdata.csv",header=TRUE) > short <- c("au", "IDa", "tit", "y", "jour", "vol", "iss", "art", "beg", + "end", "np", "cite", "DOI", "URL", "type", "ver", "acc", "src", "IDe") > colnames(SF) <- short > str(SF) 'data.frame': 12543 obs. of 19 variables: $ au : chr "Ryu, S., Wang, J.E., Kim, J.-H., Ruffo, R., Jung, Y.H., Kim, D.K." "Lee, B.-Y., Kim, D.H., Park, J., ... $ IDa : chr "57193059482; 56913501600; 57865565300; 6602426816; 22979935700; 55574223813" "56518539100; 55574225422; ... $ tit : chr "A study on cobalt substitution in sodium manganese mixed-anion phosphates as positive electrode materials ... $ y : int 2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ... $ jour: chr "Journal of Power Sources" "Science and Technology of Advanced Materials" "Journal of Web Engineering" ... $ vol : chr "444" "20" "18" "229" ... $ iss : chr "" "1" "8" "" ... $ art : chr " 227274" "" "" " 116797" ... $ beg : chr "" "758" "837" "" ... $ end : chr "" "773" "864" "" ... $ np : logi NA NA NA NA NA NA ... $ cite: int 7 71 NA 16 5 25 6 8 18 10 ... $ DOI : chr "10.1016/j.jpowsour.2019.227274" "10.1080/14686996.2019.1631716" "10.13052/jwe1540-9589.1884" ... $ URL : chr "https://www.scopus.com/inward/record.url?eid=2-s2.0-85073156220&partnerID=40&md5=428f55ce083bb7" ... $ type: chr "Article" "Review" "Article" "Article" ... $ ver : chr "Final" "Final" "Final" "Final" ... $ acc : chr "" "All Open Access, Gold, Green" "" "" ... $ src : chr "Scopus" "Scopus" "Scopus" "Scopus" ... $ IDe : chr "2-s2.0-85073156220" "2-s2.0-85068763946" "2-s2.0-85097943650" "2-s2.0-85071928006" ... > dim(SF) [1] 12543 19 Split info about authors > n <- nrow(SF); C <- rep(NA,9*n); N <- length(C) > A <- data.frame(work=C,IDa=C,au=C,y=C) > k <- 0 > for(w in 1:n){ + ans <- gsub("\\.","",strsplit(SF$au[w],"\\., ")[[1]]) + ais <- strsplit(SF$IDa[w],"; ")[[1]] + if(length(ans)==length(ais)) { + for(i in 1:length(ans)){ k <- k+1 + if(k > N) stop(paste("increase size, w=",w)) + A$work[k] <- w; A$IDa[k] <- ais[i]; A$au[k] <- ans[i]; A$y[k] <- SF$y[w] + } + } else warning(paste("work ",w)) + } Warning messages: 1: work 213 2: work 319 3: work 390 4: work 4813 5: work 8499 6: work 9249 ===== Improved code ===== The above code can be improved by adding some tracing > n <- nrow(SF); C <- rep(NA,15*n); N <- length(C); step <- 5000 > A <- data.frame(work=C,IDa=C,au=C) > cat("Start",date(),", n =",n,"\n") > k <- 0 > for(w in 1:n){ + ans <- gsub("\\.","",strsplit(SF$au[w],"\\., ")[[1]]) + ais <- strsplit(SF$IDa[w],"; ")[[1]] + if(length(ans)==length(ais)) { + for(i in 1:length(ans)){ k <- k+1 + if(k %% step==0) cat(date()," w =",w," k =",k," k/w =",k/w,"\n"); flush.console() + if(k > N) stop(paste("increase size, w=",w)) + A$work[k] <- w; A$IDa[k] <- ais[i]; A$au[k] <- ans[i] + } + } else warning(paste("work ",w)) + } > cat("Stop",date(),", k =",k,", k/n =",k/n,"\n") Start Sun Jun 18 00:06:54 2023 , n = 12543 Sun Jun 18 00:07:07 2023 w = 807 k = 5000 k/w = 6.195787 Sun Jun 18 00:07:21 2023 w = 1762 k = 10000 k/w = 5.675369 Sun Jun 18 00:07:34 2023 w = 2684 k = 15000 k/w = 5.588674 Sun Jun 18 00:07:48 2023 w = 3720 k = 20000 k/w = 5.376344 Sun Jun 18 00:08:01 2023 w = 4203 k = 25000 k/w = 5.948132 Sun Jun 18 00:08:15 2023 w = 4856 k = 30000 k/w = 6.177924 Sun Jun 18 00:08:29 2023 w = 5565 k = 35000 k/w = 6.289308 Sun Jun 18 00:08:43 2023 w = 5811 k = 40000 k/w = 6.883497 Sun Jun 18 00:08:57 2023 w = 6023 k = 45000 k/w = 7.47136 Sun Jun 18 00:09:11 2023 w = 6213 k = 50000 k/w = 8.047642 Sun Jun 18 00:09:25 2023 w = 6522 k = 55000 k/w = 8.432996 Sun Jun 18 00:09:39 2023 w = 7240 k = 60000 k/w = 8.287293 Sun Jun 18 00:09:52 2023 w = 7448 k = 65000 k/w = 8.727175 Sun Jun 18 00:10:06 2023 w = 8074 k = 70000 k/w = 8.669804 Sun Jun 18 00:10:20 2023 w = 8397 k = 75000 k/w = 8.931761 Sun Jun 18 00:10:34 2023 w = 8977 k = 80000 k/w = 8.911663 Sun Jun 18 00:10:49 2023 w = 9956 k = 85000 k/w = 8.537565 Sun Jun 18 00:11:03 2023 w = 10714 k = 90000 k/w = 8.400224 Sun Jun 18 00:11:18 2023 w = 11717 k = 95000 k/w = 8.107877 Warning messages: 1: work 213 2: work 319 3: work 390 4: work 4813 5: work 8499 6: work 9249 Stop Sun Jun 18 00:11:29 2023 , k = 98987 , k/n = 7.891812 In the command ''C <- rep(NA,15*n)'' the number 15 is a guess about the average number of authors per paper. If the real average is larger the program will stop with the ''increase size'' message. A good approximation of the real average is the last k/w. Replace 15 with 1.3 * k/w and rerun the program. The value of the variable ''step'' controls the frequency of the trace lines. ====== Corrections of authors' info ====== There are some rows with different numbers of names and IDs. Inspecting the raw data in these rows using the code (changing w) > w <- 9249 > (SF$tit[w]) > (SF$au[w]) > (ans <- gsub("\\.","",strsplit(SF$au[w],"\\., ")[[1]])) > (ais <- strsplit(SF$IDa[w],"; ")[[1]]) it turns out that the troublemakers are single-word names. Row 9249 describes an [[https://idus.us.es/bitstream/handle/11441/99979/1/out.pdf?sequence=1|article]] with 1271 coauthors. We correct the data (adding a dot to the end of single-word names) and rerun the splitting code. > SF$au[213] <- "Park, J., De Liu., Yi, M.Y., Santhanam, R." > SF$au[319] <- "Datt, R., Suman., Bagui, A., Siddiqui, A., Sharma, R., Gupta, V., Yoo, S., Kumar, S., Singh, S.P." > SF$au[390] <- "Fazl-i-Sattar., Ahmed, A., Ullah, H., Ullah, Z., Tariq, M., Ayub, K." > SF$au[4813] <- "Samaad, T., Tahir, G.A., Mansoor-Ur-Rahman., Ashraf, M." > SF$au[8499] <- "Park, H.C., Isnaeni., Gong, S., Cho, Y.-H." > SF$au[9249] <- gsub("Pisano","Pisano.",SF$au[9249]) > > n <- nrow(SF); C <- rep(NA,9*n); N <- length(C) > A <- data.frame(work=C,IDa=C,au=C,y=C) > k <- 0 > for(w in 1:n){ + ans <- gsub("\\.","",strsplit(SF$au[w],"\\., ")[[1]]) + ais <- strsplit(SF$IDa[w],"; ")[[1]] + if(length(ans)==length(ais)) { + for(i in 1:length(ans)){ k <- k+1 + if(k > N) stop(paste("increase size, w=",w)) + A$work[k] <- w; A$IDa[k] <- ais[i]; A$au[k] <- ans[i]; A$y[k] <- SF$y[w] + } + } else warning(paste("work ",w)) + } > k [1] 100285 > A <- A[1:k,] > dim(A) [1] 100285 4 ===== Some statistics ===== > Wd <- table(table(A$work)) > head(Wd) 1 2 3 4 5 6 231 1916 2593 2270 1617 1075 > length(Wd) [1] 65 > tail(Wd) 1233 1235 1488 1497 1551 1558 1 1 1 1 1 1 > x <- as.numeric(names(Wd)) > plot(x,Wd,log="xy",pch=16,cex=0.7,xlab="#authors",ylab="freq", + main="Kaist - #authors per paper distribution") > > Id <- table(table(A$IDa)) > head(Id) 1 2 3 4 5 6 16609 4108 2121 1108 661 496 > tail(Id) 80 86 93 96 103 105 2 1 1 1 1 1 > x <- as.numeric(names(Id)) > plot(x,Id,log="xy",pch=16,cex=0.7,xlab="#works",ylab="freq", + main="Kaist - #works per author distribution") There are 2270 works with 4 authors and a single work with 1558 authors. 16609 authors contributed to a single work and there is an author that contributed to 105 works. {{vlado:work:pics:kaist-ap.png?500}} {{vlado:work:pics:kaist-wa.png?500}} ====== Matching author IDs and names ====== We attach to the author IDs the corresponding names. > I <- factor(A$IDa); L <- levels(I); Ia <- as.integer(I) > nA <- length(L) > nA [1] 28053 > Anam <- rep(NA,nA); names(Anam) <- L > for(i in 1:nrow(A)){ nam <- A$au[i]; ID <- A$IDa[i] + if(is.na(Anam[ID])) Anam[ID] <- nam else + if(Anam[ID]!=nam) warning(paste(ID,":",nam,"/",Anam[ID])) + } Warning messages: 1: 56298144400 : Yoo, KW / Yoo, K 2: 7202588575 : Chun, JH / Chun, J 3: 55182524000 : Choi, BG / Choi, B-G 4: 56298144400 : Yoo, KW / Yoo, K 5: 7202588575 : Chun, JH / Chun, J 6: 55182524000 : Choi, BG / Choi, B-G 7: 57205232256 : Kim, SH / Kim, S 8: 36965693000 : Kyhm, JH / Kyhm, J 9: 57206099583 : Kim, DJ / Kim, D 10: 57205232256 : Kim, S-H / Kim, S 11: 56178406700 : Lim, H-K / Lim, HK 12: 14056089600 : Lee, H-U / Lee, HU 13: 55494376500 : Park, Y-K / Park, YK 14: 56178406700 : Lim, H-K / Lim, HK 15: 56377217300 : Park, J / Park, JC 16: 36473564000 : Lee, DJ / Lee, D-J 17: 6602354436 : Do Heo, W / Heo, WD 18: 6602354436 : Do Heo, W / Heo, WD 19: 25634053100 : Lee, S / Lee, SB 20: 57221622000 : Lee, J / Lee, J-S 21: 8242246500 : Kim, S-C / Kim, SC 22: 7404469270 : Cho, Y / Cho, Y-H 23: 55928867600 : Lee, S-W / Lee, SW 24: 55857721300 : Yoo, D-E / Yoo, DE 25: 57211666172 : Shim, JE / Shim, J-E 26: 57197718957 : Seo, J / Seo, JH 27: 56949819000 : Cha, SL / Cha, S-L 28: 55807779600 : Han, J-H / Han, J 29: 7404786542 : Song, JD / Song, J 30: 57205232256 : Kim, S-H / Kim, S 31: 57204616683 : Jeon, G / Jeon, G-J 32: 57204766033 : Kim, MW / Kim, M-W 33: 6602354436 : Do Heo, W / Heo, WD 34: 47861080000 : Kim, J-H / Kim, J 35: 57206099583 : Kim, DJ / Kim, D 36: 57218106364 : Kim, D-H / Kim, D 37: 57202965763 : Kim, KM / Min Kim, K 38: 55523269400 : Hong, SC / Hong, S-C 39: 8549444100 : Lee, JR / Lee, J-R 40: 55928816500 : Park, H / Park, HK 41: 57200220938 : Lee, S / Lee, S-H 42: 57203690087 : Hwang, H / Hwang, HS 43: 57192500386 : Lee, HJ / Lee, H 44: 57189594982 : Sasikala, SP / Padmajan Sasikala, S 45: 57193226486 : Hoang Bui, VK / Bui, VKH 46: 55549849900 : Park, Y / Park, Y-J 47: 7501843237 : Chang, Y-K / Chang, YK 48: 57193537793 : Kim, GY / Kim, G 49: 57199467150 : Lee, DW / Lee, D-W 50: 18535939800 : Kang, IS / Kang, I-S We see that some authors appear in the data with different names. This part could be **improved** by a selection of a kind of canonical name - remove "-", preference for longer initials, etc. ??? Now we are ready to save the transformed data as a Pajek two-mode network ''WA.net'' and additionally author names on the file ''authors.nam''. > net <- file("WA.net","w",encoding="UTF-8") > RN <- SF$IDe; nr <- length(RN) > CN <- L; nc <- length(CN) > U <- A$work; V <- Ia; w <- rep(1,length(U)); t <- A$y > cat("% CSV2Pajek",date(),"\n*vertices",nr+nc,nr,"\n",file=net) > for(i in 1:nr) cat(i,' "',RN[i],'"\n',sep="",file=net) > for(i in 1:nc) cat(i+nr,' "',CN[i],'"\n',sep="",file=net) > cat("*arcs\n",file=net) > for(i in 1:length(U)) cat(U[i],V[i]+nr,w[i],"[",t[i],"]\n",file=net) > close(net) > > nam <- file("authors.nam","w",encoding="UTF-8") > cat("% CSV2Pajek",date(),"\n*vertices",nc,"\n",file=nam) > for(i in 1:nc) cat(i,' "',Anam[i],'"\n',sep="",file=nam) > close(nam) Using ''authors.nam'' we can replace in Pajek the author IDs with the corresponding names and get more readable results. The file ''authors.nam'' is encoded in UTF-8 but without BOM (signature) required by Pajek. I added it in EmEditor by saving it with encoding "UTF-8 with Signature". For creation and analysis of co-authorship networks see [[https://github.com/bavla/SocNet/wiki/Coauthor|SocNet/Coauthor]] and [[https://github.com/bavla/biblio/blob/master/Pajek/macro/README.md|macros]]. For example [[.:strict|strict co-authorship]]. ===== Improved list of author names ===== I added the "canonical" author names and exported the list in UTF-8-BOM encoding required by Pajek. > stand <- function(s){ + p <- strsplit(s,", ")[[1]]; k <- length(p) + if(k>1) p[k] <- gsub("-","",p[k]) else {p <- c(p,""); k <- 2} + return(list(k=k,p=p)) + } > > I <- factor(A$IDa); L <- levels(I); Ia <- as.integer(I) > Anam <- rep(NA,length(L)); names(Anam) <- L > for(i in 1:nrow(A)){ ID <- A$IDa[i] + b <- stand(A$au[i]); nam <- paste(b$p,collapse=", ") + if(is.na(Anam[ID])) Anam[ID] <- nam else + if(Anam[ID]!=nam) { + a <- stand(Anam[ID]) + na <- nchar(a$p[a$k]); nb <- nchar(b$p[b$k]) + if(nb>na) { warning(paste(i,ID,":",A$au[i],"/",Anam[ID])) + Anam[ID] <- nam } + } + } > > nam <- file("authors2.nam","wb") > writeBin(charToRaw('\xEF\xBB\xBF'),nam,endian="little") > writeBin(charToRaw(paste("% CSV2Pajek",date(),"\n*vertices",nc,"\n")),nam,endian="little") > for(i in 1:nc) writeBin(charToRaw(paste(i,' "',Anam[i],'"\n',sep="")),nam,endian="little") > close(nam) ===== Distributions for all universities ===== September 1, 2023 Nataliya prepared Pajek *.net files for eight universities. I computed the (all) degrees and saved them on *.clu files. > F V1 V2 V3 1 WA_HKPoly.net 47165 15155 2 WA_HKUST1.net 37333 9225 3 WA_KAIST.net 40549 12543 4 WA_Maas.net 80334 13338 5 WA_NTY.net 82971 25192 6 WA_Pohang.net 21951 6179 7 WA_Pomp.net 36769 6054 8 WA_UTS.net 45329 13094 The following program produces plots of author and work distributions for all universities. It saves the data in the list R for later processing. > wdir <- "C:/Users/vlado/docs/papers/2023/Natalija/data2/nets" > setwd(wdir) > F <- read.csv("files.dat",header=FALSE,sep="") > R <- NULL > for(i in 1:nrow(F)){ + f <- gsub("net","clu",F[i,1]) + nam <- gsub("WA_","",gsub(".net","",F[i,1])) + c <- read.csv(f,header=FALSE,skip=1,sep="")$V1 + nw <- F$V3[i]; w <- c[1:nw] + a <- c[(nw+1):F$V2[i]]; na <- length(a) + cat("\n",i,nam,nw,na,"\n") + Wd <- table(w) + cat("authors\n"); print(head(Wd)); print(tail(Wd)) + x <- as.numeric(names(Wd)) + pdf(file=gsub(".net","-a.pdf",F[i,1])) + plot(x,Wd,log="xy",pch=16,cex=0.7,xlab="#authors",ylab="freq", + main=paste(nam,"- #authors per paper distribution")) + dev.off() + Ad <- table(a) + cat("works\n"); print(head(Ad)); print(tail(Ad)) + x <- as.numeric(names(Ad)) + pdf(file=gsub(".net","-x.pdf",F[i,1])) + plot(x,Ad,log="xy",pch=16,cex=0.7,xlab="#works",ylab="freq", + main=paste(nam,"- #works per author distribution")) + dev.off() + R <- append(R,list(nam=nam,nw=nw,na=na,Wd=Wd,Ad=Ad)) + } 1 HKPoly 15155 32010 authors w 0 1 2 3 4 5 3 607 2065 2970 2848 2170 w 988 989 1018 1043 1159 1211 1 1 2 2 1 1 works a 1 2 3 4 5 6 19605 4826 2142 1213 823 539 a 140 141 144 149 175 248 1 1 1 1 1 1 2 HKUST1 9225 28108 authors w 0 1 2 3 4 5 2 212 1144 1635 1581 1286 w 2949 2951 2952 2953 5096 5215 2 1 1 1 1 1 works a 1 2 3 4 5 6 14735 5111 1445 708 408 316 a 296 297 298 299 302 438 66 1421 17 3 1 1 3 KAIST 12543 28006 authors w 0 1 2 3 4 5 1 231 1916 2594 2272 1618 w 1227 1234 1484 1489 1545 1555 1 1 1 1 1 1 works a 1 2 3 4 5 6 16624 4105 2123 1108 660 498 a 80 86 93 96 103 105 2 1 1 1 1 1 4 Maas 13338 66996 authors w 0 1 2 3 4 5 10 832 1164 1352 1372 1454 w 1202 1261 1264 1265 1590 2582 1 1 1 1 1 1 works a 1 2 3 4 5 6 43452 10350 4822 2098 1382 932 a 94 100 102 104 120 122 1 1 1 1 1 1 5 NTY 25192 57779 authors w 0 1 2 3 4 5 7 1331 3175 4299 4307 3500 w 988 989 1018 1043 1159 1211 1 1 2 2 1 1 works a 1 2 3 4 5 6 36525 8709 3790 2049 1333 950 a 127 129 152 153 160 188 1 2 1 1 1 1 6 Pohang 6179 15772 authors w 1 2 3 4 5 6 98 606 949 998 909 653 w 72 75 106 155 187 382 1 1 1 1 1 1 works a 1 2 3 4 5 6 10072 2454 1046 618 340 287 a 74 75 77 88 91 169 1 1 1 1 1 1 7 Pomp 6054 30715 authors w 0 1 2 3 4 5 1 613 859 968 585 433 w 979 988 1018 1043 1159 1211 1 1 2 2 1 1 works a 1 2 3 4 5 6 19550 4995 1774 1028 718 493 a 76 79 81 85 114 117 1 1 1 1 1 1 8 UTS 13094 32235 authors w 0 1 2 3 4 5 1 883 1510 2320 2430 1983 w 988 989 1018 1043 1159 1211 1 1 2 2 1 1 works a 1 2 3 4 5 6 20874 4878 1901 1024 721 445 a 130 149 158 169 177 192 1 1 1 1 1 1 ===== Replacing IDs with names ===== The structure of ''NET'' files is the following file: ''WA_Pomp.net'' % CSV2Pajek Thu Jul 13 16:24:57 2023 *vertices 36769 6054 1 "2-s2.0-85077240176" IDs of works 2 "2-s2.0-85077129493" ... 6053 "2-s2.0-84941710968" 6054 "2-s2.0-84881717588" 6055 "10038927800" IDs of authors 6056 "10039034700" 6057 "10039087200" ... 36768 "9942846400" 36769 "9943079200" *arcs 1 26773 1 [ 2019 ] arcs with a time stamp 1 18945 1 [ 2019 ] ... The IDs are used because they uniquely identify the units - some of them can have the same name. To obtain the readable results the IDs have to be replaced with the corresponding names that are available in the ''NAM'' files. file: ''authors_Pomp.nam'' % CSV2Pajek Thu Jul 13 16:24:59 2023 *vertices 30715 1 "Kopisch-Obuch, FJ" 2 "Muro, D" 3 "Berking, M" ... 30713 "Langenberg, C" 30714 "Minov, J" 30715 "Aliaga, RS" If you don't change IDs with names, they will be used in the results, and you can then use them in searching for additional information. To change IDs with names you have first to change IDs to the default labels Network/Create new network/Transform/Add/Vertex labels/Default and afterward, replace these with labels from the ''NAM'' file (select ''*.* (all files)'' to see them). Network/Create new network/Transform/Add/Vertex labels/From file(s) ====== To do ====== The works with a very large number of coauthors substantially increase the size of the corresponding collaboration networks but the contribution of a single author is very small. How to consider only the relevant authors? An option is [[.:trunc|truncated co-appearance networks]].