====== Networks ====== [[..:alex|OpenAlex]]; [[..:alex:paj|OpenAlex 2 Pajek]] ===== Functions OpenAlex ===== https://github.com/bavla/OpenAlex/blob/main/code/OpenAlex.R # OpenAlex # https://github.com/bavla/OpenAlex/tree/main/code # http://vladowiki.fmf.uni-lj.si/doku.php?id=vlado:work:bib:alex # by Vladimir Batagelj, March 2024 # source("https://raw.githubusercontent.com/bavla/OpenAlex/main/OpenAlex.R") keys = ls eDict <- function(size=10000L) new.env(hash=TRUE,parent=emptyenv(),size=size) getVals <- Vectorize(get,vectorize.args="x") dict2DF <- function(dict,ind) { V <- as.data.frame(t(getVals(keys(dict),dict))) V[[ind]] <- as.integer(unname(V[[ind]])) return(V[order(V[[ind]]),]) } putWork <- function(Wid,sWname=""){ if(exists(Wid,env=works,inherits=FALSE)){ if(works[[Wid]]["sWname"]!=sWname){ if(works[[Wid]]["sWname"]=="") {works[[Wid]]["sWname"] <- sWname} else { cat("W",length(works),works[[Wid]]["sWname"],sWname,"\n",file=WC$tr) }} } else works[[Wid]] <- c(wind=length(works)+1,sWname=sWname) return(works[[Wid]]["wind"]) } putSrc <- function(Sid,Sname=NA){ if(exists(Sid,env=srces,inherits=FALSE)){ if(srces[[Sid]]["Sname"]!=Sname){ if(is.na(srces[[Sid]]["Sname"])) {srces[[Sid]]["Sname"] <- Sname} else { cat("S",length(srces),srces[[Sid]]["Sname"],Sname,"\n",file=WC$tr) }} } else srces[[Sid]] <- c(sind=length(srces)+1,Sname=Sname) return(srces[[Sid]]["sind"]) } putAuth <- function(Aid,Aname=NA){ sAnam <- ifelse(is.na(Aname),NA,sAname(Aname)) if(exists(Aid,env=auths,inherits=FALSE)){ if(auths[[Aid]]["Aname"]!=Aname){ if(is.na(auths[[Aid]]["Aname"])) {auths[[Aid]]["Aname"] <- Aname} else { cat("A",length(auths),auths[[Aid]]["Aname"],Aname,"\n",file=WC$tr) }} } else auths[[Aid]] <- c(aind=length(auths)+1,Aname=Aname,sAname=sAnam) return(auths[[Aid]]["aind"]) } .Ty <- c("article"="AR","book-chapter"="BC","dissertation"="DS","book"="BK","dataset"="DS", "paratext"="PT","other"="OT","reference-entry"="RE","report"="RP","peer-review"="PR", "standard"="ST","editorial"="ED","erratum"="ER","grant"="GR","letter"="LT") getID <- function(URLid) substring(URLid,22) firstup <- function(n) {n <- tolower(n); substr(n,1,1) <- toupper(substr(n,1,1)); n} Gname <- function(name,ty,py,vl,fp){L <- firstup(unlist(strsplit(name," "))); k <- length(L) H <- paste(substr(L[k],1,8),paste(substr(L[1:(k-1)],1,1),sep="",collapse=""),sep="_") if(ty=="article") paste(H,"(",py,")",vl,":",fp,sep="") else paste(H,"(",py,")",.Ty[ty],sep="") } sAname <- function(name){L <- firstup(unlist(strsplit(name," "))); k <- length(L) H <- paste(L[k],paste(substr(L[1:(k-1)],1,1),sep="",collapse="")) } openWorks <- function(query=NULL,list=NULL,file=NULL){ WC <<- new.env(hash=TRUE,parent=emptyenv()) WC$works <- "https://api.openalex.org/works" WC$Q <- query; WC$L <- list; WC$f <- file WC$n <- 0; WC$l <- 0; WC$m <- 0; WC$an <- 0 WC$tr <- file("trace.txt","w",encoding="UTF-8") cat("% OpenAlex",date(),"\n",file=WC$tr) if(length(query[["search"]])>0) { WC$k <- 0; WC$nr <- 0; WC$act <- "page" if(length(query[["per_page"]])==0) WC$Q$per_page <- "200" WC$Q$cursor <- "*" } else if(length(list)>0) { WC$act <- "list" } else if(length(file)>0) { WC$act <- "open" } else WC$act <- "stop" } nextWork <- function(){ # repeat{ for(t in 1:5){ switch(WC$act, "page" = { # if(WC$n==10) {WC$act <- "list"; next} WC$k <- WC$k + 1 if(WC$k>WC$nr){ WC$wd <- GET(WC$works,query=WC$Q) if(WC$wd$status_code!=200) {WC$act <- "list" cat(WC$n,"GET error\n"); flush.console(); next} WC$k <- 1 WC$wc <- fromJSON(rawToChar(WC$wd$content)) WC$Q$cursor <- WC$wc$meta$next_cursor if(is.null(WC$Q$cursor)) {WC$act <- "list"; next} WC$df <- WC$wc$results; WC$nr <- nrow(WC$df) # cat(WC$k,wc$meta$count,WC$nr,"\n ",WC$Q$cursor,"\n"); flush.console() } WC$n <- WC$n + 1 return(WC$df[WC$k,]) }, "list" = { WC$l <- WC$l + 1 if(WC$l>length(WC$L)) {WC$act <- "open"; next} works <- paste(WC$works,"/",WC$L[WC$l],sep="") WC$wd <- GET(works,query=list(select=WC$Q[["select"]])) if(WC$wd$status_code!=200) {cat(WC$n,"GET error\n") flush.console(); next} # cat(" >>>",WC$l,WC$L[WC$l],"\n"); flush.console() wc <- fromJSON(rawToChar(WC$wd$content)) WC$n <- WC$n + 1 return(wc) }, "open" = { if(is.null(WC$f)) { WC$act <- "stop"; next } WC$ndj <- file(WC$f,open="r") WC$act <- "file"; next }, "file" = { wc <- readLines(con=WC$ndj,n=1) if(length(wc)==0){ close(WC$ndj); WC$act <- "stop"; next } WC$m <- WC$m + 1; WC$n <- WC$n + 1 return(fromJSON(wc)) }, "stop" = { return(NULL) }, stop(paste0("No handler for ",WC$act)) ) } stop("Too many errors") } processWork <- function(w) { # cat(" Process:",WC$n,w$title,"\n"); flush.console() Wid <- getID(w$id); hit <- TRUE Sid <- getID(w$primary_location$source$id) Sname <- w$primary_location$source$display_name pYear <- w$publication_year; pDate <- w$publication_date type <- w$type; lang <- w$language vol <- w$biblio$volume; iss <- w$biblio$issue fPage <- w$biblio$first_page; lPage <- w$biblio$last_page title <- w$title; tit <- gsub(";",",",title) autsh <- w$authorships[[1]] if(nrow(autsh)==0) { cat("W",WC$n,"no authors info\n",file=WC$tr) WC$an <- WC$an + 1; fAName <- paste("Anon",WC$an,sep="") } else { fAName <- w$authorships$author$display_name[1] if(length(w$authorships)==1) fAName <- w$authorships[[1]]$author$display_name[1]} sWname <- Gname(fAName,type,pYear,vol,fPage) u <- putWork(Wid,sWname) # cat(u,Wid,hit,sWname,Sid,pYear,pDate,type,lang,vol,iss,fPage,lPage,fAName,tit,sep=";","\n"); flush.console() if(!is.na(Sid)) {j <- putSrc(Sid,Sname); cat(u,j,"\n",file=wj)} cat(u,Wid,hit,sWname,Sid,pYear,pDate,type,lang,vol,iss,fPage,lPage,fAName,tit, sep=";",file=wrk); cat("\n",file=wrk) refs <- w$referenced_works if(length(w$referenced_works)==1) refs <- w$referenced_works[[1]] for(wk in refs) { vid <- getID(wk); v <- putWork(vid,"") cat(v,vid,FALSE,"",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,sep=";",file=wrk) cat("\n",file=wrk); cat(u,v,"\n",file=ci) } if(nrow(autsh)==0) { v <- putAuth(fAName,Aname=fAName); cat(u,v,"\n",file=wa) } else { auts <- w$authorships$author if(is.null(auts)) auts <- w$authorships[[1]]$author for(a in 1:nrow(auts)) { Aid <- getID(auts$id[a]); v <- putAuth(Aid,Aname=auts$display_name[a]) cat(u,v,"\n",file=wa) } } } closeWorks <- function() {close(WC$tr); rm(WC,inherits=TRUE)} ===== Program OpenAlex2Pajek ===== https://github.com/bavla/OpenAlex/blob/main/code/OpenAlex2Pajek.R # OpenAlex2Pajek # source("OpenAlex2Pajek.R") library(httr) library(jsonlite) source("https://raw.githubusercontent.com/bavla/OpenAlex/main/code/OpenAlex.R") # source("OpenAlex.R") # VBlist <- read.table("VladoWorks.csv")$V1 Q <- list( search="handball", # filter="publication_year:2015", select="id,primary_location,publication_year,publication_date,type,language,biblio,title,authorships,countries_distinct_count,cited_by_count,referenced_works_count,referenced_works", # select="id,title,countries_distinct_count,cited_by_count,referenced_works_count", per_page="200" # per_page="3" ) save <- TRUE; step <- 500 if(save) json <- file("save.ndjson","w",encoding="UTF-8") cat("OpenAlex2Pajek - Start",date(),"\n") ci <- file("Ci.tmp","w",encoding="UTF-8"); wa <- file("WA.tmp","w",encoding="UTF-8") wj <- file("WJ.tmp","w",encoding="UTF-8"); wrk <- file("works.csv","w",encoding="UTF-8") cat("% OpenAlex",date(),"\n",file=wa); cat("% OpenAlex",date(),"\n",file=wj) cat("% OpenAlex",date(),"\n",file=ci); cat("% OpenAlex",date(),"\n",file=wrk) cat("ind;Wid;hit;sWname;Sid;pYear;pDate;type;lang;vol;iss;fPage;lPage;fAName;title\n",file=wrk) works <- eDict(); srces <- eDict(); auths <- eDict(); # openWorks(query=Q,list=VBlist,file="manual.ndjson") # openWorks(query=Q,list=NULL,file="save.ndjson") # openWorks(query=Q,list=VBlist,file=NULL) openWorks(query=Q,list=NULL,file=NULL) # print(ls.str(WC)) cat("*** OpenAlex2Pajek - Start",date(),"\n"); flush.console() repeat{ w <- nextWork() if(is.null(w)) break if(save) write(toJSON(w),file=json) if(WC$n %% step==0) cat(date()," n =",WC$n,"\n"); flush.console() # tryCatch( processWork(w) #, # error=function(e){ cat("W",WC$n,w$id,"\n"); flush.console(); print(e)} ) } cat("*** OpenAlex2Pajek - Stop",date(),"\n"); flush.console() # print(ls.str(WC)) close(ci); close(wa); close(wj); close(wrk) if(save) close(json) cat("hits:",WC$n,"works:",length(works),"authors:",length(auths), "anon:",WC$an,"sources:",length(srces),"\n") # Citation Cite U <- dict2DF(works,"wind") n <- nrow(U) net <- file("Cite.net","w",encoding="UTF-8"); cat('\xEF\xBB\xBF',file=net) nam <- file("Works.nam","w",encoding="UTF-8"); cat('\xEF\xBB\xBF',file=nam) cat("% OpenAlex2Pajek",date(),"\n*vertices",n,"\n",file=net) cat("% OpenAlex2Pajek",date(),"\n*vertices",n,"\n",file=nam) Ci <- read.csv("Ci.tmp",sep="",head=FALSE,skip=1,encoding="UTF-8") for(i in 1:n){ cat(i,' "',row.names(U)[i],'"\n',sep="",file=net) cat(i,' "',ifelse(U[["sWname"]][i]=="",row.names(U)[i],U[["sWname"]][i]),'"\n',sep="",file=nam) } cat("*arcs\n",file=net) for(i in 1:nrow(Ci)) cat(Ci$V1[i],Ci$V2[i],"\n",file=net) close(net); close(nam); rm(Ci) # Authorship WA A <- dict2DF(auths,"aind") m <- nrow(A) net <- file("WA.net","w",encoding="UTF-8"); cat('\xEF\xBB\xBF',file=net) nam <- file("Authors.nam","w",encoding="UTF-8"); cat('\xEF\xBB\xBF',file=nam) cat("% OpenAlex2Pajek",date(),"\n*vertices",n+m,n,"\n",file=net) cat("% OpenAlex2Pajek",date(),"\n*vertices",m,"\n",file=nam) WA <- read.csv("WA.tmp",sep="",head=FALSE,skip=1,encoding="UTF-8") for(i in 1:n) cat(i,' "',row.names(U)[i],'"\n',sep="",file=net) for(i in 1:m){ cat(n+i,' "',row.names(A)[i],'"\n',sep="",file=net) cat(i,' "',ifelse(A[["sAname"]][i]=="",row.names(A)[i],A[["sAname"]][i]),'"\n',sep="",file=nam) } cat("*arcs\n",file=net) for(i in 1:nrow(WA)) cat(WA$V1[i],n+WA$V2[i],"\n",file=net) close(net); close(nam); rm(WA) # Sources WJ J <- dict2DF(srces,"sind") m <- nrow(J) net <- file("WJ.net","w",encoding="UTF-8"); cat('\xEF\xBB\xBF',file=net) nam <- file("Sources.nam","w",encoding="UTF-8"); cat('\xEF\xBB\xBF',file=nam) cat("% OpenAlex2Pajek",date(),"\n*vertices",n+m,n,"\n",file=net) cat("% OpenAlex2Pajek",date(),"\n*vertices",m,"\n",file=nam) WJ <- read.csv("WJ.tmp",sep="",head=FALSE,skip=1,encoding="UTF-8") for(i in 1:n) cat(i,' "',row.names(U)[i],'"\n',sep="",file=net) for(i in 1:m){ cat(n+i,' "',row.names(J)[i],'"\n',sep="",file=net) cat(i,' "',ifelse(J[["Sname"]][i]=="",row.names(J)[i],J[["Sname"]][i]),'"\n',sep="",file=nam) } cat("*arcs\n",file=net) for(i in 1:nrow(WJ)) cat(WJ$V1[i],n+WJ$V2[i],"\n",file=net) close(net); close(nam); rm(WJ) # closeWorks() ===== Creating Handball Pajek networks ===== https://github.com/bavla/OpenAlex/raw/main/data/handball.zip > source("OpenAlex2Pajek.R") OpenAlex2Pajek - Start Mon Mar 18 05:34:58 2024 *** OpenAlex2Pajek - Start Mon Mar 18 05:34:58 2024 Mon Mar 18 05:35:07 2024 n = 500 Mon Mar 18 05:35:15 2024 n = 1000 Mon Mar 18 05:35:25 2024 n = 1500 Mon Mar 18 05:35:33 2024 n = 2000 Mon Mar 18 05:35:45 2024 n = 2500 Mon Mar 18 05:35:53 2024 n = 3000 ... Mon Mar 18 05:53:38 2024 n = 23500 Mon Mar 18 05:54:04 2024 n = 24000 Mon Mar 18 05:54:33 2024 n = 24500 Mon Mar 18 05:55:02 2024 n = 25000 Mon Mar 18 05:55:51 2024 n = 25500 *** OpenAlex2Pajek - Stop Mon Mar 18 05:56:06 2024 hits: 25861 works: 233471 authors: 52643 anon: 1325 sources: 5510 ===== Changes ===== - value val in putDict functions was changed from named vector to a named list (March 19, 2024) - added partitions pYear, hit, type, lang (version 1. March 22, 2024) ===== To do ===== - add the type of work to the works dictionary (March 22, 2024) - 1e+05 -> 100000; 2e+05 -> 200000 - add keywords network WK - add countries network WL (L - location) \\ [[..:alex|OpenAlex]]; [[..:alex:paj|OpenAlex 2 Pajek]]