Networks

Functions OpenAlex

https://github.com/bavla/OpenAlex/blob/main/code/OpenAlex.R

# OpenAlex
# https://github.com/bavla/OpenAlex/tree/main/code
# http://vladowiki.fmf.uni-lj.si/doku.php?id=vlado:work:bib:alex
# by Vladimir Batagelj, March 2024
# source("https://raw.githubusercontent.com/bavla/OpenAlex/main/OpenAlex.R")

keys = ls

eDict <- function(size=10000L) new.env(hash=TRUE,parent=emptyenv(),size=size)

getVals <- Vectorize(get,vectorize.args="x")

dict2DF <- function(dict,ind) {
  V <- as.data.frame(t(getVals(keys(dict),dict)))
  V[[ind]] <- as.integer(unname(V[[ind]]))
  return(V[order(V[[ind]]),])
}

putWork <- function(Wid,sWname=""){
  if(exists(Wid,env=works,inherits=FALSE)){
    if(works[[Wid]]["sWname"]!=sWname){
      if(works[[Wid]]["sWname"]=="") {works[[Wid]]["sWname"] <- sWname} else {
        cat("W",length(works),works[[Wid]]["sWname"],sWname,"\n",file=WC$tr) }}
  } else works[[Wid]] <- c(wind=length(works)+1,sWname=sWname)
  return(works[[Wid]]["wind"]) 
}

putSrc <- function(Sid,Sname=NA){
  if(exists(Sid,env=srces,inherits=FALSE)){
    if(srces[[Sid]]["Sname"]!=Sname){
      if(is.na(srces[[Sid]]["Sname"])) {srces[[Sid]]["Sname"] <- Sname} else {
        cat("S",length(srces),srces[[Sid]]["Sname"],Sname,"\n",file=WC$tr) }}
  } else srces[[Sid]] <- c(sind=length(srces)+1,Sname=Sname)
  return(srces[[Sid]]["sind"]) 
}

putAuth <- function(Aid,Aname=NA){
  sAnam <- ifelse(is.na(Aname),NA,sAname(Aname)) 
  if(exists(Aid,env=auths,inherits=FALSE)){
    if(auths[[Aid]]["Aname"]!=Aname){
      if(is.na(auths[[Aid]]["Aname"])) {auths[[Aid]]["Aname"] <- Aname} else {
        cat("A",length(auths),auths[[Aid]]["Aname"],Aname,"\n",file=WC$tr) }}
  } else auths[[Aid]] <- c(aind=length(auths)+1,Aname=Aname,sAname=sAnam)
  return(auths[[Aid]]["aind"]) 
}

.Ty <- c("article"="AR","book-chapter"="BC","dissertation"="DS","book"="BK","dataset"="DS",        
  "paratext"="PT","other"="OT","reference-entry"="RE","report"="RP","peer-review"="PR",    
  "standard"="ST","editorial"="ED","erratum"="ER","grant"="GR","letter"="LT")

getID <- function(URLid) substring(URLid,22)

firstup <- function(n) {n <- tolower(n); substr(n,1,1) <- toupper(substr(n,1,1)); n}

Gname <- function(name,ty,py,vl,fp){L <- firstup(unlist(strsplit(name," "))); k <- length(L)
  H <- paste(substr(L[k],1,8),paste(substr(L[1:(k-1)],1,1),sep="",collapse=""),sep="_")
  if(ty=="article") paste(H,"(",py,")",vl,":",fp,sep="") else
    paste(H,"(",py,")",.Ty[ty],sep="")
}

sAname <- function(name){L <- firstup(unlist(strsplit(name," "))); k <- length(L)
  H <- paste(L[k],paste(substr(L[1:(k-1)],1,1),sep="",collapse=""))
}

openWorks <- function(query=NULL,list=NULL,file=NULL){
  WC <<- new.env(hash=TRUE,parent=emptyenv())
  WC$works <- "https://api.openalex.org/works"
  WC$Q <- query; WC$L <- list; WC$f <- file
  WC$n <- 0; WC$l <- 0; WC$m <- 0; WC$an <- 0
  WC$tr <- file("trace.txt","w",encoding="UTF-8")
  cat("% OpenAlex",date(),"\n",file=WC$tr)
  if(length(query[["search"]])>0) {
    WC$k <- 0; WC$nr <- 0; WC$act <- "page"
    if(length(query[["per_page"]])==0) WC$Q$per_page <- "200"
    WC$Q$cursor <- "*"
  } else if(length(list)>0) { WC$act <- "list"
  } else if(length(file)>0) { WC$act <- "open"
  } else WC$act <- "stop"
}

nextWork <- function(){
  # repeat{
  for(t in 1:5){
    switch(WC$act,
      "page" = {
        # if(WC$n==10) {WC$act <- "list"; next}
        WC$k <- WC$k + 1
        if(WC$k>WC$nr){
          WC$wd <- GET(WC$works,query=WC$Q)
          if(WC$wd$status_code!=200) {WC$act <- "list"
            cat(WC$n,"GET error\n"); flush.console(); next}
          WC$k <- 1
          WC$wc <- fromJSON(rawToChar(WC$wd$content))
          WC$Q$cursor <- WC$wc$meta$next_cursor
          if(is.null(WC$Q$cursor)) {WC$act <- "list"; next}
          WC$df <- WC$wc$results; WC$nr <- nrow(WC$df)
          # cat(WC$k,wc$meta$count,WC$nr,"\n   ",WC$Q$cursor,"\n"); flush.console()
        }
        WC$n <- WC$n + 1
        return(WC$df[WC$k,])    
      },
      "list" = {
        WC$l <- WC$l + 1 
        if(WC$l>length(WC$L)) {WC$act <- "open"; next}
        works <- paste(WC$works,"/",WC$L[WC$l],sep="")
        WC$wd <- GET(works,query=list(select=WC$Q[["select"]]))
        if(WC$wd$status_code!=200) {cat(WC$n,"GET error\n")
          flush.console(); next}
        # cat("   >>>",WC$l,WC$L[WC$l],"\n"); flush.console()
        wc <- fromJSON(rawToChar(WC$wd$content))
        WC$n <- WC$n + 1
        return(wc)
      },
      "open" = {
        if(is.null(WC$f)) { WC$act <- "stop"; next }
        WC$ndj <- file(WC$f,open="r")
        WC$act <- "file"; next 
      },
      "file" = {
        wc <- readLines(con=WC$ndj,n=1)
        if(length(wc)==0){ close(WC$ndj); WC$act <- "stop"; next }
        WC$m <- WC$m + 1; WC$n <- WC$n + 1
        return(fromJSON(wc))
      },
      "stop" = { return(NULL) },
      stop(paste0("No handler for ",WC$act))
    ) 
  }
  stop("Too many errors")
}

processWork <- function(w) {
  # cat("   Process:",WC$n,w$title,"\n"); flush.console()
  Wid <- getID(w$id); hit <- TRUE
  Sid <- getID(w$primary_location$source$id)
  Sname <- w$primary_location$source$display_name 
  pYear <- w$publication_year; pDate <- w$publication_date
  type <- w$type; lang <- w$language
  vol <- w$biblio$volume; iss <- w$biblio$issue
  fPage <- w$biblio$first_page; lPage <- w$biblio$last_page
  title <- w$title; tit <- gsub(";",",",title) 
  autsh <- w$authorships[[1]]
  if(nrow(autsh)==0) { cat("W",WC$n,"no authors info\n",file=WC$tr)
    WC$an <- WC$an + 1; fAName <- paste("Anon",WC$an,sep="")
  } else { fAName <- w$authorships$author$display_name[1]
    if(length(w$authorships)==1) fAName <- w$authorships[[1]]$author$display_name[1]}
  sWname <- Gname(fAName,type,pYear,vol,fPage)
  u <- putWork(Wid,sWname)
  # cat(u,Wid,hit,sWname,Sid,pYear,pDate,type,lang,vol,iss,fPage,lPage,fAName,tit,sep=";","\n"); flush.console()
  if(!is.na(Sid)) {j <- putSrc(Sid,Sname); cat(u,j,"\n",file=wj)}
  cat(u,Wid,hit,sWname,Sid,pYear,pDate,type,lang,vol,iss,fPage,lPage,fAName,tit,
    sep=";",file=wrk); cat("\n",file=wrk)
  refs <- w$referenced_works
  if(length(w$referenced_works)==1) refs <- w$referenced_works[[1]]
  for(wk in refs) {
    vid <- getID(wk); v <- putWork(vid,"")
    cat(v,vid,FALSE,"",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,sep=";",file=wrk)
    cat("\n",file=wrk); cat(u,v,"\n",file=ci) }
  if(nrow(autsh)==0) {
    v <- putAuth(fAName,Aname=fAName); cat(u,v,"\n",file=wa)
  } else {
    auts <- w$authorships$author
    if(is.null(auts)) auts <- w$authorships[[1]]$author 
    for(a in 1:nrow(auts)) {
      Aid <- getID(auts$id[a]); v <- putAuth(Aid,Aname=auts$display_name[a])
      cat(u,v,"\n",file=wa) } }
}

closeWorks <- function() {close(WC$tr); rm(WC,inherits=TRUE)}

Program OpenAlex2Pajek

https://github.com/bavla/OpenAlex/blob/main/code/OpenAlex2Pajek.R

# OpenAlex2Pajek 
# source("OpenAlex2Pajek.R")

library(httr)
library(jsonlite)
source("https://raw.githubusercontent.com/bavla/OpenAlex/main/code/OpenAlex.R")
# source("OpenAlex.R")

# VBlist <- read.table("VladoWorks.csv")$V1
Q <- list(
  search="handball",
#  filter="publication_year:2015",
  select="id,primary_location,publication_year,publication_date,type,language,biblio,title,authorships,countries_distinct_count,cited_by_count,referenced_works_count,referenced_works",
#  select="id,title,countries_distinct_count,cited_by_count,referenced_works_count",
  per_page="200"
#  per_page="3"
)
save <- TRUE; step <- 500
if(save) json <- file("save.ndjson","w",encoding="UTF-8")

cat("OpenAlex2Pajek - Start",date(),"\n")
ci <- file("Ci.tmp","w",encoding="UTF-8"); wa <- file("WA.tmp","w",encoding="UTF-8")
wj <- file("WJ.tmp","w",encoding="UTF-8"); wrk <- file("works.csv","w",encoding="UTF-8")
cat("% OpenAlex",date(),"\n",file=wa); cat("% OpenAlex",date(),"\n",file=wj)
cat("% OpenAlex",date(),"\n",file=ci); cat("% OpenAlex",date(),"\n",file=wrk)
cat("ind;Wid;hit;sWname;Sid;pYear;pDate;type;lang;vol;iss;fPage;lPage;fAName;title\n",file=wrk)

works <- eDict(); srces <- eDict(); auths <- eDict();

# openWorks(query=Q,list=VBlist,file="manual.ndjson")
# openWorks(query=Q,list=NULL,file="save.ndjson")
# openWorks(query=Q,list=VBlist,file=NULL)
openWorks(query=Q,list=NULL,file=NULL)
# print(ls.str(WC))
cat("*** OpenAlex2Pajek - Start",date(),"\n"); flush.console()
repeat{
  w <- nextWork()
  if(is.null(w)) break
  if(save) write(toJSON(w),file=json)
  if(WC$n %% step==0) cat(date()," n =",WC$n,"\n"); flush.console()
#  tryCatch(
  processWork(w) #,
#    error=function(e){ cat("W",WC$n,w$id,"\n"); flush.console(); print(e)} )
}
cat("*** OpenAlex2Pajek - Stop",date(),"\n"); flush.console()
# print(ls.str(WC))
close(ci);  close(wa); close(wj); close(wrk)
if(save) close(json)
cat("hits:",WC$n,"works:",length(works),"authors:",length(auths),
  "anon:",WC$an,"sources:",length(srces),"\n")

# Citation Cite
U <- dict2DF(works,"wind")
n <- nrow(U)
net <- file("Cite.net","w",encoding="UTF-8"); cat('\xEF\xBB\xBF',file=net)
nam <- file("Works.nam","w",encoding="UTF-8"); cat('\xEF\xBB\xBF',file=nam)
cat("% OpenAlex2Pajek",date(),"\n*vertices",n,"\n",file=net)
cat("% OpenAlex2Pajek",date(),"\n*vertices",n,"\n",file=nam)
Ci <- read.csv("Ci.tmp",sep="",head=FALSE,skip=1,encoding="UTF-8")
for(i in 1:n){
  cat(i,' "',row.names(U)[i],'"\n',sep="",file=net)
  cat(i,' "',ifelse(U[["sWname"]][i]=="",row.names(U)[i],U[["sWname"]][i]),'"\n',sep="",file=nam)
}
cat("*arcs\n",file=net)
for(i in 1:nrow(Ci)) cat(Ci$V1[i],Ci$V2[i],"\n",file=net)
close(net); close(nam); rm(Ci)

# Authorship WA
A <- dict2DF(auths,"aind")
m <- nrow(A)
net <- file("WA.net","w",encoding="UTF-8"); cat('\xEF\xBB\xBF',file=net)
nam <- file("Authors.nam","w",encoding="UTF-8"); cat('\xEF\xBB\xBF',file=nam)
cat("% OpenAlex2Pajek",date(),"\n*vertices",n+m,n,"\n",file=net)
cat("% OpenAlex2Pajek",date(),"\n*vertices",m,"\n",file=nam)
WA <- read.csv("WA.tmp",sep="",head=FALSE,skip=1,encoding="UTF-8")
for(i in 1:n) cat(i,' "',row.names(U)[i],'"\n',sep="",file=net)
for(i in 1:m){
  cat(n+i,' "',row.names(A)[i],'"\n',sep="",file=net)
  cat(i,' "',ifelse(A[["sAname"]][i]=="",row.names(A)[i],A[["sAname"]][i]),'"\n',sep="",file=nam)
}
cat("*arcs\n",file=net)
for(i in 1:nrow(WA)) cat(WA$V1[i],n+WA$V2[i],"\n",file=net)
close(net); close(nam); rm(WA)

# Sources WJ
J <- dict2DF(srces,"sind")
m <- nrow(J)
net <- file("WJ.net","w",encoding="UTF-8"); cat('\xEF\xBB\xBF',file=net)
nam <- file("Sources.nam","w",encoding="UTF-8"); cat('\xEF\xBB\xBF',file=nam)
cat("% OpenAlex2Pajek",date(),"\n*vertices",n+m,n,"\n",file=net)
cat("% OpenAlex2Pajek",date(),"\n*vertices",m,"\n",file=nam)
WJ <- read.csv("WJ.tmp",sep="",head=FALSE,skip=1,encoding="UTF-8")
for(i in 1:n) cat(i,' "',row.names(U)[i],'"\n',sep="",file=net)
for(i in 1:m){
  cat(n+i,' "',row.names(J)[i],'"\n',sep="",file=net)
  cat(i,' "',ifelse(J[["Sname"]][i]=="",row.names(J)[i],J[["Sname"]][i]),'"\n',sep="",file=nam)
}
cat("*arcs\n",file=net)
for(i in 1:nrow(WJ)) cat(WJ$V1[i],n+WJ$V2[i],"\n",file=net)
close(net); close(nam); rm(WJ)

# closeWorks() 

Creating Handball Pajek networks

https://github.com/bavla/OpenAlex/raw/main/data/handball.zip

> source("OpenAlex2Pajek.R")
OpenAlex2Pajek - Start Mon Mar 18 05:34:58 2024 
*** OpenAlex2Pajek - Start Mon Mar 18 05:34:58 2024 
Mon Mar 18 05:35:07 2024  n = 500 
Mon Mar 18 05:35:15 2024  n = 1000 
Mon Mar 18 05:35:25 2024  n = 1500 
Mon Mar 18 05:35:33 2024  n = 2000 
Mon Mar 18 05:35:45 2024  n = 2500 
Mon Mar 18 05:35:53 2024  n = 3000 
...
Mon Mar 18 05:53:38 2024  n = 23500 
Mon Mar 18 05:54:04 2024  n = 24000 
Mon Mar 18 05:54:33 2024  n = 24500 
Mon Mar 18 05:55:02 2024  n = 25000 
Mon Mar 18 05:55:51 2024  n = 25500 
*** OpenAlex2Pajek - Stop Mon Mar 18 05:56:06 2024 
hits: 25861 works: 233471 authors: 52643 anon: 1325 sources: 5510 

Changes

  1. value val in putDict functions was changed from named vector to a named list (March 19, 2024)
  2. added partitions pYear, hit, type, lang (version 1. March 22, 2024)


To do

  1. add the type of work to the works dictionary (March 22, 2024)
  2. 1e+05 → 100000; 2e+05 → 200000
  3. add keywords network WK
  4. add countries network WL (L - location)


OpenAlex; OpenAlex 2 Pajek

vlado/work/bib/alex/nets.txt · Last modified: 2024/03/22 16:24 by vlado
 
Except where otherwise noted, content on this wiki is licensed under the following license: CC Attribution-Noncommercial-Share Alike 3.0 Unported
Recent changes RSS feed Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki