S2ORC metadata to Pajek

S2ORC metadata to Pajek

Program

The python program to transform the S2ORC metadata.csv file into a collection of Pajek networks combines solutions from WoS2Pajek and lemmatization based on the library NLTK:

# meta2nets - metadata.csv to Pajek bibliographic networks
# S2ORC to Pajek 0.1
# by Vladimir Batagelj, December 11, 2020
 
wdir = "C:/Users/batagelj/Documents/2020/corona/test"
ddir = "C:/Users/batagelj/Documents/2020/corona/test"
import sys, os, re, datetime, csv, json, shutil, time
os.chdir(wdir)
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
 
def get_wordnet_pos(word):
   """Map POS tag to first character lemmatize() accepts"""
   tag = nltk.pos_tag([word])[0][1][0].upper()
   tag_dict = {"J": wordnet.ADJ,
               "N": wordnet.NOUN,
               "V": wordnet.VERB,
               "R": wordnet.ADV}
   return tag_dict.get(tag, wordnet.NOUN)
 
def indAuthor(name):
# determines the index of an author
   global naut, aut, authors
   if name in aut:
     return aut[name]
   else:
     naut = naut + 1;
     aut[name] = naut
     authors.write(str(naut)+' "'+name+'"\n')
     return naut
 
def indJournal(name):
# determines the index of a journal
   global njr, jour, journals
   name = name.upper()
   if name in jour:
     return jour[name]
   else:
     njr = njr + 1;
     jour[name] = njr
     journals.write(str(njr)+' "'+name+'"\n')
     return njr
 
def indKeyword(name):
# determines the index of a keyword
   global nkw, keyw, keywords
   if name in keyw:
     return keyw[name]
   else:
     nkw = nkw + 1;
     keyw[name] = nkw
     keywords.write(str(nkw)+' "'+name+'"\n')
     return nkw
 
version = "S2ORC to Pajek 0.1"
print(version)
ts = datetime.datetime.now(); numrec = 0
print('{0}: {1}\n'.format("START",ts))
 
fromA = False; fromT = True; mstep = 5000; delfiles = False
works = open(wdir+'/works.tmp','w',encoding="utf-8-sig")
worksinfo = open(wdir+'/works.csv','w',encoding="utf-8-sig")
authors = open(wdir+'/authors.tmp','w',encoding="utf-8-sig")
years  = open(wdir+'/years.tmp','w')
journals  = open(wdir+'/journals.tmp','w',encoding="utf-8-sig")
authlinks  = open(wdir+'/authlinks.tmp','w')
keywlinks  = open(wdir+'/keywlinks.tmp','w')
jourlinks  = open(wdir+'/jourlinks.tmp','w')
keywords  = open(wdir+'/keywords.tmp','w',encoding="utf-8-sig")
 
aut  = {}; naut = 0
keyw = {}; nkw  = 0
jour = {}; njr  = 1
jour['*****'] = njr
journals.write(str(njr)+' "*****"\n')
 
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
#add words that aren't in the NLTK stopwords list
add_words = ['!', '?', ',', ':', '&', '%', '.', '’', '(', ')', '[', ']']
new_stopwords = stop_words.union(add_words)
 
with open('metadata.csv',newline='',encoding="utf-8") as csvfile:
   csvreader = csv.DictReader(csvfile,delimiter=',',quotechar='"')
   numrec = 0
   worksinfo.write("num|name|pubTime|ID|DOI|PMC|pubMed\n")
   for row in csvreader:
      numrec += 1
      # if numrec > 2000: break
      if (numrec % mstep) == 0:
         print('{0}: {1}'.format(numrec,datetime.datetime.now()))        
      years.write('{0}: {1} {2} {3}\n'.format(numrec,row["cord_uid"],
         row["publish_time"],row["source_x"]))
      Au = row["authors"].split(";")
      firstAu = Au[0].strip() if len(Au)>0 else "Anonymous" 
      name = firstAu.split(",")[0] if len(firstAu)>0 else "Anonymous" 
      worksinfo.write(str(numrec)+"|"+name+"|"+row["publish_time"]+"|"+\
         row['cord_uid']+"|"+row['doi']+"|"+row['pmcid']+"|"+row['pubmed_id']+"\n")
      works.write(str(numrec)+' "'+name+':'+row["publish_time"]+'"\n')
      #   row['cord_uid'])
      for s in Au:
         iauth = indAuthor(s.strip())
         authlinks.write("{0} {1}\n".format(numrec,iauth))
      S = (row["title"]+" "+row["abstract"] if fromA & fromT else\
          row["abstract"] if fromA else row["title"])\
          .lower().replace("/"," ").replace("-"," ")
      L = set([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for\
               w in nltk.word_tokenize(S)])
      C = set([w for w in L if w not in new_stopwords])
      for k in C:
         ikeyw = indKeyword(k)
         keywlinks.write("{0} {1}\n".format(numrec,ikeyw))
      ijour = indJournal(row["journal"])
      jourlinks.write("{0} {1}\n".format(numrec,ijour))
 
authors.close(); journals.close(); keywords.close()
worksinfo.close(); works.close(); years.close()
authlinks.close(); keywlinks.close(); jourlinks.close()
 
print("number of works    ={0:7}".format(numrec))
print("number of authors  ={0:7}".format(naut))
print("number of journals ={0:7}".format(njr))
print("number of keywords ={0:7}".format(nkw))
 
tr = datetime.datetime.now()
print('{0}: {1}\n'.format(numrec,tr))
 
# time.sleep(3)
 
# works X authors network
print("works X authors  network: "+wdir+"/WA.net\n")
works  = open(wdir+'/works.tmp','r',encoding="utf-8-sig")
authors = open(wdir+'/authors.tmp','r',encoding="utf-8-sig")
wa  = open(wdir+'/WA.net','w',encoding="utf-8-sig")
wa.write("% created by "+version+" "+datetime.datetime.now().ctime()+"\n")
wa.write('*vertices '+str(numrec+naut)+' '+str(numrec)+'\n')
shutil.copyfileobj(works,wa)
works.close()
while True:
   line = authors.readline()
   if not line: break
   s = line.split(" ",1)
   wa.write(str(eval(s[0])+numrec)+' '+s[1])
temp  = open(wdir+'/authlinks.tmp','r')
wa.write('*arcs\n')
while True:
   line = temp.readline()
   if not line: break
   s = line.split(" ")
   wa.write(s[0]+' '+str(eval(s[1])+numrec)+'\n')
temp.close(); wa.close(); authors.close()
 
# works X journals network
print("works X journals  network: "+wdir+"/WJ.net\n")
works  = open(wdir+'/works.tmp','r',encoding="utf-8-sig")
journals = open(wdir+'/journals.tmp','r',encoding="utf-8-sig")
wj  = open(wdir+'/WJ.net','w',encoding="utf-8-sig")
wj.write("% created by "+version+" "+datetime.datetime.now().ctime()+"\n")
wj.write('*vertices '+str(numrec+njr)+' '+str(numrec)+'\n')
shutil.copyfileobj(works,wj)
works.close()
while True:
   line = journals.readline()
   if not line: break
   s = line.split(" ",1)
   wj.write(str(eval(s[0])+numrec)+' '+s[1])
temp  = open(wdir+'/jourlinks.tmp','r')
wj.write('*arcs\n')
while True:
   line = temp.readline()
   if not line: break
   s = line.split(" ")
   wj.write(s[0]+' '+str(eval(s[1])+numrec)+'\n')
temp.close(); wj.close(); journals.close()
 
# works X keywords network
print("works X keywords  network: "+wdir+"/WK.net\n")
works  = open(wdir+'/works.tmp','r',encoding="utf-8-sig")
keywords = open(wdir+'/keywords.tmp','r',encoding="utf-8-sig")
wk  = open(wdir+'/WK.net','w',encoding="utf-8-sig")
wk.write("% created by "+version+" "+datetime.datetime.now().ctime()+"\n")
wk.write('*vertices '+str(numrec+nkw)+' '+str(numrec)+'\n')
shutil.copyfileobj(works,wk)
works.close()
while True:
   line = keywords.readline()
   if not line: break
   s = line.split(" ",1)
   wk.write(str(eval(s[0])+numrec)+' '+s[1])
temp  = open(wdir+'/keywlinks.tmp','r')
wk.write('*arcs\n')
while True:
   line = temp.readline()
   if not line: break
   s = line.split(" ")
   wk.write(s[0]+' '+str(eval(s[1])+numrec)+'\n')
temp.close(); wk.close(); keywords.close()
 
if delfiles:
   try:
      os.remove(wdir+'/works.tmp')
      os.remove(wdir+'/authors.tmp');  os.remove(wdir+'/authlinks.tmp')      
      os.remove(wdir+'/keywords.tmp'); os.remove(wdir+'/keywlinks.tmp')
      os.remove(wdir+'/journals.tmp'); os.remove(wdir+'/jourlinks.tmp')
      # os.remove(wdir+'/years.tmp'); os.remove(wdir+'/works.csv')
   except:
      print("unable to delete some temp files")
 
tf = datetime.datetime.now()
print('{0}: {1}\n'.format("END",tf))

Running the program

Running the program for the first 1000 records we get the following time stamps.

>>> 
===== RESTART: C:/Users/batagelj/Documents/2020/corona/test/meta2nets.py =====
*** metadata to networks
0: 2020-12-12 03:33:48.161604
 
1001: 2020-12-12 03:34:03.557485
 
>>> 375*15/60
93.75
>>> 375*15/60/60
1.5625
>>>

Since there are 375094 records (works) we get an estimate of 1 hour and a half for processing the entire file.

The program is controlled by quantities in the line

fromA = False; fromT = True; mstep = 5000; delfiles = False

fromA - use abstracts for keywords
fromT - use titles for keywords
mstep - print report after each mstep records
delfiles - remove auxiliary files

Keywords from titles

>>> 
===== RESTART: C:\Users\batagelj\Documents\2020\corona\test\meta2nets.py =====
S2ORC to Pajek 0.1
START: 2020-12-12 16:06:06.710062
 
5000: 2020-12-12 16:06:57.913991
10000: 2020-12-12 16:07:35.631148
15000: 2020-12-12 16:08:21.124750
20000: 2020-12-12 16:08:48.106293
25000: 2020-12-12 16:09:16.449914
30000: 2020-12-12 16:09:50.866883
35000: 2020-12-12 16:10:28.012007
.....
355000: 2020-12-12 17:02:19.552977
360000: 2020-12-12 17:03:06.782679
365000: 2020-12-12 17:03:53.694362
370000: 2020-12-12 17:04:40.436035
375000: 2020-12-12 17:05:27.778743
number of works    = 375094
number of authors  = 959851
number of journals =  28054
number of keywords =  97104
375094: 2020-12-12 17:05:28.742798
 
works X authors  network: C:/Users/batagelj/Documents/2020/corona/test/WA.net
 
works X journals  network: C:/Users/batagelj/Documents/2020/corona/test/WJ.net
 
works X keywords  network: C:/Users/batagelj/Documents/2020/corona/test/WK.net
 
END: 2020-12-12 17:06:47.844323
>>>

The processing was finished in one hour.

Keyword pairs from titles

...
      L = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for\
               w in nltk.word_tokenize(S)]
      C = [w for w in L if w not in new_stopwords]
      for i, j in zip(C, C[1:]):
         k = i+"_"+j; ikeyw = indKeyword(k)
         keywlinks.write("{0} {1}\n".format(numrec,ikeyw))
...

>>> 
==== RESTART: C:/Users/batagelj/Documents/2020/corona/test/meta2netsB.py ====
S2ORC to Pajek 0.1
START: 2021-01-19 04:07:51.671188
 
5000: 2021-01-19 04:08:58.018983
10000: 2021-01-19 04:09:47.248799
15000: 2021-01-19 04:10:48.702314
20000: 2021-01-19 04:11:24.814379
25000: 2021-01-19 04:12:01.083454
...
365000: 2021-01-19 05:06:55.333874
370000: 2021-01-19 05:07:41.233500
375000: 2021-01-19 05:08:28.893226
number of works    = 375094
number of authors  = 959851
number of journals =  28054
number of keywords = 961905
375094: 2021-01-19 05:08:29.876282
 
works X authors  network: C:/Users/batagelj/Documents/2020/corona/test/WA.net
 
works X journals  network: C:/Users/batagelj/Documents/2020/corona/test/WJ.net
 
works X keywords  network: C:/Users/batagelj/Documents/2020/corona/test/WK.net

Keywords from titles and abstracts

>>> 
===== RESTART: C:\Users\batagelj\Documents\2020\corona\test\meta2nets.py =====
S2ORC to Pajek 0.1
START: 2021-01-19 12:58:15.730912
 
5000: 2021-01-19 13:15:27.950952
10000: 2021-01-19 13:24:55.726427
15000: 2021-01-19 13:37:37.047972
20000: 2021-01-19 13:45:37.731465
25000: 2021-01-19 13:50:29.352145
30000: 2021-01-19 13:58:49.430748
35000: 2021-01-19 14:06:32.506234
40000: 2021-01-19 14:18:40.728886
45000: 2021-01-19 14:32:11.334250
50000: 2021-01-19 14:45:29.242888
...
340000: 2021-01-20 01:53:19.811793
345000: 2021-01-20 02:05:23.890208
350000: 2021-01-20 02:17:29.090687
355000: 2021-01-20 02:30:21.463865
360000: 2021-01-20 02:42:46.636486
365000: 2021-01-20 02:55:18.021463
370000: 2021-01-20 03:08:13.192800
375000: 2021-01-20 03:22:03.156271
number of works    = 375094
number of authors  = 959851
number of journals =  28054
number of keywords = 431518
375094: 2021-01-20 03:22:19.701218
 
works X authors  network: C:/Users/batagelj/Documents/2020/corona/test/WA.net
 
works X journals  network: C:/Users/batagelj/Documents/2020/corona/test/WJ.net
 
works X keywords  network: C:/Users/batagelj/Documents/2020/corona/test/WK.net
 
END: 2021-01-20 03:32:32.804285
>>>

Analyses

Pajek networks data