====== S2ORC metadata to Pajek ======

<html><!--
[[http://vladowiki.fmf.uni-lj.si/doku.php?id=notes:imfm:corona|Corona]]
--></html>
===== Program =====


The python program to transform the [[https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases.html|S2ORC]] ''metadata.csv'' file into a collection of Pajek networks combines solutions from [[https://github.com/bavla/biblio/tree/master/WoS2Pajek|WoS2Pajek]] and [[pro:bib:lem:py|lemmatization]] based on the library NLTK: 
<code python>
# meta2nets - metadata.csv to Pajek bibliographic networks
# S2ORC to Pajek 0.1
# by Vladimir Batagelj, December 11, 2020

wdir = "C:/Users/batagelj/Documents/2020/corona/test"
ddir = "C:/Users/batagelj/Documents/2020/corona/test"
import sys, os, re, datetime, csv, json, shutil, time
os.chdir(wdir)
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

def get_wordnet_pos(word):
   """Map POS tag to first character lemmatize() accepts"""
   tag = nltk.pos_tag([word])[0][1][0].upper()
   tag_dict = {"J": wordnet.ADJ,
               "N": wordnet.NOUN,
               "V": wordnet.VERB,
               "R": wordnet.ADV}
   return tag_dict.get(tag, wordnet.NOUN)

def indAuthor(name):
# determines the index of an author
   global naut, aut, authors
   if name in aut:
     return aut[name]
   else:
     naut = naut + 1;
     aut[name] = naut
     authors.write(str(naut)+' "'+name+'"\n')
     return naut

def indJournal(name):
# determines the index of a journal
   global njr, jour, journals
   name = name.upper()
   if name in jour:
     return jour[name]
   else:
     njr = njr + 1;
     jour[name] = njr
     journals.write(str(njr)+' "'+name+'"\n')
     return njr

def indKeyword(name):
# determines the index of a keyword
   global nkw, keyw, keywords
   if name in keyw:
     return keyw[name]
   else:
     nkw = nkw + 1;
     keyw[name] = nkw
     keywords.write(str(nkw)+' "'+name+'"\n')
     return nkw

version = "S2ORC to Pajek 0.1"
print(version)
ts = datetime.datetime.now(); numrec = 0
print('{0}: {1}\n'.format("START",ts))

fromA = False; fromT = True; mstep = 5000; delfiles = False
works = open(wdir+'/works.tmp','w',encoding="utf-8-sig")
worksinfo = open(wdir+'/works.csv','w',encoding="utf-8-sig")
authors = open(wdir+'/authors.tmp','w',encoding="utf-8-sig")
years  = open(wdir+'/years.tmp','w')
journals  = open(wdir+'/journals.tmp','w',encoding="utf-8-sig")
authlinks  = open(wdir+'/authlinks.tmp','w')
keywlinks  = open(wdir+'/keywlinks.tmp','w')
jourlinks  = open(wdir+'/jourlinks.tmp','w')
keywords  = open(wdir+'/keywords.tmp','w',encoding="utf-8-sig")

aut  = {}; naut = 0
keyw = {}; nkw  = 0
jour = {}; njr  = 1
jour['*****'] = njr
journals.write(str(njr)+' "*****"\n')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
#add words that aren't in the NLTK stopwords list
add_words = ['!', '?', ',', ':', '&', '%', '.', '’', '(', ')', '[', ']']
new_stopwords = stop_words.union(add_words)

with open('metadata.csv',newline='',encoding="utf-8") as csvfile:
   csvreader = csv.DictReader(csvfile,delimiter=',',quotechar='"')
   numrec = 0
   worksinfo.write("num|name|pubTime|ID|DOI|PMC|pubMed\n")
   for row in csvreader:
      numrec += 1
      # if numrec > 2000: break
      if (numrec % mstep) == 0:
         print('{0}: {1}'.format(numrec,datetime.datetime.now()))        
      years.write('{0}: {1} {2} {3}\n'.format(numrec,row["cord_uid"],
         row["publish_time"],row["source_x"]))
      Au = row["authors"].split(";")
      firstAu = Au[0].strip() if len(Au)>0 else "Anonymous" 
      name = firstAu.split(",")[0] if len(firstAu)>0 else "Anonymous" 
      worksinfo.write(str(numrec)+"|"+name+"|"+row["publish_time"]+"|"+\
         row['cord_uid']+"|"+row['doi']+"|"+row['pmcid']+"|"+row['pubmed_id']+"\n")
      works.write(str(numrec)+' "'+name+':'+row["publish_time"]+'"\n')
      #   row['cord_uid'])
      for s in Au:
         iauth = indAuthor(s.strip())
         authlinks.write("{0} {1}\n".format(numrec,iauth))
      S = (row["title"]+" "+row["abstract"] if fromA & fromT else\
          row["abstract"] if fromA else row["title"])\
          .lower().replace("/"," ").replace("-"," ")
      L = set([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for\
               w in nltk.word_tokenize(S)])
      C = set([w for w in L if w not in new_stopwords])
      for k in C:
         ikeyw = indKeyword(k)
         keywlinks.write("{0} {1}\n".format(numrec,ikeyw))
      ijour = indJournal(row["journal"])
      jourlinks.write("{0} {1}\n".format(numrec,ijour))
      
authors.close(); journals.close(); keywords.close()
worksinfo.close(); works.close(); years.close()
authlinks.close(); keywlinks.close(); jourlinks.close()

print("number of works    ={0:7}".format(numrec))
print("number of authors  ={0:7}".format(naut))
print("number of journals ={0:7}".format(njr))
print("number of keywords ={0:7}".format(nkw))

tr = datetime.datetime.now()
print('{0}: {1}\n'.format(numrec,tr))

# time.sleep(3)

# works X authors network
print("works X authors  network: "+wdir+"/WA.net\n")
works  = open(wdir+'/works.tmp','r',encoding="utf-8-sig")
authors = open(wdir+'/authors.tmp','r',encoding="utf-8-sig")
wa  = open(wdir+'/WA.net','w',encoding="utf-8-sig")
wa.write("% created by "+version+" "+datetime.datetime.now().ctime()+"\n")
wa.write('*vertices '+str(numrec+naut)+' '+str(numrec)+'\n')
shutil.copyfileobj(works,wa)
works.close()
while True:
   line = authors.readline()
   if not line: break
   s = line.split(" ",1)
   wa.write(str(eval(s[0])+numrec)+' '+s[1])
temp  = open(wdir+'/authlinks.tmp','r')
wa.write('*arcs\n')
while True:
   line = temp.readline()
   if not line: break
   s = line.split(" ")
   wa.write(s[0]+' '+str(eval(s[1])+numrec)+'\n')
temp.close(); wa.close(); authors.close()

# works X journals network
print("works X journals  network: "+wdir+"/WJ.net\n")
works  = open(wdir+'/works.tmp','r',encoding="utf-8-sig")
journals = open(wdir+'/journals.tmp','r',encoding="utf-8-sig")
wj  = open(wdir+'/WJ.net','w',encoding="utf-8-sig")
wj.write("% created by "+version+" "+datetime.datetime.now().ctime()+"\n")
wj.write('*vertices '+str(numrec+njr)+' '+str(numrec)+'\n')
shutil.copyfileobj(works,wj)
works.close()
while True:
   line = journals.readline()
   if not line: break
   s = line.split(" ",1)
   wj.write(str(eval(s[0])+numrec)+' '+s[1])
temp  = open(wdir+'/jourlinks.tmp','r')
wj.write('*arcs\n')
while True:
   line = temp.readline()
   if not line: break
   s = line.split(" ")
   wj.write(s[0]+' '+str(eval(s[1])+numrec)+'\n')
temp.close(); wj.close(); journals.close()

# works X keywords network
print("works X keywords  network: "+wdir+"/WK.net\n")
works  = open(wdir+'/works.tmp','r',encoding="utf-8-sig")
keywords = open(wdir+'/keywords.tmp','r',encoding="utf-8-sig")
wk  = open(wdir+'/WK.net','w',encoding="utf-8-sig")
wk.write("% created by "+version+" "+datetime.datetime.now().ctime()+"\n")
wk.write('*vertices '+str(numrec+nkw)+' '+str(numrec)+'\n')
shutil.copyfileobj(works,wk)
works.close()
while True:
   line = keywords.readline()
   if not line: break
   s = line.split(" ",1)
   wk.write(str(eval(s[0])+numrec)+' '+s[1])
temp  = open(wdir+'/keywlinks.tmp','r')
wk.write('*arcs\n')
while True:
   line = temp.readline()
   if not line: break
   s = line.split(" ")
   wk.write(s[0]+' '+str(eval(s[1])+numrec)+'\n')
temp.close(); wk.close(); keywords.close()

if delfiles:
   try:
      os.remove(wdir+'/works.tmp')
      os.remove(wdir+'/authors.tmp');  os.remove(wdir+'/authlinks.tmp')      
      os.remove(wdir+'/keywords.tmp'); os.remove(wdir+'/keywlinks.tmp')
      os.remove(wdir+'/journals.tmp'); os.remove(wdir+'/jourlinks.tmp')
      # os.remove(wdir+'/years.tmp'); os.remove(wdir+'/works.csv')
   except:
      print("unable to delete some temp files")

tf = datetime.datetime.now()
print('{0}: {1}\n'.format("END",tf))
</code>

===== Running the program =====


Running the program for the first 1000 records we get the following time stamps.
<code python>
>>> 
===== RESTART: C:/Users/batagelj/Documents/2020/corona/test/meta2nets.py =====
*** metadata to networks
0: 2020-12-12 03:33:48.161604

1001: 2020-12-12 03:34:03.557485

>>> 375*15/60
93.75
>>> 375*15/60/60
1.5625
>>> 
</code>
Since there are 375094 records (works) we get an estimate of 1 hour and a half for processing the entire file. 

The program is controlled by quantities in the line
<code python>
fromA = False; fromT = True; mstep = 5000; delfiles = False
</code>
  * ''fromA'' - use abstracts for keywords
  * ''fromT'' - use titles for keywords
  * ''mstep'' - print report after each ''mstep'' records
  * ''delfiles'' - remove auxiliary files

==== Keywords from titles ====

<code python>
>>> 
===== RESTART: C:\Users\batagelj\Documents\2020\corona\test\meta2nets.py =====
S2ORC to Pajek 0.1
START: 2020-12-12 16:06:06.710062

5000: 2020-12-12 16:06:57.913991
10000: 2020-12-12 16:07:35.631148
15000: 2020-12-12 16:08:21.124750
20000: 2020-12-12 16:08:48.106293
25000: 2020-12-12 16:09:16.449914
30000: 2020-12-12 16:09:50.866883
35000: 2020-12-12 16:10:28.012007
.....
355000: 2020-12-12 17:02:19.552977
360000: 2020-12-12 17:03:06.782679
365000: 2020-12-12 17:03:53.694362
370000: 2020-12-12 17:04:40.436035
375000: 2020-12-12 17:05:27.778743
number of works    = 375094
number of authors  = 959851
number of journals =  28054
number of keywords =  97104
375094: 2020-12-12 17:05:28.742798

works X authors  network: C:/Users/batagelj/Documents/2020/corona/test/WA.net

works X journals  network: C:/Users/batagelj/Documents/2020/corona/test/WJ.net

works X keywords  network: C:/Users/batagelj/Documents/2020/corona/test/WK.net

END: 2020-12-12 17:06:47.844323
>>> 
</code>
The processing was finished in one hour.

==== Keyword pairs from titles ====

<code python>
...
      L = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for\
               w in nltk.word_tokenize(S)]
      C = [w for w in L if w not in new_stopwords]
      for i, j in zip(C, C[1:]):
         k = i+"_"+j; ikeyw = indKeyword(k)
         keywlinks.write("{0} {1}\n".format(numrec,ikeyw))
...
</code>

<code python>
>>> 
==== RESTART: C:/Users/batagelj/Documents/2020/corona/test/meta2netsB.py ====
S2ORC to Pajek 0.1
START: 2021-01-19 04:07:51.671188

5000: 2021-01-19 04:08:58.018983
10000: 2021-01-19 04:09:47.248799
15000: 2021-01-19 04:10:48.702314
20000: 2021-01-19 04:11:24.814379
25000: 2021-01-19 04:12:01.083454
...
365000: 2021-01-19 05:06:55.333874
370000: 2021-01-19 05:07:41.233500
375000: 2021-01-19 05:08:28.893226
number of works    = 375094
number of authors  = 959851
number of journals =  28054
number of keywords = 961905
375094: 2021-01-19 05:08:29.876282

works X authors  network: C:/Users/batagelj/Documents/2020/corona/test/WA.net

works X journals  network: C:/Users/batagelj/Documents/2020/corona/test/WJ.net

works X keywords  network: C:/Users/batagelj/Documents/2020/corona/test/WK.net
</code>

==== Keywords from titles and abstracts ====


<code python>
>>> 
===== RESTART: C:\Users\batagelj\Documents\2020\corona\test\meta2nets.py =====
S2ORC to Pajek 0.1
START: 2021-01-19 12:58:15.730912

5000: 2021-01-19 13:15:27.950952
10000: 2021-01-19 13:24:55.726427
15000: 2021-01-19 13:37:37.047972
20000: 2021-01-19 13:45:37.731465
25000: 2021-01-19 13:50:29.352145
30000: 2021-01-19 13:58:49.430748
35000: 2021-01-19 14:06:32.506234
40000: 2021-01-19 14:18:40.728886
45000: 2021-01-19 14:32:11.334250
50000: 2021-01-19 14:45:29.242888
...
340000: 2021-01-20 01:53:19.811793
345000: 2021-01-20 02:05:23.890208
350000: 2021-01-20 02:17:29.090687
355000: 2021-01-20 02:30:21.463865
360000: 2021-01-20 02:42:46.636486
365000: 2021-01-20 02:55:18.021463
370000: 2021-01-20 03:08:13.192800
375000: 2021-01-20 03:22:03.156271
number of works    = 375094
number of authors  = 959851
number of journals =  28054
number of keywords = 431518
375094: 2021-01-20 03:22:19.701218

works X authors  network: C:/Users/batagelj/Documents/2020/corona/test/WA.net

works X journals  network: C:/Users/batagelj/Documents/2020/corona/test/WJ.net

works X keywords  network: C:/Users/batagelj/Documents/2020/corona/test/WK.net

END: 2021-01-20 03:32:32.804285
>>> 
</code>


===== Analyses =====

[[https://github.com/bavla/Corona/tree/main/data|Pajek networks data]]

  * [[.:ana:aut|Authors]]
  * [[.:ana:day|Years and Days]]
  * [[.:ana:jrn|Journals]]
  * [[.:ana:kti|Keywords from titles]]
  * [[.:ana:kt2|Keyword pairs from titles]]
  * [[.:ana:kab|Keywords from titles and abstracts]]
  * [[.:ana:drv|Derived networks]]