====== S2ORC metadata to Pajek ======
===== Program =====
The python program to transform the [[https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases.html|S2ORC]] ''metadata.csv'' file into a collection of Pajek networks combines solutions from [[https://github.com/bavla/biblio/tree/master/WoS2Pajek|WoS2Pajek]] and [[pro:bib:lem:py|lemmatization]] based on the library NLTK:
# meta2nets - metadata.csv to Pajek bibliographic networks
# S2ORC to Pajek 0.1
# by Vladimir Batagelj, December 11, 2020
wdir = "C:/Users/batagelj/Documents/2020/corona/test"
ddir = "C:/Users/batagelj/Documents/2020/corona/test"
import sys, os, re, datetime, csv, json, shutil, time
os.chdir(wdir)
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
def get_wordnet_pos(word):
"""Map POS tag to first character lemmatize() accepts"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
def indAuthor(name):
# determines the index of an author
global naut, aut, authors
if name in aut:
return aut[name]
else:
naut = naut + 1;
aut[name] = naut
authors.write(str(naut)+' "'+name+'"\n')
return naut
def indJournal(name):
# determines the index of a journal
global njr, jour, journals
name = name.upper()
if name in jour:
return jour[name]
else:
njr = njr + 1;
jour[name] = njr
journals.write(str(njr)+' "'+name+'"\n')
return njr
def indKeyword(name):
# determines the index of a keyword
global nkw, keyw, keywords
if name in keyw:
return keyw[name]
else:
nkw = nkw + 1;
keyw[name] = nkw
keywords.write(str(nkw)+' "'+name+'"\n')
return nkw
version = "S2ORC to Pajek 0.1"
print(version)
ts = datetime.datetime.now(); numrec = 0
print('{0}: {1}\n'.format("START",ts))
fromA = False; fromT = True; mstep = 5000; delfiles = False
works = open(wdir+'/works.tmp','w',encoding="utf-8-sig")
worksinfo = open(wdir+'/works.csv','w',encoding="utf-8-sig")
authors = open(wdir+'/authors.tmp','w',encoding="utf-8-sig")
years = open(wdir+'/years.tmp','w')
journals = open(wdir+'/journals.tmp','w',encoding="utf-8-sig")
authlinks = open(wdir+'/authlinks.tmp','w')
keywlinks = open(wdir+'/keywlinks.tmp','w')
jourlinks = open(wdir+'/jourlinks.tmp','w')
keywords = open(wdir+'/keywords.tmp','w',encoding="utf-8-sig")
aut = {}; naut = 0
keyw = {}; nkw = 0
jour = {}; njr = 1
jour['*****'] = njr
journals.write(str(njr)+' "*****"\n')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
#add words that aren't in the NLTK stopwords list
add_words = ['!', '?', ',', ':', '&', '%', '.', '’', '(', ')', '[', ']']
new_stopwords = stop_words.union(add_words)
with open('metadata.csv',newline='',encoding="utf-8") as csvfile:
csvreader = csv.DictReader(csvfile,delimiter=',',quotechar='"')
numrec = 0
worksinfo.write("num|name|pubTime|ID|DOI|PMC|pubMed\n")
for row in csvreader:
numrec += 1
# if numrec > 2000: break
if (numrec % mstep) == 0:
print('{0}: {1}'.format(numrec,datetime.datetime.now()))
years.write('{0}: {1} {2} {3}\n'.format(numrec,row["cord_uid"],
row["publish_time"],row["source_x"]))
Au = row["authors"].split(";")
firstAu = Au[0].strip() if len(Au)>0 else "Anonymous"
name = firstAu.split(",")[0] if len(firstAu)>0 else "Anonymous"
worksinfo.write(str(numrec)+"|"+name+"|"+row["publish_time"]+"|"+\
row['cord_uid']+"|"+row['doi']+"|"+row['pmcid']+"|"+row['pubmed_id']+"\n")
works.write(str(numrec)+' "'+name+':'+row["publish_time"]+'"\n')
# row['cord_uid'])
for s in Au:
iauth = indAuthor(s.strip())
authlinks.write("{0} {1}\n".format(numrec,iauth))
S = (row["title"]+" "+row["abstract"] if fromA & fromT else\
row["abstract"] if fromA else row["title"])\
.lower().replace("/"," ").replace("-"," ")
L = set([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for\
w in nltk.word_tokenize(S)])
C = set([w for w in L if w not in new_stopwords])
for k in C:
ikeyw = indKeyword(k)
keywlinks.write("{0} {1}\n".format(numrec,ikeyw))
ijour = indJournal(row["journal"])
jourlinks.write("{0} {1}\n".format(numrec,ijour))
authors.close(); journals.close(); keywords.close()
worksinfo.close(); works.close(); years.close()
authlinks.close(); keywlinks.close(); jourlinks.close()
print("number of works ={0:7}".format(numrec))
print("number of authors ={0:7}".format(naut))
print("number of journals ={0:7}".format(njr))
print("number of keywords ={0:7}".format(nkw))
tr = datetime.datetime.now()
print('{0}: {1}\n'.format(numrec,tr))
# time.sleep(3)
# works X authors network
print("works X authors network: "+wdir+"/WA.net\n")
works = open(wdir+'/works.tmp','r',encoding="utf-8-sig")
authors = open(wdir+'/authors.tmp','r',encoding="utf-8-sig")
wa = open(wdir+'/WA.net','w',encoding="utf-8-sig")
wa.write("% created by "+version+" "+datetime.datetime.now().ctime()+"\n")
wa.write('*vertices '+str(numrec+naut)+' '+str(numrec)+'\n')
shutil.copyfileobj(works,wa)
works.close()
while True:
line = authors.readline()
if not line: break
s = line.split(" ",1)
wa.write(str(eval(s[0])+numrec)+' '+s[1])
temp = open(wdir+'/authlinks.tmp','r')
wa.write('*arcs\n')
while True:
line = temp.readline()
if not line: break
s = line.split(" ")
wa.write(s[0]+' '+str(eval(s[1])+numrec)+'\n')
temp.close(); wa.close(); authors.close()
# works X journals network
print("works X journals network: "+wdir+"/WJ.net\n")
works = open(wdir+'/works.tmp','r',encoding="utf-8-sig")
journals = open(wdir+'/journals.tmp','r',encoding="utf-8-sig")
wj = open(wdir+'/WJ.net','w',encoding="utf-8-sig")
wj.write("% created by "+version+" "+datetime.datetime.now().ctime()+"\n")
wj.write('*vertices '+str(numrec+njr)+' '+str(numrec)+'\n')
shutil.copyfileobj(works,wj)
works.close()
while True:
line = journals.readline()
if not line: break
s = line.split(" ",1)
wj.write(str(eval(s[0])+numrec)+' '+s[1])
temp = open(wdir+'/jourlinks.tmp','r')
wj.write('*arcs\n')
while True:
line = temp.readline()
if not line: break
s = line.split(" ")
wj.write(s[0]+' '+str(eval(s[1])+numrec)+'\n')
temp.close(); wj.close(); journals.close()
# works X keywords network
print("works X keywords network: "+wdir+"/WK.net\n")
works = open(wdir+'/works.tmp','r',encoding="utf-8-sig")
keywords = open(wdir+'/keywords.tmp','r',encoding="utf-8-sig")
wk = open(wdir+'/WK.net','w',encoding="utf-8-sig")
wk.write("% created by "+version+" "+datetime.datetime.now().ctime()+"\n")
wk.write('*vertices '+str(numrec+nkw)+' '+str(numrec)+'\n')
shutil.copyfileobj(works,wk)
works.close()
while True:
line = keywords.readline()
if not line: break
s = line.split(" ",1)
wk.write(str(eval(s[0])+numrec)+' '+s[1])
temp = open(wdir+'/keywlinks.tmp','r')
wk.write('*arcs\n')
while True:
line = temp.readline()
if not line: break
s = line.split(" ")
wk.write(s[0]+' '+str(eval(s[1])+numrec)+'\n')
temp.close(); wk.close(); keywords.close()
if delfiles:
try:
os.remove(wdir+'/works.tmp')
os.remove(wdir+'/authors.tmp'); os.remove(wdir+'/authlinks.tmp')
os.remove(wdir+'/keywords.tmp'); os.remove(wdir+'/keywlinks.tmp')
os.remove(wdir+'/journals.tmp'); os.remove(wdir+'/jourlinks.tmp')
# os.remove(wdir+'/years.tmp'); os.remove(wdir+'/works.csv')
except:
print("unable to delete some temp files")
tf = datetime.datetime.now()
print('{0}: {1}\n'.format("END",tf))
===== Running the program =====
Running the program for the first 1000 records we get the following time stamps.
>>>
===== RESTART: C:/Users/batagelj/Documents/2020/corona/test/meta2nets.py =====
*** metadata to networks
0: 2020-12-12 03:33:48.161604
1001: 2020-12-12 03:34:03.557485
>>> 375*15/60
93.75
>>> 375*15/60/60
1.5625
>>>
Since there are 375094 records (works) we get an estimate of 1 hour and a half for processing the entire file.
The program is controlled by quantities in the line
fromA = False; fromT = True; mstep = 5000; delfiles = False
* ''fromA'' - use abstracts for keywords
* ''fromT'' - use titles for keywords
* ''mstep'' - print report after each ''mstep'' records
* ''delfiles'' - remove auxiliary files
==== Keywords from titles ====
>>>
===== RESTART: C:\Users\batagelj\Documents\2020\corona\test\meta2nets.py =====
S2ORC to Pajek 0.1
START: 2020-12-12 16:06:06.710062
5000: 2020-12-12 16:06:57.913991
10000: 2020-12-12 16:07:35.631148
15000: 2020-12-12 16:08:21.124750
20000: 2020-12-12 16:08:48.106293
25000: 2020-12-12 16:09:16.449914
30000: 2020-12-12 16:09:50.866883
35000: 2020-12-12 16:10:28.012007
.....
355000: 2020-12-12 17:02:19.552977
360000: 2020-12-12 17:03:06.782679
365000: 2020-12-12 17:03:53.694362
370000: 2020-12-12 17:04:40.436035
375000: 2020-12-12 17:05:27.778743
number of works = 375094
number of authors = 959851
number of journals = 28054
number of keywords = 97104
375094: 2020-12-12 17:05:28.742798
works X authors network: C:/Users/batagelj/Documents/2020/corona/test/WA.net
works X journals network: C:/Users/batagelj/Documents/2020/corona/test/WJ.net
works X keywords network: C:/Users/batagelj/Documents/2020/corona/test/WK.net
END: 2020-12-12 17:06:47.844323
>>>
The processing was finished in one hour.
==== Keyword pairs from titles ====
...
L = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for\
w in nltk.word_tokenize(S)]
C = [w for w in L if w not in new_stopwords]
for i, j in zip(C, C[1:]):
k = i+"_"+j; ikeyw = indKeyword(k)
keywlinks.write("{0} {1}\n".format(numrec,ikeyw))
...
>>>
==== RESTART: C:/Users/batagelj/Documents/2020/corona/test/meta2netsB.py ====
S2ORC to Pajek 0.1
START: 2021-01-19 04:07:51.671188
5000: 2021-01-19 04:08:58.018983
10000: 2021-01-19 04:09:47.248799
15000: 2021-01-19 04:10:48.702314
20000: 2021-01-19 04:11:24.814379
25000: 2021-01-19 04:12:01.083454
...
365000: 2021-01-19 05:06:55.333874
370000: 2021-01-19 05:07:41.233500
375000: 2021-01-19 05:08:28.893226
number of works = 375094
number of authors = 959851
number of journals = 28054
number of keywords = 961905
375094: 2021-01-19 05:08:29.876282
works X authors network: C:/Users/batagelj/Documents/2020/corona/test/WA.net
works X journals network: C:/Users/batagelj/Documents/2020/corona/test/WJ.net
works X keywords network: C:/Users/batagelj/Documents/2020/corona/test/WK.net
==== Keywords from titles and abstracts ====
>>>
===== RESTART: C:\Users\batagelj\Documents\2020\corona\test\meta2nets.py =====
S2ORC to Pajek 0.1
START: 2021-01-19 12:58:15.730912
5000: 2021-01-19 13:15:27.950952
10000: 2021-01-19 13:24:55.726427
15000: 2021-01-19 13:37:37.047972
20000: 2021-01-19 13:45:37.731465
25000: 2021-01-19 13:50:29.352145
30000: 2021-01-19 13:58:49.430748
35000: 2021-01-19 14:06:32.506234
40000: 2021-01-19 14:18:40.728886
45000: 2021-01-19 14:32:11.334250
50000: 2021-01-19 14:45:29.242888
...
340000: 2021-01-20 01:53:19.811793
345000: 2021-01-20 02:05:23.890208
350000: 2021-01-20 02:17:29.090687
355000: 2021-01-20 02:30:21.463865
360000: 2021-01-20 02:42:46.636486
365000: 2021-01-20 02:55:18.021463
370000: 2021-01-20 03:08:13.192800
375000: 2021-01-20 03:22:03.156271
number of works = 375094
number of authors = 959851
number of journals = 28054
number of keywords = 431518
375094: 2021-01-20 03:22:19.701218
works X authors network: C:/Users/batagelj/Documents/2020/corona/test/WA.net
works X journals network: C:/Users/batagelj/Documents/2020/corona/test/WJ.net
works X keywords network: C:/Users/batagelj/Documents/2020/corona/test/WK.net
END: 2021-01-20 03:32:32.804285
>>>
===== Analyses =====
[[https://github.com/bavla/Corona/tree/main/data|Pajek networks data]]
* [[.:ana:aut|Authors]]
* [[.:ana:day|Years and Days]]
* [[.:ana:jrn|Journals]]
* [[.:ana:kti|Keywords from titles]]
* [[.:ana:kt2|Keyword pairs from titles]]
* [[.:ana:kab|Keywords from titles and abstracts]]
* [[.:ana:drv|Derived networks]]