====== S2ORC metadata to Pajek ====== ===== Program ===== The python program to transform the [[https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases.html|S2ORC]] ''metadata.csv'' file into a collection of Pajek networks combines solutions from [[https://github.com/bavla/biblio/tree/master/WoS2Pajek|WoS2Pajek]] and [[pro:bib:lem:py|lemmatization]] based on the library NLTK: # meta2nets - metadata.csv to Pajek bibliographic networks # S2ORC to Pajek 0.1 # by Vladimir Batagelj, December 11, 2020 wdir = "C:/Users/batagelj/Documents/2020/corona/test" ddir = "C:/Users/batagelj/Documents/2020/corona/test" import sys, os, re, datetime, csv, json, shutil, time os.chdir(wdir) import nltk from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet from nltk.corpus import stopwords def get_wordnet_pos(word): """Map POS tag to first character lemmatize() accepts""" tag = nltk.pos_tag([word])[0][1][0].upper() tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV} return tag_dict.get(tag, wordnet.NOUN) def indAuthor(name): # determines the index of an author global naut, aut, authors if name in aut: return aut[name] else: naut = naut + 1; aut[name] = naut authors.write(str(naut)+' "'+name+'"\n') return naut def indJournal(name): # determines the index of a journal global njr, jour, journals name = name.upper() if name in jour: return jour[name] else: njr = njr + 1; jour[name] = njr journals.write(str(njr)+' "'+name+'"\n') return njr def indKeyword(name): # determines the index of a keyword global nkw, keyw, keywords if name in keyw: return keyw[name] else: nkw = nkw + 1; keyw[name] = nkw keywords.write(str(nkw)+' "'+name+'"\n') return nkw version = "S2ORC to Pajek 0.1" print(version) ts = datetime.datetime.now(); numrec = 0 print('{0}: {1}\n'.format("START",ts)) fromA = False; fromT = True; mstep = 5000; delfiles = False works = open(wdir+'/works.tmp','w',encoding="utf-8-sig") worksinfo = open(wdir+'/works.csv','w',encoding="utf-8-sig") authors = open(wdir+'/authors.tmp','w',encoding="utf-8-sig") years = open(wdir+'/years.tmp','w') journals = open(wdir+'/journals.tmp','w',encoding="utf-8-sig") authlinks = open(wdir+'/authlinks.tmp','w') keywlinks = open(wdir+'/keywlinks.tmp','w') jourlinks = open(wdir+'/jourlinks.tmp','w') keywords = open(wdir+'/keywords.tmp','w',encoding="utf-8-sig") aut = {}; naut = 0 keyw = {}; nkw = 0 jour = {}; njr = 1 jour['*****'] = njr journals.write(str(njr)+' "*****"\n') lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words("english")) #add words that aren't in the NLTK stopwords list add_words = ['!', '?', ',', ':', '&', '%', '.', '’', '(', ')', '[', ']'] new_stopwords = stop_words.union(add_words) with open('metadata.csv',newline='',encoding="utf-8") as csvfile: csvreader = csv.DictReader(csvfile,delimiter=',',quotechar='"') numrec = 0 worksinfo.write("num|name|pubTime|ID|DOI|PMC|pubMed\n") for row in csvreader: numrec += 1 # if numrec > 2000: break if (numrec % mstep) == 0: print('{0}: {1}'.format(numrec,datetime.datetime.now())) years.write('{0}: {1} {2} {3}\n'.format(numrec,row["cord_uid"], row["publish_time"],row["source_x"])) Au = row["authors"].split(";") firstAu = Au[0].strip() if len(Au)>0 else "Anonymous" name = firstAu.split(",")[0] if len(firstAu)>0 else "Anonymous" worksinfo.write(str(numrec)+"|"+name+"|"+row["publish_time"]+"|"+\ row['cord_uid']+"|"+row['doi']+"|"+row['pmcid']+"|"+row['pubmed_id']+"\n") works.write(str(numrec)+' "'+name+':'+row["publish_time"]+'"\n') # row['cord_uid']) for s in Au: iauth = indAuthor(s.strip()) authlinks.write("{0} {1}\n".format(numrec,iauth)) S = (row["title"]+" "+row["abstract"] if fromA & fromT else\ row["abstract"] if fromA else row["title"])\ .lower().replace("/"," ").replace("-"," ") L = set([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for\ w in nltk.word_tokenize(S)]) C = set([w for w in L if w not in new_stopwords]) for k in C: ikeyw = indKeyword(k) keywlinks.write("{0} {1}\n".format(numrec,ikeyw)) ijour = indJournal(row["journal"]) jourlinks.write("{0} {1}\n".format(numrec,ijour)) authors.close(); journals.close(); keywords.close() worksinfo.close(); works.close(); years.close() authlinks.close(); keywlinks.close(); jourlinks.close() print("number of works ={0:7}".format(numrec)) print("number of authors ={0:7}".format(naut)) print("number of journals ={0:7}".format(njr)) print("number of keywords ={0:7}".format(nkw)) tr = datetime.datetime.now() print('{0}: {1}\n'.format(numrec,tr)) # time.sleep(3) # works X authors network print("works X authors network: "+wdir+"/WA.net\n") works = open(wdir+'/works.tmp','r',encoding="utf-8-sig") authors = open(wdir+'/authors.tmp','r',encoding="utf-8-sig") wa = open(wdir+'/WA.net','w',encoding="utf-8-sig") wa.write("% created by "+version+" "+datetime.datetime.now().ctime()+"\n") wa.write('*vertices '+str(numrec+naut)+' '+str(numrec)+'\n') shutil.copyfileobj(works,wa) works.close() while True: line = authors.readline() if not line: break s = line.split(" ",1) wa.write(str(eval(s[0])+numrec)+' '+s[1]) temp = open(wdir+'/authlinks.tmp','r') wa.write('*arcs\n') while True: line = temp.readline() if not line: break s = line.split(" ") wa.write(s[0]+' '+str(eval(s[1])+numrec)+'\n') temp.close(); wa.close(); authors.close() # works X journals network print("works X journals network: "+wdir+"/WJ.net\n") works = open(wdir+'/works.tmp','r',encoding="utf-8-sig") journals = open(wdir+'/journals.tmp','r',encoding="utf-8-sig") wj = open(wdir+'/WJ.net','w',encoding="utf-8-sig") wj.write("% created by "+version+" "+datetime.datetime.now().ctime()+"\n") wj.write('*vertices '+str(numrec+njr)+' '+str(numrec)+'\n') shutil.copyfileobj(works,wj) works.close() while True: line = journals.readline() if not line: break s = line.split(" ",1) wj.write(str(eval(s[0])+numrec)+' '+s[1]) temp = open(wdir+'/jourlinks.tmp','r') wj.write('*arcs\n') while True: line = temp.readline() if not line: break s = line.split(" ") wj.write(s[0]+' '+str(eval(s[1])+numrec)+'\n') temp.close(); wj.close(); journals.close() # works X keywords network print("works X keywords network: "+wdir+"/WK.net\n") works = open(wdir+'/works.tmp','r',encoding="utf-8-sig") keywords = open(wdir+'/keywords.tmp','r',encoding="utf-8-sig") wk = open(wdir+'/WK.net','w',encoding="utf-8-sig") wk.write("% created by "+version+" "+datetime.datetime.now().ctime()+"\n") wk.write('*vertices '+str(numrec+nkw)+' '+str(numrec)+'\n') shutil.copyfileobj(works,wk) works.close() while True: line = keywords.readline() if not line: break s = line.split(" ",1) wk.write(str(eval(s[0])+numrec)+' '+s[1]) temp = open(wdir+'/keywlinks.tmp','r') wk.write('*arcs\n') while True: line = temp.readline() if not line: break s = line.split(" ") wk.write(s[0]+' '+str(eval(s[1])+numrec)+'\n') temp.close(); wk.close(); keywords.close() if delfiles: try: os.remove(wdir+'/works.tmp') os.remove(wdir+'/authors.tmp'); os.remove(wdir+'/authlinks.tmp') os.remove(wdir+'/keywords.tmp'); os.remove(wdir+'/keywlinks.tmp') os.remove(wdir+'/journals.tmp'); os.remove(wdir+'/jourlinks.tmp') # os.remove(wdir+'/years.tmp'); os.remove(wdir+'/works.csv') except: print("unable to delete some temp files") tf = datetime.datetime.now() print('{0}: {1}\n'.format("END",tf)) ===== Running the program ===== Running the program for the first 1000 records we get the following time stamps. >>> ===== RESTART: C:/Users/batagelj/Documents/2020/corona/test/meta2nets.py ===== *** metadata to networks 0: 2020-12-12 03:33:48.161604 1001: 2020-12-12 03:34:03.557485 >>> 375*15/60 93.75 >>> 375*15/60/60 1.5625 >>> Since there are 375094 records (works) we get an estimate of 1 hour and a half for processing the entire file. The program is controlled by quantities in the line fromA = False; fromT = True; mstep = 5000; delfiles = False * ''fromA'' - use abstracts for keywords * ''fromT'' - use titles for keywords * ''mstep'' - print report after each ''mstep'' records * ''delfiles'' - remove auxiliary files ==== Keywords from titles ==== >>> ===== RESTART: C:\Users\batagelj\Documents\2020\corona\test\meta2nets.py ===== S2ORC to Pajek 0.1 START: 2020-12-12 16:06:06.710062 5000: 2020-12-12 16:06:57.913991 10000: 2020-12-12 16:07:35.631148 15000: 2020-12-12 16:08:21.124750 20000: 2020-12-12 16:08:48.106293 25000: 2020-12-12 16:09:16.449914 30000: 2020-12-12 16:09:50.866883 35000: 2020-12-12 16:10:28.012007 ..... 355000: 2020-12-12 17:02:19.552977 360000: 2020-12-12 17:03:06.782679 365000: 2020-12-12 17:03:53.694362 370000: 2020-12-12 17:04:40.436035 375000: 2020-12-12 17:05:27.778743 number of works = 375094 number of authors = 959851 number of journals = 28054 number of keywords = 97104 375094: 2020-12-12 17:05:28.742798 works X authors network: C:/Users/batagelj/Documents/2020/corona/test/WA.net works X journals network: C:/Users/batagelj/Documents/2020/corona/test/WJ.net works X keywords network: C:/Users/batagelj/Documents/2020/corona/test/WK.net END: 2020-12-12 17:06:47.844323 >>> The processing was finished in one hour. ==== Keyword pairs from titles ==== ... L = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for\ w in nltk.word_tokenize(S)] C = [w for w in L if w not in new_stopwords] for i, j in zip(C, C[1:]): k = i+"_"+j; ikeyw = indKeyword(k) keywlinks.write("{0} {1}\n".format(numrec,ikeyw)) ... >>> ==== RESTART: C:/Users/batagelj/Documents/2020/corona/test/meta2netsB.py ==== S2ORC to Pajek 0.1 START: 2021-01-19 04:07:51.671188 5000: 2021-01-19 04:08:58.018983 10000: 2021-01-19 04:09:47.248799 15000: 2021-01-19 04:10:48.702314 20000: 2021-01-19 04:11:24.814379 25000: 2021-01-19 04:12:01.083454 ... 365000: 2021-01-19 05:06:55.333874 370000: 2021-01-19 05:07:41.233500 375000: 2021-01-19 05:08:28.893226 number of works = 375094 number of authors = 959851 number of journals = 28054 number of keywords = 961905 375094: 2021-01-19 05:08:29.876282 works X authors network: C:/Users/batagelj/Documents/2020/corona/test/WA.net works X journals network: C:/Users/batagelj/Documents/2020/corona/test/WJ.net works X keywords network: C:/Users/batagelj/Documents/2020/corona/test/WK.net ==== Keywords from titles and abstracts ==== >>> ===== RESTART: C:\Users\batagelj\Documents\2020\corona\test\meta2nets.py ===== S2ORC to Pajek 0.1 START: 2021-01-19 12:58:15.730912 5000: 2021-01-19 13:15:27.950952 10000: 2021-01-19 13:24:55.726427 15000: 2021-01-19 13:37:37.047972 20000: 2021-01-19 13:45:37.731465 25000: 2021-01-19 13:50:29.352145 30000: 2021-01-19 13:58:49.430748 35000: 2021-01-19 14:06:32.506234 40000: 2021-01-19 14:18:40.728886 45000: 2021-01-19 14:32:11.334250 50000: 2021-01-19 14:45:29.242888 ... 340000: 2021-01-20 01:53:19.811793 345000: 2021-01-20 02:05:23.890208 350000: 2021-01-20 02:17:29.090687 355000: 2021-01-20 02:30:21.463865 360000: 2021-01-20 02:42:46.636486 365000: 2021-01-20 02:55:18.021463 370000: 2021-01-20 03:08:13.192800 375000: 2021-01-20 03:22:03.156271 number of works = 375094 number of authors = 959851 number of journals = 28054 number of keywords = 431518 375094: 2021-01-20 03:22:19.701218 works X authors network: C:/Users/batagelj/Documents/2020/corona/test/WA.net works X journals network: C:/Users/batagelj/Documents/2020/corona/test/WJ.net works X keywords network: C:/Users/batagelj/Documents/2020/corona/test/WK.net END: 2021-01-20 03:32:32.804285 >>> ===== Analyses ===== [[https://github.com/bavla/Corona/tree/main/data|Pajek networks data]] * [[.:ana:aut|Authors]] * [[.:ana:day|Years and Days]] * [[.:ana:jrn|Journals]] * [[.:ana:kti|Keywords from titles]] * [[.:ana:kt2|Keyword pairs from titles]] * [[.:ana:kab|Keywords from titles and abstracts]] * [[.:ana:drv|Derived networks]]