====== First partial conversion to Pajek files ====== ===== Multirelational network ===== First, using R > setwd('C:/Users/Batagelj/data/panama/csvs') > T <- read.csv('entities.csv',head=TRUE,sep=',') > dim(T) [1] 319150 21 > colnames(T) [1] "name" "original_name" [3] "former_name" "jurisdiction" [5] "jurisdiction_description" "company_type" [7] "address" "internal_id" [9] "incorporation_date" "inactivation_date" [11] "struck_off_date" "dorm_date" [13] "status" "service_provider" [15] "ibcRUC" "country_codes" [17] "countries" "note" [19] "valid_until" "node_id" [21] "sourceID" I determined sizes of data files file rows cols ----------------------------------- Entities.csv 319150 21 Intermediaries.csv 23636 9 Officers.csv 345594 7 Addresses.csv 151054 7 ----------------------------------- and names of attributes. Afterward, using a short python program, I produced a list of all nodes from files - **''panama.nod''**. It turned out that the sets are not disjoint - some officers are also intermediaries. An improved version of the initial program import csv, sys, os, time, datetime from operator import itemgetter # by Vladimir Batagelj, 14. May 2016 os.chdir('C:/Users/Batagelj/data/panama/csvs') def indNode(nodeId,ty): # determines the Pajek's number of a node if nodeId in nodes: print('*** Duplicated node', len(nodes), nodeId) (a,b) = nodes[nodeId] nodes[nodeId] = (a,b+str(ty)) else: nodes[nodeId] = (len(nodes)+1,str(ty)) nod.write(str(len(nodes))+' "'+nodeId+'"\n') return nodes[nodeId] t1 = datetime.datetime.now() print("nodeRead\nstarted: ",t1.ctime()) clu = open('panama.clu','w',encoding='utf-8') nod = open('panama.nod','w',encoding='utf-8') clu.write('% created by nodeRead:'+t1.ctime()+'\n*vertices ?\n') nodes = {} podatki = ['Entities.csv','Intermediaries.csv','Officers.csv','Addresses.csv'] nodeInd = [19,7,5,5] for ty in range(4): nInd = nodeInd[ty] with open(podatki[ty], newline='', encoding='utf-8') as dat: nodeReader = csv.reader(dat, delimiter=',', quotechar='"') try: head = next(nodeReader) for row in nodeReader: u = indNode(row[nInd],ty+1) except csv.Error as e: sys.exit('file {}, line {}: {}'.format( podatki, nodeReader.line_num, e)) print("type = ",ty+1," n = ",len(nodes)) for (k,v) in sorted(nodes.items(), key=itemgetter(1)): clu.write("{0:<7} {1:<9} {2}\n".format(v[0],k,v[1])) nod.close(); clu.close() t2 = datetime.datetime.now() print("\nfinished: ",t2.ctime()) ============= RESTART: C:/Users/batagelj/data/panama/nodeRead.py ============= nodeRead started: Sun May 15 03:38:03 2016 type = 1 n = 319150 type = 2 n = 342786 *** Duplicated node 342798 51122 *** Duplicated node 342832 51149 *** Duplicated node 342846 51162 ... *** Duplicated node 448544 70699 *** Duplicated node 448657 70785 *** Duplicated node 448769 70872 *** Duplicated node 448783 70884 type = 3 n = 687241 type = 4 n = 838295 finished: Sun May 15 03:38:22 2016 produces the list ''panama.nod'' and a partition ''panama.clu'' with classes (clusters) Cluster Freq Representative --------------------------------------------------- C1 Entities 319150 1 C2 Intermediaries 22497 320290 C3 Officers 344455 342787 C4 Addresses 151054 687242 C5 Intermediaries+Officers 1139 319151 --------------------------------------------------- Both files were manually adapted to Pajek format. In the next step I transformed the file ''all_edges.csv'' into a corresponding Pajek's multirelational list of arcs: import csv, sys, os, time, datetime from operator import itemgetter # by Vladimir Batagelj, 10-11, 15, 21. May 2016 os.chdir('C:/Users/Batagelj/data/panama/csvs') def indNode(nodeId): # determines the Pajek's number of a node if not(nodeId in nodes): nodes[nodeId] = len(nodes)+1 return nodes[nodeId] def indRel(nodeId): # determines the Pajek's number of a relation if not(nodeId in rels): rels[nodeId] = len(rels)+1 return rels[nodeId] t1 = datetime.datetime.now() print("linkRead\nstarted: ",t1.ctime()) nodes = {} with open('panama.nod', newline='', encoding='utf-8') as nod: nodReader = csv.reader(nod, delimiter=' ', quotechar='"') try: for row in nodReader: u = indNode(row[1]) except csv.Error as e: sys.exit('file {}, line {}: {}'.format( 'panama.nod', nodReader.line_num, e)) podatki = 'all_edges.csv' net = open('panama.net','w',encoding='utf-8') net.write('*arcs\n') rels = {} with open(podatki, newline='', encoding='utf-8') as dat: linkReader = csv.reader(dat, delimiter=',', quotechar='"') m = 0 try: head = next(linkReader) for row in linkReader: m = m+1 u = indNode(row[0]); v = indNode(row[2]); r = indRel(row[1]) net.write(str(r)+": "+str(u)+' '+str(v)+'\n') except csv.Error as e: sys.exit('file {}, line {}: {}'.format( podatki, linkReader.line_num, e)) print("n = ",len(nodes)," m = ",m) print(head) for (k,v) in sorted(rels.items(), key=itemgetter(1)): print('{0:>3} "{1}"'.format(v,k)) net.close() t2 = datetime.datetime.now() print("\nfinished: ",t2.ctime()) >>> ============= RESTART: C:\Users\batagelj\data\panama\linkRead.py ============= linkRead started: Sun May 15 04:13:23 2016 n = 838295 m = 1265690 ['node_1', 'rel_type', 'node_2'] intermediary_of 1 registered_address 3 similar 4 underlying 5 officer_of 2 finished: Sun May 15 04:13:32 2016 In a text editor we combine files ''panama.nod'' (list of nodes) and ''panama.net'' (list of relational arcs) into a Pajek multirelational network file ''panama.net''. Files ''panama.net'' and ''panama.clu'' are available at {{notes:zip:panama.zip}} ===== Structure of the Panama network ===== To get an insight to the overall structure of the panama network I applied the Pajek's command Operations/Network+Partition/Shrink Network The obtained reduced network has a matrix C1 C5 C2 C3 C4 Label ---------------------------------------------------- C1. 622 0 0 0 93623 #67028 C5. 51754 1 4 13 1193 #51122 C2. 310671 0 16 43 8165 #66951 C3. 538166 48 301 47579 213491 #51113 C4. 0 0 0 0 0 #2004267 ---------------------------------------------------- As a multirelational network it is presented in the following figure {{notes:pics:relations.svg}} {{notes:pics:relations.pdf}} where relations are represented by the following colors R1 intermediary_of 319121 red R2 officer_of 581476 blue R3 registered_address 317094 green R4 similar 46761 pink R5 underlying 1238 orange ===== Countries, labels and jurisdiction ===== Nodes can be related to different countries. We get a multi-relational two-mode network Nodes X Countries **''countries.net''**. The information about the source file is preserved as relation number: 1 "entities", 2 "intermediaries", 3 "officers", 4 "addresses". To (some) Entities also the country of jurisdiction is assigned - partition **''juris.clu''**. Labels of nodes were saved on the file **''panama.nam''**. import csv, sys, os, re, time, datetime from operator import itemgetter # by Vladimir Batagelj, 19. May 2016 os.chdir('C:/Users/Batagelj/data/panama/csvs') def indNode(nodeId): # determines the Pajek's number of a node if not(nodeId in nodes): nodes[nodeId] = len(nodes)+1 return nodes[nodeId] def indCountry(nodeId): # determines the Pajek's number of a country if not(nodeId in cnty): cnty[nodeId] = len(cnty)+1 return cnty[nodeId] t1 = datetime.datetime.now() print("otherRead\nstarted: ",t1.ctime()) nodes = {} with open('panama.nod', newline='', encoding='utf-8') as nod: nodReader = csv.reader(nod, delimiter=' ', quotechar='"') try: # head = next(nodReader); head = next(nodReader) for row in nodReader: u = indNode(row[1]) except csv.Error as e: sys.exit('file {}, line {}: {}'.format( 'panama.nod', nodReader.line_num, e)) # T <- read.csv('entities.csv',head=TRUE,sep=',') cnty = {}; n = len(nodes); label = [""]*n; juris = [0]*n net = open('countries.net','w',encoding='utf-8') net.write('*arcs\n') data = 'entities.csv'; r = 1 with open(data, newline='', encoding='utf-8') as dat: datReader = csv.reader(dat, delimiter=',', quotechar='"') m = 0 try: head = next(datReader) for row in datReader: u = indNode(row[19]); label[u-1] = row[0] juris[u-1] = indCountry(row[3]) if len(row[3])>0 else 999 if len(row[15])>0: L = re.split(';',row[15]) for e in L: m = m+1; v = n+indCountry(e) net.write(str(r)+": "+str(u)+' '+str(v)+'\n') except csv.Error as e: sys.exit('file {}, line {}: {}'.format( data, datReader.line_num, e)) print("r = ",r," n = ",len(nodes)," m = ",m) data = 'Intermediaries.csv'; r = 2 with open(data, newline='', encoding='utf-8') as dat: datReader = csv.reader(dat, delimiter=',', quotechar='"') m = 0 try: head = next(datReader) for row in datReader: u = indNode(row[7]); label[u-1] = row[0] if len(row[4])>0: L = re.split(';',row[4]) for e in L: m = m+1; v = n+indCountry(e) net.write(str(r)+": "+str(u)+' '+str(v)+'\n') except csv.Error as e: sys.exit('file {}, line {}: {}'.format( data, datReader.line_num, e)) print("r = ",r," n = ",len(nodes)," m = ",m) data = 'Officers.csv'; r = 3 with open(data, newline='', encoding='utf-8') as dat: datReader = csv.reader(dat, delimiter=',', quotechar='"') m = 0 try: head = next(datReader) for row in datReader: u = indNode(row[5]); label[u-1] = row[0] if len(row[3])>0: L = re.split(';',row[3]) for e in L: m = m+1; v = n+indCountry(e) net.write(str(r)+": "+str(u)+' '+str(v)+'\n') except csv.Error as e: sys.exit('file {}, line {}: {}'.format( data, datReader.line_num, e)) print("r = ",r," n = ",len(nodes)," m = ",m) data = 'Addresses.csv'; r = 4 with open(data, newline='', encoding='utf-8') as dat: datReader = csv.reader(dat, delimiter=',', quotechar='"') m = 0 try: head = next(datReader) for row in datReader: u = indNode(row[5]); label[u-1] = row[0] if len(row[3])>0: L = re.split(';',row[3]) for e in L: m = m+1; v = n+indCountry(e) net.write(str(r)+": "+str(u)+' '+str(v)+'\n') except csv.Error as e: sys.exit('file {}, line {}: {}'.format( data, datReader.line_num, e)) print("r = ",r," n = ",len(nodes)," m = ",m) net.close() print(head) for (k,v) in sorted(cnty.items(), key=itemgetter(1)): print("{0:>3} {1:>3} {2}".format(v,n+v,k)) clu = open('juris.clu','w',encoding='utf-8') clu.write('% created by otherRead:'+t1.ctime()+'\n*vertices '+str(n)+'\n') for k in range(n): clu.write("{0:<3}\n".format(juris[k])) clu.close() nam = open('panama.nam','w',encoding='utf-8') nam.write('% created by otherRead:'+t1.ctime()+'\n*vertices '+str(n)+'\n') for k in range(n): nam.write('{0:<7} "{1}"\n'.format(k+1,label[k])) nam.close() t2 = datetime.datetime.now() print("\nfinished: ",t2.ctime()) >>> ============ RESTART: C:/Users/batagelj/data/panama/otherRead.py ============ otherRead started: Fri May 20 02:03:02 2016 r = 1 n = 838295 m = 365643 r = 2 n = 838295 m = 22860 r = 3 n = 838295 m = 252197 r = 4 n = 838295 m = 150162 ['address', 'icij_id', 'valid_until', 'country_codes', 'countries', 'node_id', 'sourceID'] 1 838296 XXX 2 838297 VGB 3 838298 CYP 4 838299 SGP 5 838300 USA 6 838301 RUS 7 838302 NLD 8 838303 GBR 9 838304 BVI 10 838305 HKG ... 235 838530 ETH 236 838531 GNB 237 838532 SLB 238 838533 GNQ 239 838534 REU 240 838535 SMR finished: Fri May 20 02:03:18 2016 >>>