This shows you the differences between two versions of the page.
— |
notes:net:first [2016/05/23 15:39] (current) vlado created |
||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== First partial conversion to Pajek files ====== | ||
+ | ===== Multirelational network ===== | ||
+ | |||
+ | First, using R | ||
+ | <code R> | ||
+ | > setwd('C:/Users/Batagelj/data/panama/csvs') | ||
+ | > T <- read.csv('entities.csv',head=TRUE,sep=',') | ||
+ | > dim(T) | ||
+ | [1] 319150 21 | ||
+ | > colnames(T) | ||
+ | [1] "name" "original_name" | ||
+ | [3] "former_name" "jurisdiction" | ||
+ | [5] "jurisdiction_description" "company_type" | ||
+ | [7] "address" "internal_id" | ||
+ | [9] "incorporation_date" "inactivation_date" | ||
+ | [11] "struck_off_date" "dorm_date" | ||
+ | [13] "status" "service_provider" | ||
+ | [15] "ibcRUC" "country_codes" | ||
+ | [17] "countries" "note" | ||
+ | [19] "valid_until" "node_id" | ||
+ | [21] "sourceID" | ||
+ | </code> | ||
+ | I determined sizes of data files | ||
+ | <code> | ||
+ | file rows cols | ||
+ | ----------------------------------- | ||
+ | Entities.csv 319150 21 | ||
+ | Intermediaries.csv 23636 9 | ||
+ | Officers.csv 345594 7 | ||
+ | Addresses.csv 151054 7 | ||
+ | ----------------------------------- | ||
+ | </code> | ||
+ | and names of attributes. | ||
+ | |||
+ | Afterward, using a short python program, I produced a list of all nodes from files - **''panama.nod''**. | ||
+ | |||
+ | It turned out that the sets are not disjoint - some officers are also intermediaries. An improved version of the initial program | ||
+ | <code python> | ||
+ | import csv, sys, os, time, datetime | ||
+ | from operator import itemgetter | ||
+ | # by Vladimir Batagelj, 14. May 2016 | ||
+ | os.chdir('C:/Users/Batagelj/data/panama/csvs') | ||
+ | |||
+ | def indNode(nodeId,ty): | ||
+ | # determines the Pajek's number of a node | ||
+ | if nodeId in nodes: | ||
+ | print('*** Duplicated node', len(nodes), nodeId) | ||
+ | (a,b) = nodes[nodeId] | ||
+ | nodes[nodeId] = (a,b+str(ty)) | ||
+ | else: | ||
+ | nodes[nodeId] = (len(nodes)+1,str(ty)) | ||
+ | nod.write(str(len(nodes))+' "'+nodeId+'"\n') | ||
+ | return nodes[nodeId] | ||
+ | |||
+ | t1 = datetime.datetime.now() | ||
+ | print("nodeRead\nstarted: ",t1.ctime()) | ||
+ | clu = open('panama.clu','w',encoding='utf-8') | ||
+ | nod = open('panama.nod','w',encoding='utf-8') | ||
+ | clu.write('% created by nodeRead:'+t1.ctime()+'\n*vertices ?\n') | ||
+ | nodes = {} | ||
+ | |||
+ | podatki = ['Entities.csv','Intermediaries.csv','Officers.csv','Addresses.csv'] | ||
+ | nodeInd = [19,7,5,5] | ||
+ | for ty in range(4): | ||
+ | nInd = nodeInd[ty] | ||
+ | with open(podatki[ty], newline='', encoding='utf-8') as dat: | ||
+ | nodeReader = csv.reader(dat, delimiter=',', quotechar='"') | ||
+ | try: | ||
+ | head = next(nodeReader) | ||
+ | for row in nodeReader: u = indNode(row[nInd],ty+1) | ||
+ | except csv.Error as e: | ||
+ | sys.exit('file {}, line {}: {}'.format( | ||
+ | podatki, nodeReader.line_num, e)) | ||
+ | print("type = ",ty+1," n = ",len(nodes)) | ||
+ | |||
+ | for (k,v) in sorted(nodes.items(), key=itemgetter(1)): | ||
+ | clu.write("{0:<7} {1:<9} {2}\n".format(v[0],k,v[1])) | ||
+ | |||
+ | nod.close(); clu.close() | ||
+ | t2 = datetime.datetime.now() | ||
+ | print("\nfinished: ",t2.ctime()) | ||
+ | </code> | ||
+ | |||
+ | <code> | ||
+ | ============= RESTART: C:/Users/batagelj/data/panama/nodeRead.py ============= | ||
+ | nodeRead | ||
+ | started: Sun May 15 03:38:03 2016 | ||
+ | type = 1 n = 319150 | ||
+ | type = 2 n = 342786 | ||
+ | *** Duplicated node 342798 51122 | ||
+ | *** Duplicated node 342832 51149 | ||
+ | *** Duplicated node 342846 51162 | ||
+ | ... | ||
+ | *** Duplicated node 448544 70699 | ||
+ | *** Duplicated node 448657 70785 | ||
+ | *** Duplicated node 448769 70872 | ||
+ | *** Duplicated node 448783 70884 | ||
+ | type = 3 n = 687241 | ||
+ | type = 4 n = 838295 | ||
+ | |||
+ | finished: Sun May 15 03:38:22 2016 | ||
+ | </code> | ||
+ | produces the list ''panama.nod'' and a partition ''panama.clu'' with classes (clusters) | ||
+ | <code> | ||
+ | Cluster Freq Representative | ||
+ | --------------------------------------------------- | ||
+ | C1 Entities 319150 1 | ||
+ | C2 Intermediaries 22497 320290 | ||
+ | C3 Officers 344455 342787 | ||
+ | C4 Addresses 151054 687242 | ||
+ | C5 Intermediaries+Officers 1139 319151 | ||
+ | --------------------------------------------------- | ||
+ | </code> | ||
+ | Both files were manually adapted to Pajek format. | ||
+ | |||
+ | In the next step I transformed the file ''all_edges.csv'' into a corresponding Pajek's multirelational list of arcs: | ||
+ | <code python> | ||
+ | import csv, sys, os, time, datetime | ||
+ | from operator import itemgetter | ||
+ | # by Vladimir Batagelj, 10-11, 15, 21. May 2016 | ||
+ | os.chdir('C:/Users/Batagelj/data/panama/csvs') | ||
+ | |||
+ | def indNode(nodeId): | ||
+ | # determines the Pajek's number of a node | ||
+ | if not(nodeId in nodes): | ||
+ | nodes[nodeId] = len(nodes)+1 | ||
+ | return nodes[nodeId] | ||
+ | |||
+ | def indRel(nodeId): | ||
+ | # determines the Pajek's number of a relation | ||
+ | if not(nodeId in rels): | ||
+ | rels[nodeId] = len(rels)+1 | ||
+ | return rels[nodeId] | ||
+ | |||
+ | t1 = datetime.datetime.now() | ||
+ | print("linkRead\nstarted: ",t1.ctime()) | ||
+ | nodes = {} | ||
+ | with open('panama.nod', newline='', encoding='utf-8') as nod: | ||
+ | nodReader = csv.reader(nod, delimiter=' ', quotechar='"') | ||
+ | try: | ||
+ | for row in nodReader: | ||
+ | u = indNode(row[1]) | ||
+ | except csv.Error as e: | ||
+ | sys.exit('file {}, line {}: {}'.format( | ||
+ | 'panama.nod', nodReader.line_num, e)) | ||
+ | |||
+ | podatki = 'all_edges.csv' | ||
+ | net = open('panama.net','w',encoding='utf-8') | ||
+ | net.write('*arcs\n') | ||
+ | rels = {} | ||
+ | with open(podatki, newline='', encoding='utf-8') as dat: | ||
+ | linkReader = csv.reader(dat, delimiter=',', quotechar='"') | ||
+ | m = 0 | ||
+ | try: | ||
+ | head = next(linkReader) | ||
+ | for row in linkReader: | ||
+ | m = m+1 | ||
+ | u = indNode(row[0]); v = indNode(row[2]); r = indRel(row[1]) | ||
+ | net.write(str(r)+": "+str(u)+' '+str(v)+'\n') | ||
+ | except csv.Error as e: | ||
+ | sys.exit('file {}, line {}: {}'.format( | ||
+ | podatki, linkReader.line_num, e)) | ||
+ | print("n = ",len(nodes)," m = ",m) | ||
+ | print(head) | ||
+ | for (k,v) in sorted(rels.items(), key=itemgetter(1)): | ||
+ | print('{0:>3} "{1}"'.format(v,k)) | ||
+ | |||
+ | net.close() | ||
+ | t2 = datetime.datetime.now() | ||
+ | print("\nfinished: ",t2.ctime()) | ||
+ | </code> | ||
+ | |||
+ | <code> | ||
+ | >>> | ||
+ | ============= RESTART: C:\Users\batagelj\data\panama\linkRead.py ============= | ||
+ | linkRead | ||
+ | started: Sun May 15 04:13:23 2016 | ||
+ | n = 838295 m = 1265690 | ||
+ | ['node_1', 'rel_type', 'node_2'] | ||
+ | intermediary_of 1 | ||
+ | registered_address 3 | ||
+ | similar 4 | ||
+ | underlying 5 | ||
+ | officer_of 2 | ||
+ | |||
+ | finished: Sun May 15 04:13:32 2016 | ||
+ | </code> | ||
+ | |||
+ | In a text editor we combine files ''panama.nod'' (list of nodes) and ''panama.net'' (list of relational arcs) into a Pajek multirelational network file ''panama.net''. | ||
+ | |||
+ | Files ''panama.net'' and ''panama.clu'' are available at | ||
+ | |||
+ | {{notes:zip:panama.zip}} | ||
+ | |||
+ | ===== Structure of the Panama network ===== | ||
+ | |||
+ | To get an insight to the overall structure of the panama network I applied the Pajek's command | ||
+ | <code> | ||
+ | Operations/Network+Partition/Shrink Network | ||
+ | </code> | ||
+ | The obtained reduced network has a matrix | ||
+ | <code> | ||
+ | C1 C5 C2 C3 C4 Label | ||
+ | ---------------------------------------------------- | ||
+ | C1. 622 0 0 0 93623 #67028 | ||
+ | C5. 51754 1 4 13 1193 #51122 | ||
+ | C2. 310671 0 16 43 8165 #66951 | ||
+ | C3. 538166 48 301 47579 213491 #51113 | ||
+ | C4. 0 0 0 0 0 #2004267 | ||
+ | ---------------------------------------------------- | ||
+ | </code> | ||
+ | As a multirelational network it is presented in the following figure | ||
+ | |||
+ | {{notes:pics:relations.svg}} | ||
+ | |||
+ | {{notes:pics:relations.pdf}} | ||
+ | |||
+ | where relations are represented by the following colors | ||
+ | <code> | ||
+ | R1 intermediary_of 319121 red | ||
+ | R2 officer_of 581476 blue | ||
+ | R3 registered_address 317094 green | ||
+ | R4 similar 46761 pink | ||
+ | R5 underlying 1238 orange | ||
+ | </code> | ||
+ | |||
+ | ===== Countries, labels and jurisdiction ===== | ||
+ | |||
+ | Nodes can be related to different countries. We get a multi-relational two-mode network Nodes X Countries **''countries.net''**. The information about the source file is preserved as relation number: 1 "entities", 2 "intermediaries", 3 "officers", 4 "addresses". To (some) Entities also the country of jurisdiction is assigned - partition **''juris.clu''**. Labels of nodes were saved on the file **''panama.nam''**. | ||
+ | <code python> | ||
+ | import csv, sys, os, re, time, datetime | ||
+ | from operator import itemgetter | ||
+ | # by Vladimir Batagelj, 19. May 2016 | ||
+ | os.chdir('C:/Users/Batagelj/data/panama/csvs') | ||
+ | |||
+ | def indNode(nodeId): | ||
+ | # determines the Pajek's number of a node | ||
+ | if not(nodeId in nodes): | ||
+ | nodes[nodeId] = len(nodes)+1 | ||
+ | return nodes[nodeId] | ||
+ | |||
+ | def indCountry(nodeId): | ||
+ | # determines the Pajek's number of a country | ||
+ | if not(nodeId in cnty): | ||
+ | cnty[nodeId] = len(cnty)+1 | ||
+ | return cnty[nodeId] | ||
+ | |||
+ | t1 = datetime.datetime.now() | ||
+ | print("otherRead\nstarted: ",t1.ctime()) | ||
+ | nodes = {} | ||
+ | with open('panama.nod', newline='', encoding='utf-8') as nod: | ||
+ | nodReader = csv.reader(nod, delimiter=' ', quotechar='"') | ||
+ | try: | ||
+ | # head = next(nodReader); head = next(nodReader) | ||
+ | for row in nodReader: | ||
+ | u = indNode(row[1]) | ||
+ | except csv.Error as e: | ||
+ | sys.exit('file {}, line {}: {}'.format( | ||
+ | 'panama.nod', nodReader.line_num, e)) | ||
+ | |||
+ | # T <- read.csv('entities.csv',head=TRUE,sep=',') | ||
+ | cnty = {}; n = len(nodes); label = [""]*n; juris = [0]*n | ||
+ | net = open('countries.net','w',encoding='utf-8') | ||
+ | net.write('*arcs\n') | ||
+ | data = 'entities.csv'; r = 1 | ||
+ | with open(data, newline='', encoding='utf-8') as dat: | ||
+ | datReader = csv.reader(dat, delimiter=',', quotechar='"') | ||
+ | m = 0 | ||
+ | try: | ||
+ | head = next(datReader) | ||
+ | for row in datReader: | ||
+ | u = indNode(row[19]); label[u-1] = row[0] | ||
+ | juris[u-1] = indCountry(row[3]) if len(row[3])>0 else 999 | ||
+ | if len(row[15])>0: | ||
+ | L = re.split(';',row[15]) | ||
+ | for e in L: | ||
+ | m = m+1; v = n+indCountry(e) | ||
+ | net.write(str(r)+": "+str(u)+' '+str(v)+'\n') | ||
+ | except csv.Error as e: | ||
+ | sys.exit('file {}, line {}: {}'.format( | ||
+ | data, datReader.line_num, e)) | ||
+ | print("r = ",r," n = ",len(nodes)," m = ",m) | ||
+ | |||
+ | data = 'Intermediaries.csv'; r = 2 | ||
+ | with open(data, newline='', encoding='utf-8') as dat: | ||
+ | datReader = csv.reader(dat, delimiter=',', quotechar='"') | ||
+ | m = 0 | ||
+ | try: | ||
+ | head = next(datReader) | ||
+ | for row in datReader: | ||
+ | u = indNode(row[7]); label[u-1] = row[0] | ||
+ | if len(row[4])>0: | ||
+ | L = re.split(';',row[4]) | ||
+ | for e in L: | ||
+ | m = m+1; v = n+indCountry(e) | ||
+ | net.write(str(r)+": "+str(u)+' '+str(v)+'\n') | ||
+ | except csv.Error as e: | ||
+ | sys.exit('file {}, line {}: {}'.format( | ||
+ | data, datReader.line_num, e)) | ||
+ | print("r = ",r," n = ",len(nodes)," m = ",m) | ||
+ | |||
+ | data = 'Officers.csv'; r = 3 | ||
+ | with open(data, newline='', encoding='utf-8') as dat: | ||
+ | datReader = csv.reader(dat, delimiter=',', quotechar='"') | ||
+ | m = 0 | ||
+ | try: | ||
+ | head = next(datReader) | ||
+ | for row in datReader: | ||
+ | u = indNode(row[5]); label[u-1] = row[0] | ||
+ | if len(row[3])>0: | ||
+ | L = re.split(';',row[3]) | ||
+ | for e in L: | ||
+ | m = m+1; v = n+indCountry(e) | ||
+ | net.write(str(r)+": "+str(u)+' '+str(v)+'\n') | ||
+ | except csv.Error as e: | ||
+ | sys.exit('file {}, line {}: {}'.format( | ||
+ | data, datReader.line_num, e)) | ||
+ | print("r = ",r," n = ",len(nodes)," m = ",m) | ||
+ | |||
+ | data = 'Addresses.csv'; r = 4 | ||
+ | with open(data, newline='', encoding='utf-8') as dat: | ||
+ | datReader = csv.reader(dat, delimiter=',', quotechar='"') | ||
+ | m = 0 | ||
+ | try: | ||
+ | head = next(datReader) | ||
+ | for row in datReader: | ||
+ | u = indNode(row[5]); label[u-1] = row[0] | ||
+ | if len(row[3])>0: | ||
+ | L = re.split(';',row[3]) | ||
+ | for e in L: | ||
+ | m = m+1; v = n+indCountry(e) | ||
+ | net.write(str(r)+": "+str(u)+' '+str(v)+'\n') | ||
+ | except csv.Error as e: | ||
+ | sys.exit('file {}, line {}: {}'.format( | ||
+ | data, datReader.line_num, e)) | ||
+ | print("r = ",r," n = ",len(nodes)," m = ",m) | ||
+ | |||
+ | net.close() | ||
+ | print(head) | ||
+ | for (k,v) in sorted(cnty.items(), key=itemgetter(1)): | ||
+ | print("{0:>3} {1:>3} {2}".format(v,n+v,k)) | ||
+ | |||
+ | clu = open('juris.clu','w',encoding='utf-8') | ||
+ | clu.write('% created by otherRead:'+t1.ctime()+'\n*vertices '+str(n)+'\n') | ||
+ | for k in range(n): | ||
+ | clu.write("{0:<3}\n".format(juris[k])) | ||
+ | clu.close() | ||
+ | |||
+ | nam = open('panama.nam','w',encoding='utf-8') | ||
+ | nam.write('% created by otherRead:'+t1.ctime()+'\n*vertices '+str(n)+'\n') | ||
+ | for k in range(n): | ||
+ | nam.write('{0:<7} "{1}"\n'.format(k+1,label[k])) | ||
+ | nam.close() | ||
+ | |||
+ | t2 = datetime.datetime.now() | ||
+ | print("\nfinished: ",t2.ctime()) | ||
+ | </code> | ||
+ | |||
+ | <code> | ||
+ | >>> | ||
+ | ============ RESTART: C:/Users/batagelj/data/panama/otherRead.py ============ | ||
+ | otherRead | ||
+ | started: Fri May 20 02:03:02 2016 | ||
+ | r = 1 n = 838295 m = 365643 | ||
+ | r = 2 n = 838295 m = 22860 | ||
+ | r = 3 n = 838295 m = 252197 | ||
+ | r = 4 n = 838295 m = 150162 | ||
+ | ['address', 'icij_id', 'valid_until', 'country_codes', 'countries', 'node_id', 'sourceID'] | ||
+ | 1 838296 XXX | ||
+ | 2 838297 VGB | ||
+ | 3 838298 CYP | ||
+ | 4 838299 SGP | ||
+ | 5 838300 USA | ||
+ | 6 838301 RUS | ||
+ | 7 838302 NLD | ||
+ | 8 838303 GBR | ||
+ | 9 838304 BVI | ||
+ | 10 838305 HKG | ||
+ | ... | ||
+ | 235 838530 ETH | ||
+ | 236 838531 GNB | ||
+ | 237 838532 SLB | ||
+ | 238 838533 GNQ | ||
+ | 239 838534 REU | ||
+ | 240 838535 SMR | ||
+ | |||
+ | finished: Fri May 20 02:03:18 2016 | ||
+ | >>> | ||
+ | </code> |