Differences

This shows you the differences between two versions of the page.

Link to this comparison view

notes:net:first [2016/05/23 15:39] (current)
vlado created
Line 1: Line 1:
 +====== First partial conversion to Pajek files ======
  
 +===== Multirelational network =====
 +
 +First, using R 
 +<code R>
 +> setwd('C:/Users/Batagelj/data/panama/csvs')
 +> T <- read.csv('entities.csv',head=TRUE,sep=',')
 +> dim(T)
 +[1] 319150     21
 +> colnames(T)
 + [1] "name"                     "original_name"           
 + [3] "former_name"              "jurisdiction"            
 + [5] "jurisdiction_description" "company_type"            
 + [7] "address"                  "internal_id"             
 + [9] "incorporation_date"       "inactivation_date"       
 +[11] "struck_off_date"          "dorm_date"               
 +[13] "status"                   "service_provider"        
 +[15] "ibcRUC"                   "country_codes"           
 +[17] "countries"                "note"                    
 +[19] "valid_until"              "node_id"                 
 +[21] "sourceID"                
 +</code>
 +I determined sizes of data files
 +<code>
 +file                    rows   cols
 +-----------------------------------
 +Entities.csv          319150     21
 +Intermediaries.csv     23636      9
 +Officers.csv          345594      7
 +Addresses.csv         151054      7
 +-----------------------------------
 +</code>
 +and names of attributes.
 +
 +Afterward, using a short python program, I produced a list of all nodes from files - **''panama.nod''**.
 +
 +It turned out that the sets are not disjoint - some officers are also intermediaries. An improved version of the initial program 
 +<code python>
 +import csv, sys, os, time, datetime
 +from operator import itemgetter
 +# by Vladimir Batagelj, 14. May 2016
 +os.chdir('C:/Users/Batagelj/data/panama/csvs')
 +
 +def indNode(nodeId,ty):
 +# determines the Pajek's number of a node
 +   if nodeId in nodes:
 +      print('*** Duplicated node', len(nodes), nodeId)
 +      (a,b) = nodes[nodeId]
 +      nodes[nodeId] = (a,b+str(ty))
 +   else:
 +      nodes[nodeId] = (len(nodes)+1,str(ty))
 +      nod.write(str(len(nodes))+' "'+nodeId+'"\n')
 +   return nodes[nodeId]
 +
 +t1 = datetime.datetime.now()
 +print("nodeRead\nstarted: ",t1.ctime())
 +clu = open('panama.clu','w',encoding='utf-8')
 +nod = open('panama.nod','w',encoding='utf-8')
 +clu.write('% created by nodeRead:'+t1.ctime()+'\n*vertices ?\n')
 +nodes = {}
 +
 +podatki = ['Entities.csv','Intermediaries.csv','Officers.csv','Addresses.csv']
 +nodeInd = [19,7,5,5] 
 +for ty in range(4):
 +   nInd = nodeInd[ty]
 +   with open(podatki[ty], newline='', encoding='utf-8') as dat:
 +      nodeReader = csv.reader(dat, delimiter=',', quotechar='"')
 +      try:
 +         head = next(nodeReader)
 +         for row in nodeReader: u = indNode(row[nInd],ty+1)
 +      except csv.Error as e:
 +         sys.exit('file {}, line {}: {}'.format(
 +            podatki, nodeReader.line_num, e))
 +   print("type = ",ty+1,"     n = ",len(nodes))
 +
 +for (k,v) in sorted(nodes.items(), key=itemgetter(1)):
 +   clu.write("{0:<7} {1:<9} {2}\n".format(v[0],k,v[1]))
 +
 +nod.close(); clu.close()
 +t2 = datetime.datetime.now()
 +print("\nfinished: ",t2.ctime())
 +</code>
 +
 +<code>  
 +============= RESTART: C:/Users/batagelj/data/panama/nodeRead.py =============
 +nodeRead
 +started:  Sun May 15 03:38:03 2016
 +type =  1      n =  319150
 +type =  2      n =  342786
 +*** Duplicated node 342798 51122
 +*** Duplicated node 342832 51149
 +*** Duplicated node 342846 51162
 +...
 +*** Duplicated node 448544 70699
 +*** Duplicated node 448657 70785
 +*** Duplicated node 448769 70872
 +*** Duplicated node 448783 70884
 +type =  3      n =  687241
 +type =  4      n =  838295
 +
 +finished:  Sun May 15 03:38:22 2016
 +</code>
 +produces the list ''panama.nod'' and a partition ''panama.clu'' with classes (clusters)
 +<code>
 +    Cluster                     Freq  Representative
 + ---------------------------------------------------
 + C1 Entities                  319150          1
 + C2 Intermediaries             22497     320290
 + C3 Officers                  344455     342787
 + C4 Addresses                 151054     687242
 + C5 Intermediaries+Officers     1139     319151
 + ---------------------------------------------------
 +</code>
 +Both files were manually adapted to Pajek format.
 +
 +In the next step I transformed the file ''all_edges.csv'' into a corresponding Pajek's multirelational list of arcs: 
 +<code python>
 +import csv, sys, os, time, datetime
 +from operator import itemgetter
 +# by Vladimir Batagelj, 10-11, 15, 21. May 2016
 +os.chdir('C:/Users/Batagelj/data/panama/csvs')
 +
 +def indNode(nodeId):
 +# determines the Pajek's number of a node
 +   if not(nodeId in nodes):
 +      nodes[nodeId] = len(nodes)+1
 +   return nodes[nodeId]
 +
 +def indRel(nodeId):
 +# determines the Pajek's number of a relation
 +   if not(nodeId in rels):
 +      rels[nodeId] = len(rels)+1
 +   return rels[nodeId]
 +
 +t1 = datetime.datetime.now()
 +print("linkRead\nstarted: ",t1.ctime())
 +nodes = {}
 +with open('panama.nod', newline='', encoding='utf-8') as nod:
 +   nodReader = csv.reader(nod, delimiter=' ', quotechar='"')
 +   try:
 +      for row in nodReader:
 +         u = indNode(row[1])
 +   except csv.Error as e:
 +      sys.exit('file {}, line {}: {}'.format(
 +         'panama.nod', nodReader.line_num, e))
 +      
 +podatki = 'all_edges.csv'
 +net = open('panama.net','w',encoding='utf-8')
 +net.write('*arcs\n')
 +rels = {}
 +with open(podatki, newline='', encoding='utf-8') as dat:
 +   linkReader = csv.reader(dat, delimiter=',', quotechar='"')
 +   m = 0
 +   try:
 +      head = next(linkReader)
 +      for row in linkReader:
 +         m = m+1
 +         u = indNode(row[0]); v = indNode(row[2]); r = indRel(row[1])
 +         net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
 +   except csv.Error as e:
 +      sys.exit('file {}, line {}: {}'.format(
 +         podatki, linkReader.line_num, e))
 +print("n = ",len(nodes),"   m = ",m)
 +print(head)
 +for (k,v) in sorted(rels.items(), key=itemgetter(1)):
 +   print('{0:>3} "{1}"'.format(v,k))
 +
 +net.close()
 +t2 = datetime.datetime.now()
 +print("\nfinished: ",t2.ctime())
 +</code>
 +
 +<code>
 +>>> 
 +============= RESTART: C:\Users\batagelj\data\panama\linkRead.py =============
 +linkRead
 +started:  Sun May 15 04:13:23 2016
 +n =  838295    m =  1265690
 +['node_1', 'rel_type', 'node_2']
 +intermediary_of 1
 +registered_address 3
 +similar 4
 +underlying 5
 +officer_of 2
 +
 +finished:  Sun May 15 04:13:32 2016
 +</code>
 +
 +In a text editor we combine files ''panama.nod'' (list of nodes) and ''panama.net'' (list of relational arcs) into a Pajek multirelational network file ''panama.net''. 
 +
 +Files ''panama.net'' and ''panama.clu'' are available at
 +
 +{{notes:zip:panama.zip}}
 +
 +===== Structure of the Panama network =====
 +
 +To get an insight to the overall structure of the panama network I applied the Pajek's command
 +<code>
 +Operations/Network+Partition/Shrink Network
 +</code>
 +The obtained reduced network has a matrix
 +<code>
 +            C1     C5     C2     C3     C4  Label
 +----------------------------------------------------
 +  C1.      622      0      0      0  93623  #67028    
 +  C5.    51754      1      4     13   1193  #51122    
 +  C2.   310671      0     16     43   8165  #66951   
 +  C3.   538166     48    301  47579 213491  #51113   
 +  C4.        0      0      0      0      0  #2004267
 +----------------------------------------------------
 +</code>
 +As a multirelational network it is presented in the following figure
 +
 +{{notes:pics:relations.svg}}
 +
 +{{notes:pics:relations.pdf}}
 +
 +where relations are represented by the following colors
 +<code>
 +R1 intermediary_of       319121   red
 +R2 officer_of            581476   blue
 +R3 registered_address    317094   green
 +R4 similar                46761   pink
 +R5 underlying              1238   orange
 +</code>
 +
 +===== Countries, labels and jurisdiction =====
 +
 +Nodes can be related to different countries. We get a multi-relational two-mode network Nodes X Countries  **''countries.net''**. The information about the source file is preserved as relation number: 1 "entities", 2 "intermediaries", 3 "officers", 4 "addresses". To (some) Entities also the country of jurisdiction is assigned - partition  **''juris.clu''**. Labels of nodes were saved on the file **''panama.nam''**. 
 +<code python>
 +import csv, sys, os, re, time, datetime
 +from operator import itemgetter
 +# by Vladimir Batagelj, 19. May 2016
 +os.chdir('C:/Users/Batagelj/data/panama/csvs')
 +
 +def indNode(nodeId):
 +# determines the Pajek's number of a node
 +   if not(nodeId in nodes):
 +      nodes[nodeId] = len(nodes)+1
 +   return nodes[nodeId]
 +
 +def indCountry(nodeId):
 +# determines the Pajek's number of a country
 +   if not(nodeId in cnty):
 +      cnty[nodeId] = len(cnty)+1
 +   return cnty[nodeId]
 +
 +t1 = datetime.datetime.now()
 +print("otherRead\nstarted: ",t1.ctime())
 +nodes = {}
 +with open('panama.nod', newline='', encoding='utf-8') as nod:
 +   nodReader = csv.reader(nod, delimiter=' ', quotechar='"')
 +   try:
 +#      head = next(nodReader); head = next(nodReader)
 +      for row in nodReader:
 +         u = indNode(row[1])
 +   except csv.Error as e:
 +      sys.exit('file {}, line {}: {}'.format(
 +         'panama.nod', nodReader.line_num, e))
 +      
 +# T <- read.csv('entities.csv',head=TRUE,sep=',')      
 +cnty = {}; n = len(nodes); label = [""]*n; juris = [0]*n
 +net = open('countries.net','w',encoding='utf-8')
 +net.write('*arcs\n')
 +data = 'entities.csv'; r = 1
 +with open(data, newline='', encoding='utf-8') as dat:
 +   datReader = csv.reader(dat, delimiter=',', quotechar='"')
 +   m = 0
 +   try:
 +      head = next(datReader)
 +      for row in datReader:
 +         u = indNode(row[19]); label[u-1] = row[0]
 +         juris[u-1] = indCountry(row[3]) if len(row[3])>0 else 999
 +         if len(row[15])>0:
 +            L = re.split(';',row[15])
 +            for e in L:
 +               m = m+1; v = n+indCountry(e)
 +               net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
 +   except csv.Error as e:
 +      sys.exit('file {}, line {}: {}'.format(
 +         data, datReader.line_num, e))
 +print("r = ",r,"   n = ",len(nodes),"   m = ",m)
 +
 +data = 'Intermediaries.csv'; r = 2
 +with open(data, newline='', encoding='utf-8') as dat:
 +   datReader = csv.reader(dat, delimiter=',', quotechar='"')
 +   m = 0
 +   try:
 +      head = next(datReader)
 +      for row in datReader:
 +         u = indNode(row[7]); label[u-1] = row[0]
 +         if len(row[4])>0:
 +            L = re.split(';',row[4])
 +            for e in L:
 +               m = m+1; v = n+indCountry(e)
 +               net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
 +   except csv.Error as e:
 +      sys.exit('file {}, line {}: {}'.format(
 +         data, datReader.line_num, e))
 +print("r = ",r,"   n = ",len(nodes),"   m = ",m)
 +
 +data = 'Officers.csv'; r = 3
 +with open(data, newline='', encoding='utf-8') as dat:
 +   datReader = csv.reader(dat, delimiter=',', quotechar='"')
 +   m = 0
 +   try:
 +      head = next(datReader)
 +      for row in datReader:
 +         u = indNode(row[5]); label[u-1] = row[0]
 +         if len(row[3])>0:
 +            L = re.split(';',row[3])
 +            for e in L:
 +               m = m+1; v = n+indCountry(e)
 +               net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
 +   except csv.Error as e:
 +      sys.exit('file {}, line {}: {}'.format(
 +         data, datReader.line_num, e))
 +print("r = ",r,"   n = ",len(nodes),"   m = ",m)
 +
 +data = 'Addresses.csv'; r = 4
 +with open(data, newline='', encoding='utf-8') as dat:
 +   datReader = csv.reader(dat, delimiter=',', quotechar='"')
 +   m = 0
 +   try:
 +      head = next(datReader)
 +      for row in datReader:
 +         u = indNode(row[5]); label[u-1] = row[0]
 +         if len(row[3])>0:
 +            L = re.split(';',row[3])
 +            for e in L:
 +               m = m+1; v = n+indCountry(e)
 +               net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
 +   except csv.Error as e:
 +      sys.exit('file {}, line {}: {}'.format(
 +         data, datReader.line_num, e))
 +print("r = ",r,"   n = ",len(nodes),"   m = ",m)
 +
 +net.close()
 +print(head)
 +for (k,v) in sorted(cnty.items(), key=itemgetter(1)):
 +   print("{0:>3} {1:>3} {2}".format(v,n+v,k))
 +
 +clu = open('juris.clu','w',encoding='utf-8')
 +clu.write('% created by otherRead:'+t1.ctime()+'\n*vertices '+str(n)+'\n')
 +for k in range(n):
 +   clu.write("{0:<3}\n".format(juris[k]))
 +clu.close()
 +
 +nam = open('panama.nam','w',encoding='utf-8')
 +nam.write('% created by otherRead:'+t1.ctime()+'\n*vertices '+str(n)+'\n')
 +for k in range(n):
 +   nam.write('{0:<7} "{1}"\n'.format(k+1,label[k]))
 +nam.close()
 +
 +t2 = datetime.datetime.now()
 +print("\nfinished: ",t2.ctime())
 +</code>
 +
 +<code>
 +>>> 
 +============ RESTART: C:/Users/batagelj/data/panama/otherRead.py ============
 +otherRead
 +started:  Fri May 20 02:03:02 2016
 +r =  1    n =  838295    m =  365643
 +r =  2    n =  838295    m =  22860
 +r =  3    n =  838295    m =  252197
 +r =  4    n =  838295    m =  150162
 +['address', 'icij_id', 'valid_until', 'country_codes', 'countries', 'node_id', 'sourceID']
 +  1 838296 XXX
 +  2 838297 VGB
 +  3 838298 CYP
 +  4 838299 SGP
 +  5 838300 USA
 +  6 838301 RUS
 +  7 838302 NLD
 +  8 838303 GBR
 +  9 838304 BVI
 + 10 838305 HKG
 +...
 +235 838530 ETH
 +236 838531 GNB
 +237 838532 SLB
 +238 838533 GNQ
 +239 838534 REU
 +240 838535 SMR
 +
 +finished:  Fri May 20 02:03:18 2016
 +>>> 
 +</code>
notes/net/first.txt · Last modified: 2016/05/23 15:39 by vlado
 
Except where otherwise noted, content on this wiki is licensed under the following license: CC Attribution-Noncommercial-Share Alike 3.0 Unported
Recent changes RSS feed Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki