First partial conversion to Pajek files

Multirelational network

First, using R

> setwd('C:/Users/Batagelj/data/panama/csvs')
> T <- read.csv('entities.csv',head=TRUE,sep=',')
> dim(T)
[1] 319150     21
> colnames(T)
 [1] "name"                     "original_name"           
 [3] "former_name"              "jurisdiction"            
 [5] "jurisdiction_description" "company_type"            
 [7] "address"                  "internal_id"             
 [9] "incorporation_date"       "inactivation_date"       
[11] "struck_off_date"          "dorm_date"               
[13] "status"                   "service_provider"        
[15] "ibcRUC"                   "country_codes"           
[17] "countries"                "note"                    
[19] "valid_until"              "node_id"                 
[21] "sourceID"                

I determined sizes of data files

file                    rows   cols
-----------------------------------
Entities.csv          319150     21
Intermediaries.csv     23636      9
Officers.csv          345594      7
Addresses.csv         151054      7
-----------------------------------

and names of attributes.

Afterward, using a short python program, I produced a list of all nodes from files - panama.nod.

It turned out that the sets are not disjoint - some officers are also intermediaries. An improved version of the initial program

import csv, sys, os, time, datetime
from operator import itemgetter
# by Vladimir Batagelj, 14. May 2016
os.chdir('C:/Users/Batagelj/data/panama/csvs')
 
def indNode(nodeId,ty):
# determines the Pajek's number of a node
   if nodeId in nodes:
      print('*** Duplicated node', len(nodes), nodeId)
      (a,b) = nodes[nodeId]
      nodes[nodeId] = (a,b+str(ty))
   else:
      nodes[nodeId] = (len(nodes)+1,str(ty))
      nod.write(str(len(nodes))+' "'+nodeId+'"\n')
   return nodes[nodeId]
 
t1 = datetime.datetime.now()
print("nodeRead\nstarted: ",t1.ctime())
clu = open('panama.clu','w',encoding='utf-8')
nod = open('panama.nod','w',encoding='utf-8')
clu.write('% created by nodeRead:'+t1.ctime()+'\n*vertices ?\n')
nodes = {}
 
podatki = ['Entities.csv','Intermediaries.csv','Officers.csv','Addresses.csv']
nodeInd = [19,7,5,5] 
for ty in range(4):
   nInd = nodeInd[ty]
   with open(podatki[ty], newline='', encoding='utf-8') as dat:
      nodeReader = csv.reader(dat, delimiter=',', quotechar='"')
      try:
         head = next(nodeReader)
         for row in nodeReader: u = indNode(row[nInd],ty+1)
      except csv.Error as e:
         sys.exit('file {}, line {}: {}'.format(
            podatki, nodeReader.line_num, e))
   print("type = ",ty+1,"     n = ",len(nodes))
 
for (k,v) in sorted(nodes.items(), key=itemgetter(1)):
   clu.write("{0:<7} {1:<9} {2}\n".format(v[0],k,v[1]))
 
nod.close(); clu.close()
t2 = datetime.datetime.now()
print("\nfinished: ",t2.ctime())
  
============= RESTART: C:/Users/batagelj/data/panama/nodeRead.py =============
nodeRead
started:  Sun May 15 03:38:03 2016
type =  1      n =  319150
type =  2      n =  342786
*** Duplicated node 342798 51122
*** Duplicated node 342832 51149
*** Duplicated node 342846 51162
...
*** Duplicated node 448544 70699
*** Duplicated node 448657 70785
*** Duplicated node 448769 70872
*** Duplicated node 448783 70884
type =  3      n =  687241
type =  4      n =  838295

finished:  Sun May 15 03:38:22 2016

produces the list panama.nod and a partition panama.clu with classes (clusters)

    Cluster                     Freq  Representative
 ---------------------------------------------------
 C1 Entities                  319150          1
 C2 Intermediaries             22497     320290
 C3 Officers                  344455     342787
 C4 Addresses                 151054     687242
 C5 Intermediaries+Officers     1139     319151
 ---------------------------------------------------

Both files were manually adapted to Pajek format.

In the next step I transformed the file all_edges.csv into a corresponding Pajek's multirelational list of arcs:

import csv, sys, os, time, datetime
from operator import itemgetter
# by Vladimir Batagelj, 10-11, 15, 21. May 2016
os.chdir('C:/Users/Batagelj/data/panama/csvs')
 
def indNode(nodeId):
# determines the Pajek's number of a node
   if not(nodeId in nodes):
      nodes[nodeId] = len(nodes)+1
   return nodes[nodeId]
 
def indRel(nodeId):
# determines the Pajek's number of a relation
   if not(nodeId in rels):
      rels[nodeId] = len(rels)+1
   return rels[nodeId]
 
t1 = datetime.datetime.now()
print("linkRead\nstarted: ",t1.ctime())
nodes = {}
with open('panama.nod', newline='', encoding='utf-8') as nod:
   nodReader = csv.reader(nod, delimiter=' ', quotechar='"')
   try:
      for row in nodReader:
         u = indNode(row[1])
   except csv.Error as e:
      sys.exit('file {}, line {}: {}'.format(
         'panama.nod', nodReader.line_num, e))
 
podatki = 'all_edges.csv'
net = open('panama.net','w',encoding='utf-8')
net.write('*arcs\n')
rels = {}
with open(podatki, newline='', encoding='utf-8') as dat:
   linkReader = csv.reader(dat, delimiter=',', quotechar='"')
   m = 0
   try:
      head = next(linkReader)
      for row in linkReader:
         m = m+1
         u = indNode(row[0]); v = indNode(row[2]); r = indRel(row[1])
         net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
   except csv.Error as e:
      sys.exit('file {}, line {}: {}'.format(
         podatki, linkReader.line_num, e))
print("n = ",len(nodes),"   m = ",m)
print(head)
for (k,v) in sorted(rels.items(), key=itemgetter(1)):
   print('{0:>3} "{1}"'.format(v,k))
 
net.close()
t2 = datetime.datetime.now()
print("\nfinished: ",t2.ctime())
>>> 
============= RESTART: C:\Users\batagelj\data\panama\linkRead.py =============
linkRead
started:  Sun May 15 04:13:23 2016
n =  838295    m =  1265690
['node_1', 'rel_type', 'node_2']
intermediary_of 1
registered_address 3
similar 4
underlying 5
officer_of 2

finished:  Sun May 15 04:13:32 2016

In a text editor we combine files panama.nod (list of nodes) and panama.net (list of relational arcs) into a Pajek multirelational network file panama.net.

Files panama.net and panama.clu are available at

panama.zip

Structure of the Panama network

To get an insight to the overall structure of the panama network I applied the Pajek's command

Operations/Network+Partition/Shrink Network

The obtained reduced network has a matrix

            C1     C5     C2     C3     C4  Label
----------------------------------------------------
  C1.      622      0      0      0  93623  #67028    
  C5.    51754      1      4     13   1193  #51122    
  C2.   310671      0     16     43   8165  #66951   
  C3.   538166     48    301  47579 213491  #51113   
  C4.        0      0      0      0      0  #2004267
----------------------------------------------------

As a multirelational network it is presented in the following figure

relations.svg

relations.pdf

where relations are represented by the following colors

R1 intermediary_of       319121   red
R2 officer_of            581476   blue
R3 registered_address    317094   green
R4 similar                46761   pink
R5 underlying              1238   orange

Countries, labels and jurisdiction

Nodes can be related to different countries. We get a multi-relational two-mode network Nodes X Countries countries.net. The information about the source file is preserved as relation number: 1 “entities”, 2 “intermediaries”, 3 “officers”, 4 “addresses”. To (some) Entities also the country of jurisdiction is assigned - partition juris.clu. Labels of nodes were saved on the file panama.nam.

import csv, sys, os, re, time, datetime
from operator import itemgetter
# by Vladimir Batagelj, 19. May 2016
os.chdir('C:/Users/Batagelj/data/panama/csvs')
 
def indNode(nodeId):
# determines the Pajek's number of a node
   if not(nodeId in nodes):
      nodes[nodeId] = len(nodes)+1
   return nodes[nodeId]
 
def indCountry(nodeId):
# determines the Pajek's number of a country
   if not(nodeId in cnty):
      cnty[nodeId] = len(cnty)+1
   return cnty[nodeId]
 
t1 = datetime.datetime.now()
print("otherRead\nstarted: ",t1.ctime())
nodes = {}
with open('panama.nod', newline='', encoding='utf-8') as nod:
   nodReader = csv.reader(nod, delimiter=' ', quotechar='"')
   try:
#      head = next(nodReader); head = next(nodReader)
      for row in nodReader:
         u = indNode(row[1])
   except csv.Error as e:
      sys.exit('file {}, line {}: {}'.format(
         'panama.nod', nodReader.line_num, e))
 
# T <- read.csv('entities.csv',head=TRUE,sep=',')      
cnty = {}; n = len(nodes); label = [""]*n; juris = [0]*n
net = open('countries.net','w',encoding='utf-8')
net.write('*arcs\n')
data = 'entities.csv'; r = 1
with open(data, newline='', encoding='utf-8') as dat:
   datReader = csv.reader(dat, delimiter=',', quotechar='"')
   m = 0
   try:
      head = next(datReader)
      for row in datReader:
         u = indNode(row[19]); label[u-1] = row[0]
         juris[u-1] = indCountry(row[3]) if len(row[3])>0 else 999
         if len(row[15])>0:
            L = re.split(';',row[15])
            for e in L:
               m = m+1; v = n+indCountry(e)
               net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
   except csv.Error as e:
      sys.exit('file {}, line {}: {}'.format(
         data, datReader.line_num, e))
print("r = ",r,"   n = ",len(nodes),"   m = ",m)
 
data = 'Intermediaries.csv'; r = 2
with open(data, newline='', encoding='utf-8') as dat:
   datReader = csv.reader(dat, delimiter=',', quotechar='"')
   m = 0
   try:
      head = next(datReader)
      for row in datReader:
         u = indNode(row[7]); label[u-1] = row[0]
         if len(row[4])>0:
            L = re.split(';',row[4])
            for e in L:
               m = m+1; v = n+indCountry(e)
               net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
   except csv.Error as e:
      sys.exit('file {}, line {}: {}'.format(
         data, datReader.line_num, e))
print("r = ",r,"   n = ",len(nodes),"   m = ",m)
 
data = 'Officers.csv'; r = 3
with open(data, newline='', encoding='utf-8') as dat:
   datReader = csv.reader(dat, delimiter=',', quotechar='"')
   m = 0
   try:
      head = next(datReader)
      for row in datReader:
         u = indNode(row[5]); label[u-1] = row[0]
         if len(row[3])>0:
            L = re.split(';',row[3])
            for e in L:
               m = m+1; v = n+indCountry(e)
               net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
   except csv.Error as e:
      sys.exit('file {}, line {}: {}'.format(
         data, datReader.line_num, e))
print("r = ",r,"   n = ",len(nodes),"   m = ",m)
 
data = 'Addresses.csv'; r = 4
with open(data, newline='', encoding='utf-8') as dat:
   datReader = csv.reader(dat, delimiter=',', quotechar='"')
   m = 0
   try:
      head = next(datReader)
      for row in datReader:
         u = indNode(row[5]); label[u-1] = row[0]
         if len(row[3])>0:
            L = re.split(';',row[3])
            for e in L:
               m = m+1; v = n+indCountry(e)
               net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
   except csv.Error as e:
      sys.exit('file {}, line {}: {}'.format(
         data, datReader.line_num, e))
print("r = ",r,"   n = ",len(nodes),"   m = ",m)
 
net.close()
print(head)
for (k,v) in sorted(cnty.items(), key=itemgetter(1)):
   print("{0:>3} {1:>3} {2}".format(v,n+v,k))
 
clu = open('juris.clu','w',encoding='utf-8')
clu.write('% created by otherRead:'+t1.ctime()+'\n*vertices '+str(n)+'\n')
for k in range(n):
   clu.write("{0:<3}\n".format(juris[k]))
clu.close()
 
nam = open('panama.nam','w',encoding='utf-8')
nam.write('% created by otherRead:'+t1.ctime()+'\n*vertices '+str(n)+'\n')
for k in range(n):
   nam.write('{0:<7} "{1}"\n'.format(k+1,label[k]))
nam.close()
 
t2 = datetime.datetime.now()
print("\nfinished: ",t2.ctime())
>>> 
============ RESTART: C:/Users/batagelj/data/panama/otherRead.py ============
otherRead
started:  Fri May 20 02:03:02 2016
r =  1    n =  838295    m =  365643
r =  2    n =  838295    m =  22860
r =  3    n =  838295    m =  252197
r =  4    n =  838295    m =  150162
['address', 'icij_id', 'valid_until', 'country_codes', 'countries', 'node_id', 'sourceID']
  1 838296 XXX
  2 838297 VGB
  3 838298 CYP
  4 838299 SGP
  5 838300 USA
  6 838301 RUS
  7 838302 NLD
  8 838303 GBR
  9 838304 BVI
 10 838305 HKG
...
235 838530 ETH
236 838531 GNB
237 838532 SLB
238 838533 GNQ
239 838534 REU
240 838535 SMR

finished:  Fri May 20 02:03:18 2016
>>> 
notes/net/first.txt · Last modified: 2016/05/23 15:39 by vlado
 
Except where otherwise noted, content on this wiki is licensed under the following license: CC Attribution-Noncommercial-Share Alike 3.0 Unported
Recent changes RSS feed Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki