====== First partial conversion to Pajek files ======
===== Multirelational network =====
First, using R
> setwd('C:/Users/Batagelj/data/panama/csvs')
> T <- read.csv('entities.csv',head=TRUE,sep=',')
> dim(T)
[1] 319150 21
> colnames(T)
[1] "name" "original_name"
[3] "former_name" "jurisdiction"
[5] "jurisdiction_description" "company_type"
[7] "address" "internal_id"
[9] "incorporation_date" "inactivation_date"
[11] "struck_off_date" "dorm_date"
[13] "status" "service_provider"
[15] "ibcRUC" "country_codes"
[17] "countries" "note"
[19] "valid_until" "node_id"
[21] "sourceID"
I determined sizes of data files
file rows cols
-----------------------------------
Entities.csv 319150 21
Intermediaries.csv 23636 9
Officers.csv 345594 7
Addresses.csv 151054 7
-----------------------------------
and names of attributes.
Afterward, using a short python program, I produced a list of all nodes from files - **''panama.nod''**.
It turned out that the sets are not disjoint - some officers are also intermediaries. An improved version of the initial program
import csv, sys, os, time, datetime
from operator import itemgetter
# by Vladimir Batagelj, 14. May 2016
os.chdir('C:/Users/Batagelj/data/panama/csvs')
def indNode(nodeId,ty):
# determines the Pajek's number of a node
if nodeId in nodes:
print('*** Duplicated node', len(nodes), nodeId)
(a,b) = nodes[nodeId]
nodes[nodeId] = (a,b+str(ty))
else:
nodes[nodeId] = (len(nodes)+1,str(ty))
nod.write(str(len(nodes))+' "'+nodeId+'"\n')
return nodes[nodeId]
t1 = datetime.datetime.now()
print("nodeRead\nstarted: ",t1.ctime())
clu = open('panama.clu','w',encoding='utf-8')
nod = open('panama.nod','w',encoding='utf-8')
clu.write('% created by nodeRead:'+t1.ctime()+'\n*vertices ?\n')
nodes = {}
podatki = ['Entities.csv','Intermediaries.csv','Officers.csv','Addresses.csv']
nodeInd = [19,7,5,5]
for ty in range(4):
nInd = nodeInd[ty]
with open(podatki[ty], newline='', encoding='utf-8') as dat:
nodeReader = csv.reader(dat, delimiter=',', quotechar='"')
try:
head = next(nodeReader)
for row in nodeReader: u = indNode(row[nInd],ty+1)
except csv.Error as e:
sys.exit('file {}, line {}: {}'.format(
podatki, nodeReader.line_num, e))
print("type = ",ty+1," n = ",len(nodes))
for (k,v) in sorted(nodes.items(), key=itemgetter(1)):
clu.write("{0:<7} {1:<9} {2}\n".format(v[0],k,v[1]))
nod.close(); clu.close()
t2 = datetime.datetime.now()
print("\nfinished: ",t2.ctime())
============= RESTART: C:/Users/batagelj/data/panama/nodeRead.py =============
nodeRead
started: Sun May 15 03:38:03 2016
type = 1 n = 319150
type = 2 n = 342786
*** Duplicated node 342798 51122
*** Duplicated node 342832 51149
*** Duplicated node 342846 51162
...
*** Duplicated node 448544 70699
*** Duplicated node 448657 70785
*** Duplicated node 448769 70872
*** Duplicated node 448783 70884
type = 3 n = 687241
type = 4 n = 838295
finished: Sun May 15 03:38:22 2016
produces the list ''panama.nod'' and a partition ''panama.clu'' with classes (clusters)
Cluster Freq Representative
---------------------------------------------------
C1 Entities 319150 1
C2 Intermediaries 22497 320290
C3 Officers 344455 342787
C4 Addresses 151054 687242
C5 Intermediaries+Officers 1139 319151
---------------------------------------------------
Both files were manually adapted to Pajek format.
In the next step I transformed the file ''all_edges.csv'' into a corresponding Pajek's multirelational list of arcs:
import csv, sys, os, time, datetime
from operator import itemgetter
# by Vladimir Batagelj, 10-11, 15, 21. May 2016
os.chdir('C:/Users/Batagelj/data/panama/csvs')
def indNode(nodeId):
# determines the Pajek's number of a node
if not(nodeId in nodes):
nodes[nodeId] = len(nodes)+1
return nodes[nodeId]
def indRel(nodeId):
# determines the Pajek's number of a relation
if not(nodeId in rels):
rels[nodeId] = len(rels)+1
return rels[nodeId]
t1 = datetime.datetime.now()
print("linkRead\nstarted: ",t1.ctime())
nodes = {}
with open('panama.nod', newline='', encoding='utf-8') as nod:
nodReader = csv.reader(nod, delimiter=' ', quotechar='"')
try:
for row in nodReader:
u = indNode(row[1])
except csv.Error as e:
sys.exit('file {}, line {}: {}'.format(
'panama.nod', nodReader.line_num, e))
podatki = 'all_edges.csv'
net = open('panama.net','w',encoding='utf-8')
net.write('*arcs\n')
rels = {}
with open(podatki, newline='', encoding='utf-8') as dat:
linkReader = csv.reader(dat, delimiter=',', quotechar='"')
m = 0
try:
head = next(linkReader)
for row in linkReader:
m = m+1
u = indNode(row[0]); v = indNode(row[2]); r = indRel(row[1])
net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
except csv.Error as e:
sys.exit('file {}, line {}: {}'.format(
podatki, linkReader.line_num, e))
print("n = ",len(nodes)," m = ",m)
print(head)
for (k,v) in sorted(rels.items(), key=itemgetter(1)):
print('{0:>3} "{1}"'.format(v,k))
net.close()
t2 = datetime.datetime.now()
print("\nfinished: ",t2.ctime())
>>>
============= RESTART: C:\Users\batagelj\data\panama\linkRead.py =============
linkRead
started: Sun May 15 04:13:23 2016
n = 838295 m = 1265690
['node_1', 'rel_type', 'node_2']
intermediary_of 1
registered_address 3
similar 4
underlying 5
officer_of 2
finished: Sun May 15 04:13:32 2016
In a text editor we combine files ''panama.nod'' (list of nodes) and ''panama.net'' (list of relational arcs) into a Pajek multirelational network file ''panama.net''.
Files ''panama.net'' and ''panama.clu'' are available at
{{notes:zip:panama.zip}}
===== Structure of the Panama network =====
To get an insight to the overall structure of the panama network I applied the Pajek's command
Operations/Network+Partition/Shrink Network
The obtained reduced network has a matrix
C1 C5 C2 C3 C4 Label
----------------------------------------------------
C1. 622 0 0 0 93623 #67028
C5. 51754 1 4 13 1193 #51122
C2. 310671 0 16 43 8165 #66951
C3. 538166 48 301 47579 213491 #51113
C4. 0 0 0 0 0 #2004267
----------------------------------------------------
As a multirelational network it is presented in the following figure
{{notes:pics:relations.svg}}
{{notes:pics:relations.pdf}}
where relations are represented by the following colors
R1 intermediary_of 319121 red
R2 officer_of 581476 blue
R3 registered_address 317094 green
R4 similar 46761 pink
R5 underlying 1238 orange
===== Countries, labels and jurisdiction =====
Nodes can be related to different countries. We get a multi-relational two-mode network Nodes X Countries **''countries.net''**. The information about the source file is preserved as relation number: 1 "entities", 2 "intermediaries", 3 "officers", 4 "addresses". To (some) Entities also the country of jurisdiction is assigned - partition **''juris.clu''**. Labels of nodes were saved on the file **''panama.nam''**.
import csv, sys, os, re, time, datetime
from operator import itemgetter
# by Vladimir Batagelj, 19. May 2016
os.chdir('C:/Users/Batagelj/data/panama/csvs')
def indNode(nodeId):
# determines the Pajek's number of a node
if not(nodeId in nodes):
nodes[nodeId] = len(nodes)+1
return nodes[nodeId]
def indCountry(nodeId):
# determines the Pajek's number of a country
if not(nodeId in cnty):
cnty[nodeId] = len(cnty)+1
return cnty[nodeId]
t1 = datetime.datetime.now()
print("otherRead\nstarted: ",t1.ctime())
nodes = {}
with open('panama.nod', newline='', encoding='utf-8') as nod:
nodReader = csv.reader(nod, delimiter=' ', quotechar='"')
try:
# head = next(nodReader); head = next(nodReader)
for row in nodReader:
u = indNode(row[1])
except csv.Error as e:
sys.exit('file {}, line {}: {}'.format(
'panama.nod', nodReader.line_num, e))
# T <- read.csv('entities.csv',head=TRUE,sep=',')
cnty = {}; n = len(nodes); label = [""]*n; juris = [0]*n
net = open('countries.net','w',encoding='utf-8')
net.write('*arcs\n')
data = 'entities.csv'; r = 1
with open(data, newline='', encoding='utf-8') as dat:
datReader = csv.reader(dat, delimiter=',', quotechar='"')
m = 0
try:
head = next(datReader)
for row in datReader:
u = indNode(row[19]); label[u-1] = row[0]
juris[u-1] = indCountry(row[3]) if len(row[3])>0 else 999
if len(row[15])>0:
L = re.split(';',row[15])
for e in L:
m = m+1; v = n+indCountry(e)
net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
except csv.Error as e:
sys.exit('file {}, line {}: {}'.format(
data, datReader.line_num, e))
print("r = ",r," n = ",len(nodes)," m = ",m)
data = 'Intermediaries.csv'; r = 2
with open(data, newline='', encoding='utf-8') as dat:
datReader = csv.reader(dat, delimiter=',', quotechar='"')
m = 0
try:
head = next(datReader)
for row in datReader:
u = indNode(row[7]); label[u-1] = row[0]
if len(row[4])>0:
L = re.split(';',row[4])
for e in L:
m = m+1; v = n+indCountry(e)
net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
except csv.Error as e:
sys.exit('file {}, line {}: {}'.format(
data, datReader.line_num, e))
print("r = ",r," n = ",len(nodes)," m = ",m)
data = 'Officers.csv'; r = 3
with open(data, newline='', encoding='utf-8') as dat:
datReader = csv.reader(dat, delimiter=',', quotechar='"')
m = 0
try:
head = next(datReader)
for row in datReader:
u = indNode(row[5]); label[u-1] = row[0]
if len(row[3])>0:
L = re.split(';',row[3])
for e in L:
m = m+1; v = n+indCountry(e)
net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
except csv.Error as e:
sys.exit('file {}, line {}: {}'.format(
data, datReader.line_num, e))
print("r = ",r," n = ",len(nodes)," m = ",m)
data = 'Addresses.csv'; r = 4
with open(data, newline='', encoding='utf-8') as dat:
datReader = csv.reader(dat, delimiter=',', quotechar='"')
m = 0
try:
head = next(datReader)
for row in datReader:
u = indNode(row[5]); label[u-1] = row[0]
if len(row[3])>0:
L = re.split(';',row[3])
for e in L:
m = m+1; v = n+indCountry(e)
net.write(str(r)+": "+str(u)+' '+str(v)+'\n')
except csv.Error as e:
sys.exit('file {}, line {}: {}'.format(
data, datReader.line_num, e))
print("r = ",r," n = ",len(nodes)," m = ",m)
net.close()
print(head)
for (k,v) in sorted(cnty.items(), key=itemgetter(1)):
print("{0:>3} {1:>3} {2}".format(v,n+v,k))
clu = open('juris.clu','w',encoding='utf-8')
clu.write('% created by otherRead:'+t1.ctime()+'\n*vertices '+str(n)+'\n')
for k in range(n):
clu.write("{0:<3}\n".format(juris[k]))
clu.close()
nam = open('panama.nam','w',encoding='utf-8')
nam.write('% created by otherRead:'+t1.ctime()+'\n*vertices '+str(n)+'\n')
for k in range(n):
nam.write('{0:<7} "{1}"\n'.format(k+1,label[k]))
nam.close()
t2 = datetime.datetime.now()
print("\nfinished: ",t2.ctime())
>>>
============ RESTART: C:/Users/batagelj/data/panama/otherRead.py ============
otherRead
started: Fri May 20 02:03:02 2016
r = 1 n = 838295 m = 365643
r = 2 n = 838295 m = 22860
r = 3 n = 838295 m = 252197
r = 4 n = 838295 m = 150162
['address', 'icij_id', 'valid_until', 'country_codes', 'countries', 'node_id', 'sourceID']
1 838296 XXX
2 838297 VGB
3 838298 CYP
4 838299 SGP
5 838300 USA
6 838301 RUS
7 838302 NLD
8 838303 GBR
9 838304 BVI
10 838305 HKG
...
235 838530 ETH
236 838531 GNB
237 838532 SLB
238 838533 GNQ
239 838534 REU
240 838535 SMR
finished: Fri May 20 02:03:18 2016
>>>