Constructing networks

# Analysis of Citi Bike data
# by Vladimir Batagelj, December 2016
 
# wdir = 'E:/data/bike/test'
wdir = 'E:/data/bike/CitiBike/csv'
cdir = 'c:/users/batagelj/work/python/graph/chart'
# ftrip = 'test.csv'
# ftrip = '2013-07 - Citi Bike trip data.csv'
# ftrip = '201609-citibike-tripdata.csv'
# ftrip = '201608-citibike-tripdata.csv'
# dtFormat = "%Y-%m-%d %H:%M:%S"
dtFormat = "%m/%d/%Y %H:%M:%S"
 
import sys, os, json, pickle
# sys.path = [gdir]+sys.path; 
os.chdir(wdir)
from datetime import *
import csv
import webbrowser
 
def TQshow(tq,cdir,TQmax,Tmin,Tmax,w,h,tit,
   fill='steelblue',xLab=70,yLab=40):
   TQ = [ list(q) for q in tq ]
   js = open(cdir+'/barData.js','w')
   js.write('var barData = '+str(TQ)+';\n')
   js.write('var TQmax = '+str(TQmax)+';\n')
   js.write('var Tmin = '+str(Tmin)+';\n')
   js.write('var Tmax = '+str(Tmax)+';\n')
   js.write('var Width = '+str(w)+';\n')
   js.write('var Height = '+str(h)+';\n')
   js.write('var Title = "'+tit+'";\n')
   js.write('var Rfill = "'+fill+'";\n')
   js.write('var xLab = "'+str(xLab)+'";\n')
   js.write('var yLab = "'+str(yLab)+'";\n')
   js.close()  
# https://pymotw.com/3/webbrowser/
# import webbrowser
# b = webbrowser.get('google-chrome')
# b = webbrowser.get('mozilla')
   b = webbrowser.get('windows-default')
#   b.open('c:/users/batagelj/work/python/graph/chart/barChart.html')
   b.open(cdir+'/TQchart.html')
 
def Stind(snam,sla,slo):
   if snam in S: sin = S[snam][0]
   #   sin, Sla, Slo = S[snam]; ok = True
   #   if abs(sla-Sla) > 0.0005: ok = False # print('lat', sla, Sla)
   #   if abs(slo-Slo) > 0.0005: ok = False # print('lon', slo, Slo)
   #   if not ok:
   #      print(tripsreader.line_num,snam,sla,Sla,slo,Slo,file=rep);
   #      sys.exit()
   else:
      sin = len(S) 
      S[snam] = [sin,sla,slo]
   return sin
 
def addStart(u,v,t):
   key = (u,v)
   if not(key in G): G[key] = [0]*48
   i = 2*t.hour+int(30<=t.minute)
   G[key][i] += 1
 
def Table(L):
   T = {}
   for a in L:
      if not(a in T): T[a] = 0
      T[a] += 1
   return T
 
DirCiti = [ 
 "2013-07 - Citi Bike trip data.csv", "2013-08 - Citi Bike trip data.csv",
 "2013-09 - Citi Bike trip data.csv", "2013-10 - Citi Bike trip data.csv",
 "2013-11 - Citi Bike trip data.csv", "2013-12 - Citi Bike trip data.csv",
 "2014-01 - Citi Bike trip data.csv", "2014-02 - Citi Bike trip data.csv",
 "2014-03 - Citi Bike trip data.csv", "2014-04 - Citi Bike trip data.csv",
 "2014-05 - Citi Bike trip data.csv", "2014-06 - Citi Bike trip data.csv",
 "2014-07 - Citi Bike trip data.csv", "2014-08 - Citi Bike trip data.csv",
 "201409-citibike-tripdata.csv"     , "201410-citibike-tripdata.csv"     ,
 "201411-citibike-tripdata.csv"     , "201412-citibike-tripdata.csv"     ,
 "201501-citibike-tripdata.csv"     , "201502-citibike-tripdata.csv"     ,
 "201503-citibike-tripdata.csv"     , "201504-citibike-tripdata.csv"     ,
 "201505-citibike-tripdata.csv"     , "201506-citibike-tripdata.csv"     ,
 "201507-citibike-tripdata.csv"     , "201508-citibike-tripdata.csv"     ,
 "201509-citibike-tripdata.csv"     , "201510-citibike-tripdata.csv"     ,
 "201511-citibike-tripdata.csv"     , "201512-citibike-tripdata.csv"     ,
 "201601-citibike-tripdata.csv"     , "201602-citibike-tripdata.csv"     ,
 "201603-citibike-tripdata.csv"     , "201604-citibike-tripdata.csv"     ,
 "201605-citibike-tripdata.csv"     , "201606-citibike-tripdata.csv"     ,
 "201607-citibike-tripdata.csv"     , "201608-citibike-tripdata.csv"     ,
 "201609-citibike-tripdata.csv"     ]
 
print("*** Bikes: Stations\n")
Utype = {"Customer":1,"Subscriber":2}
G = {}
# rep = open('report.txt','wt')
if os.path.isfile('./stations.pickle'):
    with open('stations.pickle', 'rb') as sPick: S = pickle.load(sPick)
else: S = {}
print("# of stations = ",len(S))
modul = 50000; nRec = 0
t1 = datetime.now()
print("started: ",t1.ctime(),"\n")
tripFiles = DirCiti[27:]
for ftrip in tripFiles:
    print("tripFile = ",ftrip) #; print(ftrip,file=rep);
    with open(ftrip, newline='') as csvfile:
       tripsreader = csv.reader(csvfile, delimiter=',', quotechar='"')
       header = next(tripsreader)
    #   print(header)
       for trip in tripsreader:
    #      print(', '.join(trip),"\n")
          if tripsreader.line_num % modul == 0:
             print(".",sep="",end="",flush=True)
          dur = int(trip[0])
          bt = datetime.strptime(trip[1], dtFormat)
          et = datetime.strptime(trip[2], dtFormat)
          bwd = bt.isoweekday(); ewd = et.isoweekday()
          bs = int(trip[3]); bn = trip[4]
          bla = float(trip[5]); blo = float(trip[6])
          es = int(trip[7]); en = trip[8]
          ela = float(trip[9]); elo = float(trip[10])
          bid = int(trip[11])
          uty = Utype.get(trip[12])
          y = trip[13]; uy = 0 if y in ['','\\N'] else int(y)
          ug = int(trip[14])
          bind = Stind(bn,bla,blo)
          eind = Stind(en,ela,elo)
          addStart(bind,eind,bt)
    #      print(dur,(et-bt).seconds,bid,uy,ug)
       print("\n# of records = ",tripsreader.line_num)
       nRec += tripsreader.line_num
    t2 = datetime.now()
    print("processed: ",t2.ctime(),"\n")
 
print("\ntotal # of records = ",nRec)       
t3 = datetime.now()
print("\ncomputed: ",t3.ctime(),"\ntime used: ", t3-t1)
print("# of stations = ",len(S))
print("\npickle")
with open('stations.pickle','wb') as sPick:
    pickle.dump(S, sPick, pickle.HIGHEST_PROTOCOL)
with open('tripStart.pickle','wb') as sPick:
    pickle.dump(G, sPick, pickle.HIGHEST_PROTOCOL)
 
print("\nnetwork")
W = [ sum(L) for k, L in G.items() ]
w = Table(W)
# ws = [ (k, w[k]) for k in sorted(w) ]
# wC = ws[100:]; swC = sum([v for k,v in wC ])
list = open('tripDist.csv','wt')
print('trips,links', file=list)
for trip in sorted(w): print(trip,w[trip],sep=',',file=list)
list.close() #; rep.close()
Sname = { v[0]: k for k,v in S.items() }
tresh = 1250
sos = open('SOs.csv','wt')
F = ['f'+str(i) for i in range(48)]
print('u,v,uName,vName,',','.join(F),sep='',file=sos)
for k in G:
   sk = sum(G[k])
   if sk>=tresh:
      u,v = k; un = Sname[u]; vn = Sname[v]
      print(sk,u,v,'"'+un+'"','"'+vn+'"',
            ','.join([str(a) for a in G[k]]),sep=',',file=sos)
sos.close()
 
print("\nstations")
StSumO = [ 0 ] * len(S); StSumI = [ 0 ] * len(S)
for i in range(len(S)): StSumO[i] = [0]*48
for i in range(len(S)): StSumI[i] = [0]*48
for k in G:
   u,v = k; V = G[k]
   for i in range(48): 
      StSumO[u-1][i] = StSumO[u-1][i]+V[i]; StSumI[v-1][i] = StSumI[v-1][i]+V[i]
sos = open('SO2s.csv','wt')
O = ['o'+str(i) for i in range(48)]; I = ['i'+str(i) for i in range(48)]
print('v,Name,',','.join(O+I),sep='',file=sos)
for s in range(len(S)):
   so = sum(StSumO[s]); si = sum(StSumI[s])
   print(s+1,'"'+Sname[s]+'"',so,si,
         ','.join([str(a) for a in (StSumO[s]+StSumI[s])]),
         sep=',',file=sos)
sos.close()
 
t4 = datetime.now()
print("\ncomputed: ",t4.ctime(),"\ntime used: ", t4-t3)

Excel reads CSV files with sep=';'.

>>> 
==================== RESTART: E:\data\Bike\py\stations.py ====================
*** Bikes: Stations

# of stations =  678
started:  Thu Dec  8 05:14:56 2016 

tripFile =  201510-citibike-tripdata.csv
........................
# of records =  1212278
processed:  Thu Dec  8 05:16:08 2016 

tripFile =  201511-citibike-tripdata.csv
...................
# of records =  987246
processed:  Thu Dec  8 05:17:06 2016 

tripFile =  201512-citibike-tripdata.csv
................
# of records =  804126
processed:  Thu Dec  8 05:17:53 2016 

tripFile =  201601-citibike-tripdata.csv
..........
# of records =  509479
processed:  Thu Dec  8 05:18:23 2016 

tripFile =  201602-citibike-tripdata.csv
...........
# of records =  560875
processed:  Thu Dec  8 05:18:56 2016 

tripFile =  201603-citibike-tripdata.csv
..................
# of records =  919922
processed:  Thu Dec  8 05:19:51 2016 

tripFile =  201604-citibike-tripdata.csv
....................
# of records =  1013150
processed:  Thu Dec  8 05:20:52 2016 

tripFile =  201605-citibike-tripdata.csv
........................
# of records =  1212281
processed:  Thu Dec  8 05:22:05 2016 

tripFile =  201606-citibike-tripdata.csv
.............................
# of records =  1460319
processed:  Thu Dec  8 05:23:32 2016 

tripFile =  201607-citibike-tripdata.csv
...........................
# of records =  1380111
processed:  Thu Dec  8 05:24:54 2016 

tripFile =  201608-citibike-tripdata.csv
...............................
# of records =  1557664
processed:  Thu Dec  8 05:26:28 2016 

tripFile =  201609-citibike-tripdata.csv
................................
# of records =  1648857
processed:  Thu Dec  8 05:28:06 2016 


total # of records =  13266308

computed:  Thu Dec  8 05:28:06 2016 
time used:  0:13:10.561218
# of stations =  678

pickle

network

stations

computed:  Thu Dec  8 05:28:17 2016 
time used:  0:00:10.800618
>>> 

Correction

To compute in(v;k) we should consider links finishing in station v in the k-th half hour. Change addStart(bind,eind,bt).

nRec += tripsreader.line_num - 1 ??? Add the total records counter.

notes/net/bikes/citi1.txt · Last modified: 2016/12/12 05:41 by vlado
 
Except where otherwise noted, content on this wiki is licensed under the following license: CC Attribution-Noncommercial-Share Alike 3.0 Unported
Recent changes RSS feed Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki