====== ENRON ====== There are two somehow cleaned versions of raw data [[https://www.cs.cmu.edu/~./enron/|CMU]] / [[http://www.cs.cmu.edu/~enron/enron_mail_20150507.tgz|tgz]] and [[http://bailando.sims.berkeley.edu/enron_email.html|UC Berkeley]]. There is also a version with removed privacy info [[http://info.nuix.com/Enron.html|Enron]] from [[http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set|EDRM]]. Some preprocessed data are available at [[http://research.cs.queensu.ca/home/skill/otherforms.html|David Skillicorn]]'s page. ===== Networks ===== Some networks derived from the Enron data are available: * [[http://konect.uni-koblenz.de/networks/enron|Konect]] - labels ? * [[http://snap.stanford.edu/data/email-Enron.html|Snap]] - no labels * Conrad Lee: [[http://sociograph.blogspot.si/2011/04/communication-networks-part-1-enron-e.html|Enron]] (see **Download**) * Y. Park: [[http://cis.jhu.edu/~parky/Enron/|Scan Statistics on Enron Graphs]] - 184 nodes, temporal * Sujit Pal: [[https://github.com/sujitpal/mlia-examples/tree/master/src/enron_network|Enron network / GitHub]] - nodes labeled with e-mail, arcs weighted with frequency * [[https://cran.r-project.org/web/packages/igraphdata/igraphdata.pdf|igraph / R - Enron]] - limited to 184 nodes * [[http://odds.cs.stonybrook.edu/enroninc-dataset/|Enronic]] ===== Analyses ===== * [[http://homes.cs.washington.edu/~jheer/|Jeffrey M. Heer]], [[http://hci.stanford.edu/~jheer/projects/enron/v1/|Enron 1]], [[http://homes.cs.washington.edu/~jheer//projects/enron/|Enron 2]] * Damon Wade: [[https://www.linkedin.com/pulse/visualizing-email-networks-damon-wade|Visualizing Email Networks]] * Philip Starritt: [[http://www.philipstarritt.com/enron|Data Mining the Enron Email Dataset]] * Sujit Pal: [[http://sujitpal.blogspot.si/2013/11/using-graph-centrality-metrics-for.html|Using Graph Centrality Metrics for Crime Fighting]] ===== igraph/R - Enron ===== > library(igraph) > library(igraphdata) > help(igraphdata) > data(package="igraphdata") > data(enron) > help(enron) > arcs <- E(enron) > length(arcs) [1] 125409 > arcs[1] Edge sequence: e e [1] 25 -> 154 > arcs[125409] Edge sequence: e e [125409] 18 -> 18 > nodes <- V(enron) > length(nodes) [1] 184 > enron IGRAPH D--- 184 125409 -- Enron email network + attr: LDC_names (g/c), LDC_desc (g/c), name (g/c), Citation (g/c), | Email (v/c), Name (v/c), Note (v/c), Time (e/c), Reciptype (e/c), | Topic (e/n), LDC_topic (e/n) + edges: [1] 25->154 25->154 30-> 30 30-> 30 30-> 30 30-> 30 39-> 39 52-> 67 [9] 52-> 67 52-> 67 52-> 67 61->100 61->100 61->163 61->163 61->166 [17] 61->166 61->170 64-> 59 64-> 59 64-> 64 64-> 64 64->147 64->147 [25] 64->164 64->164 64->168 66-> 66 66-> 66 67->129 67->129 67->129 [33] 67->129 93-> 10 93-> 10 93-> 10 93-> 10 93-> 39 93-> 39 93-> 93 [41] 93-> 93 93-> 93 93-> 93 93->124 93->124 100-> 61 100-> 61 115->115 + ... omitted several edges > enron$name [1] "Enron email network" > vertex_attr_names(enron) [1] "Email" "Name" "Note" > edge_attr_names(enron) [1] "Time" "Reciptype" "Topic" "LDC_topic" > graph_attr_names(enron) [1] "LDC_names" "LDC_desc" "name" "Citation" > vertex_attr(enron,'Name', index = c(1)) [1] "Albert Meyers" > vertex_attr(enron,'Name', index = c(2)) [1] "Thomas Martin" > vertex_attr(enron,'Email', index = c(1,2)) [1] "albert.meyers" "a..martin" > vertex_attr(enron,'Note', index = c(1,2)) [1] "Employee, Specialist" "Vice President" > edge_attr(enron,'Time', index = 1:3) [1] "1979-12-31 21:00:00" "1979-12-31 21:00:00" "1979-12-31 21:00:00" > edge_attr(enron,'Topic', index = 1:3) [1] 1 1 3 > edge_attr(enron,'LDC_topic', index = 1:3) [1] 0 -1 -1 > edge_attr(enron,'Reciptype', index = 1:3) [1] "to" "to" "cc" > graph_attr(enron,'LDC_names') [1] "Calif_analysis" "Calif_bankruptcy" "Calif_utilities" [4] "Calif_crisis_legal" "Calif_enron" "Calif_federal" [7] "Newsfeed_Calif" "Calif_legis" "Daily_business" [10] "Educational" "EnronOnline" "Kitchen_daily" [13] "Kitchen_fortune" "Energy_newsfeed" "General_newsfeed" [16] "Downfall" "Downfall_newsfeed" "Broadband" [19] "Federal_gov" "FERC_DOE" "College Football" [22] "Pro Football" "India_General" "India_Dabhol" [25] "Nine_eleven" "Nine_Eleven_Analysis" "Dynegy" [28] "Sempra" "Duke" "El Paso" [31] "Pipelines" "World_energy" > a <- arcs[1] > a + 1/125409 edge: [1] 25->154 > ends(enron,a) [,1] [,2] [1,] 25 154 > ==== iGraph -> Pajek ==== library(igraph) library(igraphdata) data(enron) minutes <- function(d) as.numeric(as.POSIXct(d,format="%Y-%m-%d %H:%M:%OS"))/60 setwd("D:/Data/Enron/iGraph") nodes <- V(enron); arcs <- E(enron); T <- arcs$Time n <- length(nodes); m <- length(arcs); notes <- factor(nodes$Note); fNotes <- levels(notes) rTypes <- factor(arcs$Reciptype); fTypes <- levels(rTypes) topics <- factor(arcs$Topic); fTopics <- levels(topics) tops <- factor(arcs$LDC_topic); fTops <- levels(tops) net <- file("enron.net","w"); cat('*vertices ',n,'\n',sep='',file=net) clu <- file("enronNote.clu","w"); cat('%',file=clu) for(i in 1:length(fNotes)) cat(' ',i,fNotes[i],file=clu) cat('\n*vertices ',n,'\n',sep='',file=clu) for(v in 1:n) { cat(v,' "',nodes$Name[v],'"\n',sep='',file=net); cat(notes[v],'\n',file=clu) } cat('% Types:',file=net) for(t in 1:length(fTypes)) cat(' ',t,' ',fTypes[t],sep='',file=net) cat('\n',file=net) tNames <- c('no match','outlier',graph_attr(enron, 'LDC_names')) for(t in 1:length(fTops)) cat('*arcs :',t,' "',tNames[t],'"\n',sep='',file=net) cat('*arcs\n',file=net); X <- ends(enron,arcs) start <- "1979-12-31 20:00:00"; s <- minutes(start) for(a in 1:m) cat(as.integer(tops[a]),': ',X[a,1],' ',X[a,2],' 1 [',minutes(T[a])-s, '] l "',fTypes[rTypes[a]],'"\n',sep='',file=net) close(net); close(clu) {{pajek:data:zip:enron184.zip}} http://kateto.net/networks-r-igraph