====== Statistics ======
===== Combinatorics =====
> combn(letters[1:4], 2)
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] "a" "a" "a" "b" "b" "c"
[2,] "b" "c" "d" "c" "d" "d"
> combn(letters[1:4], 3)
[,1] [,2] [,3] [,4]
[1,] "a" "a" "a" "b"
[2,] "b" "b" "c" "c"
[3,] "c" "d" "d" "d"
> utils:::menuInstallPkgs()
package ‘combinat’ successfully unpacked and MD5 sums checked
> library(combinat)
> permn(letters[1:4])
[[1]]
[1] "a" "b" "c" "d"
[[2]]
[1] "a" "b" "d" "c"
[[3]]
[1] "a" "d" "b" "c"
https://www.topcoder.com/blog/generating-combinations/
https://www.geeksforgeeks.org/heaps-algorithm-for-generating-permutations/
https://www.topcoder.com/blog/generating-permutations/
> startSubsets <- function(n){.n <<- n; .s <<- -1}
>
> nextSubset <- function(){
+ .s <<- .s+1
+ S <- as.logical(intToBits(.s)[1:(.n+1)])
+ if(S[.n+1]) {.s <<- -1; return(NULL)}
+ return(S[1:.n])
+ }
#.Machine$integer.max
> startSubsets(3)
> repeat {S <- nextSubset(); if(is.null(S)) break; cat("{",letters[1:3][S],'}\n')}
{ }
{ a }
{ b }
{ a b }
{ c }
{ a c }
{ b c }
{ a b c }
[[https://www4.uwsp.edu/math/nwodarz/Math209Files/209-0809F-L10-Section06_03-AlgorithmsForGeneratingPermutationsAndCombinations-Notes.pdf|Permutations]] in increasing lexicographic order.
> startPerm <- function(n){.n <<- n; .s <<- 1:n; .first <<- TRUE }
> startPerm <- function(n){.n <<- n; .s <<- 1:n; .first <<- TRUE }
> nextPerm <- function(){
+ if(.first){ .first <<- FALSE; return(.s) }
+ m <- .n-1
+ while(.s[m]>.s[m+1]) m <- m-1
+ k <- .n
+ while(.s[m]>.s[k]) k <- k-1
+ .t <<- .s[m]; .s[m] <<- .s[k]; .s[k] <<- .t
+ p <- m+1; q <- .n
+ while(p
> startPerm(4)
> tryCatch({for(i in 1:30) cat(i,nextPerm(),"\n")}, error=function(err){})
1 1 2 3 4
2 1 2 4 3
3 1 3 2 4
4 1 3 4 2
5 1 4 2 3
6 1 4 3 2
7 2 1 3 4
8 2 1 4 3
9 2 3 1 4
10 2 3 4 1
11 2 4 1 3
12 2 4 3 1
13 3 1 2 4
14 3 1 4 2
15 3 2 1 4
16 3 2 4 1
17 3 4 1 2
18 3 4 2 1
19 4 1 2 3
20 4 1 3 2
21 4 2 1 3
22 4 2 3 1
23 4 3 1 2
24 4 3 2 1
NULL
>
https://blogs.msdn.microsoft.com/gpalem/2013/03/28/make-vectorize-your-friend-in-r/
===== Autocorrelation =====
==== Simple example ====
> library(statnet)
> g <- c(
+ 0, 1, 0, 1, 0, 0, 0, 0, 0,
+ 0, 0, 1, 0, 1, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 0, 0, 0,
+ 0, 0, 0, 0, 1, 0, 1, 0, 0,
+ 0, 0, 0, 0, 0, 1, 0, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ 0, 0, 0, 0, 0, 0, 0, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0 )
> G <- symmetrize(matrix(g,byrow=TRUE,nrow=9))
> rownames(G) <- colnames(G) <- c("A","B","C","D","E","F","G","H","I")
> plot.sociomatrix(G)
> gplot(G,gmode="graph",displaylabels=TRUE)
> a1 <- c( 1, 2, 3, 2, 3, 4, 3, 4, 5 )
> gplot(G,gmode="graph",displaylabels=TRUE,vertex.cex=a1)
> nacf(G,a1,type="geary",mode="graph")[2]
1
0.3333333
> nacf(G,a1,type="moran",mode="graph")[2]
1
0.5
> a2 <- c(3, 4, 3, 4, 3, 2, 1, 2, 5 )
> nacf(G,a2,type="geary",mode="graph")[2]
1
1
> nacf(G,a2,type="moran",mode="graph")[2]
1
-0.25
> a3 <- c(4, 1, 4, 2, 5, 2, 3, 3, 3 )
> nacf(G,a3,type="geary",mode="graph")[2]
1
1.833333
> nacf(G,a3,type="moran",mode="graph")[2]
1
-0.875
==== Florentine families ====
> library(statnet)
> data(florentine)
> fw <- flomarriage %v% "wealth"
> gplot(flomarriage,displaylabels=TRUE,vertex.cex=0.025*fw,
+ gmode="graph",main='Florentine families')
> I <- nacf(flomarriage,fw,type="moran",mode="graph")
> I
0 1 2 3 4 5
1.00000000 -0.31073529 0.06531299 -0.06045322 -0.06267282 0.01039729
6 7 8 9 10 11
0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
12 13 14 15
0.00000000 0.00000000 0.00000000 0.00000000
> C <- nacf(flomarriage,fw,type="geary",mode="graph")
> C
0 1 2 3 4 5
0.000000000 1.683607336 0.811642725 0.899673648 0.826831324 0.006496392
6 7 8 9 10 11
0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
12 13 14 15
0.000000000 0.000000000 0.000000000 0.000000000
> I[2]
1
-0.3107353
> C[2]
1
1.683607
===== CUG =====
> library(statnet)
> data(florentine)
> rSize <- cug.test(flobusiness,centralization,
+ FUN.arg=list(FUN=betweenness),mode="graph",cmode="size")
> rEdges <- cug.test(flobusiness,centralization,
+ FUN.arg=list(FUN=betweenness),mode="graph",cmode="edges")
> rDyad <- cug.test(flobusiness,centralization,
+ FUN.arg=list(FUN=betweenness),mode="graph",cmode="dyad.census")
> names(rSize)
[1] "obs.stat" "rep.stat" "mode" "diag" "cmode" "plteobs" "pgteobs"
[8] "reps"
> rSize
Univariate Conditional Uniform Graph Test
Conditioning Method: size
Graph Type: graph
Diagonal Used: FALSE
Replications: 1000
Observed Value: 0.2057143
Pr(X>=Obs): 0.001
Pr(X<=Obs): 0.999
>
> # Aggregate results
> Betweenness <- c(rSize$obs.stat,rEdges$obs.stat,rDyad$obs.stat)
> PctGreater <- c(rSize$pgteobs,rEdges$pgteobs,rDyad$pgteobs)
> PctLess <- c(rSize$plteobs,rEdges$plteobs,rDyad$plteobs)
> report <- cbind(Betweenness, PctGreater, PctLess)
> rownames(report) <- c("Size","Edges","Dyads")
> report
Betweenness PctGreater PctLess
Size 0.2057143 0.001 0.999
Edges 0.2057143 0.713 0.289
Dyads 0.2057143 0.738 0.263
> gplot(flobusiness,gmode="graph")
> gplot(flobusiness,gmode="graph",displaylabels=TRUE)
>
> par(mfrow=c(1,3))
> plot(rSize, main="Betweenness \nConditioned on Size" )
> plot(rEdges, main="Betweenness \nConditioned on Edges" )
> plot(rDyad, main="Betweenness \nConditioned on Dyads" )
> par(mfrow=c(1,1))
===== QAP =====
> help(qaptest)
> gcor(flobusiness,flomarriage)
[1] 0.3718679
> (rCor <- qaptest(list(flobusiness,flomarriage),gcor,
+ g1=1,g2=2,reps=1000))
QAP Test Results
Estimated p-values:
p(f(perm) >= f(d)): 0.001
p(f(perm) <= f(d)): 1
>
> plot(rCor, xlim=c(-0.25, 0.4))
===== Preparing data =====
C:\Users\batagelj\Documents\papers\2018\moskva\NetR\doc\R\grund\
https://raw.githubusercontent.com/bavla/netstat/master/data/LGang/
The LGang data includes attributes and the co-offending network of a youth gang. Data were collected by James Densley (Metropolitan University) and Thomas Grund (UCD).
> setwd("C:/Users/batagelj/Documents/papers/2018/moskva/NetR/nets")
> library(sna)
> list.files()
[1] "campnet" "grund" "padgett" "rivers" "test"
> # LG.mat <- read.csv("./grund/LGang-matrix.csv",header=TRUE)
> LG.mat <- read.csv("https://raw.githubusercontent.com/bavla/netstat/master/data/LGang/LGang-matrix.csv",header=TRUE)
> head(LG.mat)
ID X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26
1 1 0 1 1 2 1 1 2 3 2 2 3 1 0 0 0 0 1 1 0 2 2 3 1 0 1 0
...
> LG.mat <- LG.mat[,-1] # remove ID
> LG.mat[LG.mat==1]<-0 # recode
> LG.mat[LG.mat>=2]<-1
> LG.net <- network(as.matrix(LG.mat),directed=FALSE) # convert to network
> # LG.attrib <- read.csv("./grund/LGang-attributes.csv")
> LG.attrib <- read.csv("https://raw.githubusercontent.com/bavla/netstat/master/data/LGang/LGang-attributes.csv")
> head(LG.attrib)
ID year age birthplace residence arrests convictions prison music rank rankrev core
1 1 1989 20 West Africa 0 16 4 1 1 1 5 1
2 2 1989 20 Caribbean 0 16 7 1 0 2 4 1
3 3 1990 19 Caribbean 0 12 4 1 0 2 4 1
4 4 1988 21 Caribbean 0 8 1 0 0 2 4 1
5 5 1985 24 Caribbean 0 11 3 0 0 2 4 1
6 6 1984 25 UK 1 17 10 0 0 2 4 1
> LG.net %v% "ethnicity" <- as.character(LG.attrib$birthplace)
> LG.net %v% "prison" <- LG.attrib$prison
> plot(LG.net)
> plot(LG.net,vertex.col="ethnicity")
> plot(LG.net,vertex.col="prison")
> prisonsize<- 1 + LG.net %v% "prison" # display as vertex size
> plot(LG.net, vertex.cex=prisonsize, vertex.col="ethnicity")
===== Changes of transitivity with density =====
> # changes for different densities
> (d<-seq(from=0.05,to=0.85,by=0.1))
[1] 0.05 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85
> den <-lapply(d, function(x) gden(rgraph(100,m=50, tprob= x)))
> class(den)
[1] "list"
> length(den)
[1] 9
> class(den[[1]])
[1] "numeric"
> den[[1]]
[1] 0.05161616 0.05282828 0.04979798 0.04727273 0.05161616 0.05111111 0.05252525 0.04969697 0.04777778
[10] 0.04969697 0.04919192 0.05181818 0.05424242 0.04939394 0.04909091 0.05000000 0.04727273 0.04626263
[19] 0.04959596 0.05474747 0.04858586 0.04888889 0.05181818 0.05171717 0.04868687 0.05060606 0.05343434
[28] 0.04585859 0.04777778 0.04969697 0.05242424 0.05161616 0.04989899 0.04929293 0.05090909 0.04898990
[37] 0.04808081 0.05161616 0.04616162 0.04949495 0.05020202 0.04737374 0.04737374 0.04656566 0.05171717
[46] 0.05080808 0.05040404 0.04939394 0.05313131 0.05060606
> library(beanplot)
> beanplot(den, names = d)
> trans <-lapply(d, function(x) gtrans(rgraph(100,m=50, tprob= x), measure="weakcensus"))
> beanplot(trans, names = d)
> # changes of betweenness with density
> d <- seq(0.05,0.5,by=0.05)
> n <- 100
> r <- 50
>
> bw <- lapply(d,
+ function(x){ temp <- vector('list',r)
+ for (i in 1:r) temp[[i]] <- betweenness(rgraph(n,tprob=x),rescale=TRUE)
+ return(temp)
+ })
> beanplot(dat, names=d, what=c(1,1,1,0))
===== Testing =====
==== Transitivity ====
Are there fewer unclosed triangles in the LGang network than one would expect by chance, given the size and number of edges that are there?
> (LG.trans <- gtrans(LG.net))
[1] 0.3635371
> (LG.density <- gden(LG.net))
[1] 0.092942
> (LG.size <- network.size(LG.net))
[1] 54
> (LG.nedges <- network.edgecount(LG.net))
[1] 133
> ERgphs <- rgnm(n=200, nv=LG.size, m=LG.nedges, mode="graph")
> ERgph1 <- network(ERgphs[1,,],directed=F)
> plot(ERgph1)
> plot(LG.net)
> gden(ERgph1)
[1] 0.092942
> gden(LG.net)
[1] 0.092942
> ERtrans <- gtrans(ERgphs)
> plot(density(ERtrans))
> plot(density(ERtrans),xlim=c(0,0.38))
> abline(v=LG.trans,col="red",lw=2)
> # packed test
> cg <- cug.test(LG.net,"gtrans",mode="graph",cmode=c("edges"),reps=10000)
> cg
Univariate Conditional Uniform Graph Test
Conditioning Method: edges
Graph Type: graph
Diagonal Used: FALSE
Replications: 10000
Observed Value: 0.3635371
Pr(X>=Obs): 0
Pr(X<=Obs): 1
> plot(cg)
The probability for a random network with the same number of edges as the LGang network to have more triangles than the LGang network is practically 0. So, relative to a random network with the same amount of edges there would appear to be high clustering in the LGang network.
> cg2 <- cug.test(LG.net,gtrans,cmode="dyad.census")
> cg2
Univariate Conditional Uniform Graph Test
Conditioning Method: dyad.census
Graph Type: digraph
Diagonal Used: FALSE
Replications: 1000
Observed Value: 0.3635371
Pr(X>=Obs): 0
Pr(X<=Obs): 1
> plot(cg2)
===== Permutation test QAP =====
> ety <- get.vertex.attribute(LG.net,"ethnicity")
> ety
[1] "West Africa" "Caribbean" "Caribbean" "Caribbean" "Caribbean" "UK" "East Africa"
[8] "West Africa" "West Africa" "West Africa" "West Africa" "UK" "UK" "UK"
...
> Mety <- 1 * outer(ety, ety, "==") # 1 * for conversion to integer
> Mety[1:10,1:10]
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 1 0 0 0 0 0 0 1 1 1
[2,] 0 1 1 1 1 0 0 0 0 0
[3,] 0 1 1 1 1 0 0 0 0 0
[4,] 0 1 1 1 1 0 0 0 0 0
[5,] 0 1 1 1 1 0 0 0 0 0
[6,] 0 0 0 0 0 1 0 0 0 0
[7,] 0 0 0 0 0 0 1 0 0 0
[8,] 1 0 0 0 0 0 0 1 1 1
[9,] 1 0 0 0 0 0 0 1 1 1
[10,] 1 0 0 0 0 0 0 1 1 1
> gcor(LG.net,Mety)
[1] 0.1249278
> LG.perm <- network(rmperm(LG.net),directed=F)
> gden(LG.perm)
[1] 0.092942
> gden(LG.net)
[1] 0.092942
> list.vertex.attributes(LG.net)
[1] "ethnicity" "na" "prison" "vertex.names"
> list.vertex.attributes(LG.perm)
[1] "na" "vertex.names"
> LG.net %v% "vertex.names"
[1] "X1" "X2" "X3" "X4" "X5" "X6" "X7" "X8" "X9" "X10" "X11" "X12" "X13" "X14" "X15" "X16" "X17"
[18] "X18" "X19" "X20" "X21" "X22" "X23" "X24" "X25" "X26" "X27" "X28" "X29" "X30" "X31" "X32" "X33" "X34"
[35] "X35" "X36" "X37" "X38" "X39" "X40" "X41" "X42" "X43" "X44" "X45" "X46" "X47" "X48" "X49" "X50" "X51"
[52] "X52" "X53" "X54"
> LG.perm %v% "vertex.names"
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
[35] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
> LG.perm %v% "vertex.names" <- LG.net %v% "vertex.names"
> plot(LG.net,displaylabels=TRUE)
> plot(LG.perm,displaylabels=TRUE)
> gcor(LG.net, LG.perm)
[1] 0.03845129
> gcor(LG.perm, Mety)
[1] -0.03857968
> qap <- qaptest(list(LG.net,Mety),g1=1,g2=2,FUN=gcor,reps=5000)
> summary(qap)
QAP Test Results
Estimated p-values:
p(f(perm) >= f(d)): 2e-04
p(f(perm) <= f(d)): 0.9998
Test Diagnostics:
Test Value (f(d)): 0.1249278
Replications: 5000
Distribution Summary:
Min: -0.09659847
1stQ: -0.02275637
Med: -0.001658628
Mean: 0.0001832054
3rdQ: 0.01943912
Max: 0.1354767
> plot(qap)
Our actual test-statistic (here, correlation) is much higher than what we would see if we were to scramble the network.
> Netr1<-rgraph(100, tprob=0.1)
> Netr2<-rgraph(100, tprob=0.5)
> plot(network(Netr1))
>
> qap<-qaptest(list(Netr1,Netr2),g1=1,g2=2,FUN=gcor,reps=5000)
> summary(qap)
QAP Test Results
Estimated p-values:
p(f(perm) >= f(d)): 0.3958
p(f(perm) <= f(d)): 0.6132
Test Diagnostics:
Test Value (f(d)): 0.002507697
Replications: 5000
Distribution Summary:
Min: -0.03736669
1stQ: -0.006954022
Med: -0.0001956514
Mean: -0.0001938942
3rdQ: 0.006562719
Max: 0.03562371
> plot(qap)
Our observed correlation falls right into the distribution of correlations we would expect by chance, controlling for the underlying structure. Hence, the correlation between the two random networks is not significant.
===== Regression =====
> ties <- as.matrix(LG.net)
> diag(ties) <- NA
> ties <- ties[upper.tri(ties)]
> Maty <- Mety
> Maty <- Maty[upper.tri(Maty)]
> dyads <- data.frame(ties,Maty)
> dyads[1:10,]
ties Maty
1 0 0
2 0 0
3 1 1
4 1 0
5 0 1
6 1 1
7 0 0
8 0 1
9 1 1
10 1 1
> results <- glm(ties~Maty,family="binomial",data=dyads)
> summary(results)
Call:
glm(formula = ties ~ Maty, family = "binomial", data = dyads)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.5679 -0.5679 -0.3794 -0.3794 2.3096
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.5953 0.1239 -20.946 < 2e-16 ***
Maty 0.8523 0.1844 4.622 3.8e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 885.19 on 1430 degrees of freedom
Residual deviance: 864.48 on 1429 degrees of freedom
AIC: 868.48
Number of Fisher Scoring iterations: 5
>
The null hypothesis is that networks are random with the right number of edges (so the dyads are independent). The x1 estimate (=beta coefficient) is positive at 0.85 and this effect is significant with p = 4.14...e-06 = very small p. Basically, the regression says that a tie between two gang members is Exp(b) = 2.34 times more likely when the two gang members have the same ethnicity. There is clear evidence for ethnic homophily in co-offending.
help(netlogit)
> dyQap <- netlogit(LG.net,Mety,mode="graph",nullhyp="qapy",reps=100)
> dyQap
Network Logit Model
Coefficients:
Estimate Exp(b) Pr(<=b) Pr(>=b) Pr(>=|b|)
(intercept) -2.5952547 0.07462687 0.91 0.09 0.91
x1 0.8522854 2.34500000 1.00 0.00 0.00
Goodness of Fit Statistics:
Null deviance: 1983.787 on 1431 degrees of freedom
Residual deviance: 864.4812 on 1429 degrees of freedom
Chi-Squared test of fit improvement:
1119.306 on 2 degrees of freedom, p-value 0
AIC: 868.4812 BIC: 879.0135
Pseudo-R^2 Measures:
(Dn-Dr)/(Dn-Dr+dfn): 0.4388909
(Dn-Dr)/Dn: 0.5642268
>
==== Interactive coordinates ====
> xy <- gplot(LG.net,displaylabels=TRUE,label.cex=0.7,usearrows=FALSE,interactive=TRUE)
> gplot(LG.net,coord=xy,displaylabels=TRUE,label.cex=0.7,usearrows=FALSE)
====== ======
\\