====== tf-idf 2020 ====== [[https://github.com/bavla/SocNet/wiki/TfIdfY|Bavla/SocNet/tf-idf]] ===== Months 2020 partition ===== We first create a months partition for 2020


> month20 <- function(d){
+   tryCatch( 
+     if(as.integer(substr(d,1,4)) == 2020)return(as.integer(substr(d,6,7)))
+     else return(0),
+     error=function(ermsg) return(0)
+   )
+ }
> nMonths20 <- sapply(Y$date,month20)
> nMonths20[is.na(nMonths20)] <- 0
> table(nMonths20)
nMonths20
     0      1      2      3      4      5      6      7      8      9     10     11 
235599   1035   1904   4858  12318  18219  17813  18027  16707  17877  16412  12518 
    12 
  1807 
> vector2clu(nMonths20,Clu="months20.clu")

In Pajek we extend it to a two-mode partition with cluster 99 for the second set K (WK Rows=375094, Cols=97104)


read months20.clu 
partition/create constant partition [97104,99]
select months20 as First and constant as Second
partitions/fuse partitions

===== Total frequencies ===== In Pajek


read or select WK
network/create partition/degree/input
network/2-mode network/partition in 2 modes
select indegree as First, 2-mode as Second
partitions/extract (Second from First) [2]
save total.clu

===== Month k frequencies ===== In Pajek


select WK, select fusion partition
operations/network+partition/extract/subnetwork induced selected [yes][k,99]
network/2-mode network/partition in 2 modes
network/create partition/degree/input
select 2-mode as Second
partitions/extract (Second from First) [2]
save m(k).clu

===== Creating frequencies data frame ===== The list of keywords is in the file ''keywords.tmp''. It contains some single appearances of character ''\'' that makes troubles in read.table - it has to be replaced with ''\\'' using some editor. In R


> L <- c("total","m01","m02","m03","m04","m05","m06","m07","m08","m09","m10","m11","m12")
> Freq <- paste(wdir,"/tfIdf/",L,".clu",sep="")

> K <- read.table('keywords.tmp',skip=0,colClasses="character")[,2]
> TF <- data.frame(row.names=K,
+   tot=read.table(Freq[1],skip=1)[,1],
+   m01=read.table(Freq[2],skip=1)[,1],
+   m02=read.table(Freq[3],skip=1)[,1],
+   m03=read.table(Freq[4],skip=1)[,1],
+   m04=read.table(Freq[5],skip=1)[,1],
+   m05=read.table(Freq[6],skip=1)[,1],
+   m06=read.table(Freq[7],skip=1)[,1],
+   m07=read.table(Freq[8],skip=1)[,1],
+   m08=read.table(Freq[9],skip=1)[,1],
+   m09=read.table(Freq[10],skip=1)[,1],
+   m10=read.table(Freq[11],skip=1)[,1],
+   m11=read.table(Freq[12],skip=1)[,1],
+   m12=read.table(Freq[13],skip=1)[,1] )
> head(TF)
             tot m01 m02 m03 m04 m05 m06 m07 m08 m09 m10 m11 m12
proven        59   0   1   1   3   4   3   2   4   4   0   2   0
saudi        739   4   7   6  20  25  23  26  35  32  30  24   4
clinical   14239  42 102 267 509 726 667 680 658 689 610 502  54
pneumoniae   284   3   2   2   5   2   9  11   9  11   7   8   1
mycoplasma   254   0   1   2   5   2   5  11   8   7   4   2   0
jeddah        20   0   0   0   0   0   0   0   1   0   1   0   0
> save(TF, file = "tfIdf2020.RData")

===== Word clouds ===== C:\Users\batagelj\Documents\papers\2021\twoMode\data


> wdir <- "C:/Users/batagelj/Documents/2020/corona/MetaTit"
> setwd(wdir)
> load("tfIdf2020.RData")
> dim(TF)
[1] 97104    13
> P <- apply(sign(TF[,2:13]), 1, sum)
> S <- apply(TF, 2, sum)
> library(wordcloud)
> N <- row.names(TF)
> Encoding(N) <- "UTF-8"

For j from 1 to 12


> j <- 1
> di <- ifelse(TF[,j+1]==0,0,TF[,j+1]/S[j+1]*log(12/P))
> or <- order(di,decreasing=TRUE)
> names(di) <- N
> di[or][1:50]
> ff <- round(10^6*sqrt(di))
> set.seed(1234) # for reproducibility 
> wordcloud(words=N,freq=ff,# min.freq = 1,
+   max.words=100, random.order=FALSE, rot.per=0.35,
+   colors=brewer.pal(8, "Dark2"))

January, February {{notes:imfm:corona:pics:fig01.png?400}} {{notes:imfm:corona:pics:fig02.png?400}} March, April {{notes:imfm:corona:pics:fig03.png?400}} {{notes:imfm:corona:pics:fig04.png?400}} May, June {{notes:imfm:corona:pics:fig05.png?400}} {{notes:imfm:corona:pics:fig06.png?400}} July, August {{notes:imfm:corona:pics:fig07.png?400}} {{notes:imfm:corona:pics:fig08.png?400}} September, October {{notes:imfm:corona:pics:fig09.png?400}} {{notes:imfm:corona:pics:fig10.png?400}} November, December {{notes:imfm:corona:pics:fig11.png?400}} {{notes:imfm:corona:pics:fig12.png?400}} ===== wordcloud2 ===== https://www.r-graph-gallery.com/196-the-wordcloud2-library.html


> d <- c()
> for(j in 1:12) d <- cbind(d,ifelse(TF[,j+1]==0,0,TF[,j+1]/S[j+1]*log(12/P)))
> library(wordcloud2)
> library(htmlwidgets)
> library(webshot)
#> webshot::install_phantomjs()
> for(j in 1:12){
>   df <- data.frame(N=N,f=d[,j])
#>   wc <- wordcloud2(df,size=1.5)
#>   wc <- wordcloud2(df,size=2,minRotation=-pi/2,maxRotation=-pi/2)
>   wc <- wordcloud2(df,size=1.2,minRotation=0,maxRotation=0)
>   tmp <- paste("tmp",j,".html",sep="")
>   pdf <- paste("fig",j,".pdf",sep="")
>   png <- paste("fig",j,".png",sep="")
>   saveWidget(wc,tmp,selfcontained=FALSE)
>   webshot(tmp,pdf,delay=5,vwidth=480,vheight=480)
>   webshot(tmp,png,delay=5,vwidth=480,vheight=480)
> }

The differences between values are too large. In some word-clouds the ''Covid-19'' prevails. This produces unreadable results {{notes:imfm:corona:pics:fig5c.png?600}} To make pictures more uniform and convey the ordering of keywords I assigned to each keyword ''key'' the value ''const/sqrt(1+index(key))'' where ''index(key)'' is the position of keyword key in the list of keywords ordered in the decreasing order of their tf-idf values. * [[.:wc0|Word clouds: monthly frequencies]] * [[.:wca|Word clouds: monthly tf-idfs]] * [[.:wcd|Word clouds: monthly differences (observed-expected)]] * [[.:wch|Word clouds: monthly chi square]]