====== tf-idf 2020 ====== [[https://github.com/bavla/SocNet/wiki/TfIdfY|Bavla/SocNet/tf-idf]] ===== Months 2020 partition ===== We first create a months partition for 2020 > month20 <- function(d){ + tryCatch( + if(as.integer(substr(d,1,4)) == 2020)return(as.integer(substr(d,6,7))) + else return(0), + error=function(ermsg) return(0) + ) + } > nMonths20 <- sapply(Y$date,month20) > nMonths20[is.na(nMonths20)] <- 0 > table(nMonths20) nMonths20 0 1 2 3 4 5 6 7 8 9 10 11 235599 1035 1904 4858 12318 18219 17813 18027 16707 17877 16412 12518 12 1807 > vector2clu(nMonths20,Clu="months20.clu") In Pajek we extend it to a two-mode partition with cluster 99 for the second set K (WK Rows=375094, Cols=97104) read months20.clu partition/create constant partition [97104,99] select months20 as First and constant as Second partitions/fuse partitions ===== Total frequencies ===== In Pajek read or select WK network/create partition/degree/input network/2-mode network/partition in 2 modes select indegree as First, 2-mode as Second partitions/extract (Second from First) [2] save total.clu ===== Month k frequencies ===== In Pajek select WK, select fusion partition operations/network+partition/extract/subnetwork induced selected [yes][k,99] network/2-mode network/partition in 2 modes network/create partition/degree/input select 2-mode as Second partitions/extract (Second from First) [2] save m(k).clu ===== Creating frequencies data frame ===== The list of keywords is in the file ''keywords.tmp''. It contains some single appearances of character ''\'' that makes troubles in read.table - it has to be replaced with ''\\'' using some editor. In R > L <- c("total","m01","m02","m03","m04","m05","m06","m07","m08","m09","m10","m11","m12") > Freq <- paste(wdir,"/tfIdf/",L,".clu",sep="") > K <- read.table('keywords.tmp',skip=0,colClasses="character")[,2] > TF <- data.frame(row.names=K, + tot=read.table(Freq[1],skip=1)[,1], + m01=read.table(Freq[2],skip=1)[,1], + m02=read.table(Freq[3],skip=1)[,1], + m03=read.table(Freq[4],skip=1)[,1], + m04=read.table(Freq[5],skip=1)[,1], + m05=read.table(Freq[6],skip=1)[,1], + m06=read.table(Freq[7],skip=1)[,1], + m07=read.table(Freq[8],skip=1)[,1], + m08=read.table(Freq[9],skip=1)[,1], + m09=read.table(Freq[10],skip=1)[,1], + m10=read.table(Freq[11],skip=1)[,1], + m11=read.table(Freq[12],skip=1)[,1], + m12=read.table(Freq[13],skip=1)[,1] ) > head(TF) tot m01 m02 m03 m04 m05 m06 m07 m08 m09 m10 m11 m12 proven 59 0 1 1 3 4 3 2 4 4 0 2 0 saudi 739 4 7 6 20 25 23 26 35 32 30 24 4 clinical 14239 42 102 267 509 726 667 680 658 689 610 502 54 pneumoniae 284 3 2 2 5 2 9 11 9 11 7 8 1 mycoplasma 254 0 1 2 5 2 5 11 8 7 4 2 0 jeddah 20 0 0 0 0 0 0 0 1 0 1 0 0 > save(TF, file = "tfIdf2020.RData") ===== Word clouds ===== C:\Users\batagelj\Documents\papers\2021\twoMode\data > wdir <- "C:/Users/batagelj/Documents/2020/corona/MetaTit" > setwd(wdir) > load("tfIdf2020.RData") > dim(TF) [1] 97104 13 > P <- apply(sign(TF[,2:13]), 1, sum) > S <- apply(TF, 2, sum) > library(wordcloud) > N <- row.names(TF) > Encoding(N) <- "UTF-8" For j from 1 to 12 > j <- 1 > di <- ifelse(TF[,j+1]==0,0,TF[,j+1]/S[j+1]*log(12/P)) > or <- order(di,decreasing=TRUE) > names(di) <- N > di[or][1:50] > ff <- round(10^6*sqrt(di)) > set.seed(1234) # for reproducibility > wordcloud(words=N,freq=ff,# min.freq = 1, + max.words=100, random.order=FALSE, rot.per=0.35, + colors=brewer.pal(8, "Dark2")) January, February {{notes:imfm:corona:pics:fig01.png?400}} {{notes:imfm:corona:pics:fig02.png?400}} March, April {{notes:imfm:corona:pics:fig03.png?400}} {{notes:imfm:corona:pics:fig04.png?400}} May, June {{notes:imfm:corona:pics:fig05.png?400}} {{notes:imfm:corona:pics:fig06.png?400}} July, August {{notes:imfm:corona:pics:fig07.png?400}} {{notes:imfm:corona:pics:fig08.png?400}} September, October {{notes:imfm:corona:pics:fig09.png?400}} {{notes:imfm:corona:pics:fig10.png?400}} November, December {{notes:imfm:corona:pics:fig11.png?400}} {{notes:imfm:corona:pics:fig12.png?400}} ===== wordcloud2 ===== https://www.r-graph-gallery.com/196-the-wordcloud2-library.html > d <- c() > for(j in 1:12) d <- cbind(d,ifelse(TF[,j+1]==0,0,TF[,j+1]/S[j+1]*log(12/P))) > library(wordcloud2) > library(htmlwidgets) > library(webshot) #> webshot::install_phantomjs() > for(j in 1:12){ > df <- data.frame(N=N,f=d[,j]) #> wc <- wordcloud2(df,size=1.5) #> wc <- wordcloud2(df,size=2,minRotation=-pi/2,maxRotation=-pi/2) > wc <- wordcloud2(df,size=1.2,minRotation=0,maxRotation=0) > tmp <- paste("tmp",j,".html",sep="") > pdf <- paste("fig",j,".pdf",sep="") > png <- paste("fig",j,".png",sep="") > saveWidget(wc,tmp,selfcontained=FALSE) > webshot(tmp,pdf,delay=5,vwidth=480,vheight=480) > webshot(tmp,png,delay=5,vwidth=480,vheight=480) > } The differences between values are too large. In some word-clouds the ''Covid-19'' prevails. This produces unreadable results {{notes:imfm:corona:pics:fig5c.png?600}} To make pictures more uniform and convey the ordering of keywords I assigned to each keyword ''key'' the value ''const/sqrt(1+index(key))'' where ''index(key)'' is the position of keyword key in the list of keywords ordered in the decreasing order of their tf-idf values. * [[.:wc0|Word clouds: monthly frequencies]] * [[.:wca|Word clouds: monthly tf-idfs]] * [[.:wcd|Word clouds: monthly differences (observed-expected)]] * [[.:wch|Word clouds: monthly chi square]]