tf-idf 2020

Months 2020 partition

We first create a months partition for 2020

> month20 <- function(d){
+   tryCatch( 
+     if(as.integer(substr(d,1,4)) == 2020)return(as.integer(substr(d,6,7)))
+     else return(0),
+     error=function(ermsg) return(0)
+   )
+ }
> nMonths20 <- sapply(Y$date,month20)
> nMonths20[is.na(nMonths20)] <- 0
> table(nMonths20)
nMonths20
     0      1      2      3      4      5      6      7      8      9     10     11 
235599   1035   1904   4858  12318  18219  17813  18027  16707  17877  16412  12518 
    12 
  1807 
> vector2clu(nMonths20,Clu="months20.clu")

In Pajek we extend it to a two-mode partition with cluster 99 for the second set K (WK Rows=375094, Cols=97104)

read months20.clu 
partition/create constant partition [97104,99]
select months20 as First and constant as Second
partitions/fuse partitions

Total frequencies

In Pajek

read or select WK
network/create partition/degree/input
network/2-mode network/partition in 2 modes
select indegree as First, 2-mode as Second
partitions/extract (Second from First) [2]
save total.clu

Month k frequencies

In Pajek

select WK, select fusion partition
operations/network+partition/extract/subnetwork induced selected [yes][k,99]
network/2-mode network/partition in 2 modes
network/create partition/degree/input
select 2-mode as Second
partitions/extract (Second from First) [2]
save m(k).clu

Creating frequencies data frame

The list of keywords is in the file keywords.tmp. It contains some single appearances of character \ that makes troubles in read.table - it has to be replaced with \\ using some editor.

In R

> L <- c("total","m01","m02","m03","m04","m05","m06","m07","m08","m09","m10","m11","m12")
> Freq <- paste(wdir,"/tfIdf/",L,".clu",sep="")

> K <- read.table('keywords.tmp',skip=0,colClasses="character")[,2]
> TF <- data.frame(row.names=K,
+   tot=read.table(Freq[1],skip=1)[,1],
+   m01=read.table(Freq[2],skip=1)[,1],
+   m02=read.table(Freq[3],skip=1)[,1],
+   m03=read.table(Freq[4],skip=1)[,1],
+   m04=read.table(Freq[5],skip=1)[,1],
+   m05=read.table(Freq[6],skip=1)[,1],
+   m06=read.table(Freq[7],skip=1)[,1],
+   m07=read.table(Freq[8],skip=1)[,1],
+   m08=read.table(Freq[9],skip=1)[,1],
+   m09=read.table(Freq[10],skip=1)[,1],
+   m10=read.table(Freq[11],skip=1)[,1],
+   m11=read.table(Freq[12],skip=1)[,1],
+   m12=read.table(Freq[13],skip=1)[,1] )
> head(TF)
             tot m01 m02 m03 m04 m05 m06 m07 m08 m09 m10 m11 m12
proven        59   0   1   1   3   4   3   2   4   4   0   2   0
saudi        739   4   7   6  20  25  23  26  35  32  30  24   4
clinical   14239  42 102 267 509 726 667 680 658 689 610 502  54
pneumoniae   284   3   2   2   5   2   9  11   9  11   7   8   1
mycoplasma   254   0   1   2   5   2   5  11   8   7   4   2   0
jeddah        20   0   0   0   0   0   0   0   1   0   1   0   0
> save(TF, file = "tfIdf2020.RData")

Word clouds

C:\Users\batagelj\Documents\papers\2021\twoMode\data

> wdir <- "C:/Users/batagelj/Documents/2020/corona/MetaTit"
> setwd(wdir)
> load("tfIdf2020.RData")
> dim(TF)
[1] 97104    13
> P <- apply(sign(TF[,2:13]), 1, sum)
> S <- apply(TF, 2, sum)
> library(wordcloud)
> N <- row.names(TF)
> Encoding(N) <- "UTF-8"

For j from 1 to 12

> j <- 1
> di <- ifelse(TF[,j+1]==0,0,TF[,j+1]/S[j+1]*log(12/P))
> or <- order(di,decreasing=TRUE)
> names(di) <- N
> di[or][1:50]
> ff <- round(10^6*sqrt(di))
> set.seed(1234) # for reproducibility 
> wordcloud(words=N,freq=ff,# min.freq = 1,
+   max.words=100, random.order=FALSE, rot.per=0.35,
+   colors=brewer.pal(8, "Dark2"))

January, February

March, April

May, June

July, August

September, October

November, December

wordcloud2

https://www.r-graph-gallery.com/196-the-wordcloud2-library.html

> d <- c()
> for(j in 1:12) d <- cbind(d,ifelse(TF[,j+1]==0,0,TF[,j+1]/S[j+1]*log(12/P)))
> library(wordcloud2)
> library(htmlwidgets)
> library(webshot)
#> webshot::install_phantomjs()
> for(j in 1:12){
>   df <- data.frame(N=N,f=d[,j])
#>   wc <- wordcloud2(df,size=1.5)
#>   wc <- wordcloud2(df,size=2,minRotation=-pi/2,maxRotation=-pi/2)
>   wc <- wordcloud2(df,size=1.2,minRotation=0,maxRotation=0)
>   tmp <- paste("tmp",j,".html",sep="")
>   pdf <- paste("fig",j,".pdf",sep="")
>   png <- paste("fig",j,".png",sep="")
>   saveWidget(wc,tmp,selfcontained=FALSE)
>   webshot(tmp,pdf,delay=5,vwidth=480,vheight=480)
>   webshot(tmp,png,delay=5,vwidth=480,vheight=480)
> }

The differences between values are too large. In some word-clouds the Covid-19 prevails. This produces unreadable results

To make pictures more uniform and convey the ordering of keywords I assigned to each keyword key the value const/sqrt(1+index(key)) where index(key) is the position of keyword key in the list of keywords ordered in the decreasing order of their tf-idf values.

notes/imfm/corona/ana/tfidf.txt · Last modified: 2021/01/06 05:12 by vlado
 
Except where otherwise noted, content on this wiki is licensed under the following license: CC Attribution-Noncommercial-Share Alike 3.0 Unported
Recent changes RSS feed Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki