====== Friends ======
[[http://www.tvsubtitles.net/tvshow-65-1.html|Friends subtitles for all seasons]].
Download the SRT files.
===== Program in Python =====
# TF-IDF word clouds for Friends episodes
wdir = "C:/.../term/test"
ddir = "C:/.../term/Friends"
import sys, os, re, datetime, csv, json, shutil, time, math
from os import listdir
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import fontTools
os.chdir(wdir)
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
def get_wordnet_pos(word):
"""Map POS tag to first character lemmatize() accepts"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
# get frequencies for the episode on file Ffile = Ffiles[fix]
def getFreqs(Ffile):
with open(ddir+"/"+Ffile,newline='',encoding="utf-8") as infile:
lines = infile.readlines()
S = [ s.lower() for s in [ l.strip() for l in lines ]
if not((len(s)==0) or s.isnumeric() or ("-->" in s)) ]
SS = ' '.join(S)
L = [ lemmatizer.lemmatize(w, get_wordnet_pos(w)) for\
w in nltk.word_tokenize(SS)]
K = [w for w in L if w not in new_stopwords]
return Counter(K)
# draw a word cloud for frequency dictionary D = dict(W[i])
def makeWordCloud(D):
wc = WordCloud(background_color="white", max_words=200)
wc.generate_from_frequencies(D)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()
# save a word cloud for frequency dictionary D = dict(W[i]) in selected
# format determined by the file extension
# { 'pdf', 'eps', 'svg', 'png', 'jpg' }
def saveWordCloud(D,wcfile,width=600,height=400,max_words=200):
wc = WordCloud(background_color="white",width=width,height=height,\
max_words=max_words)
wc.generate_from_frequencies(D)
wc.to_file(wcfile)
# save a word cloud for frequency dictionary D = dict(W[i]) in SVG
def svgWordCloud(D,wcfile,width=600,height=400,max_words=200):
wc = WordCloud(background_color="white",width=width,height=height,\
max_words=max_words)
wc.generate_from_frequencies(D)
f = open(wcfile,"w+")
f.write(wc.to_svg(embed_font=True))
f.close()
# -----
version = "Friends 0.1"
print(version)
ts = datetime.datetime.now()
print('{0}: {1}\n'.format("START",ts))
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
# add stop words that aren't in the NLTK stopwords list
add_words = ['!', '?', ',', ':', '&', '%', '.', '’', '(', ')', '[', ']',\
"...", "-", "--", "'d", "'m", "'s", "'re", "'ve", "'ll",\
"....", "<", ">", "/i", "``", "''", "n't", "ca", "gon", "na",\
"u", "l", "wh" ]
new_stopwords = stop_words.union(add_words)
# list of Friends episode files
Ffiles = [f for f in os.listdir(ddir) if f.endswith(".srt")]
tr = datetime.datetime.now()
print('{0}: {1}\n'.format("COUNT",tr))
# count frequencies
nf = len(Ffiles); CL = [[]]*nf; CS = [0]*nf
T = Counter(); TS = Counter()
for fix in range(nf):
CL[fix] = getFreqs(Ffiles[fix])
CS[fix] = sum(CL[fix].values())
T += CL[fix]
TS += Counter(set(CL[fix]))
print(fix,'/',CS[fix],':',Ffiles[fix],'\n',CL[fix].most_common(20))
CT = sum(T.values())
print(-1,'/',CT,':','Total','\n',T.most_common(20))
ti = datetime.datetime.now()
print('{0}: {1}\n'.format("TF-IDF",ti))
# compute TF-IDF weights
# to improve readability weights are multiplied with 10000 and rounded
W = [ [] for i in range(nf) ]
for i in range(nf):
W[i] = [ (key, round(10000*CL[i][key]/CS[i]*math.log(nf/TS[key])))\
for key in CL[i] ]
print(i,'/',CS[i],':',Ffiles[i],'\n',Counter(dict(W[i])).most_common(20))
tf = datetime.datetime.now()
print('{0}: {1}\n'.format("END",tf))
===== Computing frequencies for each episode =====
>>>
RESTART: C:\...\test\friends.pyw
Friends 0.1
START: 2021-06-20 04:02:45.572279
COUNT: 2021-06-20 04:02:45.592280
0 / 1369 : Friends - [10x01] - The One After Joey and Rachel Kiss.srt
[('yeah', 39), ('know', 33), ('okay', 32), ('oh', 30), ('well', 25), ('think', 24), ('get', 23), ('ross', 21), ('right', 21), ('hey', 19), ('go', 19), ('really', 18), ('talk', 17), ('joey', 17), ('say', 17), ('good', 17), ('mean', 15), ('like', 14), ('mike', 13), ('charlie', 13)]
1 / 1224 : Friends - [10x02] - The One Where Ross Is Fine.srt
[('know', 42), ('oh', 29), ('okay', 23), ('ross', 23), ('get', 21), ('yeah', 20), ('think', 20), ('well', 17), ('hey', 16), ('adopt', 16), ('one', 15), ('would', 15), ('fine', 15), ('go', 14), ('like', 14), ('see', 13), ('right', 13), ('god', 13), ('say', 11), ('want', 10)]
2 / 1141 : Friends - [10x03] - The One with Ross's Tan.srt
[('get', 35), ('okay', 32), ('yeah', 24), ('know', 23), ('oh', 21), ('well', 20), ('right', 19), ('like', 17), ('chandler', 15), ('go', 14), ('hey', 13), ('look', 13), ('could', 13), ('one', 12), ('back', 12), ('two', 12), ('say', 11), ('really', 11), ('mean', 11), ('sorry', 11)]
3 / 1457 : Friends - [10x04] - The One with the Cake.srt
[('emma', 45), ('get', 37), ('go', 33), ('okay', 32), ('oh', 27), ('hey', 24), ('birthday', 23), ('know', 20), ('well', 19), ('one', 19), ('cake', 19), ('right', 17), ('yeah', 14), ('come', 13), ('could', 13), ('guy', 12), ('party', 11), ('like', 11), ('joey', 11), ('let', 11)]
4 / 1450 : Friends - [10x05] - The One Where Rachel's Sister Babysits.srt
[('know', 33), ('oh', 23), ('get', 22), ('well', 21), ('okay', 20), ('yeah', 20), ('think', 18), ('guy', 18), ('want', 18), ('hi', 16), ('right', 15), ('like', 15), ('would', 14), ('go', 14), ('amy', 14), ('baby', 13), ('love', 13), ('god', 13), ('propose', 13), ('help', 12)]
5 / 1403 : Friends - [10x06] - The One with Ross' Grant.srt
[('oh', 33), ('get', 28), ('know', 22), ('like', 19), ('well', 19), ('hey', 18), ('think', 18), ('okay', 18), ('lie', 18), ('right', 17), ('yeah', 16), ('go', 15), ('want', 14), ('say', 14), ('give', 14), ('tell', 14), ('one', 12), ('sorry', 12), ('watch', 12), ('really', 11)]
..........
232 / 1455 : Friends - [9x22] - The One With The Donor.srt
[('know', 27), ('get', 26), ('okay', 23), ('well', 21), ('go', 21), ('-l', 20), ('want', 20), ('guy', 20), ('right', 17), ('like', 17), ('think', 17), ('oh', 15), ('see', 15), ('yeah', 14), ('really', 13), ('good', 12), ('mean', 12), ('great', 11), ('sperm', 11), ('lt', 11)]
233 / 1758 : Friends - [9x23] - The One in Barbados.srt
[('go', 33), ('know', 30), ('get', 28), ('-i', 22), ('mike', 22), ('oh', 22), ('phoebe', 22), ('say', 21), ('okay', 21), ('weii', 20), ('aii', 19), ('want', 19), ('david', 19), ('right', 15), ('yeah', 15), ('reaiiy', 15), ('couid', 15), ('-yeah', 14), ('hey', 13), ('guy', 13)]
234 / 1270 : Friends - [9x24] - The One in Barbados.srt
[('know', 28), ('okay', 24), ('go', 18), ('-i', 18), ('right', 17), ('get', 17), ('think', 16), ('-oh', 15), ('mean', 14), ('aii', 14), ('oh', 14), ('-no', 14), ('weii', 11), ('say', 11), ('-you', 11), ('iike', 11), ('-yeah', 10), ('wouid', 9), ('-weii', 9), ('time', 9)]
-1 / 268819 : Total
[('get', 5629), ('know', 5263), ('go', 4223), ('okay', 3257), ('right', 2931), ('oh', 2927), ('want', 2610), ('think', 2457), ('like', 2433), ('say', 2432), ('come', 2161), ('one', 2156), ('guy', 2142), ('see', 2031), ('well', 1991), ('yeah', 1943), ('look', 1795), ('really', 1589), ('make', 1560), ('good', 1552)]
TF-IDF: 2021-06-20 04:14:55.765044
The complete listing of [[.:friends:freq|Frequencies top 20]].
===== Computing TF-IDF weights =====
0 / 1369 : Friends - [10x01] - The One After Joey and Rachel Kiss.srt
[('charlie', 310), ('mike', 192), ('salon', 160), ('wall', 126), ('barbados', 107), ('precious', 106), ('susie', 96), ('nonchalant', 80), ('glimpse', 80), ('cornrows', 80), ('scalp', 70), ('grandma', 68), ('shampoo', 54), ('propose', 53), ('kiss', 52), ('appreciate', 46), ('girlfriend', 44), ('emma', 42), ('switch', 41), ('pushup', 40)]
1 / 1224 : Friends - [10x02] - The One Where Ross Is Fine.srt
[('adopt', 442), ('owen', 234), ('fajitas', 223), ('frank', 220), ('charlie', 187), ('triplet', 166), ('colleen', 134), ('adoption', 120), ('jr.', 103), ('batch', 100), ('margarita', 90), ('carriage', 89), ('leslie', 80), ('frog', 78), ('intuitive', 78), ('chill', 71), ('flan', 71), ('barbados', 60), ('mitt', 60), ('alice', 57)]
2 / 1141 : Friends - [10x03] - The One with Ross's Tan.srt
[('amanda', 382), ('tan', 207), ('bra', 169), ('spray-on', 144), ('sprayed', 144), ('scrappy', 144), ('mississippi', 143), ('spray', 135), ('pedicure', 125), ('mobile', 125), ('count', 112), ('pat', 96), ('monica.', 96), ('1992.', 96), ('dodge', 84), ('power', 81), ('accident', 81), ('british', 76), ('soldier', 76), ('popped', 71)]
3 / 1457 : Friends - [10x04] - The One with the Cake.srt
[('emma', 588), ('birthday', 254), ('cake', 252), ('vermont', 145), ('maxim', 112), ('prepared', 111), ('bunny', 106), ('frost', 84), ('race', 79), ('8th', 76), ('bakery', 76), ('monologue', 75), ('party', 73), ('audition', 73), ('robot', 67), ('awake', 65), ('naptime', 65), ('testicle', 65), ('forth', 65), ('nighty-night', 65)]
4 / 1450 : Friends - [10x05] - The One Where Rachel's Sister Babysits.srt
[('amy', 372), ('ella', 264), ('propose', 217), ('b.', 188), ('anniversary', 164), ('thesaurus', 151), ('myron', 151), ('letter', 145), ('recommendation', 131), ('falafel', 113), ('emma', 105), ('screen', 97), ('knicks', 82), ('one-year', 75), ('smart', 75), ('scoreboard.', 75), ('ask.', 75), ('full-sized', 75), ('aortic', 75), ('brockovich', 75)]
5 / 1403 : Friends - [10x06] - The One with Ross' Grant.srt
[('gladys', 350), ('grant', 233), ('benji', 195), ('tape', 170), ('charlie', 163), ('lie', 157), ('boss', 155), ('hobart', 136), ('boscodictiasaur', 117), ('benjamin', 102), ('ex-boyfriend', 93), ('nobel', 93), ('proposal', 82), ('chameleon', 78), ('likewise', 78), ('li', 78), ('biely', 78), ('ichiban', 78), ('men.', 78), ('reilly', 78)]
..........
232 / 1455 : Friends - [9x22] - The One With The Donor.srt
[('-l', 266), ('zack', 263), ('sperm', 239), ('lt', 144), ('donor', 131), ('keynote', 120), ('surrogacy', 113), ('beet', 113), ('charlie', 112), ('speaker', 106), ('mike', 97), ('lf', 92), ('insemination', 75), ('sherman', 75), ('-barbados', 75), ('shopping', 74), ('slutty', 70), ('inhospitable', 66), ('ravioli', 66), ('paleontology', 65)]
233 / 1758 : Friends - [9x23] - The One in Barbados.srt
[('david', 321), ('mike', 253), ('weii', 204), ('aii', 181), ('speech', 156), ('reaiiy', 153), ('couid', 151), ('iove', 117), ('wiii', 116), ('iike', 114), ('propose', 110), ('advice', 108), ('-weii', 102), ('thong', 100), ('meddie', 93), ('pharmacist', 88), ('rain', 86), ('chandier', 84), ('barbados', 83), ('shouid', 83)]
234 / 1270 : Friends - [9x24] - The One in Barbados.srt
[('aii', 185), ('weii', 156), ('iike', 145), ('-weii', 141), ('wouid', 127), ('homo', 121), ('iook', 115), ('erectus', 113), ('ping-pong', 113), ('piay', 110), ('reaiiy', 99), ('pooi', 87), ('shake', 87), ('herbivore', 86), ('carnivore', 86), ('erectus.', 86), ('forfeit', 86), ('comeback', 86), ('41', 86), ('iittie', 85)]
>>>
Complete listing of [[.:friends:tfidf|TF-IDF top 20]].
===== Notes =====
- there are still some irregularities in the tables. For example, words starting with - : ''-barbados'', ... Replace such special characters with space or remove them?
- it seems that the data were obtained with OCR tool that had problems with distinguishing between i and l : reaiiy -> really, iittie -> little, weii -> well, wiii -> will, .... Corrections? or find some other source?
- the language used is not literary but colloquial language. Some kind of normalization?
===== Word clouds =====
==== Displaying word cloud on the screen ====
i = 3; makeWordCloud(dict(W[i]))
i = 7; makeWordCloud(dict(W[i]))
makeWordCloud(dict(W[141]))
Friends 003 - The One with the Cake
{{notes:text:pics:friends003.png}}
Friends 007 - The One with the Late Thanksgiving
{{notes:text:pics:friends007.png}}
Friends 141 - The One Where Joey Loses His Insurance
{{notes:text:pics:friends141.png}}
==== Saving word cloud to file ====
Unfortunately for vector formats (EPS, PDF, SVG) the bitmap picture is saved.
saveWordCloud(dict(W[141]),"Friends-141.png",width=1200,height=800)
{{notes:text:pics:friends-141.png?900}}
It is now easy to produce in a subdirectory ''PNG'' word clouds for all episodes and inspect them using some picture viewer (IrfanView).
for i in range(nf):
wcfile = "./png/Friends-"+str(i).zfill(3)+".png"
if(i % 10 == 0): print(i)
saveWordCloud(dict(W[i]),wcfile,width=900,height=600)
==== Saving word cloud to SVG file ====
The word cloud can be saved in **REAL** vector graphics in SVG format
svgWordCloud(dict(W[141]),"Friends-141.svg",width=900,height=600)
In a web browser, it displays OK. I still have problems in converting it to PDF using Inkscape.
===== Analysis with the alternative subtitles files =====
Complete listing of [[.:friends:tfidf2|TF-IDF top 20]].
{{notes:text:pics:friends2-123.png}}
{{notes:text:pics:friends2-071.png?500}}
{{notes:text:pics:friends2-154.png?500}}
{{notes:text:pics:friends2-228.png?500}}
{{notes:text:pics:friends2-233.png?500}}
===== Some links =====
- https://github.com/amueller/word_cloud/
- https://www.python-course.eu/python_wordcloud_tutorial.php
- https://www.codegrepper.com/code-examples/python/counter+python+collections+get+top+10