Friends

Friends subtitles for all seasons. Download the SRT files.

Program in Python

# TF-IDF word clouds for Friends episodes
wdir = "C:/.../term/test"
ddir = "C:/.../term/Friends"
import sys, os, re, datetime, csv, json, shutil, time, math
from os import listdir
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import fontTools
 
os.chdir(wdir)
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
 
def get_wordnet_pos(word):
   """Map POS tag to first character lemmatize() accepts"""
   tag = nltk.pos_tag([word])[0][1][0].upper()
   tag_dict = {"J": wordnet.ADJ,
               "N": wordnet.NOUN,
               "V": wordnet.VERB,
               "R": wordnet.ADV}
   return tag_dict.get(tag, wordnet.NOUN)
 
# get frequencies for the episode on file Ffile = Ffiles[fix]
def getFreqs(Ffile):
   with open(ddir+"/"+Ffile,newline='',encoding="utf-8") as infile:
      lines = infile.readlines()
   S = [ s.lower() for s in [ l.strip() for l in lines ]
      if not((len(s)==0) or s.isnumeric() or ("-->" in s)) ]
   SS = ' '.join(S)
   L = [ lemmatizer.lemmatize(w, get_wordnet_pos(w)) for\
      w in nltk.word_tokenize(SS)]
   K = [w for w in L if w not in new_stopwords]
   return Counter(K)
 
# draw a word cloud for frequency dictionary D = dict(W[i])
def makeWordCloud(D): 
   wc = WordCloud(background_color="white", max_words=200)
   wc.generate_from_frequencies(D) 
   plt.imshow(wc, interpolation="bilinear")
   plt.axis("off")
   plt.show()
 
# save a word cloud for frequency dictionary D = dict(W[i]) in selected
# format determined by the file extension
# { 'pdf', 'eps', 'svg', 'png', 'jpg' }
def saveWordCloud(D,wcfile,width=600,height=400,max_words=200): 
   wc = WordCloud(background_color="white",width=width,height=height,\
                  max_words=max_words)
   wc.generate_from_frequencies(D)
   wc.to_file(wcfile)
 
# save a word cloud for frequency dictionary D = dict(W[i]) in SVG
def svgWordCloud(D,wcfile,width=600,height=400,max_words=200): 
   wc = WordCloud(background_color="white",width=width,height=height,\
                  max_words=max_words)                  
   wc.generate_from_frequencies(D)
   f = open(wcfile,"w+")
   f.write(wc.to_svg(embed_font=True))
   f.close()
 
# -----
version = "Friends 0.1"
print(version)
ts = datetime.datetime.now()
print('{0}: {1}\n'.format("START",ts))
 
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
# add stop words that aren't in the NLTK stopwords list
add_words = ['!', '?', ',', ':', '&', '%', '.', '’', '(', ')', '[', ']',\
             "...", "-", "--", "'d", "'m", "'s", "'re", "'ve", "'ll",\
             "....", "<", ">", "/i", "``", "''", "n't", "ca", "gon", "na",\
             "u", "l", "wh" ]
new_stopwords = stop_words.union(add_words)
 
# list of Friends episode files
Ffiles = [f for f in os.listdir(ddir) if f.endswith(".srt")]
 
tr = datetime.datetime.now()
print('{0}: {1}\n'.format("COUNT",tr))
 
# count frequencies
nf = len(Ffiles); CL = [[]]*nf; CS = [0]*nf
T = Counter(); TS = Counter()
for fix in range(nf):
   CL[fix] = getFreqs(Ffiles[fix])
   CS[fix] = sum(CL[fix].values()) 
   T += CL[fix]
   TS += Counter(set(CL[fix]))
   print(fix,'/',CS[fix],':',Ffiles[fix],'\n',CL[fix].most_common(20))
CT = sum(T.values())
print(-1,'/',CT,':','Total','\n',T.most_common(20))
 
ti = datetime.datetime.now()
print('{0}: {1}\n'.format("TF-IDF",ti))
 
# compute TF-IDF weights
# to improve readability weights are multiplied with 10000 and rounded
W = [ [] for i in range(nf) ]
for i in range(nf): 
   W[i] = [ (key, round(10000*CL[i][key]/CS[i]*math.log(nf/TS[key])))\
            for key in CL[i] ]
   print(i,'/',CS[i],':',Ffiles[i],'\n',Counter(dict(W[i])).most_common(20))
 
tf = datetime.datetime.now()
print('{0}: {1}\n'.format("END",tf))

Computing frequencies for each episode

>>> 
 RESTART: C:\...\test\friends.pyw 
Friends 0.1
START: 2021-06-20 04:02:45.572279

COUNT: 2021-06-20 04:02:45.592280

0 / 1369 : Friends - [10x01] - The One After Joey and Rachel Kiss.srt 
 [('yeah', 39), ('know', 33), ('okay', 32), ('oh', 30), ('well', 25), ('think', 24), ('get', 23), ('ross', 21), ('right', 21), ('hey', 19), ('go', 19), ('really', 18), ('talk', 17), ('joey', 17), ('say', 17), ('good', 17), ('mean', 15), ('like', 14), ('mike', 13), ('charlie', 13)]
1 / 1224 : Friends - [10x02] - The One Where Ross Is Fine.srt 
 [('know', 42), ('oh', 29), ('okay', 23), ('ross', 23), ('get', 21), ('yeah', 20), ('think', 20), ('well', 17), ('hey', 16), ('adopt', 16), ('one', 15), ('would', 15), ('fine', 15), ('go', 14), ('like', 14), ('see', 13), ('right', 13), ('god', 13), ('say', 11), ('want', 10)]
2 / 1141 : Friends - [10x03] - The One with Ross's Tan.srt 
 [('get', 35), ('okay', 32), ('yeah', 24), ('know', 23), ('oh', 21), ('well', 20), ('right', 19), ('like', 17), ('chandler', 15), ('go', 14), ('hey', 13), ('look', 13), ('could', 13), ('one', 12), ('back', 12), ('two', 12), ('say', 11), ('really', 11), ('mean', 11), ('sorry', 11)]
3 / 1457 : Friends - [10x04] - The One with the Cake.srt 
 [('emma', 45), ('get', 37), ('go', 33), ('okay', 32), ('oh', 27), ('hey', 24), ('birthday', 23), ('know', 20), ('well', 19), ('one', 19), ('cake', 19), ('right', 17), ('yeah', 14), ('come', 13), ('could', 13), ('guy', 12), ('party', 11), ('like', 11), ('joey', 11), ('let', 11)]
4 / 1450 : Friends - [10x05] - The One Where Rachel's Sister Babysits.srt 
 [('know', 33), ('oh', 23), ('get', 22), ('well', 21), ('okay', 20), ('yeah', 20), ('think', 18), ('guy', 18), ('want', 18), ('hi', 16), ('right', 15), ('like', 15), ('would', 14), ('go', 14), ('amy', 14), ('baby', 13), ('love', 13), ('god', 13), ('propose', 13), ('help', 12)]
5 / 1403 : Friends - [10x06] - The One with Ross' Grant.srt 
 [('oh', 33), ('get', 28), ('know', 22), ('like', 19), ('well', 19), ('hey', 18), ('think', 18), ('okay', 18), ('lie', 18), ('right', 17), ('yeah', 16), ('go', 15), ('want', 14), ('say', 14), ('give', 14), ('tell', 14), ('one', 12), ('sorry', 12), ('watch', 12), ('really', 11)]
..........
232 / 1455 : Friends - [9x22] - The One With The Donor.srt 
 [('know', 27), ('get', 26), ('okay', 23), ('well', 21), ('go', 21), ('-l', 20), ('want', 20), ('guy', 20), ('right', 17), ('like', 17), ('think', 17), ('oh', 15), ('see', 15), ('yeah', 14), ('really', 13), ('good', 12), ('mean', 12), ('great', 11), ('sperm', 11), ('lt', 11)]
233 / 1758 : Friends - [9x23] - The One in Barbados.srt 
 [('go', 33), ('know', 30), ('get', 28), ('-i', 22), ('mike', 22), ('oh', 22), ('phoebe', 22), ('say', 21), ('okay', 21), ('weii', 20), ('aii', 19), ('want', 19), ('david', 19), ('right', 15), ('yeah', 15), ('reaiiy', 15), ('couid', 15), ('-yeah', 14), ('hey', 13), ('guy', 13)]
234 / 1270 : Friends - [9x24] - The One in Barbados.srt 
 [('know', 28), ('okay', 24), ('go', 18), ('-i', 18), ('right', 17), ('get', 17), ('think', 16), ('-oh', 15), ('mean', 14), ('aii', 14), ('oh', 14), ('-no', 14), ('weii', 11), ('say', 11), ('-you', 11), ('iike', 11), ('-yeah', 10), ('wouid', 9), ('-weii', 9), ('time', 9)]
-1 / 268819 : Total 
 [('get', 5629), ('know', 5263), ('go', 4223), ('okay', 3257), ('right', 2931), ('oh', 2927), ('want', 2610), ('think', 2457), ('like', 2433), ('say', 2432), ('come', 2161), ('one', 2156), ('guy', 2142), ('see', 2031), ('well', 1991), ('yeah', 1943), ('look', 1795), ('really', 1589), ('make', 1560), ('good', 1552)]
TF-IDF: 2021-06-20 04:14:55.765044

The complete listing of Frequencies top 20.

Computing TF-IDF weights

0 / 1369 : Friends - [10x01] - The One After Joey and Rachel Kiss.srt 
 [('charlie', 310), ('mike', 192), ('salon', 160), ('wall', 126), ('barbados', 107), ('precious', 106), ('susie', 96), ('nonchalant', 80), ('glimpse', 80), ('cornrows', 80), ('scalp', 70), ('grandma', 68), ('shampoo', 54), ('propose', 53), ('kiss', 52), ('appreciate', 46), ('girlfriend', 44), ('emma', 42), ('switch', 41), ('pushup', 40)]
1 / 1224 : Friends - [10x02] - The One Where Ross Is Fine.srt 
 [('adopt', 442), ('owen', 234), ('fajitas', 223), ('frank', 220), ('charlie', 187), ('triplet', 166), ('colleen', 134), ('adoption', 120), ('jr.', 103), ('batch', 100), ('margarita', 90), ('carriage', 89), ('leslie', 80), ('frog', 78), ('intuitive', 78), ('chill', 71), ('flan', 71), ('barbados', 60), ('mitt', 60), ('alice', 57)]
2 / 1141 : Friends - [10x03] - The One with Ross's Tan.srt 
 [('amanda', 382), ('tan', 207), ('bra', 169), ('spray-on', 144), ('sprayed', 144), ('scrappy', 144), ('mississippi', 143), ('spray', 135), ('pedicure', 125), ('mobile', 125), ('count', 112), ('pat', 96), ('monica.', 96), ('1992.', 96), ('dodge', 84), ('power', 81), ('accident', 81), ('british', 76), ('soldier', 76), ('popped', 71)]
3 / 1457 : Friends - [10x04] - The One with the Cake.srt 
 [('emma', 588), ('birthday', 254), ('cake', 252), ('vermont', 145), ('maxim', 112), ('prepared', 111), ('bunny', 106), ('frost', 84), ('race', 79), ('8th', 76), ('bakery', 76), ('monologue', 75), ('party', 73), ('audition', 73), ('robot', 67), ('awake', 65), ('naptime', 65), ('testicle', 65), ('forth', 65), ('nighty-night', 65)]
4 / 1450 : Friends - [10x05] - The One Where Rachel's Sister Babysits.srt 
 [('amy', 372), ('ella', 264), ('propose', 217), ('b.', 188), ('anniversary', 164), ('thesaurus', 151), ('myron', 151), ('letter', 145), ('recommendation', 131), ('falafel', 113), ('emma', 105), ('screen', 97), ('knicks', 82), ('one-year', 75), ('smart', 75), ('scoreboard.', 75), ('ask.', 75), ('full-sized', 75), ('aortic', 75), ('brockovich', 75)]
5 / 1403 : Friends - [10x06] - The One with Ross' Grant.srt 
 [('gladys', 350), ('grant', 233), ('benji', 195), ('tape', 170), ('charlie', 163), ('lie', 157), ('boss', 155), ('hobart', 136), ('boscodictiasaur', 117), ('benjamin', 102), ('ex-boyfriend', 93), ('nobel', 93), ('proposal', 82), ('chameleon', 78), ('likewise', 78), ('li', 78), ('biely', 78), ('ichiban', 78), ('men.', 78), ('reilly', 78)]
..........
232 / 1455 : Friends - [9x22] - The One With The Donor.srt 
 [('-l', 266), ('zack', 263), ('sperm', 239), ('lt', 144), ('donor', 131), ('keynote', 120), ('surrogacy', 113), ('beet', 113), ('charlie', 112), ('speaker', 106), ('mike', 97), ('lf', 92), ('insemination', 75), ('sherman', 75), ('-barbados', 75), ('shopping', 74), ('slutty', 70), ('inhospitable', 66), ('ravioli', 66), ('paleontology', 65)]
233 / 1758 : Friends - [9x23] - The One in Barbados.srt 
 [('david', 321), ('mike', 253), ('weii', 204), ('aii', 181), ('speech', 156), ('reaiiy', 153), ('couid', 151), ('iove', 117), ('wiii', 116), ('iike', 114), ('propose', 110), ('advice', 108), ('-weii', 102), ('thong', 100), ('meddie', 93), ('pharmacist', 88), ('rain', 86), ('chandier', 84), ('barbados', 83), ('shouid', 83)]
234 / 1270 : Friends - [9x24] - The One in Barbados.srt 
 [('aii', 185), ('weii', 156), ('iike', 145), ('-weii', 141), ('wouid', 127), ('homo', 121), ('iook', 115), ('erectus', 113), ('ping-pong', 113), ('piay', 110), ('reaiiy', 99), ('pooi', 87), ('shake', 87), ('herbivore', 86), ('carnivore', 86), ('erectus.', 86), ('forfeit', 86), ('comeback', 86), ('41', 86), ('iittie', 85)]
>>> 

Complete listing of TF-IDF top 20.

Notes

  1. there are still some irregularities in the tables. For example, words starting with - : -barbados, … Replace such special characters with space or remove them?
  2. it seems that the data were obtained with OCR tool that had problems with distinguishing between i and l : reaiiy → really, iittie → little, weii → well, wiii → will, …. Corrections? or find some other source?
  3. the language used is not literary but colloquial language. Some kind of normalization?

Word clouds

Displaying word cloud on the screen

i = 3; makeWordCloud(dict(W[i]))
i = 7; makeWordCloud(dict(W[i]))
makeWordCloud(dict(W[141]))

Friends 003 - The One with the Cake

Friends 007 - The One with the Late Thanksgiving

Friends 141 - The One Where Joey Loses His Insurance

Saving word cloud to file

Unfortunately for vector formats (EPS, PDF, SVG) the bitmap picture is saved.

saveWordCloud(dict(W[141]),"Friends-141.png",width=1200,height=800)

It is now easy to produce in a subdirectory PNG word clouds for all episodes and inspect them using some picture viewer (IrfanView).

for i in range(nf):
   wcfile = "./png/Friends-"+str(i).zfill(3)+".png"
   if(i % 10 == 0): print(i)
   saveWordCloud(dict(W[i]),wcfile,width=900,height=600)

Saving word cloud to SVG file

The word cloud can be saved in REAL vector graphics in SVG format

svgWordCloud(dict(W[141]),"Friends-141.svg",width=900,height=600)

In a web browser, it displays OK. I still have problems in converting it to PDF using Inkscape.

Analysis with the alternative subtitles files

Complete listing of TF-IDF top 20.

Some links

notes/text/friends.txt · Last modified: 2021/06/22 23:32 by vlado
 
Except where otherwise noted, content on this wiki is licensed under the following license: CC Attribution-Noncommercial-Share Alike 3.0 Unported
Recent changes RSS feed Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki