Friends subtitles for all seasons. Download the SRT files.
# TF-IDF word clouds for Friends episodes wdir = "C:/.../term/test" ddir = "C:/.../term/Friends" import sys, os, re, datetime, csv, json, shutil, time, math from os import listdir from collections import Counter from wordcloud import WordCloud import matplotlib.pyplot as plt import fontTools os.chdir(wdir) import nltk from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet from nltk.corpus import stopwords def get_wordnet_pos(word): """Map POS tag to first character lemmatize() accepts""" tag = nltk.pos_tag([word])[0][1][0].upper() tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV} return tag_dict.get(tag, wordnet.NOUN) # get frequencies for the episode on file Ffile = Ffiles[fix] def getFreqs(Ffile): with open(ddir+"/"+Ffile,newline='',encoding="utf-8") as infile: lines = infile.readlines() S = [ s.lower() for s in [ l.strip() for l in lines ] if not((len(s)==0) or s.isnumeric() or ("-->" in s)) ] SS = ' '.join(S) L = [ lemmatizer.lemmatize(w, get_wordnet_pos(w)) for\ w in nltk.word_tokenize(SS)] K = [w for w in L if w not in new_stopwords] return Counter(K) # draw a word cloud for frequency dictionary D = dict(W[i]) def makeWordCloud(D): wc = WordCloud(background_color="white", max_words=200) wc.generate_from_frequencies(D) plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.show() # save a word cloud for frequency dictionary D = dict(W[i]) in selected # format determined by the file extension # { 'pdf', 'eps', 'svg', 'png', 'jpg' } def saveWordCloud(D,wcfile,width=600,height=400,max_words=200): wc = WordCloud(background_color="white",width=width,height=height,\ max_words=max_words) wc.generate_from_frequencies(D) wc.to_file(wcfile) # save a word cloud for frequency dictionary D = dict(W[i]) in SVG def svgWordCloud(D,wcfile,width=600,height=400,max_words=200): wc = WordCloud(background_color="white",width=width,height=height,\ max_words=max_words) wc.generate_from_frequencies(D) f = open(wcfile,"w+") f.write(wc.to_svg(embed_font=True)) f.close() # ----- version = "Friends 0.1" print(version) ts = datetime.datetime.now() print('{0}: {1}\n'.format("START",ts)) lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words("english")) # add stop words that aren't in the NLTK stopwords list add_words = ['!', '?', ',', ':', '&', '%', '.', '’', '(', ')', '[', ']',\ "...", "-", "--", "'d", "'m", "'s", "'re", "'ve", "'ll",\ "....", "<", ">", "/i", "``", "''", "n't", "ca", "gon", "na",\ "u", "l", "wh" ] new_stopwords = stop_words.union(add_words) # list of Friends episode files Ffiles = [f for f in os.listdir(ddir) if f.endswith(".srt")] tr = datetime.datetime.now() print('{0}: {1}\n'.format("COUNT",tr)) # count frequencies nf = len(Ffiles); CL = [[]]*nf; CS = [0]*nf T = Counter(); TS = Counter() for fix in range(nf): CL[fix] = getFreqs(Ffiles[fix]) CS[fix] = sum(CL[fix].values()) T += CL[fix] TS += Counter(set(CL[fix])) print(fix,'/',CS[fix],':',Ffiles[fix],'\n',CL[fix].most_common(20)) CT = sum(T.values()) print(-1,'/',CT,':','Total','\n',T.most_common(20)) ti = datetime.datetime.now() print('{0}: {1}\n'.format("TF-IDF",ti)) # compute TF-IDF weights # to improve readability weights are multiplied with 10000 and rounded W = [ [] for i in range(nf) ] for i in range(nf): W[i] = [ (key, round(10000*CL[i][key]/CS[i]*math.log(nf/TS[key])))\ for key in CL[i] ] print(i,'/',CS[i],':',Ffiles[i],'\n',Counter(dict(W[i])).most_common(20)) tf = datetime.datetime.now() print('{0}: {1}\n'.format("END",tf))
>>> RESTART: C:\...\test\friends.pyw Friends 0.1 START: 2021-06-20 04:02:45.572279 COUNT: 2021-06-20 04:02:45.592280 0 / 1369 : Friends - [10x01] - The One After Joey and Rachel Kiss.srt [('yeah', 39), ('know', 33), ('okay', 32), ('oh', 30), ('well', 25), ('think', 24), ('get', 23), ('ross', 21), ('right', 21), ('hey', 19), ('go', 19), ('really', 18), ('talk', 17), ('joey', 17), ('say', 17), ('good', 17), ('mean', 15), ('like', 14), ('mike', 13), ('charlie', 13)] 1 / 1224 : Friends - [10x02] - The One Where Ross Is Fine.srt [('know', 42), ('oh', 29), ('okay', 23), ('ross', 23), ('get', 21), ('yeah', 20), ('think', 20), ('well', 17), ('hey', 16), ('adopt', 16), ('one', 15), ('would', 15), ('fine', 15), ('go', 14), ('like', 14), ('see', 13), ('right', 13), ('god', 13), ('say', 11), ('want', 10)] 2 / 1141 : Friends - [10x03] - The One with Ross's Tan.srt [('get', 35), ('okay', 32), ('yeah', 24), ('know', 23), ('oh', 21), ('well', 20), ('right', 19), ('like', 17), ('chandler', 15), ('go', 14), ('hey', 13), ('look', 13), ('could', 13), ('one', 12), ('back', 12), ('two', 12), ('say', 11), ('really', 11), ('mean', 11), ('sorry', 11)] 3 / 1457 : Friends - [10x04] - The One with the Cake.srt [('emma', 45), ('get', 37), ('go', 33), ('okay', 32), ('oh', 27), ('hey', 24), ('birthday', 23), ('know', 20), ('well', 19), ('one', 19), ('cake', 19), ('right', 17), ('yeah', 14), ('come', 13), ('could', 13), ('guy', 12), ('party', 11), ('like', 11), ('joey', 11), ('let', 11)] 4 / 1450 : Friends - [10x05] - The One Where Rachel's Sister Babysits.srt [('know', 33), ('oh', 23), ('get', 22), ('well', 21), ('okay', 20), ('yeah', 20), ('think', 18), ('guy', 18), ('want', 18), ('hi', 16), ('right', 15), ('like', 15), ('would', 14), ('go', 14), ('amy', 14), ('baby', 13), ('love', 13), ('god', 13), ('propose', 13), ('help', 12)] 5 / 1403 : Friends - [10x06] - The One with Ross' Grant.srt [('oh', 33), ('get', 28), ('know', 22), ('like', 19), ('well', 19), ('hey', 18), ('think', 18), ('okay', 18), ('lie', 18), ('right', 17), ('yeah', 16), ('go', 15), ('want', 14), ('say', 14), ('give', 14), ('tell', 14), ('one', 12), ('sorry', 12), ('watch', 12), ('really', 11)] .......... 232 / 1455 : Friends - [9x22] - The One With The Donor.srt [('know', 27), ('get', 26), ('okay', 23), ('well', 21), ('go', 21), ('-l', 20), ('want', 20), ('guy', 20), ('right', 17), ('like', 17), ('think', 17), ('oh', 15), ('see', 15), ('yeah', 14), ('really', 13), ('good', 12), ('mean', 12), ('great', 11), ('sperm', 11), ('lt', 11)] 233 / 1758 : Friends - [9x23] - The One in Barbados.srt [('go', 33), ('know', 30), ('get', 28), ('-i', 22), ('mike', 22), ('oh', 22), ('phoebe', 22), ('say', 21), ('okay', 21), ('weii', 20), ('aii', 19), ('want', 19), ('david', 19), ('right', 15), ('yeah', 15), ('reaiiy', 15), ('couid', 15), ('-yeah', 14), ('hey', 13), ('guy', 13)] 234 / 1270 : Friends - [9x24] - The One in Barbados.srt [('know', 28), ('okay', 24), ('go', 18), ('-i', 18), ('right', 17), ('get', 17), ('think', 16), ('-oh', 15), ('mean', 14), ('aii', 14), ('oh', 14), ('-no', 14), ('weii', 11), ('say', 11), ('-you', 11), ('iike', 11), ('-yeah', 10), ('wouid', 9), ('-weii', 9), ('time', 9)] -1 / 268819 : Total [('get', 5629), ('know', 5263), ('go', 4223), ('okay', 3257), ('right', 2931), ('oh', 2927), ('want', 2610), ('think', 2457), ('like', 2433), ('say', 2432), ('come', 2161), ('one', 2156), ('guy', 2142), ('see', 2031), ('well', 1991), ('yeah', 1943), ('look', 1795), ('really', 1589), ('make', 1560), ('good', 1552)] TF-IDF: 2021-06-20 04:14:55.765044
The complete listing of Frequencies top 20.
0 / 1369 : Friends - [10x01] - The One After Joey and Rachel Kiss.srt [('charlie', 310), ('mike', 192), ('salon', 160), ('wall', 126), ('barbados', 107), ('precious', 106), ('susie', 96), ('nonchalant', 80), ('glimpse', 80), ('cornrows', 80), ('scalp', 70), ('grandma', 68), ('shampoo', 54), ('propose', 53), ('kiss', 52), ('appreciate', 46), ('girlfriend', 44), ('emma', 42), ('switch', 41), ('pushup', 40)] 1 / 1224 : Friends - [10x02] - The One Where Ross Is Fine.srt [('adopt', 442), ('owen', 234), ('fajitas', 223), ('frank', 220), ('charlie', 187), ('triplet', 166), ('colleen', 134), ('adoption', 120), ('jr.', 103), ('batch', 100), ('margarita', 90), ('carriage', 89), ('leslie', 80), ('frog', 78), ('intuitive', 78), ('chill', 71), ('flan', 71), ('barbados', 60), ('mitt', 60), ('alice', 57)] 2 / 1141 : Friends - [10x03] - The One with Ross's Tan.srt [('amanda', 382), ('tan', 207), ('bra', 169), ('spray-on', 144), ('sprayed', 144), ('scrappy', 144), ('mississippi', 143), ('spray', 135), ('pedicure', 125), ('mobile', 125), ('count', 112), ('pat', 96), ('monica.', 96), ('1992.', 96), ('dodge', 84), ('power', 81), ('accident', 81), ('british', 76), ('soldier', 76), ('popped', 71)] 3 / 1457 : Friends - [10x04] - The One with the Cake.srt [('emma', 588), ('birthday', 254), ('cake', 252), ('vermont', 145), ('maxim', 112), ('prepared', 111), ('bunny', 106), ('frost', 84), ('race', 79), ('8th', 76), ('bakery', 76), ('monologue', 75), ('party', 73), ('audition', 73), ('robot', 67), ('awake', 65), ('naptime', 65), ('testicle', 65), ('forth', 65), ('nighty-night', 65)] 4 / 1450 : Friends - [10x05] - The One Where Rachel's Sister Babysits.srt [('amy', 372), ('ella', 264), ('propose', 217), ('b.', 188), ('anniversary', 164), ('thesaurus', 151), ('myron', 151), ('letter', 145), ('recommendation', 131), ('falafel', 113), ('emma', 105), ('screen', 97), ('knicks', 82), ('one-year', 75), ('smart', 75), ('scoreboard.', 75), ('ask.', 75), ('full-sized', 75), ('aortic', 75), ('brockovich', 75)] 5 / 1403 : Friends - [10x06] - The One with Ross' Grant.srt [('gladys', 350), ('grant', 233), ('benji', 195), ('tape', 170), ('charlie', 163), ('lie', 157), ('boss', 155), ('hobart', 136), ('boscodictiasaur', 117), ('benjamin', 102), ('ex-boyfriend', 93), ('nobel', 93), ('proposal', 82), ('chameleon', 78), ('likewise', 78), ('li', 78), ('biely', 78), ('ichiban', 78), ('men.', 78), ('reilly', 78)] .......... 232 / 1455 : Friends - [9x22] - The One With The Donor.srt [('-l', 266), ('zack', 263), ('sperm', 239), ('lt', 144), ('donor', 131), ('keynote', 120), ('surrogacy', 113), ('beet', 113), ('charlie', 112), ('speaker', 106), ('mike', 97), ('lf', 92), ('insemination', 75), ('sherman', 75), ('-barbados', 75), ('shopping', 74), ('slutty', 70), ('inhospitable', 66), ('ravioli', 66), ('paleontology', 65)] 233 / 1758 : Friends - [9x23] - The One in Barbados.srt [('david', 321), ('mike', 253), ('weii', 204), ('aii', 181), ('speech', 156), ('reaiiy', 153), ('couid', 151), ('iove', 117), ('wiii', 116), ('iike', 114), ('propose', 110), ('advice', 108), ('-weii', 102), ('thong', 100), ('meddie', 93), ('pharmacist', 88), ('rain', 86), ('chandier', 84), ('barbados', 83), ('shouid', 83)] 234 / 1270 : Friends - [9x24] - The One in Barbados.srt [('aii', 185), ('weii', 156), ('iike', 145), ('-weii', 141), ('wouid', 127), ('homo', 121), ('iook', 115), ('erectus', 113), ('ping-pong', 113), ('piay', 110), ('reaiiy', 99), ('pooi', 87), ('shake', 87), ('herbivore', 86), ('carnivore', 86), ('erectus.', 86), ('forfeit', 86), ('comeback', 86), ('41', 86), ('iittie', 85)] >>>
Complete listing of TF-IDF top 20.
-barbados
, … Replace such special characters with space or remove them?i = 3; makeWordCloud(dict(W[i])) i = 7; makeWordCloud(dict(W[i])) makeWordCloud(dict(W[141]))
Friends 003 - The One with the Cake
Friends 007 - The One with the Late Thanksgiving
Friends 141 - The One Where Joey Loses His Insurance
Unfortunately for vector formats (EPS, PDF, SVG) the bitmap picture is saved.
saveWordCloud(dict(W[141]),"Friends-141.png",width=1200,height=800)
It is now easy to produce in a subdirectory PNG
word clouds for all episodes and inspect them using some picture viewer (IrfanView).
for i in range(nf): wcfile = "./png/Friends-"+str(i).zfill(3)+".png" if(i % 10 == 0): print(i) saveWordCloud(dict(W[i]),wcfile,width=900,height=600)
The word cloud can be saved in REAL vector graphics in SVG format
svgWordCloud(dict(W[141]),"Friends-141.svg",width=900,height=600)
In a web browser, it displays OK. I still have problems in converting it to PDF using Inkscape.