====== Friends ====== [[http://www.tvsubtitles.net/tvshow-65-1.html|Friends subtitles for all seasons]]. Download the SRT files. ===== Program in Python ===== # TF-IDF word clouds for Friends episodes wdir = "C:/.../term/test" ddir = "C:/.../term/Friends" import sys, os, re, datetime, csv, json, shutil, time, math from os import listdir from collections import Counter from wordcloud import WordCloud import matplotlib.pyplot as plt import fontTools os.chdir(wdir) import nltk from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet from nltk.corpus import stopwords def get_wordnet_pos(word): """Map POS tag to first character lemmatize() accepts""" tag = nltk.pos_tag([word])[0][1][0].upper() tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV} return tag_dict.get(tag, wordnet.NOUN) # get frequencies for the episode on file Ffile = Ffiles[fix] def getFreqs(Ffile): with open(ddir+"/"+Ffile,newline='',encoding="utf-8") as infile: lines = infile.readlines() S = [ s.lower() for s in [ l.strip() for l in lines ] if not((len(s)==0) or s.isnumeric() or ("-->" in s)) ] SS = ' '.join(S) L = [ lemmatizer.lemmatize(w, get_wordnet_pos(w)) for\ w in nltk.word_tokenize(SS)] K = [w for w in L if w not in new_stopwords] return Counter(K) # draw a word cloud for frequency dictionary D = dict(W[i]) def makeWordCloud(D): wc = WordCloud(background_color="white", max_words=200) wc.generate_from_frequencies(D) plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.show() # save a word cloud for frequency dictionary D = dict(W[i]) in selected # format determined by the file extension # { 'pdf', 'eps', 'svg', 'png', 'jpg' } def saveWordCloud(D,wcfile,width=600,height=400,max_words=200): wc = WordCloud(background_color="white",width=width,height=height,\ max_words=max_words) wc.generate_from_frequencies(D) wc.to_file(wcfile) # save a word cloud for frequency dictionary D = dict(W[i]) in SVG def svgWordCloud(D,wcfile,width=600,height=400,max_words=200): wc = WordCloud(background_color="white",width=width,height=height,\ max_words=max_words) wc.generate_from_frequencies(D) f = open(wcfile,"w+") f.write(wc.to_svg(embed_font=True)) f.close() # ----- version = "Friends 0.1" print(version) ts = datetime.datetime.now() print('{0}: {1}\n'.format("START",ts)) lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words("english")) # add stop words that aren't in the NLTK stopwords list add_words = ['!', '?', ',', ':', '&', '%', '.', '’', '(', ')', '[', ']',\ "...", "-", "--", "'d", "'m", "'s", "'re", "'ve", "'ll",\ "....", "<", ">", "/i", "``", "''", "n't", "ca", "gon", "na",\ "u", "l", "wh" ] new_stopwords = stop_words.union(add_words) # list of Friends episode files Ffiles = [f for f in os.listdir(ddir) if f.endswith(".srt")] tr = datetime.datetime.now() print('{0}: {1}\n'.format("COUNT",tr)) # count frequencies nf = len(Ffiles); CL = [[]]*nf; CS = [0]*nf T = Counter(); TS = Counter() for fix in range(nf): CL[fix] = getFreqs(Ffiles[fix]) CS[fix] = sum(CL[fix].values()) T += CL[fix] TS += Counter(set(CL[fix])) print(fix,'/',CS[fix],':',Ffiles[fix],'\n',CL[fix].most_common(20)) CT = sum(T.values()) print(-1,'/',CT,':','Total','\n',T.most_common(20)) ti = datetime.datetime.now() print('{0}: {1}\n'.format("TF-IDF",ti)) # compute TF-IDF weights # to improve readability weights are multiplied with 10000 and rounded W = [ [] for i in range(nf) ] for i in range(nf): W[i] = [ (key, round(10000*CL[i][key]/CS[i]*math.log(nf/TS[key])))\ for key in CL[i] ] print(i,'/',CS[i],':',Ffiles[i],'\n',Counter(dict(W[i])).most_common(20)) tf = datetime.datetime.now() print('{0}: {1}\n'.format("END",tf)) ===== Computing frequencies for each episode ===== >>> RESTART: C:\...\test\friends.pyw Friends 0.1 START: 2021-06-20 04:02:45.572279 COUNT: 2021-06-20 04:02:45.592280 0 / 1369 : Friends - [10x01] - The One After Joey and Rachel Kiss.srt [('yeah', 39), ('know', 33), ('okay', 32), ('oh', 30), ('well', 25), ('think', 24), ('get', 23), ('ross', 21), ('right', 21), ('hey', 19), ('go', 19), ('really', 18), ('talk', 17), ('joey', 17), ('say', 17), ('good', 17), ('mean', 15), ('like', 14), ('mike', 13), ('charlie', 13)] 1 / 1224 : Friends - [10x02] - The One Where Ross Is Fine.srt [('know', 42), ('oh', 29), ('okay', 23), ('ross', 23), ('get', 21), ('yeah', 20), ('think', 20), ('well', 17), ('hey', 16), ('adopt', 16), ('one', 15), ('would', 15), ('fine', 15), ('go', 14), ('like', 14), ('see', 13), ('right', 13), ('god', 13), ('say', 11), ('want', 10)] 2 / 1141 : Friends - [10x03] - The One with Ross's Tan.srt [('get', 35), ('okay', 32), ('yeah', 24), ('know', 23), ('oh', 21), ('well', 20), ('right', 19), ('like', 17), ('chandler', 15), ('go', 14), ('hey', 13), ('look', 13), ('could', 13), ('one', 12), ('back', 12), ('two', 12), ('say', 11), ('really', 11), ('mean', 11), ('sorry', 11)] 3 / 1457 : Friends - [10x04] - The One with the Cake.srt [('emma', 45), ('get', 37), ('go', 33), ('okay', 32), ('oh', 27), ('hey', 24), ('birthday', 23), ('know', 20), ('well', 19), ('one', 19), ('cake', 19), ('right', 17), ('yeah', 14), ('come', 13), ('could', 13), ('guy', 12), ('party', 11), ('like', 11), ('joey', 11), ('let', 11)] 4 / 1450 : Friends - [10x05] - The One Where Rachel's Sister Babysits.srt [('know', 33), ('oh', 23), ('get', 22), ('well', 21), ('okay', 20), ('yeah', 20), ('think', 18), ('guy', 18), ('want', 18), ('hi', 16), ('right', 15), ('like', 15), ('would', 14), ('go', 14), ('amy', 14), ('baby', 13), ('love', 13), ('god', 13), ('propose', 13), ('help', 12)] 5 / 1403 : Friends - [10x06] - The One with Ross' Grant.srt [('oh', 33), ('get', 28), ('know', 22), ('like', 19), ('well', 19), ('hey', 18), ('think', 18), ('okay', 18), ('lie', 18), ('right', 17), ('yeah', 16), ('go', 15), ('want', 14), ('say', 14), ('give', 14), ('tell', 14), ('one', 12), ('sorry', 12), ('watch', 12), ('really', 11)] .......... 232 / 1455 : Friends - [9x22] - The One With The Donor.srt [('know', 27), ('get', 26), ('okay', 23), ('well', 21), ('go', 21), ('-l', 20), ('want', 20), ('guy', 20), ('right', 17), ('like', 17), ('think', 17), ('oh', 15), ('see', 15), ('yeah', 14), ('really', 13), ('good', 12), ('mean', 12), ('great', 11), ('sperm', 11), ('lt', 11)] 233 / 1758 : Friends - [9x23] - The One in Barbados.srt [('go', 33), ('know', 30), ('get', 28), ('-i', 22), ('mike', 22), ('oh', 22), ('phoebe', 22), ('say', 21), ('okay', 21), ('weii', 20), ('aii', 19), ('want', 19), ('david', 19), ('right', 15), ('yeah', 15), ('reaiiy', 15), ('couid', 15), ('-yeah', 14), ('hey', 13), ('guy', 13)] 234 / 1270 : Friends - [9x24] - The One in Barbados.srt [('know', 28), ('okay', 24), ('go', 18), ('-i', 18), ('right', 17), ('get', 17), ('think', 16), ('-oh', 15), ('mean', 14), ('aii', 14), ('oh', 14), ('-no', 14), ('weii', 11), ('say', 11), ('-you', 11), ('iike', 11), ('-yeah', 10), ('wouid', 9), ('-weii', 9), ('time', 9)] -1 / 268819 : Total [('get', 5629), ('know', 5263), ('go', 4223), ('okay', 3257), ('right', 2931), ('oh', 2927), ('want', 2610), ('think', 2457), ('like', 2433), ('say', 2432), ('come', 2161), ('one', 2156), ('guy', 2142), ('see', 2031), ('well', 1991), ('yeah', 1943), ('look', 1795), ('really', 1589), ('make', 1560), ('good', 1552)] TF-IDF: 2021-06-20 04:14:55.765044 The complete listing of [[.:friends:freq|Frequencies top 20]]. ===== Computing TF-IDF weights ===== 0 / 1369 : Friends - [10x01] - The One After Joey and Rachel Kiss.srt [('charlie', 310), ('mike', 192), ('salon', 160), ('wall', 126), ('barbados', 107), ('precious', 106), ('susie', 96), ('nonchalant', 80), ('glimpse', 80), ('cornrows', 80), ('scalp', 70), ('grandma', 68), ('shampoo', 54), ('propose', 53), ('kiss', 52), ('appreciate', 46), ('girlfriend', 44), ('emma', 42), ('switch', 41), ('pushup', 40)] 1 / 1224 : Friends - [10x02] - The One Where Ross Is Fine.srt [('adopt', 442), ('owen', 234), ('fajitas', 223), ('frank', 220), ('charlie', 187), ('triplet', 166), ('colleen', 134), ('adoption', 120), ('jr.', 103), ('batch', 100), ('margarita', 90), ('carriage', 89), ('leslie', 80), ('frog', 78), ('intuitive', 78), ('chill', 71), ('flan', 71), ('barbados', 60), ('mitt', 60), ('alice', 57)] 2 / 1141 : Friends - [10x03] - The One with Ross's Tan.srt [('amanda', 382), ('tan', 207), ('bra', 169), ('spray-on', 144), ('sprayed', 144), ('scrappy', 144), ('mississippi', 143), ('spray', 135), ('pedicure', 125), ('mobile', 125), ('count', 112), ('pat', 96), ('monica.', 96), ('1992.', 96), ('dodge', 84), ('power', 81), ('accident', 81), ('british', 76), ('soldier', 76), ('popped', 71)] 3 / 1457 : Friends - [10x04] - The One with the Cake.srt [('emma', 588), ('birthday', 254), ('cake', 252), ('vermont', 145), ('maxim', 112), ('prepared', 111), ('bunny', 106), ('frost', 84), ('race', 79), ('8th', 76), ('bakery', 76), ('monologue', 75), ('party', 73), ('audition', 73), ('robot', 67), ('awake', 65), ('naptime', 65), ('testicle', 65), ('forth', 65), ('nighty-night', 65)] 4 / 1450 : Friends - [10x05] - The One Where Rachel's Sister Babysits.srt [('amy', 372), ('ella', 264), ('propose', 217), ('b.', 188), ('anniversary', 164), ('thesaurus', 151), ('myron', 151), ('letter', 145), ('recommendation', 131), ('falafel', 113), ('emma', 105), ('screen', 97), ('knicks', 82), ('one-year', 75), ('smart', 75), ('scoreboard.', 75), ('ask.', 75), ('full-sized', 75), ('aortic', 75), ('brockovich', 75)] 5 / 1403 : Friends - [10x06] - The One with Ross' Grant.srt [('gladys', 350), ('grant', 233), ('benji', 195), ('tape', 170), ('charlie', 163), ('lie', 157), ('boss', 155), ('hobart', 136), ('boscodictiasaur', 117), ('benjamin', 102), ('ex-boyfriend', 93), ('nobel', 93), ('proposal', 82), ('chameleon', 78), ('likewise', 78), ('li', 78), ('biely', 78), ('ichiban', 78), ('men.', 78), ('reilly', 78)] .......... 232 / 1455 : Friends - [9x22] - The One With The Donor.srt [('-l', 266), ('zack', 263), ('sperm', 239), ('lt', 144), ('donor', 131), ('keynote', 120), ('surrogacy', 113), ('beet', 113), ('charlie', 112), ('speaker', 106), ('mike', 97), ('lf', 92), ('insemination', 75), ('sherman', 75), ('-barbados', 75), ('shopping', 74), ('slutty', 70), ('inhospitable', 66), ('ravioli', 66), ('paleontology', 65)] 233 / 1758 : Friends - [9x23] - The One in Barbados.srt [('david', 321), ('mike', 253), ('weii', 204), ('aii', 181), ('speech', 156), ('reaiiy', 153), ('couid', 151), ('iove', 117), ('wiii', 116), ('iike', 114), ('propose', 110), ('advice', 108), ('-weii', 102), ('thong', 100), ('meddie', 93), ('pharmacist', 88), ('rain', 86), ('chandier', 84), ('barbados', 83), ('shouid', 83)] 234 / 1270 : Friends - [9x24] - The One in Barbados.srt [('aii', 185), ('weii', 156), ('iike', 145), ('-weii', 141), ('wouid', 127), ('homo', 121), ('iook', 115), ('erectus', 113), ('ping-pong', 113), ('piay', 110), ('reaiiy', 99), ('pooi', 87), ('shake', 87), ('herbivore', 86), ('carnivore', 86), ('erectus.', 86), ('forfeit', 86), ('comeback', 86), ('41', 86), ('iittie', 85)] >>> Complete listing of [[.:friends:tfidf|TF-IDF top 20]]. ===== Notes ===== - there are still some irregularities in the tables. For example, words starting with - : ''-barbados'', ... Replace such special characters with space or remove them? - it seems that the data were obtained with OCR tool that had problems with distinguishing between i and l : reaiiy -> really, iittie -> little, weii -> well, wiii -> will, .... Corrections? or find some other source? - the language used is not literary but colloquial language. Some kind of normalization? ===== Word clouds ===== ==== Displaying word cloud on the screen ==== i = 3; makeWordCloud(dict(W[i])) i = 7; makeWordCloud(dict(W[i])) makeWordCloud(dict(W[141])) Friends 003 - The One with the Cake {{notes:text:pics:friends003.png}} Friends 007 - The One with the Late Thanksgiving {{notes:text:pics:friends007.png}} Friends 141 - The One Where Joey Loses His Insurance {{notes:text:pics:friends141.png}} ==== Saving word cloud to file ==== Unfortunately for vector formats (EPS, PDF, SVG) the bitmap picture is saved. saveWordCloud(dict(W[141]),"Friends-141.png",width=1200,height=800) {{notes:text:pics:friends-141.png?900}} It is now easy to produce in a subdirectory ''PNG'' word clouds for all episodes and inspect them using some picture viewer (IrfanView). for i in range(nf): wcfile = "./png/Friends-"+str(i).zfill(3)+".png" if(i % 10 == 0): print(i) saveWordCloud(dict(W[i]),wcfile,width=900,height=600) ==== Saving word cloud to SVG file ==== The word cloud can be saved in **REAL** vector graphics in SVG format svgWordCloud(dict(W[141]),"Friends-141.svg",width=900,height=600) In a web browser, it displays OK. I still have problems in converting it to PDF using Inkscape. ===== Analysis with the alternative subtitles files ===== Complete listing of [[.:friends:tfidf2|TF-IDF top 20]]. {{notes:text:pics:friends2-123.png}} {{notes:text:pics:friends2-071.png?500}} {{notes:text:pics:friends2-154.png?500}} {{notes:text:pics:friends2-228.png?500}} {{notes:text:pics:friends2-233.png?500}} ===== Some links ===== - https://github.com/amueller/word_cloud/ - https://www.python-course.eu/python_wordcloud_tutorial.php - https://www.codegrepper.com/code-examples/python/counter+python+collections+get+top+10