4. May 2012
In July 2011 I collected all info about title + inventors of each US patent from the NBER 2006 list. There was a mistake in the data collection program - not all titles were completely extracted.
# Collecting title + inventors info from US patent Office # http://patft.uspto.gov/netahtml/PTO/srchnum.htm # Vladimir Batagelj, July 22, 2011 readURL <- function(page,repo,save){ e<-NULL for(a in 1:10){ stran <- tryCatch(readLines(con<-url(page),warn=FALSE,n=80), error = function(e) e,finally=close(con)) ok <- class(stran)=="character" if(ok) return(stran) if (a<6) cat('\n*** class = ',class(stran),'\nretry',a,':',date(),'\n',file=repo) if (a<10) { cat('\n*** class = ',class(stran),'\nretry',a,':',date(),'\n') flush.console()} Sys.sleep(60) } cat("Problems on the Internet ...\nClosing",date(),'\n',file=repo) cat("Problems on the Internet ...\nClosing",date(),'\n') close(repo); close(save); stop("Too many retries") } setwd("D:/Data/nber/titles") repo <- file("report-3200Mb.txt","w") save <- file("titles-3200Mb.dat","w") nums <- read.csv("nber06.nam",sep=' ',header=FALSE,skip=1,stringsAsFactors=FALSE)$V2 url1 <- 'http://patft.uspto.gov/netacgi/nph-Parser?Sect2=PTO1&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=1&f=G&l=50&d=PALL&RefSrch=yes&Query=PN%2F' k <- 3144511; K <- 3200000 cat('% NBER - patent titles\n% started at,',k,'-',K,':',date(),'\n\n',file=repo) while(k < K){ if(k %% 100==0) { if(k %% 5000==0) {cat('\n',k,' ',date(),' ',sep=''); flush.console() cat(k,' ',date(),'\n',sep='',file=repo) } cat('.'); flush.console() } k <- k+1; patNum <- nums[k] stran <- readURL(paste(url1,patNum,sep=''),repo,save) it <- grep('<font size=\"+1\">',stran,fixed=TRUE) if (length(it)>0){ ti <- it[1]; str <- stran[ti]; lt <- nchar(str) jt <- regexpr('> ',str); tit <- substr(str,jt+2,lt) cat(k,'=',patNum,'="',tit,'"\n',sep='',file=save) } else { cat(k,patNum,' - missing title\n',file=repo) } ii <- grep('>Inventors:',stran,fixed=TRUE) if (length(ii)>0){ ti <- ii[1]; str <- stran[ti+1]; lt <- nchar(str) jt <- regexpr('</TD>',str); inv <- substr(str,1,jt-1) cat(inv,'\n',sep='',file=save) } else { cat(k,patNum,' - missing inventors\n',file=repo) } } close(repo); close(save)
Because of the mistake in the 2011 collection of the US patents additional data I decided to repeat the collection again by collecting the “headers” - parts of patent's description from the title to the “References Cited” field.
# Collecting header data from US patent Office, adapted from # Collecting title + inventors info from US patent Office # http://patft.uspto.gov/netahtml/PTO/srchnum.htm # Vladimir Batagelj, July 22, 2011 # April 15, 2012 readURL <- function(page,repo,save,nr=150){ e<-NULL for(a in 1:10){ stran <- tryCatch(readLines(con<-url(page),warn=FALSE,n=nr), error = function(e) e,finally=close(con)) ok <- class(stran)=="character" if(ok) return(stran) if (a<6) cat('\n*** class = ',class(stran),'\nretry',a,':',date(),'\n',file=repo) if (a<10) { cat('\n*** class = ',class(stran),'\nretry',a,':',date(),'\n') flush.console()} Sys.sleep(60) } cat("Problems on the Internet ...\nClosing",date(),'\n',file=repo) cat("Problems on the Internet ...\nClosing",date(),'\n') close(repo); close(save); stop("Too many retries") } setwd("E:/Data/nber/titles") repo <- file("report-1M.txt","w") save <- file("titles-1M.dat","w") nums <- read.csv("nber06.nam",sep=' ',header=FALSE,skip=1,stringsAsFactors=FALSE)$V2 url1 <- 'http://patft.uspto.gov/netacgi/nph-Parser?Sect2=PTO1&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=1&f=G&l=50&d=PALL&RefSrch=yes&Query=PN%2F' k <- 1; K <- 10 cat('% NBER - patent titles\n% started at,',k,'-',K,':',date(),'\n\n',file=repo) while(k <= K){ if(k %% 100==0) { if(k %% 5000==0) {cat('\n',k,' ',date(),' ',sep=''); flush.console() cat(k,' ',date(),'\n',sep='',file=repo) } cat('.'); flush.console() } patNum <- nums[k] stran <- readURL(paste(url1,patNum,sep=''),repo,save) it <- grep('<font size=\"+1\">',stran,fixed=TRUE) if (length(it)>0){ jt <- grep('References Cited',stran,fixed=TRUE) if (length(jt)<1){ # cat(k,patNum,' - long record\n',sep='',file=repo) stran <- readURL(paste(url1,patNum,sep=''),repo,save,-1) jt <- grep('References Cited',stran,fixed=TRUE) } if (length(jt)>0){ cat(k,'=',patNum,'=',' ',stran[it:(jt-1)],'\n',sep='',file=save) } else { cat(k,patNum,' - record length\n',file=repo) } } else { cat(k,patNum,' - missing title\n',file=repo) } k <- k+1 } close(repo); close(save)
4131225
Reports of data collection runs
The collected data are stored on files headers-*.dat
.
In April 16-21. 2012 I collected the headers from the US patents site. For each patent they consist of a single string with the following structure
199997=4130810= <font size="+1"> Solid state power combiner</font> <BR><BR><CENTER><B>Abstract</B></CENTER><P> An improved solid state transmitter (and elements therefor) adapted particularly well to pulsed operation at radio frequencies is disclosed. Such transmitter includes the combination of: A crystal-controlled oscillator producing a continuous wave output signal which, ultimately, determines the frequency of each transmitted pulse; a first oscillatory circuit, including a resonant cavity and at least one normally quiescent coaxial oscillator incorporating an IMPATT diode; a second oscillatory circuit, including a resonant cavity and a plurality of normally quiescent coaxial oscillators, each one of such oscillators incorporating an IMPATT diode; and an improved modulator for periodically actuating all of the IMPATT diodes in such a manner that a pulsed output of the first oscillatory circuit is produced which remains locked to the then existing continuous wave signal out of the crystal-controlled oscillator and the pulsed outputs of the coaxial oscillators in the second oscillatory circuit similarly are locked.The improved elements disclosed, in addition to the improved modulator, include various configurations of coaxial oscillators which are easier to align than known oscillators of such type or which allow a greater number of such oscillators to be coupled to a resonant cavity than was possible according to the prior art. Finally, an improved probe and tuning arrangement for a resonant cavity is disclosed. </P><HR> <TABLE WIDTH="100%"> <TR><TD VALIGN="TOP" ALIGN="LEFT" WIDTH="10%">Inventors: </TD> <TD ALIGN="LEFT" WIDTH="90%"> <B>Wallace; Ronald M.</B> (Braintree, MA) </TD></TR> <TR> <TD VALIGN="TOP" ALIGN="LEFT" WIDTH="10%">Assignee:</TD> <TD ALIGN="LEFT" WIDTH="90%"><B>Raytheon Company</B> (Lexington, MA)<BR></TD></TR> <TR><TD VALIGN="TOP" ALIGN="LEFT" WIDTH="10%" NOWRAP>Appl. No.:</TD> <TD ALIGN="LEFT" WIDTH="90%"> <B> 05/814,744</B></TD></TR> <TR><TD VALIGN="TOP" ALIGN="LEFT" WIDTH="10%">Filed: </TD> <TD ALIGN="LEFT" WIDTH="90%"> <B>June 30, 1977</B></TD></TR> </TABLE><HR><p> <TABLE WIDTH="100%"> <TR><TD VALIGN=TOP ALIGN="LEFT" WIDTH="40%"><B>Current U.S. Class:</B></TD> <TD VALIGN=TOP ALIGN="RIGHT" WIDTH="80%"><B>331/107P</B> ; 331/56</TD></TR> <TR><TD VALIGN=TOP ALIGN="LEFT" WIDTH="40%"><B>Current International Class: </B></TD> <TD VALIGN=TOP ALIGN="RIGHT" WIDTH="80%">H03B 9/00 (20060101); H03B 9/14 (20060101); H03B 007/06 ()</TD></TR> <TR><TD VALIGN=TOP ALIGN="LEFT" WIDTH="40%"><B>Field of Search: </B></TD> <TD ALIGN="RIGHT" VALIGN="TOP" WIDTH="80%"> 331/56,17R </TD></TR> </TABLE>
Here is a test version of the extraction (title, abstract, inventors) program:
setwd("F:/Nber") dat <- file("test.dat","r") repeat { line <- readLines(dat,n=1) if(length(line)==0) break lt <- regexpr('<font size=',line); rt <- regexpr('</font>',line) head <- substr(line,1,lt-1); tit <- substr(line,lt+16,rt-1) cat(head,' : ',tit,'\n') la <- regexpr('<B>Abstract</B>',line); ra <- regexpr('</P><HR>',line) abstract <- substr(line,la+27,ra-1) cat(abstract,'\n') rest <- substr(line,ra+33,nchar(line)) li <- regexpr('Inventors: </TD>',rest); ri <- regexpr('</TR>',rest) invent <- substr(rest,li+46,ri-6) cat(invent,'\n') } close(dat)
And here is its version that goes through all collected data files:
cat('% patent data extraction\n% ',date(),'\n\n') setwd("G:/Nber") ti <- file("titles.dat","w") ab <- file("abstracts.dat","w") iv <- file("inventors.dat","w") k <- 0 parts <- c( 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2250, 2500, 2750, 3000, 3250 ) for(num in parts){ name <- paste("headers-",num,".dat",sep="") dat <- file(name,"r"); cat("\n\n***** ",name,"\n") repeat { line <- readLines(dat,n=1) if(length(line)==0) break if(k %% 1000==0) { if(k %% 50000==0) {cat('\n',k,' ',date(),'\n',sep=''); flush.console()} cat('.'); flush.console() } k <- k+1; lt <- regexpr('<font size=',line); rt <- regexpr('</font>',line) head <- substr(line,1,lt-1); tit <- substr(line,lt+16,rt-1) cat(head,tit,'\n',file=ti) if(nchar(tit)==0) cat('\n',head," *** empty title\n") la <- regexpr('<B>Abstract</B>',line); ra <- regexpr('</P><HR>',line) abstract <- substr(line,la+27,ra-1) cat(head,abstract,'\n',file=ab) if(nchar(abstract)==0) cat('\n',head," *** empty abstract\n") rest <- substr(line,ra+33,nchar(line)) li <- regexpr('Inventors: </TD>',rest); ri <- regexpr('</TR>',rest) invent <- substr(rest,li+46,ri-6) cat(head,invent,'\n',file=iv) if(nchar(invent)==0) cat('\n',head," *** empty inventors\n") } close(dat) } cat('% job completed\n% ',date(),'\n\n') close(ti); close(ab); close(iv)
We can use the titles to get a description of a paper by “keywords”. I produced two such descriptions. In the first the delimiter is the “space”
>>> title = 'Wheel spinning and vehicle conveying apparatus for automatic wheel washers' >>> names = [ x for x in title.split(" ") if len(x)>0 ] >>> names ['Wheel', 'spinning', 'and', 'vehicle', 'conveying', 'apparatus', 'for', 'automatic', 'wheel', 'washers'] >>> title = "Method for preparation of (s)-(+)-and(r)-(-)10,11-dihydro-10-hydrodoxy-5h-dibenz/b,f/azephine-5-car- boxamide " >>> names = [ x for x in title.split(" ") if len(x)>0 ] >>> names ['Method', 'for', 'preparation', 'of', '(s)-(+)-and(r)-(-)10,11-dihydro-10-hydrodoxy-5h-dibenz/b,f/azephine-5-car-', 'boxamide']
In the second the words are delimited by characters different from alphanumeric characters.
>>> import re >>> sp = re.compile(r'\W+') >>> names = [ x for x in sp.split(title) if len(x)>0 ] >>> names ['Method', 'for', 'preparation', 'of', 's', 'and', 'r', '10', '11', 'dihydro', '10', 'hydrodoxy', '5h', 'dibenz', 'b', 'f', 'azephine', '5', 'car', 'boxamide']
In both cases the words were lemmatized using the MontyLingua library and the stopwords removed.
#!/usr/bin/python ########################################################################### # # tit2lemma - Transforming patent titles.dat file into Pajek's # WK.net file # # tit2lemma.run(MLdir,resrc,WKfile,clufile) # # The program is using the MontyLingua library. It requires # also the StopWords.dat file in directory resources - see the # program WoS2Pajek. # # Vladimir Batagelj, 23. April 2012 # # based on KW2lemma: # Vladimir Batagelj, 7. March 2011 (Berlin) # 1. 12. June 2011 first working version ########################################################################### import sys, os, string, datetime def lemmatize(ML,ab,stopwords): sLto = [ML.tokenize(st) for st in ML.split_sentences(ab.lower())] sLta = [ML.tag_tokenized(t) for t in sLto] lem = [ML.lemmatise_tagged(t) for t in sLta] lemas = [s.split('/')[2] for s in string.join(lem).split(' ')] return list(set(dropList(lemas,stopwords))) def dropList(mylist, rmlist): def testfun(somestring, checklist=rmlist): return somestring not in checklist mylist = filter(testfun, mylist) return mylist def infLemma(name): # determines the lemma's number global nlem, lemmas, nod, m if name in lemmas: return lemmas[name] else: nlem += 1; lemmas[name] = nlem nod.write(str(m+nlem)+' "'+name+'"\n') return nlem # --------------------------------------------------------- def run(MLdir,resrc,titles,WKfile): global nlem, lemmas, nod, m import MontyLingua ML = MontyLingua.MontyLingua() stopwords = open(resrc+'StopWords.dat', 'r').read().lower().split() stopwords = ['.',',',';','(',')','[',']','"','=','?','!',':','-','s','']+stopwords t1 = datetime.datetime.now() print "\n*** Patents titles 2 lemmas\n" print "started: "+t1.ctime()+"\n" net = open(WKfile,'w') nod = open("lemmas.txt",'w') dat = open(titles,'r') net.write('*arcs\n') lemmas = {}; nlem = 0; k = 0; m = 3210774 while True: k += 1 # if k>10: break if k % 100000 == 0: print k s=dat.readline() if not s: break i=string.find(s,"="); j=string.find(s,"=",i+1) num = s[:i]; vid = s[i:j]; tit = s[j+1:].strip() names = [ x for x in tit.split(" ") if len(x)>0 ] for name in names: lema =lemmatize(ML,name,stopwords) if len(lema)==0: group=0 else: group=m+infLemma(lema[0]) net.write(num+" "+str(group)+'\n') dat.close(); net.close(); nod.close() t2 = datetime.datetime.now() print "finished: "+t2.ctime() print "time used: ", t2-t1 print "***" def process(): MLdir = r'c:\Python27\Lib\site-packages\MontyLingua-2.1\Python' sys.path.append(MLdir) wdir = r'G:\Nber'; sys.path.append(wdir) # wdir = os.path.dirname(__file__) resrc = os.path.join(wdir, "resources/") WKfile = wdir+'\\WK.net' titles = wdir+'\\titles.dat' run(MLdir,resrc,titles,WKfile) # # Run tit2lemma # global comlin, version, copyR version = "tit2lemma 0.1" copyR = "by V. Batagelj, April 23, 2012" if __name__ == '__main__': comlin = True print "\n***", version, "\n"+copyR+"\n" # for (i,x) in enumerate(sys.argv): print i,x if len(sys.argv) == 5: for x in sys.argv[1:]: print x print "------------------------" MLdir = sys.argv[1] resrc = sys.argv[2] titles = sys.argv[3] WKfile = sys.argv[4] sys.path.append(MLdir) run(MLdir,resrc,titles,WKfile) else: process() print try: a = input("Close console?") except: pass else: comlin = False print "Module tit2lemma imported.\n" print "***", version, copyR+"\n" process() print "\nTo rerun, type:" print " reload(tit2lemma)" #- End ---------------------------------------------------------------------
#!/usr/bin/python ########################################################################### # # tit2lem - Transforming patent titles.dat file into Pajek's # WKb.net file (splitting by delimiters) # # tit2lem.run(MLdir,resrc,WKfile,clufile) # # The program is using the MontyLingua library. It requires # also the StopWords.dat file in directory resources - see the # program WoS2Pajek. # # Vladimir Batagelj, 24. April 2012 # # based on KW2lemma: # Vladimir Batagelj, 7. March 2011 (Berlin) # 1. 12. June 2011 first working version ########################################################################### import sys, os, string, datetime def lemmatize(ML,ab,stopwords): sLto = [ML.tokenize(st) for st in ML.split_sentences(ab.lower())] sLta = [ML.tag_tokenized(t) for t in sLto] lem = [ML.lemmatise_tagged(t) for t in sLta] lemas = [s.split('/')[2] for s in string.join(lem).split(' ')] return list(set(dropList(lemas,stopwords))) def dropList(mylist, rmlist): def testfun(somestring, checklist=rmlist): return somestring not in checklist mylist = filter(testfun, mylist) return mylist def infLemma(name): # determines the lemma's number global nlem, lemmas, nod, m if name in lemmas: return lemmas[name] else: nlem += 1; lemmas[name] = nlem nod.write(str(m+nlem)+' "'+name+'"\n') return nlem # --------------------------------------------------------- def run(MLdir,resrc,titles,WKfile): global nlem, lemmas, nod, m import MontyLingua, re ML = MontyLingua.MontyLingua() stopwords = open(resrc+'StopWords.dat', 'r').read().lower().split() stopwords = ['.',',',';','(',')','[',']','"','=','?','!',':','-','s','']+stopwords t1 = datetime.datetime.now() print "\n*** Patents titles 2 lemmas\n" print "started: "+t1.ctime()+"\n" net = open(WKfile,'w') nod = open("lem.txt",'w') dat = open(titles,'r') net.write('*arcs\n') sp = re.compile(r'\W+') lemmas = {}; nlem = 0; k = 0; m = 3210774 while True: k += 1 # if k>10: break if k % 100000 == 0: print k,datetime.datetime.now().ctime() s=dat.readline() if not s: break i=string.find(s,"="); j=string.find(s,"=",i+1) num = s[:i]; vid = s[i:j]; tit = s[j+1:].strip() # names = [ x for x in tit.split(" ") if len(x)>0 ] names = [ x for x in sp.split(tit) if len(x)>0 ] for name in names: lema =lemmatize(ML,name,stopwords) if len(lema)==0: group=0 else: group=m+infLemma(lema[0]) net.write(num+" "+str(group)+'\n') dat.close(); net.close(); nod.close() t2 = datetime.datetime.now() print "finished: "+t2.ctime() print "time used: ", t2-t1 print "***" def process(): MLdir = r'c:\Python27\Lib\site-packages\MontyLingua-2.1\Python' sys.path.append(MLdir) wdir = r'G:\Nber'; sys.path.append(wdir) # wdir = os.path.dirname(__file__) resrc = os.path.join(wdir, "resources/") WKfile = wdir+'\\WKb.net' titles = wdir+'\\titles.dat' run(MLdir,resrc,titles,WKfile) # # Run tit2lem # global comlin, version, copyR version = "tit2lem 0.1" copyR = "by V. Batagelj, April 24, 2012" if __name__ == '__main__': comlin = True print "\n***", version, "\n"+copyR+"\n" # for (i,x) in enumerate(sys.argv): print i,x if len(sys.argv) == 5: for x in sys.argv[1:]: print x print "------------------------" MLdir = sys.argv[1] resrc = sys.argv[2] titles = sys.argv[3] WKfile = sys.argv[4] sys.path.append(MLdir) run(MLdir,resrc,titles,WKfile) else: process() print try: a = input("Close console?") except: pass else: comlin = False print "Module tit2lem imported.\n" print "***", version, copyR+"\n" process() print "\nTo rerun, type:" print " reload(tit2lem)" #- End ---------------------------------------------------------------------