NBER titles

NBER titles

4. May 2012

First collection of data from USPTO

In July 2011 I collected all info about title + inventors of each US patent from the NBER 2006 list. There was a mistake in the data collection program - not all titles were completely extracted.

# Collecting title + inventors info from US patent Office
# http://patft.uspto.gov/netahtml/PTO/srchnum.htm
# Vladimir Batagelj, July 22, 2011

readURL <- function(page,repo,save){
  e<-NULL
  for(a in 1:10){
    stran <- tryCatch(readLines(con<-url(page),warn=FALSE,n=80),
      error = function(e) e,finally=close(con))
    ok <- class(stran)=="character"
    if(ok) return(stran)
    if (a<6) cat('\n*** class = ',class(stran),'\nretry',a,':',date(),'\n',file=repo)
    if (a<10) {
      cat('\n*** class = ',class(stran),'\nretry',a,':',date(),'\n')
      flush.console()}
    Sys.sleep(60)
  }
  cat("Problems on the Internet ...\nClosing",date(),'\n',file=repo)
  cat("Problems on the Internet ...\nClosing",date(),'\n')
  close(repo); close(save); stop("Too many retries")
}

setwd("D:/Data/nber/titles")
repo <- file("report-3200Mb.txt","w")
save <- file("titles-3200Mb.dat","w")
nums <- read.csv("nber06.nam",sep=' ',header=FALSE,skip=1,stringsAsFactors=FALSE)$V2
url1 <- 'http://patft.uspto.gov/netacgi/nph-Parser?Sect2=PTO1&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=1&f=G&l=50&d=PALL&RefSrch=yes&Query=PN%2F'

k <- 3144511; K <- 3200000
cat('% NBER - patent titles\n% started at,',k,'-',K,':',date(),'\n\n',file=repo)
while(k < K){
  if(k %% 100==0) {
    if(k %% 5000==0) {cat('\n',k,' ',date(),' ',sep=''); flush.console()
      cat(k,' ',date(),'\n',sep='',file=repo)
    }
    cat('.'); flush.console()
  }
  k <- k+1; patNum <- nums[k]
  stran <- readURL(paste(url1,patNum,sep=''),repo,save)
  it <- grep('<font size=\"+1\">',stran,fixed=TRUE)
  if (length(it)>0){
    ti <- it[1]; str <- stran[ti]; lt <- nchar(str)
    jt <- regexpr('> ',str); tit <- substr(str,jt+2,lt)
    cat(k,'=',patNum,'="',tit,'"\n',sep='',file=save)
  } else {
    cat(k,patNum,' - missing title\n',file=repo)
  }
  ii <- grep('>Inventors:',stran,fixed=TRUE)
  if (length(ii)>0){
    ti <- ii[1]; str <- stran[ti+1]; lt <- nchar(str)
    jt <- regexpr('</TD>',str); inv <- substr(str,1,jt-1)
    cat(inv,'\n',sep='',file=save)
  } else {
    cat(k,patNum,' - missing inventors\n',file=repo)
  }
}
close(repo); close(save)

Second collection of data from USPTO

Because of the mistake in the 2011 collection of the US patents additional data I decided to repeat the collection again by collecting the “headers” - parts of patent's description from the title to the “References Cited” field.

# Collecting header data from US patent Office, adapted from
# Collecting title + inventors info from US patent Office
# http://patft.uspto.gov/netahtml/PTO/srchnum.htm
# Vladimir Batagelj, July 22, 2011
#                    April 15, 2012

readURL <- function(page,repo,save,nr=150){
  e<-NULL
  for(a in 1:10){
    stran <- tryCatch(readLines(con<-url(page),warn=FALSE,n=nr),
      error = function(e) e,finally=close(con))
    ok <- class(stran)=="character"
    if(ok) return(stran)
    if (a<6) cat('\n*** class = ',class(stran),'\nretry',a,':',date(),'\n',file=repo)
    if (a<10) {
      cat('\n*** class = ',class(stran),'\nretry',a,':',date(),'\n')
      flush.console()}
    Sys.sleep(60)
  }
  cat("Problems on the Internet ...\nClosing",date(),'\n',file=repo)
  cat("Problems on the Internet ...\nClosing",date(),'\n')
  close(repo); close(save); stop("Too many retries")
}

setwd("E:/Data/nber/titles")
repo <- file("report-1M.txt","w")
save <- file("titles-1M.dat","w")
nums <- read.csv("nber06.nam",sep=' ',header=FALSE,skip=1,stringsAsFactors=FALSE)$V2
url1 <- 'http://patft.uspto.gov/netacgi/nph-Parser?Sect2=PTO1&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=1&f=G&l=50&d=PALL&RefSrch=yes&Query=PN%2F'

k <- 1; K <- 10
cat('% NBER - patent titles\n% started at,',k,'-',K,':',date(),'\n\n',file=repo)
while(k <= K){
  if(k %% 100==0) {
    if(k %% 5000==0) {cat('\n',k,' ',date(),' ',sep=''); flush.console()
      cat(k,' ',date(),'\n',sep='',file=repo)
    }
    cat('.'); flush.console()
  }
  patNum <- nums[k]
  stran <- readURL(paste(url1,patNum,sep=''),repo,save)
  it <- grep('<font size=\"+1\">',stran,fixed=TRUE)
  if (length(it)>0){
    jt <- grep('References Cited',stran,fixed=TRUE)
    if (length(jt)<1){
#      cat(k,patNum,' - long record\n',sep='',file=repo)
      stran <- readURL(paste(url1,patNum,sep=''),repo,save,-1)
      jt <- grep('References Cited',stran,fixed=TRUE)
    }
    if (length(jt)>0){
      cat(k,'=',patNum,'=',' ',stran[it:(jt-1)],'\n',sep='',file=save)
    } else {
      cat(k,patNum,' - record length\n',file=repo)
    }
  } else {
    cat(k,patNum,' - missing title\n',file=repo)
  }
  k <- k+1
}
close(repo); close(save)

4131225

Reports of data collection runs

The collected data are stored on files headers-*.dat.

Extracting the data

In April 16-21. 2012 I collected the headers from the US patents site. For each patent they consist of a single string with the following structure

199997=4130810= <font size="+1"> Solid state power combiner</font>
 <BR><BR><CENTER><B>Abstract</B></CENTER><P>
 An improved solid state transmitter (and elements therefor) adapted
 particularly well to pulsed operation at radio frequencies is disclosed.
 Such transmitter includes the combination of: A crystal-controlled
 oscillator producing a continuous wave output signal which, ultimately,
 determines the frequency of each transmitted pulse; a first oscillatory
 circuit, including a resonant cavity and at least one normally quiescent
 coaxial oscillator incorporating an IMPATT diode; a second oscillatory
 circuit, including a resonant cavity and a plurality of normally quiescent
 coaxial oscillators, each one of such oscillators incorporating an IMPATT
 diode; and an improved modulator for periodically actuating all of the
 IMPATT diodes in such a manner that a pulsed output of the first
 oscillatory circuit is produced which remains locked to the then existing
 continuous wave signal out of the crystal-controlled oscillator and the
 pulsed outputs of the coaxial oscillators in the second oscillatory
 circuit similarly are locked.The improved elements disclosed, in addition
 to the improved modulator,
 include various configurations of coaxial oscillators which are easier to
 align than known oscillators of such type or which allow a greater number
 of such oscillators to be coupled to a resonant cavity than was possible
 according to the prior art. Finally, an improved probe and tuning
 arrangement for a resonant cavity is disclosed.
</P><HR>
<TABLE WIDTH="100%">
<TR><TD VALIGN="TOP" ALIGN="LEFT" WIDTH="10%">Inventors: </TD>
<TD ALIGN="LEFT" WIDTH="90%"> <B>Wallace; Ronald M.</B> (Braintree, MA) </TD></TR>
<TR> <TD VALIGN="TOP" ALIGN="LEFT" WIDTH="10%">Assignee:</TD>
<TD ALIGN="LEFT" WIDTH="90%"><B>Raytheon Company</B> (Lexington, MA)<BR></TD></TR>
<TR><TD VALIGN="TOP" ALIGN="LEFT" WIDTH="10%" NOWRAP>Appl. No.:</TD>
<TD ALIGN="LEFT" WIDTH="90%">      <B> 05/814,744</B></TD></TR>
<TR><TD VALIGN="TOP" ALIGN="LEFT" WIDTH="10%">Filed:  </TD>
<TD ALIGN="LEFT" WIDTH="90%">      <B>June 30, 1977</B></TD></TR>
</TABLE><HR><p>
<TABLE WIDTH="100%">
<TR><TD VALIGN=TOP ALIGN="LEFT" WIDTH="40%"><B>Current U.S. Class:</B></TD>
<TD VALIGN=TOP ALIGN="RIGHT" WIDTH="80%"><B>331/107P</B>  ; 331/56</TD></TR>
<TR><TD VALIGN=TOP ALIGN="LEFT" WIDTH="40%"><B>Current International Class: </B></TD>
<TD VALIGN=TOP ALIGN="RIGHT" WIDTH="80%">H03B 9/00&nbsp(20060101);
H03B 9/14&nbsp(20060101); H03B 007/06&nbsp()</TD></TR>
<TR><TD VALIGN=TOP ALIGN="LEFT" WIDTH="40%"><B>Field of Search: </B></TD>
<TD ALIGN="RIGHT" VALIGN="TOP" WIDTH="80%">   331/56,17R  </TD></TR>
</TABLE>

Here is a test version of the extraction (title, abstract, inventors) program:

setwd("F:/Nber")
dat <- file("test.dat","r")
repeat {
  line <- readLines(dat,n=1) 
  if(length(line)==0) break  
  lt <- regexpr('<font size=',line); rt <- regexpr('</font>',line)
  head <- substr(line,1,lt-1); tit <- substr(line,lt+16,rt-1)
  cat(head,' : ',tit,'\n')
  la <- regexpr('<B>Abstract</B>',line); ra <- regexpr('</P><HR>',line)
  abstract <- substr(line,la+27,ra-1)
  cat(abstract,'\n')
  rest <- substr(line,ra+33,nchar(line))
  li <- regexpr('Inventors: </TD>',rest); ri <- regexpr('</TR>',rest)
  invent <- substr(rest,li+46,ri-6)
  cat(invent,'\n')
}
close(dat)

And here is its version that goes through all collected data files:

cat('% patent data extraction\n% ',date(),'\n\n')
setwd("G:/Nber")
ti <- file("titles.dat","w")
ab <- file("abstracts.dat","w")
iv <- file("inventors.dat","w")
k <- 0
parts <- c(  200,  400,  600,  800, 1000,
            1200, 1400, 1600, 1800, 2000,
            2250, 2500, 2750, 3000, 3250 )
for(num in parts){
  name <- paste("headers-",num,".dat",sep="")
  dat <- file(name,"r"); cat("\n\n***** ",name,"\n")
  repeat {
    line <- readLines(dat,n=1) 
    if(length(line)==0) break  
    if(k %% 1000==0) {
      if(k %% 50000==0) {cat('\n',k,' ',date(),'\n',sep=''); flush.console()}
      cat('.'); flush.console()
    }
    k <- k+1; 
    lt <- regexpr('<font size=',line); rt <- regexpr('</font>',line)
    head <- substr(line,1,lt-1); tit <- substr(line,lt+16,rt-1)
    cat(head,tit,'\n',file=ti)
    if(nchar(tit)==0) cat('\n',head," *** empty title\n")
    la <- regexpr('<B>Abstract</B>',line); ra <- regexpr('</P><HR>',line)
    abstract <- substr(line,la+27,ra-1)
    cat(head,abstract,'\n',file=ab)
    if(nchar(abstract)==0) cat('\n',head," *** empty abstract\n")
    rest <- substr(line,ra+33,nchar(line))
    li <- regexpr('Inventors: </TD>',rest); ri <- regexpr('</TR>',rest)
    invent <- substr(rest,li+46,ri-6)
    cat(head,invent,'\n',file=iv)
    if(nchar(invent)==0) cat('\n',head," *** empty inventors\n")
  }
  close(dat)
}
cat('% job completed\n% ',date(),'\n\n')
close(ti); close(ab); close(iv)

Transforming titles to keywords

Space delimited

We can use the titles to get a description of a paper by “keywords”. I produced two such descriptions. In the first the delimiter is the “space”

>>> title = 'Wheel spinning and vehicle conveying apparatus for automatic wheel     washers'
>>> names = [ x for x in title.split(" ") if len(x)>0 ]
>>> names
['Wheel', 'spinning', 'and', 'vehicle', 'conveying', 'apparatus', 'for', 'automatic', 'wheel', 'washers']
>>> title = "Method for preparation of     (s)-(+)-and(r)-(-)10,11-dihydro-10-hydrodoxy-5h-dibenz/b,f/azephine-5-car-    boxamide "
>>> names = [ x for x in title.split(" ") if len(x)>0 ]
>>> names
['Method', 'for', 'preparation', 'of', '(s)-(+)-and(r)-(-)10,11-dihydro-10-hydrodoxy-5h-dibenz/b,f/azephine-5-car-', 'boxamide']

In the second the words are delimited by characters different from alphanumeric characters.

>>> import re
>>> sp = re.compile(r'\W+')
>>> names = [ x for x in sp.split(title) if len(x)>0 ]
>>> names
['Method', 'for', 'preparation', 'of', 's', 'and', 'r', '10', '11', 'dihydro', '10', 'hydrodoxy',
 '5h', 'dibenz', 'b', 'f', 'azephine', '5', 'car', 'boxamide']

In both cases the words were lemmatized using the MontyLingua library and the stopwords removed.

#!/usr/bin/python
###########################################################################
#
#   tit2lemma - Transforming patent titles.dat file into Pajek's
#      WK.net file
#
#   tit2lemma.run(MLdir,resrc,WKfile,clufile)
#
#   The program is using the MontyLingua library. It requires
#   also the StopWords.dat file in directory resources - see the
#   program WoS2Pajek.
#
#   Vladimir Batagelj, 23. April 2012
#
#   based on KW2lemma:
#   Vladimir Batagelj,  7. March 2011 (Berlin)
#   1. 12. June 2011       first working version
###########################################################################

import sys, os, string, datetime

def lemmatize(ML,ab,stopwords):
  sLto = [ML.tokenize(st) for st in ML.split_sentences(ab.lower())]
  sLta = [ML.tag_tokenized(t) for t in sLto]
  lem = [ML.lemmatise_tagged(t) for t in sLta]
  lemas = [s.split('/')[2] for s in string.join(lem).split(' ')]
  return list(set(dropList(lemas,stopwords)))

def dropList(mylist, rmlist):
  def testfun(somestring, checklist=rmlist):
    return somestring not in checklist
  mylist = filter(testfun, mylist)
  return mylist

def infLemma(name):
# determines the lemma's number
  global nlem, lemmas, nod, m
  if name in lemmas:
    return lemmas[name]
  else:
    nlem += 1; lemmas[name] = nlem
    nod.write(str(m+nlem)+' "'+name+'"\n')
    return nlem

# ---------------------------------------------------------
def run(MLdir,resrc,titles,WKfile):
  global nlem, lemmas, nod, m
  import MontyLingua
  ML = MontyLingua.MontyLingua()
  stopwords = open(resrc+'StopWords.dat', 'r').read().lower().split()
  stopwords = ['.',',',';','(',')','[',']','"','=','?','!',':','-','s','']+stopwords

  t1 = datetime.datetime.now()
  print "\n*** Patents titles 2 lemmas\n"
  print "started: "+t1.ctime()+"\n"
  net = open(WKfile,'w')
  nod = open("lemmas.txt",'w')
  dat = open(titles,'r')
  net.write('*arcs\n')
  lemmas = {}; nlem = 0; k = 0; m = 3210774
  while True:
    k += 1
#    if k>10: break
    if k % 100000 == 0: print k
    s=dat.readline()
    if not s: break
    i=string.find(s,"="); j=string.find(s,"=",i+1)
    num = s[:i]; vid = s[i:j]; tit = s[j+1:].strip()
    names = [ x for x in tit.split(" ") if len(x)>0 ]
    for name in names:
      lema =lemmatize(ML,name,stopwords)
      if len(lema)==0: group=0
      else:
        group=m+infLemma(lema[0])
        net.write(num+" "+str(group)+'\n')
  dat.close(); net.close(); nod.close()
  t2 = datetime.datetime.now()
  print "finished: "+t2.ctime()
  print "time used: ", t2-t1
  print "***"

def process():
  MLdir = r'c:\Python27\Lib\site-packages\MontyLingua-2.1\Python'
  sys.path.append(MLdir)
  wdir = r'G:\Nber'; sys.path.append(wdir)
# wdir = os.path.dirname(__file__)
  resrc = os.path.join(wdir, "resources/")
  WKfile = wdir+'\\WK.net'
  titles = wdir+'\\titles.dat'
  run(MLdir,resrc,titles,WKfile)

#
# Run tit2lemma
#
global comlin, version, copyR
version = "tit2lemma 0.1"
copyR = "by V. Batagelj, April 23, 2012"
if __name__ == '__main__':
   comlin = True
   print "\n***", version, "\n"+copyR+"\n"
#   for (i,x) in enumerate(sys.argv): print i,x
   if len(sys.argv) == 5:
      for x in sys.argv[1:]: print x
      print "------------------------"
      MLdir = sys.argv[1]
      resrc = sys.argv[2]
      titles = sys.argv[3]
      WKfile = sys.argv[4]
      sys.path.append(MLdir)
      run(MLdir,resrc,titles,WKfile)
   else:
      process()
   print
   try: a = input("Close console?")
   except: pass
else:
   comlin = False
   print "Module tit2lemma imported.\n"
   print "***", version, copyR+"\n"
   process()
   print "\nTo rerun, type:"
   print "  reload(tit2lemma)"

#- End ---------------------------------------------------------------------

WKa.zip (83 M) - network patents X keywords
KKa.zip (28 M) - network keywords X keywords = KWa * WKa

Nonalphanumeric delimited

#!/usr/bin/python
###########################################################################
#
#   tit2lem - Transforming patent titles.dat file into Pajek's
#      WKb.net file (splitting by delimiters)
#
#   tit2lem.run(MLdir,resrc,WKfile,clufile)
#
#   The program is using the MontyLingua library. It requires
#   also the StopWords.dat file in directory resources - see the
#   program WoS2Pajek.
#
#   Vladimir Batagelj, 24. April 2012
#
#   based on KW2lemma:
#   Vladimir Batagelj,  7. March 2011 (Berlin)
#   1. 12. June 2011       first working version
###########################################################################

import sys, os, string, datetime

def lemmatize(ML,ab,stopwords):
  sLto = [ML.tokenize(st) for st in ML.split_sentences(ab.lower())]
  sLta = [ML.tag_tokenized(t) for t in sLto]
  lem = [ML.lemmatise_tagged(t) for t in sLta]
  lemas = [s.split('/')[2] for s in string.join(lem).split(' ')]
  return list(set(dropList(lemas,stopwords)))

def dropList(mylist, rmlist):
  def testfun(somestring, checklist=rmlist):
    return somestring not in checklist
  mylist = filter(testfun, mylist)
  return mylist

def infLemma(name):
# determines the lemma's number
  global nlem, lemmas, nod, m
  if name in lemmas:
    return lemmas[name]
  else:
    nlem += 1; lemmas[name] = nlem
    nod.write(str(m+nlem)+' "'+name+'"\n')
    return nlem

# ---------------------------------------------------------
def run(MLdir,resrc,titles,WKfile):
  global nlem, lemmas, nod, m
  import MontyLingua, re
  ML = MontyLingua.MontyLingua()
  stopwords = open(resrc+'StopWords.dat', 'r').read().lower().split()
  stopwords = ['.',',',';','(',')','[',']','"','=','?','!',':','-','s','']+stopwords

  t1 = datetime.datetime.now()
  print "\n*** Patents titles 2 lemmas\n"
  print "started: "+t1.ctime()+"\n"
  net = open(WKfile,'w')
  nod = open("lem.txt",'w')
  dat = open(titles,'r')
  net.write('*arcs\n')
  sp = re.compile(r'\W+')
  lemmas = {}; nlem = 0; k = 0; m = 3210774
  while True:
    k += 1
#    if k>10: break
    if k % 100000 == 0: print k,datetime.datetime.now().ctime()
    s=dat.readline()
    if not s: break
    i=string.find(s,"="); j=string.find(s,"=",i+1)
    num = s[:i]; vid = s[i:j]; tit = s[j+1:].strip()
#    names = [ x for x in tit.split(" ") if len(x)>0 ]
    names = [ x for x in sp.split(tit) if len(x)>0 ]
    for name in names:
      lema =lemmatize(ML,name,stopwords)
      if len(lema)==0: group=0
      else:
        group=m+infLemma(lema[0])
        net.write(num+" "+str(group)+'\n')
  dat.close(); net.close(); nod.close()
  t2 = datetime.datetime.now()
  print "finished: "+t2.ctime()
  print "time used: ", t2-t1
  print "***"

def process():
  MLdir = r'c:\Python27\Lib\site-packages\MontyLingua-2.1\Python'
  sys.path.append(MLdir)
  wdir = r'G:\Nber'; sys.path.append(wdir)
# wdir = os.path.dirname(__file__)
  resrc = os.path.join(wdir, "resources/")
  WKfile = wdir+'\\WKb.net'
  titles = wdir+'\\titles.dat'
  run(MLdir,resrc,titles,WKfile)

#
# Run tit2lem
#
global comlin, version, copyR
version = "tit2lem 0.1"
copyR = "by V. Batagelj, April 24, 2012"
if __name__ == '__main__':
   comlin = True
   print "\n***", version, "\n"+copyR+"\n"
#   for (i,x) in enumerate(sys.argv): print i,x
   if len(sys.argv) == 5:
      for x in sys.argv[1:]: print x
      print "------------------------"
      MLdir = sys.argv[1]
      resrc = sys.argv[2]
      titles = sys.argv[3]
      WKfile = sys.argv[4]
      sys.path.append(MLdir)
      run(MLdir,resrc,titles,WKfile)
   else:
      process()
   print
   try: a = input("Close console?")
   except: pass
else:
   comlin = False
   print "Module tit2lem imported.\n"
   print "***", version, copyR+"\n"
   process()
   print "\nTo rerun, type:"
   print "  reload(tit2lem)"

#- End ---------------------------------------------------------------------

WKb.zip (83 M) - network patents X keywords
KKb-10.zip (22 M) - network keywords X keywords with the 10 most frequent keywords removed
nber-progs.zip (15 M) - programs and traces
KKbN.zip (28 M) - normalized network keywords X keywords