This shows you the differences between two versions of the page.
notes:net:uspt [2015/07/16 22:29] vlado created |
notes:net:uspt [2017/03/22 01:45] (current) vlado [Word clouds] |
||
---|---|---|---|
Line 134: | Line 134: | ||
{{notes:pics:lcd.svg}} | {{notes:pics:lcd.svg}} | ||
+ | |||
+ | ===== Titles / version 2017 ===== | ||
+ | |||
+ | PatNames.R | ||
+ | <code R> | ||
+ | # PatNames - branje naslovov patentov z US Patent Office | ||
+ | # http://patft.uspto.gov/netahtml/PTO/srchnum.htm | ||
+ | # Nataša Kejžar, 9. april 2004 | ||
+ | # 'zlikal' V.B., 27. november 2004 | ||
+ | # 'posodobil' V.B., 21. marec 2017 | ||
+ | # ------------------------------------------------------------- | ||
+ | # na datoteki numbers.txt pripravimo seznam številk patentov, | ||
+ | # v obliki s priložene datoteke stevilke.txt. Program PatNames | ||
+ | # bo na datoteko titles.txt izpisal ustrezne naslove. | ||
+ | # !!! pred uporabo je potrebno naložiti paket XML | ||
+ | # ------------------------------------------------------------- | ||
+ | |||
+ | doloci.opis <- function(num){ | ||
+ | # najprej sestavimo naslov | ||
+ | if(regexpr("island",num)>0){ | ||
+ | opis <- paste('---',num) | ||
+ | cat("Island ",num,"\n"); flush.console() | ||
+ | } else { | ||
+ | a1 <- "http://patft.uspto.gov/netacgi/nph-Parser?" | ||
+ | b1 <- "Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml" | ||
+ | c1 <- "%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1=" | ||
+ | a2 <- ".PN.&OS=PN/" | ||
+ | a3 <- "&RS=PN/" | ||
+ | num.c <- as.character(num) | ||
+ | cat(num,"\n"); flush.console() | ||
+ | url.s <- paste(a1,b1,c1,num.c,a2,num.c,a3,num.c,sep="") | ||
+ | |||
+ | # poberi tekočo stran s spleta in jo razčleni | ||
+ | # prepreči sporočila 'There were 50 or more warnings ...' | ||
+ | op <- options(); options(warn=-1) | ||
+ | hp <- htmlTreeParse(url.s); options(op) | ||
+ | html.s <- unlist(hp$children$html$children) | ||
+ | |||
+ | if(any(regexpr('\"Images\"',html.s)>0)){ | ||
+ | opis <- c(num,'*** Full text is not available,', | ||
+ | ' see image version') | ||
+ | } else { | ||
+ | # poiščemo točno določeno vrstico | ||
+ | # 4 elemente za 2. značko <hr> | ||
+ | i <- which(regexpr("hr",html.s)>0)[2] | ||
+ | naslov <- html.s[i+4] | ||
+ | # poiscemo še datum, predhodnik 2. značke <hr> | ||
+ | datum <- html.s[i-1] | ||
+ | names(naslov) <- NULL; names(datum) <- NULL | ||
+ | opis <- c(num,naslov,datum) | ||
+ | } | ||
+ | } | ||
+ | opis | ||
+ | } | ||
+ | |||
+ | # library(XML) | ||
+ | # setwd("C:/Users/batagelj/work/R/patents") | ||
+ | # | ||
+ | ## preberemo številke iz znakovne datoteke v num | ||
+ | # num <- readLines("./numbers.txt") | ||
+ | # | ||
+ | ## določi in izpiši ustrezni seznam naslovov | ||
+ | # writeLines(unlist(sapply(num,doloci.opis)),"./titles.txt") | ||
+ | </code> | ||
+ | |||
+ | numbers.txt | ||
+ | <code> | ||
+ | island 3 | ||
+ | 5010649 | ||
+ | 4926557 | ||
+ | 4426780 | ||
+ | 4347666 | ||
+ | 4245454 | ||
+ | 4254550 | ||
+ | 4151646 | ||
+ | 4095338 | ||
+ | 4134204 | ||
+ | 4104796 | ||
+ | 4047299 | ||
+ | 4007525 | ||
+ | 3892037 | ||
+ | 3928911 | ||
+ | 3664102 | ||
+ | 3693255 | ||
+ | 3826068 | ||
+ | island 1 | ||
+ | 4604106 | ||
+ | 4215999 | ||
+ | island 2 | ||
+ | 4229186 | ||
+ | 4252102 | ||
+ | </code> | ||
+ | |||