This commit is contained in:
Rumps 2016-02-21 14:07:05 +00:00
parent 5fc68ec7a2
commit 7e48cbb971
6 changed files with 9753 additions and 7 deletions

View File

@ -1,2 +1,26 @@
# PatentSlurp
Patent slurper for Dr Lars Hass, LUMS
# TODO
1) Add stripper for redundant xml tags
2) Harvest below data from Google dumps 2001-2015:
-------------------------------------------------------------------------------
storage display value
variable name type format label variable label
-------------------------------------------------------------------------------
sta str2 %2s assg/state
cnt str3 %3s assg/country
assgnum byte %8.0g assg/assignee seq. number (imc)
cty str72 %72s assg/city
pdpass long %12.0g Unique assignee number
ptype str1 %9s patent type
patnum long %12.0g patent number
-------------------------------------------------------------------------------
3) Compare data with NBER data (http://eml.berkeley.edu/~bhhall/NBER06.html)
4) ...
# Useful Tools
http://codebeautify.org/xmlviewer
http://www.regexr.com/

4841
firstfew.xml Normal file

File diff suppressed because it is too large Load Diff

0
firstfew.xml~ Normal file
View File

4841
firstfewbackup.xml Normal file

File diff suppressed because it is too large Load Diff

View File

@ -6,12 +6,25 @@ use warnings;
use XML::Parser;
use XML::SimpleObject;
my $file = 'ipg150106.xml';
my( $filename ) = @ARGV;
my $parser = XML::Parser->new(ErrorContext => 2, Style => "Tree");
my $xso = XML::SimpleObject->new( $parser->parsefile($file) );
open IN, '<', $filename or die;
my @contents = <IN>;
close IN;
foreach my $patent ($xso->child('us-patent-grant')) {
print $patent->child('invention-title')->{VALUE};
print "\n";
}
@contents = grep !/^\<\?xml/igm, @contents;
open OUT, '>', $filename or die;
print OUT @contents;
close OUT;
#
#my $file = 'ipg150106.xml';
#my $parser = XML::Parser->new(ErrorContext => 2, Style => "Tree");
#my $xso = XML::SimpleObject->new( $parser->parsefile($file) );
#foreach my $patent ($xso->child('us-patent-grant')) {
#print $patent->child('invention-title')->{VALUE};
# print "\n";
#}

27
step1.pl Normal file
View File

@ -0,0 +1,27 @@
#!/usr/bin/perl
use 5.010;
use strict;
use warnings;
my($filename) = @ARGV;
my $linesNum = 0;
open(FILE, $filename) or die "Can't open `$filename': $!";
while (sysread FILE, $buffer, 4096) {
$linesNum += ($buffer =~ tr/\n//);
}
close FILE;
open(MYOUTFILE, ">".$filename."CLEAN.xml");
print MYOUTFILE "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE us-patent-grant SYSTEM \"us-patent-grant-v45-2014-04-03.dtd\" [ ]>\n";
close(MYOUTFILE);
open(MYINPUTFILE, "<".$filename.".xml");
open(MYOUTFILE, ">>".$filename."CLEAN.xml");
while(<MYINPUTFILE>) {
my($line) = $_;
chomp($line);
if ($line =~ /^\<\?xml/) {
print MYOUTFILE $line;
}
}