Moving
This commit is contained in:
parent
5fc68ec7a2
commit
7e48cbb971
6 changed files with 9753 additions and 7 deletions
24
README.md
24
README.md
|
@ -1,2 +1,26 @@
|
|||
# PatentSlurp
|
||||
Patent slurper for Dr Lars Hass, LUMS
|
||||
|
||||
# TODO
|
||||
|
||||
1) Add stripper for redundant xml tags
|
||||
2) Harvest below data from Google dumps 2001-2015:
|
||||
-------------------------------------------------------------------------------
|
||||
storage display value
|
||||
variable name type format label variable label
|
||||
-------------------------------------------------------------------------------
|
||||
sta str2 %2s assg/state
|
||||
cnt str3 %3s assg/country
|
||||
assgnum byte %8.0g assg/assignee seq. number (imc)
|
||||
cty str72 %72s assg/city
|
||||
pdpass long %12.0g Unique assignee number
|
||||
ptype str1 %9s patent type
|
||||
patnum long %12.0g patent number
|
||||
-------------------------------------------------------------------------------
|
||||
3) Compare data with NBER data (http://eml.berkeley.edu/~bhhall/NBER06.html)
|
||||
4) ...
|
||||
|
||||
# Useful Tools
|
||||
|
||||
http://codebeautify.org/xmlviewer
|
||||
http://www.regexr.com/
|
||||
|
|
4841
firstfew.xml
Normal file
4841
firstfew.xml
Normal file
File diff suppressed because it is too large
Load diff
0
firstfew.xml~
Normal file
0
firstfew.xml~
Normal file
4841
firstfewbackup.xml
Normal file
4841
firstfewbackup.xml
Normal file
File diff suppressed because it is too large
Load diff
|
@ -6,12 +6,25 @@ use warnings;
|
|||
use XML::Parser;
|
||||
use XML::SimpleObject;
|
||||
|
||||
my $file = 'ipg150106.xml';
|
||||
my( $filename ) = @ARGV;
|
||||
|
||||
my $parser = XML::Parser->new(ErrorContext => 2, Style => "Tree");
|
||||
my $xso = XML::SimpleObject->new( $parser->parsefile($file) );
|
||||
open IN, '<', $filename or die;
|
||||
my @contents = <IN>;
|
||||
close IN;
|
||||
|
||||
foreach my $patent ($xso->child('us-patent-grant')) {
|
||||
print $patent->child('invention-title')->{VALUE};
|
||||
print "\n";
|
||||
}
|
||||
@contents = grep !/^\<\?xml/igm, @contents;
|
||||
|
||||
open OUT, '>', $filename or die;
|
||||
print OUT @contents;
|
||||
close OUT;
|
||||
|
||||
#
|
||||
#my $file = 'ipg150106.xml';
|
||||
|
||||
#my $parser = XML::Parser->new(ErrorContext => 2, Style => "Tree");
|
||||
#my $xso = XML::SimpleObject->new( $parser->parsefile($file) );
|
||||
|
||||
#foreach my $patent ($xso->child('us-patent-grant')) {
|
||||
#print $patent->child('invention-title')->{VALUE};
|
||||
# print "\n";
|
||||
#}
|
27
step1.pl
Normal file
27
step1.pl
Normal file
|
@ -0,0 +1,27 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
use 5.010;
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
my($filename) = @ARGV;
|
||||
|
||||
my $linesNum = 0;
|
||||
open(FILE, $filename) or die "Can't open `$filename': $!";
|
||||
while (sysread FILE, $buffer, 4096) {
|
||||
$linesNum += ($buffer =~ tr/\n//);
|
||||
}
|
||||
close FILE;
|
||||
open(MYOUTFILE, ">".$filename."CLEAN.xml");
|
||||
print MYOUTFILE "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE us-patent-grant SYSTEM \"us-patent-grant-v45-2014-04-03.dtd\" [ ]>\n";
|
||||
close(MYOUTFILE);
|
||||
|
||||
open(MYINPUTFILE, "<".$filename.".xml");
|
||||
open(MYOUTFILE, ">>".$filename."CLEAN.xml");
|
||||
while(<MYINPUTFILE>) {
|
||||
my($line) = $_;
|
||||
chomp($line);
|
||||
if ($line =~ /^\<\?xml/) {
|
||||
print MYOUTFILE $line;
|
||||
}
|
||||
}
|
Reference in a new issue