Moving
This commit is contained in:
parent
5fc68ec7a2
commit
7e48cbb971
6 changed files with 9753 additions and 7 deletions
24
README.md
24
README.md
|
@ -1,2 +1,26 @@
|
||||||
# PatentSlurp
|
# PatentSlurp
|
||||||
Patent slurper for Dr Lars Hass, LUMS
|
Patent slurper for Dr Lars Hass, LUMS
|
||||||
|
|
||||||
|
# TODO
|
||||||
|
|
||||||
|
1) Add stripper for redundant xml tags
|
||||||
|
2) Harvest below data from Google dumps 2001-2015:
|
||||||
|
-------------------------------------------------------------------------------
|
||||||
|
storage display value
|
||||||
|
variable name type format label variable label
|
||||||
|
-------------------------------------------------------------------------------
|
||||||
|
sta str2 %2s assg/state
|
||||||
|
cnt str3 %3s assg/country
|
||||||
|
assgnum byte %8.0g assg/assignee seq. number (imc)
|
||||||
|
cty str72 %72s assg/city
|
||||||
|
pdpass long %12.0g Unique assignee number
|
||||||
|
ptype str1 %9s patent type
|
||||||
|
patnum long %12.0g patent number
|
||||||
|
-------------------------------------------------------------------------------
|
||||||
|
3) Compare data with NBER data (http://eml.berkeley.edu/~bhhall/NBER06.html)
|
||||||
|
4) ...
|
||||||
|
|
||||||
|
# Useful Tools
|
||||||
|
|
||||||
|
http://codebeautify.org/xmlviewer
|
||||||
|
http://www.regexr.com/
|
||||||
|
|
4841
firstfew.xml
Normal file
4841
firstfew.xml
Normal file
File diff suppressed because it is too large
Load diff
0
firstfew.xml~
Normal file
0
firstfew.xml~
Normal file
4841
firstfewbackup.xml
Normal file
4841
firstfewbackup.xml
Normal file
File diff suppressed because it is too large
Load diff
|
@ -6,12 +6,25 @@ use warnings;
|
||||||
use XML::Parser;
|
use XML::Parser;
|
||||||
use XML::SimpleObject;
|
use XML::SimpleObject;
|
||||||
|
|
||||||
my $file = 'ipg150106.xml';
|
my( $filename ) = @ARGV;
|
||||||
|
|
||||||
my $parser = XML::Parser->new(ErrorContext => 2, Style => "Tree");
|
open IN, '<', $filename or die;
|
||||||
my $xso = XML::SimpleObject->new( $parser->parsefile($file) );
|
my @contents = <IN>;
|
||||||
|
close IN;
|
||||||
|
|
||||||
foreach my $patent ($xso->child('us-patent-grant')) {
|
@contents = grep !/^\<\?xml/igm, @contents;
|
||||||
print $patent->child('invention-title')->{VALUE};
|
|
||||||
print "\n";
|
open OUT, '>', $filename or die;
|
||||||
}
|
print OUT @contents;
|
||||||
|
close OUT;
|
||||||
|
|
||||||
|
#
|
||||||
|
#my $file = 'ipg150106.xml';
|
||||||
|
|
||||||
|
#my $parser = XML::Parser->new(ErrorContext => 2, Style => "Tree");
|
||||||
|
#my $xso = XML::SimpleObject->new( $parser->parsefile($file) );
|
||||||
|
|
||||||
|
#foreach my $patent ($xso->child('us-patent-grant')) {
|
||||||
|
#print $patent->child('invention-title')->{VALUE};
|
||||||
|
# print "\n";
|
||||||
|
#}
|
27
step1.pl
Normal file
27
step1.pl
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
use 5.010;
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
|
||||||
|
my($filename) = @ARGV;
|
||||||
|
|
||||||
|
my $linesNum = 0;
|
||||||
|
open(FILE, $filename) or die "Can't open `$filename': $!";
|
||||||
|
while (sysread FILE, $buffer, 4096) {
|
||||||
|
$linesNum += ($buffer =~ tr/\n//);
|
||||||
|
}
|
||||||
|
close FILE;
|
||||||
|
open(MYOUTFILE, ">".$filename."CLEAN.xml");
|
||||||
|
print MYOUTFILE "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE us-patent-grant SYSTEM \"us-patent-grant-v45-2014-04-03.dtd\" [ ]>\n";
|
||||||
|
close(MYOUTFILE);
|
||||||
|
|
||||||
|
open(MYINPUTFILE, "<".$filename.".xml");
|
||||||
|
open(MYOUTFILE, ">>".$filename."CLEAN.xml");
|
||||||
|
while(<MYINPUTFILE>) {
|
||||||
|
my($line) = $_;
|
||||||
|
chomp($line);
|
||||||
|
if ($line =~ /^\<\?xml/) {
|
||||||
|
print MYOUTFILE $line;
|
||||||
|
}
|
||||||
|
}
|
Reference in a new issue