Started SeekingAlphaScraper
This commit is contained in:
parent
113951ba39
commit
f0998ecd98
9 changed files with 258 additions and 11280 deletions
27
patent-slurper/README.md
Normal file
27
patent-slurper/README.md
Normal file
|
@ -0,0 +1,27 @@
|
|||
# PatentSlurp
|
||||
Patent slurper for Dr Lars Hass, LUMS
|
||||
|
||||
# TODO
|
||||
|
||||
|
||||
1) Add stripper for redundant xml tags
|
||||
2) Harvest below data from Google dumps 2001-2015:
|
||||
-------------------------------------------------------------------------------
|
||||
storage display value
|
||||
variable name type format label variable label
|
||||
-------------------------------------------------------------------------------
|
||||
sta str2 %2s assg/state
|
||||
cnt str3 %3s assg/country
|
||||
assgnum byte %8.0g assg/assignee seq. number (imc)
|
||||
cty str72 %72s assg/city
|
||||
pdpass long %12.0g Unique assignee number
|
||||
ptype str1 %9s patent type
|
||||
patnum long %12.0g patent number
|
||||
-------------------------------------------------------------------------------
|
||||
3) Compare data with NBER data (http://eml.berkeley.edu/~bhhall/NBER06.html)
|
||||
4) ...
|
||||
|
||||
# Useful Tools
|
||||
|
||||
http://codebeautify.org/xmlviewer
|
||||
http://www.regexr.com/
|
183
patent-slurper/patent-slurper.pl
Normal file
183
patent-slurper/patent-slurper.pl
Normal file
|
@ -0,0 +1,183 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
use 5.010;
|
||||
#use strict;
|
||||
use warnings;
|
||||
use XML::Simple;
|
||||
use XML::Twig;
|
||||
use Time::Piece;
|
||||
|
||||
=pod
|
||||
|
||||
=head1 NAME
|
||||
|
||||
PATENT_SLURPER - A program to process Google/USPTO data dumps
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
0.8
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This program takes L<Google/USPTO data dumps|https://www.google.com/googlebooks/uspto-patents-grants-text.html>
|
||||
and produces SQL commands to generate a database from them with a
|
||||
number of selected fields.
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
L<Ben Goldsworthy (rumperuu)|mailto:b.goldsworthy96@gmail.com>
|
||||
|
||||
=head1 LICENSE
|
||||
|
||||
=cut
|
||||
|
||||
# Trims the file extension from the filename argument
|
||||
my $filename = $ARGV[0];
|
||||
chomp $filename;
|
||||
|
||||
my $patentTwig = new XML::Twig(TwigRoots => {
|
||||
'SDOBI/B100/B140/DATE/PDAT' => 1,
|
||||
'SDOBI/B100/B110/DNUM/PDAT' => 2,
|
||||
'SDOBI/B200/B220/DATE/PDAT' => 3,
|
||||
'SDOBI/B200/B210/DNUM/PDAT' => 4
|
||||
},
|
||||
TwigHandlers => {
|
||||
'SDOBI/B100/B140/DATE/PDAT' => sub { $_->set_tag( 'appdate') },
|
||||
'SDOBI/B100/B110/DNUM/PDAT' => sub { $_->set_tag( 'appnum') },
|
||||
'SDOBI/B200/B220/DATE/PDAT' => sub { $_->set_tag( 'pubdate') },
|
||||
'SDOBI/B200/B210/DNUM/PDAT' => sub { $_->set_tag( 'pubnum') }
|
||||
},
|
||||
pretty_print => 'indented');
|
||||
my $citationTwig = new XML::Twig(TwigRoots => {
|
||||
'SDOBI/B200/B210/DNUM/PDAT' => 1,
|
||||
'SDOBI/B500/B560/B561/PCIT/DOC/DNUM/PDAT' => 2
|
||||
},
|
||||
TwigHandlers => {
|
||||
'SDOBI/B200/B210/DNUM/PDAT' => sub { $_->set_tag( 'pubnum') },
|
||||
'SDOBI/B500/B560/B561/PCIT/DOC/DNUM/PDAT' => sub { $_->set_tag('citing') }
|
||||
},
|
||||
pretty_print => 'indented');
|
||||
|
||||
my $numLines = countFile($filename);
|
||||
print "Processing $numLines lines...\n";
|
||||
processFile($filename, $numLines);
|
||||
print "File processing finished - press '1' to generate SQL statements, or '0' to quit.\n";
|
||||
while (1) {
|
||||
given (<STDIN>) {
|
||||
when(1) {
|
||||
generateSQL($filename);
|
||||
print "SQL generation finished.\n";
|
||||
exit 0;
|
||||
} when(0) {
|
||||
exit 0;
|
||||
} default {
|
||||
print "Press '1' to generate SQL statements, or '0' to quit.\n";
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
# Goes through the file serially to count the lines
|
||||
sub countFile {
|
||||
my $lineNum = 0;
|
||||
open(INFILE, "data/".$_[0].".xml") or die "Can't open ".$_[0].": $!";
|
||||
foreach(<INFILE>) {
|
||||
++$lineNum;
|
||||
}
|
||||
close(INFILE);
|
||||
return $lineNum;
|
||||
}
|
||||
|
||||
# Processes the file line-by-line, removing duplicate <?xml> and
|
||||
# <!DOCTYPE> tags and extracting the fields listed above. It has to be
|
||||
# done line-by-line (hence the use of XML::Twig rather than XML:Simple)
|
||||
# rather than loading the entire .xml file into memory because the files
|
||||
# are far too big to fit.
|
||||
sub processFile {
|
||||
my $buffer = "", my $firstItem = 1, my $currentLine = 1;
|
||||
|
||||
open(INFILE, "data/".$_[0].".xml") or die "Can't open ".$_[0].": $!";
|
||||
unlink("details/".$_[0].".xml");
|
||||
unlink("citations/".$_[0].".xml");
|
||||
open(my $detailsFile, ">>details/".$_[0].".xml") or die "Can't output ".$_[0].": $!";
|
||||
open(my $citationsFile, ">>citations/".$_[0].".xml") or die "Can't output ".$_[0].": $!";
|
||||
|
||||
# Prints the root node to the files for XML::Simple in generateSQL()
|
||||
#print $detailsFile "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<PATDOC>";
|
||||
#print $citationsFile "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<PATDOC>\n";
|
||||
|
||||
# For each line, build up a buffer (excluding the <?xml> and
|
||||
# <!DOCTYPE> tags). When the next <us-patent-grant> item is reached
|
||||
# (a.k.a. when the next <?xml> tag after the initial one is reached),
|
||||
# run the buffer through the details and citations twigs and print
|
||||
# the results to the relevant files. Then clear the buffer for the next
|
||||
# <us-patent-grant>
|
||||
foreach(<INFILE>) {
|
||||
print "Processing line ".($currentLine++)."/".$_[1]."...\n";
|
||||
|
||||
#if ($_ !~ /^\<\?xml/ && $firstItem == 0) {
|
||||
#if ($_ !~ /^\<\!DOCTYPE/) {
|
||||
#}
|
||||
#} elsif ($firstItem == 1) {
|
||||
# $firstItem = 0;
|
||||
#} else {
|
||||
if ($_ =~ /^\<\?xml/ && $firstItem == 0) {
|
||||
#print "\n----\n".$buffer."\n----\n";
|
||||
$patentTwig->parse($buffer);
|
||||
$patentTwig->print($detailsFile);
|
||||
$citationTwig->parse($buffer);
|
||||
$citationTwig->print($citationsFile);
|
||||
|
||||
$buffer = "";
|
||||
} elsif ($_ !~ /^\<\!/ && $_ !~ /^\]\>/) {
|
||||
if ($firstItem == 0) {
|
||||
$string = $_;
|
||||
$string =~ s/\&[a-zA-Z0-9]*;/zonk/g;
|
||||
$buffer = $buffer.$string;
|
||||
|
||||
} else {
|
||||
|
||||
print $detailsFile "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<patents>";
|
||||
print $citationsFile "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<patents>\n";
|
||||
$firstItem = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print $detailsFile "</patents>";
|
||||
print $citationsFile "</patents>";
|
||||
|
||||
close($detailsFile);
|
||||
close($citationsFile);
|
||||
close(INFILE);
|
||||
}
|
||||
|
||||
# Generates an SQL dump of the database formed from analysing the .xml
|
||||
# files.
|
||||
sub generateSQL {
|
||||
my $xml = new XML::Simple (KeyAttr=>[]);
|
||||
|
||||
my $details = $xml->XMLin("details/".$_[0].".xml");
|
||||
|
||||
unlink("sql/".$_[0].".sql");
|
||||
open(my $sqlFile, ">>sql/".$_[0].".sql") or die "Can't output SQL ".$_[0].": $!";
|
||||
|
||||
print $sqlFile "CREATE TABLE IF NOT EXISTS `patent`\n(\n\t`pid` INT NOT NULL AUTO_INCREMENT,\n\t`pubNum` VARCHAR(32),\n\t`pubDate` DATETIME,\n\t`appNum` VARCHAR(32),\n\t`appDate` DATETIME,\n\tPRIMARY KEY(`pid`)\n);\n\nCREATE TABLE IF NOT EXISTS `patent_cite`\n(\n\t`citing_id` VARCHAR(32) NOT NULL,\n\t`cited_id` VARCHAR(32) NOT NULL,\n\tPRIMARY KEY (`citing_id`, `cited_id`)\n);\n\n";
|
||||
print $sqlFile "INSERT INTO `patent` (`pubNum`, `pubDate`, `appNum`, `appDate`) VALUES";
|
||||
foreach my $e (@{$details->{'PATDOC'}}) {
|
||||
print $sqlFile "\n\t('".$e->{'pubnum'}."', '".$e->{'pubdate'}."', '".$e->{'appnum'}."', '".$e->{'appdate'}."'),";
|
||||
}
|
||||
print $sqlFile "\n\t('0', '0', '0', '0');\n\n-- This line and the above (0,0,0,0) tuple are needed due to the nature\n-- of the loop that builds the INSERT query, and the resultant SQL file\n-- being too long to edit from the end easily.\nDELETE FROM `patent` WHERE `pid` = '0';";
|
||||
|
||||
my $citations = $xml->XMLin("citations/".$_[0].".xml");
|
||||
|
||||
print $sqlFile "\n\nINSERT INTO `patent_cite` (`citing_id`, `cited_id`) VALUES";
|
||||
foreach my $f (@{$citations->{'PATDOC'}}) {
|
||||
my $pubNum = $f->{'pubnum'};
|
||||
foreach (@{$f->{'citing'}}) {
|
||||
print $sqlFile "\n\t('".$pubNum."', '".$_."'),";
|
||||
}
|
||||
}
|
||||
print $sqlFile "\n\t('0', '0', '0', '0');\n\n-- This line and the above (0,0,0,0) tuple are needed due to the nature\n-- of the loop that builds the INSERT query, and the resultant SQL file\n-- being too long to edit from the end easily.\nDELETE FROM `patent_cite` WHERE `citing_id` = '0';";
|
||||
|
||||
close($sqlFile);
|
||||
}
|
Reference in a new issue