Started SeekingAlphaScraper

2016-12-06 08:18:48 +00:00 · 2016-12-06 08:18:48 +00:00 · f0998ecd98
commit f0998ecd98
parent 113951ba39
9 changed files with 258 additions and 11280 deletions
--- a/patent-slurper/README.md
+++ b/patent-slurper/README.md
@ -0,0 +1,27 @@
+# PatentSlurp
+Patent slurper for Dr Lars Hass, LUMS
+
+# TODO
+
+
+1) Add stripper for redundant xml tags
+2) Harvest below data from Google dumps 2001-2015:
+-------------------------------------------------------------------------------
+              storage  display     value
+variable name   type   format      label      variable label
+-------------------------------------------------------------------------------
+sta             str2   %2s                    assg/state
+cnt             str3   %3s                    assg/country
+assgnum         byte   %8.0g                  assg/assignee seq. number (imc)
+cty             str72  %72s                   assg/city
+pdpass          long   %12.0g                 Unique assignee number
+ptype           str1   %9s                    patent type
+patnum          long   %12.0g                 patent number
+-------------------------------------------------------------------------------
+3) Compare data with NBER data (http://eml.berkeley.edu/~bhhall/NBER06.html)
+4) ...
+
+# Useful Tools
+
+http://codebeautify.org/xmlviewer
+http://www.regexr.com/
--- a/patent-slurper/patent-slurper.pl
+++ b/patent-slurper/patent-slurper.pl
@ -0,0 +1,183 @@
+#!/usr/bin/env perl
+
+use 5.010;
+#use strict;
+use warnings;
+use XML::Simple;
+use XML::Twig;
+use Time::Piece;
+
+=pod
+
+=head1 NAME
+
+PATENT_SLURPER - A program to process Google/USPTO data dumps
+
+=head1 VERSION
+
+0.8
+
+=head1 DESCRIPTION
+
+This program takes L<Google/USPTO data dumps|https://www.google.com/googlebooks/uspto-patents-grants-text.html>
+and produces SQL commands to generate a database from them with a 
+number of selected fields.
+
+=head1 AUTHOR
+
+L<Ben Goldsworthy (rumperuu)|mailto:b.goldsworthy96@gmail.com>
+
+=head1 LICENSE
+
+=cut
+
+# Trims the file extension from the filename argument
+my $filename = $ARGV[0];
+chomp  $filename;
+
+my $patentTwig = new XML::Twig(TwigRoots => { 
+                                                      'SDOBI/B100/B140/DATE/PDAT' => 1,
+                                                      'SDOBI/B100/B110/DNUM/PDAT' => 2, 
+                                                      'SDOBI/B200/B220/DATE/PDAT' => 3,
+                                                      'SDOBI/B200/B210/DNUM/PDAT' => 4 
+                                                    },
+                     TwigHandlers => { 
+                                          'SDOBI/B100/B140/DATE/PDAT' => sub { $_->set_tag( 'appdate') },
+                                          'SDOBI/B100/B110/DNUM/PDAT' => sub { $_->set_tag( 'appnum') }, 
+                                          'SDOBI/B200/B220/DATE/PDAT' => sub { $_->set_tag( 'pubdate') },
+                                          'SDOBI/B200/B210/DNUM/PDAT' => sub { $_->set_tag( 'pubnum') } 
+                                       },
+                     pretty_print => 'indented');
+my $citationTwig = new XML::Twig(TwigRoots => { 
+                                                         'SDOBI/B200/B210/DNUM/PDAT' => 1,
+                                                         'SDOBI/B500/B560/B561/PCIT/DOC/DNUM/PDAT' => 2
+                                                     },
+                      TwigHandlers => { 
+                                          'SDOBI/B200/B210/DNUM/PDAT' => sub { $_->set_tag( 'pubnum') },
+                                          'SDOBI/B500/B560/B561/PCIT/DOC/DNUM/PDAT' => sub { $_->set_tag('citing') }
+                                         },
+                      pretty_print => 'indented');
+
+my $numLines = countFile($filename);
+print "Processing $numLines lines...\n";
+processFile($filename, $numLines);
+print "File processing finished - press '1' to generate SQL statements, or '0' to quit.\n";
+while (1) {
+   given (<STDIN>) {
+      when(1) { 
+         generateSQL($filename); 
+         print "SQL generation finished.\n";
+         exit 0;
+      } when(0) { 
+         exit 0; 
+      } default { 
+         print "Press '1' to generate SQL statements, or '0' to quit.\n";
+      }
+   };
+}
+
+# Goes through the file serially to count the lines
+sub countFile {
+   my $lineNum = 0;
+   open(INFILE, "data/".$_[0].".xml") or die "Can't open ".$_[0].": $!";
+   foreach(<INFILE>) {
+      ++$lineNum;
+   }
+   close(INFILE);
+   return $lineNum;
+}
+
+# Processes the file line-by-line, removing duplicate <?xml> and 
+# <!DOCTYPE> tags and extracting the fields listed above. It has to be 
+# done line-by-line (hence the use of XML::Twig rather than XML:Simple)
+# rather than loading the entire .xml file into memory because the files 
+# are far too big to fit.
+sub processFile {
+   my $buffer = "", my $firstItem = 1, my $currentLine = 1;
+
+   open(INFILE, "data/".$_[0].".xml") or die "Can't open ".$_[0].": $!";
+   unlink("details/".$_[0].".xml");
+   unlink("citations/".$_[0].".xml");
+   open(my $detailsFile, ">>details/".$_[0].".xml") or die "Can't output ".$_[0].": $!";
+   open(my $citationsFile, ">>citations/".$_[0].".xml") or die "Can't output ".$_[0].": $!";
+   
+   # Prints the root node to the files for XML::Simple in generateSQL()
+   #print $detailsFile "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<PATDOC>";
+   #print $citationsFile "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<PATDOC>\n";
+   
+   # For each line, build up a buffer (excluding the <?xml> and 
+   # <!DOCTYPE> tags). When the next <us-patent-grant> item is reached
+   # (a.k.a. when the next <?xml> tag after the initial one is reached),
+   # run the buffer through the details and citations twigs and print
+   # the results to the relevant files. Then clear the buffer for the next
+   # <us-patent-grant>
+   foreach(<INFILE>) {
+      print "Processing line ".($currentLine++)."/".$_[1]."...\n";
+      
+      #if ($_ !~ /^\<\?xml/ && $firstItem == 0) {
+         #if ($_ !~ /^\<\!DOCTYPE/) {
+         #}
+      #} elsif ($firstItem == 1) {
+       #  $firstItem = 0;
+      #} else {
+      if ($_ =~ /^\<\?xml/ && $firstItem == 0) {	
+	 #print "\n----\n".$buffer."\n----\n"; 	 
+	 $patentTwig->parse($buffer);
+	 $patentTwig->print($detailsFile);
+         $citationTwig->parse($buffer);
+         $citationTwig->print($citationsFile);
+       
+         $buffer = "";
+      } elsif ($_ !~ /^\<\!/ && $_ !~ /^\]\>/) { 
+	 if ($firstItem == 0) {
+  		$string = $_;
+                $string =~ s/\&[a-zA-Z0-9]*;/zonk/g;
+	 	$buffer = $buffer.$string;
+		
+	 } else {
+	    
+   		print $detailsFile "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<patents>";
+   		print $citationsFile "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<patents>\n";
+	    	$firstItem = 0;
+	 } 
+      }
+   }
+   
+   print $detailsFile "</patents>";
+   print $citationsFile "</patents>";
+   
+   close($detailsFile);
+   close($citationsFile);
+   close(INFILE);
+}
+
+# Generates an SQL dump of the database formed from analysing the .xml
+# files.
+sub generateSQL {
+   my $xml = new XML::Simple (KeyAttr=>[]);
+
+   my $details = $xml->XMLin("details/".$_[0].".xml");
+   
+   unlink("sql/".$_[0].".sql");
+   open(my $sqlFile, ">>sql/".$_[0].".sql") or die "Can't output SQL ".$_[0].": $!";
+   
+   print $sqlFile "CREATE TABLE IF NOT EXISTS `patent`\n(\n\t`pid` INT NOT NULL AUTO_INCREMENT,\n\t`pubNum` VARCHAR(32),\n\t`pubDate` DATETIME,\n\t`appNum` VARCHAR(32),\n\t`appDate` DATETIME,\n\tPRIMARY KEY(`pid`)\n);\n\nCREATE TABLE IF NOT EXISTS `patent_cite`\n(\n\t`citing_id` VARCHAR(32) NOT NULL,\n\t`cited_id` VARCHAR(32) NOT NULL,\n\tPRIMARY KEY (`citing_id`, `cited_id`)\n);\n\n";
+   print $sqlFile "INSERT INTO `patent` (`pubNum`, `pubDate`, `appNum`, `appDate`) VALUES";
+   foreach my $e (@{$details->{'PATDOC'}}) {
+      print $sqlFile "\n\t('".$e->{'pubnum'}."', '".$e->{'pubdate'}."', '".$e->{'appnum'}."', '".$e->{'appdate'}."'),";
+   }
+   print $sqlFile "\n\t('0', '0', '0', '0');\n\n-- This line and the above (0,0,0,0) tuple are needed due to the nature\n-- of the loop that builds the INSERT query, and the resultant SQL file\n-- being too long to edit from the end easily.\nDELETE FROM `patent` WHERE `pid` = '0';";
+   
+   my $citations = $xml->XMLin("citations/".$_[0].".xml");
+   
+   print $sqlFile "\n\nINSERT INTO `patent_cite` (`citing_id`, `cited_id`) VALUES";
+   foreach my $f (@{$citations->{'PATDOC'}}) {
+      my $pubNum = $f->{'pubnum'};
+      foreach (@{$f->{'citing'}}) {
+         print $sqlFile "\n\t('".$pubNum."', '".$_."'),";
+      }
+   }
+   print $sqlFile "\n\t('0', '0', '0', '0');\n\n-- This line and the above (0,0,0,0) tuple are needed due to the nature\n-- of the loop that builds the INSERT query, and the resultant SQL file\n-- being too long to edit from the end easily.\nDELETE FROM `patent_cite` WHERE `citing_id` = '0';";
+   
+   close($sqlFile);
+}