From 113951ba39007e5d241fc1b4fed35f56ff37aa54 Mon Sep 17 00:00:00 2001 From: Rumperuu Date: Thu, 2 Jun 2016 10:36:02 +0100 Subject: [PATCH] new --- firstfew.xml~ | 0 patent_slurper.pl | 110 ++++++++++++++++++++++++++++++++++++++-------- step1.pl | 27 ------------ 3 files changed, 92 insertions(+), 45 deletions(-) delete mode 100644 firstfew.xml~ delete mode 100644 step1.pl diff --git a/firstfew.xml~ b/firstfew.xml~ deleted file mode 100644 index e69de29..0000000 diff --git a/patent_slurper.pl b/patent_slurper.pl index 2ce5a1b..1281825 100644 --- a/patent_slurper.pl +++ b/patent_slurper.pl @@ -3,28 +3,102 @@ use 5.010; use strict; use warnings; -use XML::Parser; -use XML::SimpleObject; +use XML::Simple; +use XML::Twig; +use Time::Piece; -my( $filename ) = @ARGV; +my $filename = $ARGV[0]; -open IN, '<', $filename or die; -my @contents = ; -close IN; +# Reprints only the following values from each entry in the .xml file: +# - application date +# - application number +# - publication date +# - publication number +my $twig = new XML::Twig(TwigRoots => {'application-reference/document-id/date' => 1, + 'application-reference/document-id/doc-number' => 2, + 'publication-reference/document-id/date' => 3, + 'publication-reference/document-id/doc-number' => 4}, + TwigHandlers => {'application-reference/document-id/date' => sub { $_->set_tag( 'application-date') }, + 'application-reference/document-id/doc-number' => sub { $_->set_tag( 'application-number') }, + 'publication-reference/document-id/date' => sub { $_->set_tag( 'publication-date') }, + 'publication-reference/document-id/doc-number' => sub { $_->set_tag( 'publication-number') }}, + pretty_print => 'indented'); + -@contents = grep !/^\<\?xml/igm, @contents; +my $numLines = countFile($filename); +print "Processing $numLines lines...\n"; +processFile($filename, $numLines); +print "File processing finished - press '1' to generate SQL statements, or '0' to quit.\n"; +while (1) { + given () { + when(1) { + generateSQL($filename); + print "SQL generation finished.\n"; + exit 0; + } + when(0) { exit 0; } + default { print "Press '1' to generate SQL statements, or '0' to quit.\n"; } + }; +} -open OUT, '>', $filename or die; -print OUT @contents; -close OUT; +sub countFile { + my $lineNum = 0; + open(INFILE, $_[0].".xml") or die "Can't open ".$_[0].": $!"; + foreach() { + ++$lineNum; + } + close(INFILE); + return $lineNum; +} -# -#my $file = 'ipg150106.xml'; +sub processFile { + my $buffer = ""; + my $firstItem = 1; + my $currentLine = 1; + open(INFILE, $_[0].".xml") or die "Can't open ".$_[0].": $!"; + open(my $finalFile, ">".$_[0]."FINAL.xml") or die "Can't clear FINAL ".$_[0].": $!"; + print $finalFile ""; + close($finalFile); + open(my $finalFile, ">>".$_[0]."FINAL.xml") or die "Can't output FINAL ".$_[0].": $!"; + print$finalFile "\n\n"; + foreach() { + print "Processing line ".$currentLine."/".$_[1]."...\n"; + ++$currentLine; + my($line) = $_; + if ($line !~ /^\<\?xml/ && $firstItem == 0) { + if ($line !~ /^\<\!DOCTYPE/) { + $buffer = $buffer.$line; + } + } else { + if ($firstItem == 1) { + $firstItem = 0; + } else { + $twig->parse($buffer); + $twig->print($finalFile); + + $buffer = ""; + } + } + } + print $finalFile ""; + close($finalFile); + close(INFILE); +} -#my $parser = XML::Parser->new(ErrorContext => 2, Style => "Tree"); -#my $xso = XML::SimpleObject->new( $parser->parsefile($file) ); +sub generateSQL { + my $xml = new XML::Simple (KeyAttr=>[]); -#foreach my $patent ($xso->child('us-patent-grant')) { - #print $patent->child('invention-title')->{VALUE}; - # print "\n"; -#} \ No newline at end of file + # read XML file + my $data = $xml->XMLin($_[0]."FINAL.xml"); + + open(my $sqlFile, ">".$_[0]."SQL.xml") or die "Can't output SQL ".$_[0].": $!"; + print $sqlFile ""; + close($sqlFile); + + open(my $sqlFile, ">>".$_[0]."SQL.xml") or die "Can't output SQL ".$_[0].": $!"; + print $sqlFile "CREATE TABLE IF NOT EXISTS `patents`\n(\n\t`pid` INT NOT NULL AUTO_INCREMENT,\n\tPRIMARY KEY(`pid`),\n\t`pubNum` VARCHAR(32),\n\t`pubDate` DATETIME,\n\t`appNum` VARCHAR(32),\n\t`appDate` DATETIME\n);\n\n"; + foreach my $e (@{$data->{'us-patent-grant'}}) { + print $sqlFile "INSERT INTO `patents`(`pubNum`, `pubDate`, `appNum`, `appDate`) VALUES ('".$e->{'publication-number'}."', '".$e->{'publication-date'}."', '".$e->{'application-number'}."', '".$e->{'application-date'}."');\n"; + } + close($sqlFile); +} diff --git a/step1.pl b/step1.pl deleted file mode 100644 index 992d978..0000000 --- a/step1.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/perl - -use 5.010; -use strict; -use warnings; - -my($filename) = @ARGV; - -my $linesNum = 0; -open(FILE, $filename) or die "Can't open `$filename': $!"; -while (sysread FILE, $buffer, 4096) { - $linesNum += ($buffer =~ tr/\n//); -} -close FILE; -open(MYOUTFILE, ">".$filename."CLEAN.xml"); -print MYOUTFILE "\n\n"; -close(MYOUTFILE); - -open(MYINPUTFILE, "<".$filename.".xml"); -open(MYOUTFILE, ">>".$filename."CLEAN.xml"); -while() { - my($line) = $_; - chomp($line); - if ($line =~ /^\<\?xml/) { - print MYOUTFILE $line; - } -} \ No newline at end of file