This commit is contained in:
Rumperuu 2016-06-02 10:36:02 +01:00
parent e4d86d9cdf
commit 113951ba39
3 changed files with 92 additions and 45 deletions

View File

View File

@ -3,28 +3,102 @@
use 5.010;
use strict;
use warnings;
use XML::Parser;
use XML::SimpleObject;
use XML::Simple;
use XML::Twig;
use Time::Piece;
my( $filename ) = @ARGV;
my $filename = $ARGV[0];
open IN, '<', $filename or die;
my @contents = <IN>;
close IN;
# Reprints only the following values from each entry in the .xml file:
# - application date
# - application number
# - publication date
# - publication number
my $twig = new XML::Twig(TwigRoots => {'application-reference/document-id/date' => 1,
'application-reference/document-id/doc-number' => 2,
'publication-reference/document-id/date' => 3,
'publication-reference/document-id/doc-number' => 4},
TwigHandlers => {'application-reference/document-id/date' => sub { $_->set_tag( 'application-date') },
'application-reference/document-id/doc-number' => sub { $_->set_tag( 'application-number') },
'publication-reference/document-id/date' => sub { $_->set_tag( 'publication-date') },
'publication-reference/document-id/doc-number' => sub { $_->set_tag( 'publication-number') }},
pretty_print => 'indented');
@contents = grep !/^\<\?xml/igm, @contents;
my $numLines = countFile($filename);
print "Processing $numLines lines...\n";
processFile($filename, $numLines);
print "File processing finished - press '1' to generate SQL statements, or '0' to quit.\n";
while (1) {
given (<STDIN>) {
when(1) {
generateSQL($filename);
print "SQL generation finished.\n";
exit 0;
}
when(0) { exit 0; }
default { print "Press '1' to generate SQL statements, or '0' to quit.\n"; }
};
}
open OUT, '>', $filename or die;
print OUT @contents;
close OUT;
sub countFile {
my $lineNum = 0;
open(INFILE, $_[0].".xml") or die "Can't open ".$_[0].": $!";
foreach(<INFILE>) {
++$lineNum;
}
close(INFILE);
return $lineNum;
}
#
#my $file = 'ipg150106.xml';
sub processFile {
my $buffer = "";
my $firstItem = 1;
my $currentLine = 1;
open(INFILE, $_[0].".xml") or die "Can't open ".$_[0].": $!";
open(my $finalFile, ">".$_[0]."FINAL.xml") or die "Can't clear FINAL ".$_[0].": $!";
print $finalFile "";
close($finalFile);
open(my $finalFile, ">>".$_[0]."FINAL.xml") or die "Can't output FINAL ".$_[0].": $!";
print$finalFile "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<patents>\n";
foreach(<INFILE>) {
print "Processing line ".$currentLine."/".$_[1]."...\n";
++$currentLine;
my($line) = $_;
if ($line !~ /^\<\?xml/ && $firstItem == 0) {
if ($line !~ /^\<\!DOCTYPE/) {
$buffer = $buffer.$line;
}
} else {
if ($firstItem == 1) {
$firstItem = 0;
} else {
$twig->parse($buffer);
$twig->print($finalFile);
$buffer = "";
}
}
}
print $finalFile "</patents>";
close($finalFile);
close(INFILE);
}
#my $parser = XML::Parser->new(ErrorContext => 2, Style => "Tree");
#my $xso = XML::SimpleObject->new( $parser->parsefile($file) );
sub generateSQL {
my $xml = new XML::Simple (KeyAttr=>[]);
#foreach my $patent ($xso->child('us-patent-grant')) {
#print $patent->child('invention-title')->{VALUE};
# print "\n";
#}
# read XML file
my $data = $xml->XMLin($_[0]."FINAL.xml");
open(my $sqlFile, ">".$_[0]."SQL.xml") or die "Can't output SQL ".$_[0].": $!";
print $sqlFile "";
close($sqlFile);
open(my $sqlFile, ">>".$_[0]."SQL.xml") or die "Can't output SQL ".$_[0].": $!";
print $sqlFile "CREATE TABLE IF NOT EXISTS `patents`\n(\n\t`pid` INT NOT NULL AUTO_INCREMENT,\n\tPRIMARY KEY(`pid`),\n\t`pubNum` VARCHAR(32),\n\t`pubDate` DATETIME,\n\t`appNum` VARCHAR(32),\n\t`appDate` DATETIME\n);\n\n";
foreach my $e (@{$data->{'us-patent-grant'}}) {
print $sqlFile "INSERT INTO `patents`(`pubNum`, `pubDate`, `appNum`, `appDate`) VALUES ('".$e->{'publication-number'}."', '".$e->{'publication-date'}."', '".$e->{'application-number'}."', '".$e->{'application-date'}."');\n";
}
close($sqlFile);
}

View File

@ -1,27 +0,0 @@
#!/usr/bin/perl
use 5.010;
use strict;
use warnings;
my($filename) = @ARGV;
my $linesNum = 0;
open(FILE, $filename) or die "Can't open `$filename': $!";
while (sysread FILE, $buffer, 4096) {
$linesNum += ($buffer =~ tr/\n//);
}
close FILE;
open(MYOUTFILE, ">".$filename."CLEAN.xml");
print MYOUTFILE "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE us-patent-grant SYSTEM \"us-patent-grant-v45-2014-04-03.dtd\" [ ]>\n";
close(MYOUTFILE);
open(MYINPUTFILE, "<".$filename.".xml");
open(MYOUTFILE, ">>".$filename."CLEAN.xml");
while(<MYINPUTFILE>) {
my($line) = $_;
chomp($line);
if ($line =~ /^\<\?xml/) {
print MYOUTFILE $line;
}
}