new
This commit is contained in:
parent
e4d86d9cdf
commit
113951ba39
3 changed files with 92 additions and 45 deletions
|
@ -3,28 +3,102 @@
|
||||||
use 5.010;
|
use 5.010;
|
||||||
use strict;
|
use strict;
|
||||||
use warnings;
|
use warnings;
|
||||||
use XML::Parser;
|
use XML::Simple;
|
||||||
use XML::SimpleObject;
|
use XML::Twig;
|
||||||
|
use Time::Piece;
|
||||||
|
|
||||||
my( $filename ) = @ARGV;
|
my $filename = $ARGV[0];
|
||||||
|
|
||||||
open IN, '<', $filename or die;
|
# Reprints only the following values from each entry in the .xml file:
|
||||||
my @contents = <IN>;
|
# - application date
|
||||||
close IN;
|
# - application number
|
||||||
|
# - publication date
|
||||||
|
# - publication number
|
||||||
|
my $twig = new XML::Twig(TwigRoots => {'application-reference/document-id/date' => 1,
|
||||||
|
'application-reference/document-id/doc-number' => 2,
|
||||||
|
'publication-reference/document-id/date' => 3,
|
||||||
|
'publication-reference/document-id/doc-number' => 4},
|
||||||
|
TwigHandlers => {'application-reference/document-id/date' => sub { $_->set_tag( 'application-date') },
|
||||||
|
'application-reference/document-id/doc-number' => sub { $_->set_tag( 'application-number') },
|
||||||
|
'publication-reference/document-id/date' => sub { $_->set_tag( 'publication-date') },
|
||||||
|
'publication-reference/document-id/doc-number' => sub { $_->set_tag( 'publication-number') }},
|
||||||
|
pretty_print => 'indented');
|
||||||
|
|
||||||
@contents = grep !/^\<\?xml/igm, @contents;
|
|
||||||
|
|
||||||
open OUT, '>', $filename or die;
|
my $numLines = countFile($filename);
|
||||||
print OUT @contents;
|
print "Processing $numLines lines...\n";
|
||||||
close OUT;
|
processFile($filename, $numLines);
|
||||||
|
print "File processing finished - press '1' to generate SQL statements, or '0' to quit.\n";
|
||||||
|
while (1) {
|
||||||
|
given (<STDIN>) {
|
||||||
|
when(1) {
|
||||||
|
generateSQL($filename);
|
||||||
|
print "SQL generation finished.\n";
|
||||||
|
exit 0;
|
||||||
|
}
|
||||||
|
when(0) { exit 0; }
|
||||||
|
default { print "Press '1' to generate SQL statements, or '0' to quit.\n"; }
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
#
|
sub countFile {
|
||||||
#my $file = 'ipg150106.xml';
|
my $lineNum = 0;
|
||||||
|
open(INFILE, $_[0].".xml") or die "Can't open ".$_[0].": $!";
|
||||||
|
foreach(<INFILE>) {
|
||||||
|
++$lineNum;
|
||||||
|
}
|
||||||
|
close(INFILE);
|
||||||
|
return $lineNum;
|
||||||
|
}
|
||||||
|
|
||||||
#my $parser = XML::Parser->new(ErrorContext => 2, Style => "Tree");
|
sub processFile {
|
||||||
#my $xso = XML::SimpleObject->new( $parser->parsefile($file) );
|
my $buffer = "";
|
||||||
|
my $firstItem = 1;
|
||||||
|
my $currentLine = 1;
|
||||||
|
open(INFILE, $_[0].".xml") or die "Can't open ".$_[0].": $!";
|
||||||
|
open(my $finalFile, ">".$_[0]."FINAL.xml") or die "Can't clear FINAL ".$_[0].": $!";
|
||||||
|
print $finalFile "";
|
||||||
|
close($finalFile);
|
||||||
|
open(my $finalFile, ">>".$_[0]."FINAL.xml") or die "Can't output FINAL ".$_[0].": $!";
|
||||||
|
print$finalFile "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<patents>\n";
|
||||||
|
foreach(<INFILE>) {
|
||||||
|
print "Processing line ".$currentLine."/".$_[1]."...\n";
|
||||||
|
++$currentLine;
|
||||||
|
my($line) = $_;
|
||||||
|
if ($line !~ /^\<\?xml/ && $firstItem == 0) {
|
||||||
|
if ($line !~ /^\<\!DOCTYPE/) {
|
||||||
|
$buffer = $buffer.$line;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if ($firstItem == 1) {
|
||||||
|
$firstItem = 0;
|
||||||
|
} else {
|
||||||
|
$twig->parse($buffer);
|
||||||
|
$twig->print($finalFile);
|
||||||
|
|
||||||
#foreach my $patent ($xso->child('us-patent-grant')) {
|
$buffer = "";
|
||||||
#print $patent->child('invention-title')->{VALUE};
|
}
|
||||||
# print "\n";
|
}
|
||||||
#}
|
}
|
||||||
|
print $finalFile "</patents>";
|
||||||
|
close($finalFile);
|
||||||
|
close(INFILE);
|
||||||
|
}
|
||||||
|
|
||||||
|
sub generateSQL {
|
||||||
|
my $xml = new XML::Simple (KeyAttr=>[]);
|
||||||
|
|
||||||
|
# read XML file
|
||||||
|
my $data = $xml->XMLin($_[0]."FINAL.xml");
|
||||||
|
|
||||||
|
open(my $sqlFile, ">".$_[0]."SQL.xml") or die "Can't output SQL ".$_[0].": $!";
|
||||||
|
print $sqlFile "";
|
||||||
|
close($sqlFile);
|
||||||
|
|
||||||
|
open(my $sqlFile, ">>".$_[0]."SQL.xml") or die "Can't output SQL ".$_[0].": $!";
|
||||||
|
print $sqlFile "CREATE TABLE IF NOT EXISTS `patents`\n(\n\t`pid` INT NOT NULL AUTO_INCREMENT,\n\tPRIMARY KEY(`pid`),\n\t`pubNum` VARCHAR(32),\n\t`pubDate` DATETIME,\n\t`appNum` VARCHAR(32),\n\t`appDate` DATETIME\n);\n\n";
|
||||||
|
foreach my $e (@{$data->{'us-patent-grant'}}) {
|
||||||
|
print $sqlFile "INSERT INTO `patents`(`pubNum`, `pubDate`, `appNum`, `appDate`) VALUES ('".$e->{'publication-number'}."', '".$e->{'publication-date'}."', '".$e->{'application-number'}."', '".$e->{'application-date'}."');\n";
|
||||||
|
}
|
||||||
|
close($sqlFile);
|
||||||
|
}
|
||||||
|
|
27
step1.pl
27
step1.pl
|
@ -1,27 +0,0 @@
|
||||||
#!/usr/bin/perl
|
|
||||||
|
|
||||||
use 5.010;
|
|
||||||
use strict;
|
|
||||||
use warnings;
|
|
||||||
|
|
||||||
my($filename) = @ARGV;
|
|
||||||
|
|
||||||
my $linesNum = 0;
|
|
||||||
open(FILE, $filename) or die "Can't open `$filename': $!";
|
|
||||||
while (sysread FILE, $buffer, 4096) {
|
|
||||||
$linesNum += ($buffer =~ tr/\n//);
|
|
||||||
}
|
|
||||||
close FILE;
|
|
||||||
open(MYOUTFILE, ">".$filename."CLEAN.xml");
|
|
||||||
print MYOUTFILE "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE us-patent-grant SYSTEM \"us-patent-grant-v45-2014-04-03.dtd\" [ ]>\n";
|
|
||||||
close(MYOUTFILE);
|
|
||||||
|
|
||||||
open(MYINPUTFILE, "<".$filename.".xml");
|
|
||||||
open(MYOUTFILE, ">>".$filename."CLEAN.xml");
|
|
||||||
while(<MYINPUTFILE>) {
|
|
||||||
my($line) = $_;
|
|
||||||
chomp($line);
|
|
||||||
if ($line =~ /^\<\?xml/) {
|
|
||||||
print MYOUTFILE $line;
|
|
||||||
}
|
|
||||||
}
|
|
Reference in a new issue