new
This commit is contained in:
parent
e4d86d9cdf
commit
113951ba39
3 changed files with 92 additions and 45 deletions
|
@ -3,28 +3,102 @@
|
|||
use 5.010;
|
||||
use strict;
|
||||
use warnings;
|
||||
use XML::Parser;
|
||||
use XML::SimpleObject;
|
||||
use XML::Simple;
|
||||
use XML::Twig;
|
||||
use Time::Piece;
|
||||
|
||||
my( $filename ) = @ARGV;
|
||||
my $filename = $ARGV[0];
|
||||
|
||||
open IN, '<', $filename or die;
|
||||
my @contents = <IN>;
|
||||
close IN;
|
||||
# Reprints only the following values from each entry in the .xml file:
|
||||
# - application date
|
||||
# - application number
|
||||
# - publication date
|
||||
# - publication number
|
||||
my $twig = new XML::Twig(TwigRoots => {'application-reference/document-id/date' => 1,
|
||||
'application-reference/document-id/doc-number' => 2,
|
||||
'publication-reference/document-id/date' => 3,
|
||||
'publication-reference/document-id/doc-number' => 4},
|
||||
TwigHandlers => {'application-reference/document-id/date' => sub { $_->set_tag( 'application-date') },
|
||||
'application-reference/document-id/doc-number' => sub { $_->set_tag( 'application-number') },
|
||||
'publication-reference/document-id/date' => sub { $_->set_tag( 'publication-date') },
|
||||
'publication-reference/document-id/doc-number' => sub { $_->set_tag( 'publication-number') }},
|
||||
pretty_print => 'indented');
|
||||
|
||||
|
||||
@contents = grep !/^\<\?xml/igm, @contents;
|
||||
my $numLines = countFile($filename);
|
||||
print "Processing $numLines lines...\n";
|
||||
processFile($filename, $numLines);
|
||||
print "File processing finished - press '1' to generate SQL statements, or '0' to quit.\n";
|
||||
while (1) {
|
||||
given (<STDIN>) {
|
||||
when(1) {
|
||||
generateSQL($filename);
|
||||
print "SQL generation finished.\n";
|
||||
exit 0;
|
||||
}
|
||||
when(0) { exit 0; }
|
||||
default { print "Press '1' to generate SQL statements, or '0' to quit.\n"; }
|
||||
};
|
||||
}
|
||||
|
||||
open OUT, '>', $filename or die;
|
||||
print OUT @contents;
|
||||
close OUT;
|
||||
sub countFile {
|
||||
my $lineNum = 0;
|
||||
open(INFILE, $_[0].".xml") or die "Can't open ".$_[0].": $!";
|
||||
foreach(<INFILE>) {
|
||||
++$lineNum;
|
||||
}
|
||||
close(INFILE);
|
||||
return $lineNum;
|
||||
}
|
||||
|
||||
#
|
||||
#my $file = 'ipg150106.xml';
|
||||
sub processFile {
|
||||
my $buffer = "";
|
||||
my $firstItem = 1;
|
||||
my $currentLine = 1;
|
||||
open(INFILE, $_[0].".xml") or die "Can't open ".$_[0].": $!";
|
||||
open(my $finalFile, ">".$_[0]."FINAL.xml") or die "Can't clear FINAL ".$_[0].": $!";
|
||||
print $finalFile "";
|
||||
close($finalFile);
|
||||
open(my $finalFile, ">>".$_[0]."FINAL.xml") or die "Can't output FINAL ".$_[0].": $!";
|
||||
print$finalFile "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<patents>\n";
|
||||
foreach(<INFILE>) {
|
||||
print "Processing line ".$currentLine."/".$_[1]."...\n";
|
||||
++$currentLine;
|
||||
my($line) = $_;
|
||||
if ($line !~ /^\<\?xml/ && $firstItem == 0) {
|
||||
if ($line !~ /^\<\!DOCTYPE/) {
|
||||
$buffer = $buffer.$line;
|
||||
}
|
||||
} else {
|
||||
if ($firstItem == 1) {
|
||||
$firstItem = 0;
|
||||
} else {
|
||||
$twig->parse($buffer);
|
||||
$twig->print($finalFile);
|
||||
|
||||
$buffer = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
print $finalFile "</patents>";
|
||||
close($finalFile);
|
||||
close(INFILE);
|
||||
}
|
||||
|
||||
#my $parser = XML::Parser->new(ErrorContext => 2, Style => "Tree");
|
||||
#my $xso = XML::SimpleObject->new( $parser->parsefile($file) );
|
||||
sub generateSQL {
|
||||
my $xml = new XML::Simple (KeyAttr=>[]);
|
||||
|
||||
#foreach my $patent ($xso->child('us-patent-grant')) {
|
||||
#print $patent->child('invention-title')->{VALUE};
|
||||
# print "\n";
|
||||
#}
|
||||
# read XML file
|
||||
my $data = $xml->XMLin($_[0]."FINAL.xml");
|
||||
|
||||
open(my $sqlFile, ">".$_[0]."SQL.xml") or die "Can't output SQL ".$_[0].": $!";
|
||||
print $sqlFile "";
|
||||
close($sqlFile);
|
||||
|
||||
open(my $sqlFile, ">>".$_[0]."SQL.xml") or die "Can't output SQL ".$_[0].": $!";
|
||||
print $sqlFile "CREATE TABLE IF NOT EXISTS `patents`\n(\n\t`pid` INT NOT NULL AUTO_INCREMENT,\n\tPRIMARY KEY(`pid`),\n\t`pubNum` VARCHAR(32),\n\t`pubDate` DATETIME,\n\t`appNum` VARCHAR(32),\n\t`appDate` DATETIME\n);\n\n";
|
||||
foreach my $e (@{$data->{'us-patent-grant'}}) {
|
||||
print $sqlFile "INSERT INTO `patents`(`pubNum`, `pubDate`, `appNum`, `appDate`) VALUES ('".$e->{'publication-number'}."', '".$e->{'publication-date'}."', '".$e->{'application-number'}."', '".$e->{'application-date'}."');\n";
|
||||
}
|
||||
close($sqlFile);
|
||||
}
|
||||
|
|
27
step1.pl
27
step1.pl
|
@ -1,27 +0,0 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
use 5.010;
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
my($filename) = @ARGV;
|
||||
|
||||
my $linesNum = 0;
|
||||
open(FILE, $filename) or die "Can't open `$filename': $!";
|
||||
while (sysread FILE, $buffer, 4096) {
|
||||
$linesNum += ($buffer =~ tr/\n//);
|
||||
}
|
||||
close FILE;
|
||||
open(MYOUTFILE, ">".$filename."CLEAN.xml");
|
||||
print MYOUTFILE "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE us-patent-grant SYSTEM \"us-patent-grant-v45-2014-04-03.dtd\" [ ]>\n";
|
||||
close(MYOUTFILE);
|
||||
|
||||
open(MYINPUTFILE, "<".$filename.".xml");
|
||||
open(MYOUTFILE, ">>".$filename."CLEAN.xml");
|
||||
while(<MYINPUTFILE>) {
|
||||
my($line) = $_;
|
||||
chomp($line);
|
||||
if ($line =~ /^\<\?xml/) {
|
||||
print MYOUTFILE $line;
|
||||
}
|
||||
}
|
Reference in a new issue