-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Moving non tei2html tools to pptools
- Loading branch information
1 parent
12bca16
commit cece34d
Showing
34 changed files
with
4,440 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#!/usr/bin/perl -w | ||
|
||
# | ||
# Test JPEG files in directories. | ||
# | ||
|
||
use strict; | ||
use warnings; | ||
use File::Basename; | ||
use Image::Magick; | ||
|
||
sub list_recursively($); | ||
|
||
sub list_recursively($) { | ||
my ($directory) = @_; | ||
my @files = ( ); | ||
|
||
unless (opendir(DIRECTORY, $directory)) { | ||
print "Cannot open directory $directory!\n"; | ||
exit; | ||
} | ||
|
||
# Read the directory, ignoring special entries "." and ".." | ||
@files = grep (!/^\.\.?$/, readdir(DIRECTORY)); | ||
|
||
closedir(DIRECTORY); | ||
|
||
foreach my $file (@files) { | ||
if (-f "$directory\\$file") { | ||
handle_file("$directory\\$file"); | ||
} elsif (-d "$directory\\$file") { | ||
list_recursively("$directory\\$file"); | ||
} | ||
} | ||
} | ||
|
||
|
||
sub handle_file($) { | ||
my ($file) = @_; | ||
if ($file =~ m/^(.*)\.(jpg|png|gif)$/) { | ||
my $image = new Image::Magick; | ||
my $error = $image->Read($file); | ||
if ($error) { | ||
print "BROKEN $file\n"; | ||
} else { | ||
print "OK $file\n"; | ||
} | ||
} | ||
} | ||
|
||
|
||
sub main() { | ||
## initial call ... $ARGV[0] is the first command line argument | ||
my $file = $ARGV[0]; | ||
|
||
if (!defined $file) { | ||
$file = "."; | ||
} | ||
if (-d $file) { | ||
list_recursively($file); | ||
} else { | ||
handle_file($file); | ||
} | ||
} | ||
|
||
|
||
main(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#!/usr/bin/perl -w | ||
|
||
# | ||
# Check the consistency of archive files. | ||
# | ||
|
||
use strict; | ||
use warnings; | ||
use File::Basename; | ||
use File::Temp; | ||
|
||
my $zip = "zip"; | ||
my $sevenZip = "\"C:\\Program Files\\7-Zip\\7z\""; | ||
# $sevenZip = "7z"; | ||
|
||
my $logFile = "checkArchives.log"; | ||
|
||
main(); | ||
|
||
sub main { | ||
## initial call ... $ARGV[0] is the first command line argument | ||
list_recursively($ARGV[0]); | ||
} | ||
|
||
sub list_recursively($); | ||
|
||
sub list_recursively($) { | ||
my ($directory) = @_; | ||
my @files = ( ); | ||
|
||
unless (opendir(DIRECTORY, $directory)) { | ||
logError("Cannot open directory $directory!"); | ||
exit; | ||
} | ||
|
||
# Read the directory, ignoring special entries "." and ".." | ||
@files = grep (!/^\.\.?$/, readdir(DIRECTORY)); | ||
|
||
closedir(DIRECTORY); | ||
|
||
foreach my $file (@files) { | ||
if (-f "$directory/$file") { | ||
handle_file("$directory/$file"); | ||
} elsif (-d "$directory/$file") { | ||
list_recursively("$directory/$file"); | ||
} | ||
} | ||
} | ||
|
||
|
||
sub handle_file($) { | ||
my ($file) = @_; | ||
|
||
if ($file =~ m/^(.*)\.(7z)$/) { | ||
system ("$sevenZip t \"$file\" 1>>$logFile"); | ||
} | ||
|
||
if ($file =~ m/^(.*)\.(zip)$/) { | ||
system ("$zip -T \"$file\" 1>>$logFile"); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
|
||
use strict; | ||
use warnings; | ||
|
||
while (<>) { | ||
|
||
my $line = $_; | ||
|
||
$line =~ s/<\/link>//g; | ||
$line =~ s/<\/meta>//g; | ||
$line =~ s/<\/img>//g; | ||
$line =~ s/<\/hr>//g; | ||
|
||
$line =~ s/<br\/>/<br>/g; | ||
|
||
$line =~ s/<style><\/style>//g; | ||
|
||
print $line; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# divn2div.pl -- change from numbered to unnumbered TEI divs. | ||
|
||
use strict; | ||
use warnings; | ||
|
||
my $inputFile = $ARGV[0]; | ||
|
||
open(INPUTFILE, $inputFile) || die("Could not open $inputFile"); | ||
|
||
print STDERR "Handling $inputFile\n"; | ||
|
||
my $previousLevel = 0; | ||
|
||
while (<INPUTFILE>) { | ||
my $remainder = $_; | ||
|
||
if ($remainder =~ m/<(body|front|back)(.*?)>/i) { | ||
$previousLevel = 0; | ||
} | ||
|
||
while ($remainder =~ m/<div([0-9])(.*?)>/i) { | ||
my $before = $`; | ||
my $level = $1; | ||
my $attrs = $2; | ||
$remainder = $'; | ||
|
||
my $close = $previousLevel - $level; | ||
$previousLevel = $level; | ||
for ( ; $close >= 0; $close--) { | ||
print $before . "</div>"; | ||
} | ||
print "<div$attrs>"; | ||
} | ||
print $remainder; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# extract-page.pl | ||
|
||
use strict; | ||
use warnings; | ||
use FindBin qw($Bin); | ||
|
||
my $saxon = "java -jar " . $Bin . "/lib/saxon9he.jar "; | ||
my $xsldir = $Bin . "/.."; # location of xsl stylesheets | ||
|
||
my $filename = $ARGV[0]; | ||
my $page = $ARGV[1]; | ||
|
||
if ($page eq '') { | ||
system ("$saxon \"$filename\" $xsldir/extract-page.xsl"); | ||
} else { | ||
system ("$saxon \"$filename\" $xsldir/extract-page.xsl n=\"$page\""); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#!/usr/bin/perl -w | ||
|
||
# | ||
# Extract images from JPEG files in directories. | ||
# | ||
|
||
use strict; | ||
use warnings; | ||
use File::Basename; | ||
use Getopt::Long; | ||
|
||
my $pdfimages = "pdfimages.exe"; # See http://www.foolabs.com/xpdf/download.html -> http://www.xpdfreader.com/download.html | ||
my $pdftopng = "pdftopng.exe"; | ||
my $pdfcount = 1000; | ||
my $resolutionDpi = 300; | ||
my $extractActualImages = 0; | ||
my $showHelp = 0; | ||
|
||
GetOptions( | ||
'j' => \$extractActualImages, | ||
'r=i' => \$resolutionDpi, | ||
'q' => \$showHelp, | ||
'help' => \$showHelp | ||
); | ||
|
||
|
||
if ($showHelp == 1) { | ||
print "extractPdf.pl -- Wrapper around pdfimages.exe to extract images from a PDF file.\n\n"; | ||
print "Usage: extractPdf.pl [-jq] [-r=i] <file>\n\n"; | ||
print "Options:\n"; | ||
print " j Extract actual embedded images.\n"; | ||
print " r=i Output resolution in DPI.\n"; | ||
print " q Print this help and exit.\n"; | ||
|
||
exit(0); | ||
} | ||
|
||
|
||
sub list_recursively($); | ||
|
||
sub list_recursively($) { | ||
my ($directory) = @_; | ||
my @files = ( ); | ||
|
||
unless (opendir(DIRECTORY, $directory)) { | ||
print "Cannot open directory $directory!\n"; | ||
exit; | ||
} | ||
|
||
# Read the directory, ignoring special entries "." and ".." | ||
@files = grep (!/^\.\.?$/, readdir(DIRECTORY)); | ||
|
||
closedir(DIRECTORY); | ||
|
||
foreach my $file (@files) { | ||
if (-f "$directory\\$file") { | ||
handle_file("$directory\\$file"); | ||
} elsif (-d "$directory\\$file") { | ||
list_recursively("$directory\\$file"); | ||
} | ||
} | ||
} | ||
|
||
|
||
sub handle_file($) { | ||
my ($file) = @_; | ||
if ($file =~ m/^(.*)\.(pdf)$/) { | ||
my $extension = $2; | ||
print "Extracting images from PDF: $file\n"; | ||
if ($extension eq 'pdf') { | ||
$pdfcount++; | ||
if ($extractActualImages == 0) { | ||
system ("$pdftopng -r $resolutionDpi $file $pdfcount"); | ||
} else { | ||
system ("$pdfimages -j -list \"$file\" $pdfcount"); | ||
} | ||
} | ||
} | ||
} | ||
|
||
|
||
sub main() { | ||
## initial call ... $ARGV[0] is the first command line argument | ||
my $file = $ARGV[0]; | ||
|
||
if (!defined $file) { | ||
$file = "."; | ||
} | ||
if (-d $file) { | ||
list_recursively($file); | ||
} else { | ||
handle_file($file); | ||
} | ||
} | ||
|
||
|
||
main(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# fb2tei.pl -- covert fictionbook to TEI. | ||
|
||
use strict; | ||
use warnings; | ||
use MIME::Base64; | ||
use FindBin qw($Bin); | ||
|
||
my $toolsdir = $Bin; # location of tools | ||
my $xsldir = $toolsdir . "/.."; # location of xsl stylesheets | ||
my $saxon = "java -jar " . $toolsdir . "/lib/saxon9he.jar "; | ||
|
||
my $filename = $ARGV[0]; | ||
$filename =~ /^(.*)\.fb2$/; | ||
my $basename = $1; | ||
|
||
system ("$saxon $filename $xsldir/fb2tei.xsl > $basename.xml"); | ||
|
||
# convert extracted .hex files to binary | ||
my @files = <*.hex>; | ||
foreach my $file (@files) { | ||
convertFile($file); | ||
} | ||
|
||
system ("perl -S tei2html.pl -h $basename.xml"); | ||
|
||
sub convertFile($) { | ||
my $filename = shift; | ||
|
||
open INFILE, $filename or die "Unable to open file: $filename"; | ||
my $string = join("", <INFILE>); | ||
close INFILE; | ||
my $binary = decode_base64($string); | ||
|
||
$filename =~ /^(.*)\.hex$/; | ||
my $outputFilename = $1; | ||
|
||
open(OUTFILE, '>:raw', $outputFilename) or die "Unable to open: $outputFilename"; | ||
print OUTFILE $binary ; | ||
close(OUTFILE); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# | ||
# fixAnchors.pl -- provide anchors with sequential numbers per page in TEI tagged files | ||
# | ||
|
||
use strict; | ||
use warnings; | ||
|
||
use Roman; # Roman.pm version 1.1 by OZAWA Sakuro <ozawa@aisoft.co.jp> | ||
use SgmlSupport qw/getAttrVal/; | ||
|
||
my $inputFile = $ARGV[0]; | ||
my $currentPage = 0; | ||
my $currentAnchor = 1; | ||
my $prefix = "a"; | ||
|
||
|
||
open(INPUTFILE, $inputFile) || die("Could not open $inputFile"); | ||
|
||
print STDERR "Fixing anchors in $inputFile\n"; | ||
|
||
while (<INPUTFILE>) { | ||
my $line = $_; | ||
my $remainder = $line; | ||
while ($remainder =~ m/<pb(.*?)>/) { | ||
my $before = $`; | ||
my $attrs = $1; | ||
$remainder = $'; | ||
handleAnchors($before); | ||
print "<pb$1>"; | ||
$currentAnchor = 1; | ||
$currentPage = getAttrVal('n', $attrs); | ||
$currentPage = isroman($currentPage) ? arabic($currentPage) : $currentPage; | ||
} | ||
handleAnchors($remainder); | ||
} | ||
|
||
|
||
sub handleAnchors($) { | ||
my $remainder = shift; | ||
while ($remainder =~ m/<anchor(.*?)>/) { | ||
my $before = $`; | ||
my $attrs = $1; | ||
$remainder = $'; | ||
print $before; | ||
print "<anchor id=$prefix$currentPage.$currentAnchor>"; | ||
if ($remainder =~ m/<ab type=lineNum>([0-9]+)<\/ab>/) { | ||
my $lineNum = $1; | ||
if ($lineNum != $currentAnchor) { | ||
print STDERR "WARNING: anchor at $currentPage.$currentAnchor doesn't match lineNum: $lineNum\n"; | ||
} | ||
} | ||
$currentAnchor++; | ||
} | ||
print $remainder; | ||
} |
Oops, something went wrong.