Skip to content

Commit

Permalink
Moving non tei2html tools to pptools
Browse files Browse the repository at this point in the history
  • Loading branch information
jhellingman committed Jan 6, 2025
1 parent 12bca16 commit cece34d
Show file tree
Hide file tree
Showing 34 changed files with 4,440 additions and 0 deletions.
67 changes: 67 additions & 0 deletions pptools/broken-images.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/perl -w

#
# Test JPEG files in directories.
#

use strict;
use warnings;
use File::Basename;
use Image::Magick;

sub list_recursively($);

sub list_recursively($) {
my ($directory) = @_;
my @files = ( );

unless (opendir(DIRECTORY, $directory)) {
print "Cannot open directory $directory!\n";
exit;
}

# Read the directory, ignoring special entries "." and ".."
@files = grep (!/^\.\.?$/, readdir(DIRECTORY));

closedir(DIRECTORY);

foreach my $file (@files) {
if (-f "$directory\\$file") {
handle_file("$directory\\$file");
} elsif (-d "$directory\\$file") {
list_recursively("$directory\\$file");
}
}
}


sub handle_file($) {
my ($file) = @_;
if ($file =~ m/^(.*)\.(jpg|png|gif)$/) {
my $image = new Image::Magick;
my $error = $image->Read($file);
if ($error) {
print "BROKEN $file\n";
} else {
print "OK $file\n";
}
}
}


sub main() {
## initial call ... $ARGV[0] is the first command line argument
my $file = $ARGV[0];

if (!defined $file) {
$file = ".";
}
if (-d $file) {
list_recursively($file);
} else {
handle_file($file);
}
}


main();
61 changes: 61 additions & 0 deletions pptools/checkArchives.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/perl -w

#
# Check the consistency of archive files.
#

use strict;
use warnings;
use File::Basename;
use File::Temp;

my $zip = "zip";
my $sevenZip = "\"C:\\Program Files\\7-Zip\\7z\"";
# $sevenZip = "7z";

my $logFile = "checkArchives.log";

main();

sub main {
## initial call ... $ARGV[0] is the first command line argument
list_recursively($ARGV[0]);
}

sub list_recursively($);

sub list_recursively($) {
my ($directory) = @_;
my @files = ( );

unless (opendir(DIRECTORY, $directory)) {
logError("Cannot open directory $directory!");
exit;
}

# Read the directory, ignoring special entries "." and ".."
@files = grep (!/^\.\.?$/, readdir(DIRECTORY));

closedir(DIRECTORY);

foreach my $file (@files) {
if (-f "$directory/$file") {
handle_file("$directory/$file");
} elsif (-d "$directory/$file") {
list_recursively("$directory/$file");
}
}
}


sub handle_file($) {
my ($file) = @_;

if ($file =~ m/^(.*)\.(7z)$/) {
system ("$sevenZip t \"$file\" 1>>$logFile");
}

if ($file =~ m/^(.*)\.(zip)$/) {
system ("$zip -T \"$file\" 1>>$logFile");
}
}
19 changes: 19 additions & 0 deletions pptools/cleanHtml.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

use strict;
use warnings;

while (<>) {

my $line = $_;

$line =~ s/<\/link>//g;
$line =~ s/<\/meta>//g;
$line =~ s/<\/img>//g;
$line =~ s/<\/hr>//g;

$line =~ s/<br\/>/<br>/g;

$line =~ s/<style><\/style>//g;

print $line;
}
35 changes: 35 additions & 0 deletions pptools/divn2div.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# divn2div.pl -- change from numbered to unnumbered TEI divs.

use strict;
use warnings;

my $inputFile = $ARGV[0];

open(INPUTFILE, $inputFile) || die("Could not open $inputFile");

print STDERR "Handling $inputFile\n";

my $previousLevel = 0;

while (<INPUTFILE>) {
my $remainder = $_;

if ($remainder =~ m/<(body|front|back)(.*?)>/i) {
$previousLevel = 0;
}

while ($remainder =~ m/<div([0-9])(.*?)>/i) {
my $before = $`;
my $level = $1;
my $attrs = $2;
$remainder = $';

my $close = $previousLevel - $level;
$previousLevel = $level;
for ( ; $close >= 0; $close--) {
print $before . "</div>";
}
print "<div$attrs>";
}
print $remainder;
}
17 changes: 17 additions & 0 deletions pptools/extract-page.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# extract-page.pl

use strict;
use warnings;
use FindBin qw($Bin);

my $saxon = "java -jar " . $Bin . "/lib/saxon9he.jar ";
my $xsldir = $Bin . "/.."; # location of xsl stylesheets

my $filename = $ARGV[0];
my $page = $ARGV[1];

if ($page eq '') {
system ("$saxon \"$filename\" $xsldir/extract-page.xsl");
} else {
system ("$saxon \"$filename\" $xsldir/extract-page.xsl n=\"$page\"");
}
97 changes: 97 additions & 0 deletions pptools/extractPdf.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/perl -w

#
# Extract images from JPEG files in directories.
#

use strict;
use warnings;
use File::Basename;
use Getopt::Long;

my $pdfimages = "pdfimages.exe"; # See http://www.foolabs.com/xpdf/download.html -> http://www.xpdfreader.com/download.html
my $pdftopng = "pdftopng.exe";
my $pdfcount = 1000;
my $resolutionDpi = 300;
my $extractActualImages = 0;
my $showHelp = 0;

GetOptions(
'j' => \$extractActualImages,
'r=i' => \$resolutionDpi,
'q' => \$showHelp,
'help' => \$showHelp
);


if ($showHelp == 1) {
print "extractPdf.pl -- Wrapper around pdfimages.exe to extract images from a PDF file.\n\n";
print "Usage: extractPdf.pl [-jq] [-r=i] <file>\n\n";
print "Options:\n";
print " j Extract actual embedded images.\n";
print " r=i Output resolution in DPI.\n";
print " q Print this help and exit.\n";

exit(0);
}


sub list_recursively($);

sub list_recursively($) {
my ($directory) = @_;
my @files = ( );

unless (opendir(DIRECTORY, $directory)) {
print "Cannot open directory $directory!\n";
exit;
}

# Read the directory, ignoring special entries "." and ".."
@files = grep (!/^\.\.?$/, readdir(DIRECTORY));

closedir(DIRECTORY);

foreach my $file (@files) {
if (-f "$directory\\$file") {
handle_file("$directory\\$file");
} elsif (-d "$directory\\$file") {
list_recursively("$directory\\$file");
}
}
}


sub handle_file($) {
my ($file) = @_;
if ($file =~ m/^(.*)\.(pdf)$/) {
my $extension = $2;
print "Extracting images from PDF: $file\n";
if ($extension eq 'pdf') {
$pdfcount++;
if ($extractActualImages == 0) {
system ("$pdftopng -r $resolutionDpi $file $pdfcount");
} else {
system ("$pdfimages -j -list \"$file\" $pdfcount");
}
}
}
}


sub main() {
## initial call ... $ARGV[0] is the first command line argument
my $file = $ARGV[0];

if (!defined $file) {
$file = ".";
}
if (-d $file) {
list_recursively($file);
} else {
handle_file($file);
}
}


main();
40 changes: 40 additions & 0 deletions pptools/fb2tei.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# fb2tei.pl -- covert fictionbook to TEI.

use strict;
use warnings;
use MIME::Base64;
use FindBin qw($Bin);

my $toolsdir = $Bin; # location of tools
my $xsldir = $toolsdir . "/.."; # location of xsl stylesheets
my $saxon = "java -jar " . $toolsdir . "/lib/saxon9he.jar ";

my $filename = $ARGV[0];
$filename =~ /^(.*)\.fb2$/;
my $basename = $1;

system ("$saxon $filename $xsldir/fb2tei.xsl > $basename.xml");

# convert extracted .hex files to binary
my @files = <*.hex>;
foreach my $file (@files) {
convertFile($file);
}

system ("perl -S tei2html.pl -h $basename.xml");

sub convertFile($) {
my $filename = shift;

open INFILE, $filename or die "Unable to open file: $filename";
my $string = join("", <INFILE>);
close INFILE;
my $binary = decode_base64($string);

$filename =~ /^(.*)\.hex$/;
my $outputFilename = $1;

open(OUTFILE, '>:raw', $outputFilename) or die "Unable to open: $outputFilename";
print OUTFILE $binary ;
close(OUTFILE);
}
55 changes: 55 additions & 0 deletions pptools/fixAnchors.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#
# fixAnchors.pl -- provide anchors with sequential numbers per page in TEI tagged files
#

use strict;
use warnings;

use Roman; # Roman.pm version 1.1 by OZAWA Sakuro <ozawa@aisoft.co.jp>
use SgmlSupport qw/getAttrVal/;

my $inputFile = $ARGV[0];
my $currentPage = 0;
my $currentAnchor = 1;
my $prefix = "a";


open(INPUTFILE, $inputFile) || die("Could not open $inputFile");

print STDERR "Fixing anchors in $inputFile\n";

while (<INPUTFILE>) {
my $line = $_;
my $remainder = $line;
while ($remainder =~ m/<pb(.*?)>/) {
my $before = $`;
my $attrs = $1;
$remainder = $';
handleAnchors($before);
print "<pb$1>";
$currentAnchor = 1;
$currentPage = getAttrVal('n', $attrs);
$currentPage = isroman($currentPage) ? arabic($currentPage) : $currentPage;
}
handleAnchors($remainder);
}


sub handleAnchors($) {
my $remainder = shift;
while ($remainder =~ m/<anchor(.*?)>/) {
my $before = $`;
my $attrs = $1;
$remainder = $';
print $before;
print "<anchor id=$prefix$currentPage.$currentAnchor>";
if ($remainder =~ m/<ab type=lineNum>([0-9]+)<\/ab>/) {
my $lineNum = $1;
if ($lineNum != $currentAnchor) {
print STDERR "WARNING: anchor at $currentPage.$currentAnchor doesn't match lineNum: $lineNum\n";
}
}
$currentAnchor++;
}
print $remainder;
}
Loading

0 comments on commit cece34d

Please sign in to comment.