-
Notifications
You must be signed in to change notification settings - Fork 1
/
PDFCreatePlugin.php
102 lines (84 loc) · 3.38 KB
/
PDFCreatePlugin.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
<?php
/**
* PDFCreate Plugin - creates OCR'd PDFs from TIFFs
*
* @author Joe Corall <jcorall@kent.edu>
*
* @todo integrate with Google’s Cloud Vision API to allow alternative to tesseract req
* @todo consider deleting individual OCR'd PDFs after aggregated PDF is created - maybe make it an admin UI setting?
* @todo add link to PDF in Files
* @todo possibly see if the TIFF has any natural language in the OCR before adding the OCR text to the metadata???
* @todo allow admin to specify criteria on what TIFFs to create PDFs from based on a metadata search
*/
define('PDF_CREATE_OCR_DIR', FILES_DIR . DIRECTORY_SEPARATOR . 'ocr');
define('PDF_CREATE_PDF_DIR', FILES_DIR . DIRECTORY_SEPARATOR . 'pdfs' );
class PDFCreatePlugin extends Omeka_Plugin_AbstractPlugin
{
protected $_hooks = array(
'after_delete_item',
'after_delete_file',
'after_save_item',
'install',
);
public function hookInstall()
{
// create directories to store PDFs
mkdir(PDF_CREATE_OCR_DIR);
mkdir(PDF_CREATE_PDF_DIR);
}
public function hookAfterSaveItem($args)
{
$item = $args['record'];
// after saving an item send a background job to generate any PDFs needed
$jobDispatcher = Zend_Registry::get('job_dispatcher');
$jobDispatcher->setQueueNameLongRunning('pdfcreate_ocr');
try {
$options = array(
// don't want to rely on an item object getting passed into the separate job/thread
// so just send the item ID and the job will load the item
'item_id' => $item->id
);
$jobDispatcher->sendLongRunning('PDFCreate_OCR', $options);
} catch (Exception $e) {
throw $e;
}
}
public function hookAfterDeleteFile($args)
{
$file = $args['record'];
$ocr_file = self::get_ocr_path($file);
if (file_exists($ocr_file)) {
unlink($ocr_file);
$pdf = PDF_CREATE_PDF_DIR . DIRECTORY_SEPARATOR . $file->item_id . '.pdf';
if (file_exists($pdf)) {
unlink($pdf);
}
}
}
public function hookAfterDeleteItem($args)
{
$item = $args['record'];
// get the directory where the OCR'd PDFs for all the files for this item might be stored
$ocr_dir = PDF_CREATE_OCR_DIR . DIRECTORY_SEPARATOR . $item->id;
// if the directory exists delete all the files in the OCR directory
if (file_exists($ocr_dir)) {
array_map('unlink', glob("$ocr_dir/*.pdf"));
array_map('unlink', glob("$ocr_dir/*.txt"));
rmdir($ocr_dir);
// also remove the aggregated PDF if it exists
$pdf = PDF_CREATE_PDF_DIR . DIRECTORY_SEPARATOR . $item->id . '.pdf';
if (file_exists($pdf)) {
unlink($pdf);
}
}
}
public function get_ocr_path($file)
{
// OCR PDF for a TIFF file is stored in a directory named after the item ID of the file
$ocr_dir = PDF_CREATE_OCR_DIR . DIRECTORY_SEPARATOR . $file->item_id;
// the PDF filename is the original name of the file with a ".PDF" extension instead of ".TIFF"
$original_file = explode('.', $file->original_filename);
$ocr_file = $ocr_dir . DIRECTORY_SEPARATOR . array_shift($original_file) . '.pdf';
return $ocr_file;
}
}