Skip to content

Commit

Permalink
Use MiniOCR format to store fulltext to ocr-highlighting plugin.
Browse files Browse the repository at this point in the history
This supports only ALTO 2.x.

There should be found a solution to support different namespaces. See #488
  • Loading branch information
Alexander Bigga committed Sep 21, 2021
1 parent fdad413 commit 6c92af8
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 4 deletions.
6 changes: 3 additions & 3 deletions Classes/Common/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -702,13 +702,13 @@ class_exists($class)
) {
// Load XML from file.
$rawTextXml = simplexml_load_string($fileContent);
$rawText = $obj->getRawText($rawTextXml);
$this->rawTextArray[$id] = $rawText;
$textMiniOcr = $obj->getTextAsMiniOcr($rawTextXml);
$this->rawTextArray[$id] = $textMiniOcr;
} else {
$this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
}
}
$fullText = $rawText;
$fullText = $textMiniOcr;
} else {
$this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
}
Expand Down
13 changes: 12 additions & 1 deletion Classes/Common/FulltextInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* @access public
* @abstract
*/
//TODO: check if this is still needed when actually full text xml is indexed

interface FulltextInterface
{
/**
Expand All @@ -34,4 +34,15 @@ interface FulltextInterface
* @return string The raw unformatted fulltext
*/
public function getRawText(\SimpleXMLElement $xml);

/**
* This extracts the fulltext data from ALTO XML and returns it in MiniOCR format
*
* @access public
*
* @param \SimpleXMLElement $xml: The XML to extract the raw text from
*
* @return string The unformatted fulltext in MiniOCR format
*/
public function getTextAsMiniOcr(\SimpleXMLElement $xml);
}
49 changes: 49 additions & 0 deletions Classes/Format/Alto.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,13 @@

namespace Kitodo\Dlf\Format;

use Kitodo\Dlf\Common\Solr;

/**
* Fulltext ALTO format class for the 'dlf' extension
*
* ** This currently supports only ALTO 2.x **
*
* @author Sebastian Meyer <sebastian.meyer@slub-dresden.de>
* @package TYPO3
* @subpackage dlf
Expand Down Expand Up @@ -42,4 +46,49 @@ public function getRawText(\SimpleXMLElement $xml)
}
return $rawText;
}



/**
* This extracts the fulltext data from ALTO XML and returns it in MiniOCR format
*
* @access public
*
* @param \SimpleXMLElement $xml: The XML to extract the raw text from
*
* @return string The unformatted fulltext in MiniOCR format
*/
public function getTextAsMiniOcr(\SimpleXMLElement $xml)
{
$rawText = '';
$xml->registerXPathNamespace('alto', 'http://www.loc.gov/standards/alto/ns-v2#');

// get all text blocks
$blocks = $xml->xpath('./alto:Layout/alto:Page/alto:PrintSpace//alto:TextBlock');

if (empty($blocks)) {
return '';
}

$miniOcr = new \SimpleXMLElement("<ocr></ocr>");

foreach ($blocks as $block) {
$newBlock = $miniOcr->addChild('b');
foreach ($block->children() as $key => $value) {
if ($key === "TextLine") {
$newLine = $newBlock->addChild('l');
foreach ($value->children() as $lkey => $word) {
if ($lkey == "String") {
$attributes = $word->attributes();
$la = Solr::escapeQuery((string) $attributes['CONTENT']);
$newWord = $newLine->addChild('w', htmlspecialchars(Solr::escapeQuery((string) $attributes['CONTENT'])) . ' ');
$newWord->addAttribute('x', (string) $attributes['HPOS'] . ' ' . (string) $attributes['VPOS'] . ' ' . (string) $attributes['WIDTH'] . ' ' . (string) $attributes['HEIGHT']);
}
}
}
}
}

return $miniOcr->asXml();
}
}

0 comments on commit 6c92af8

Please sign in to comment.