Skip to content

Commit

Permalink
#19 Identify links to tables and figures
Browse files Browse the repository at this point in the history
  • Loading branch information
Vitaliy-1 committed Nov 9, 2020
1 parent 19e86aa commit 4abfb61
Show file tree
Hide file tree
Showing 8 changed files with 241 additions and 12 deletions.
12 changes: 9 additions & 3 deletions src/docx2jats/jats/Figure.php
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
<?php namespace docx2jats\jats;

/**
* @file src/docx2jats/jats/Table.php
* @file src/docx2jats/jats/Figure.php
*
* Copyright (c) 2018-2019 Vitalii Bezsheiko
* Copyright (c) 2018-2020 Vitalii Bezsheiko
* Distributed under the GNU GPL v3.
*
* @brief represent JATS XML image
Expand All @@ -13,6 +13,8 @@
use docx2jats\objectModel\body\Image as FigureObject;

class Figure extends Element {
const JATS_FIGURE_ID_PREFIX = 'fig';

/* @var $dataObject FigureObject */
var $figureObject;

Expand All @@ -23,7 +25,11 @@ public function __construct(DataObject $dataObject) {
}

function setContent() {
$dataObject = $this->getDataObject(); /* @var $dataObject \docx2jats\objectModel\body\Table */
$dataObject = $this->getDataObject(); /* @var $dataObject \docx2jats\objectModel\body\Image */

if ($dataObject->getId()) {
$this->setAttribute('id', self::JATS_FIGURE_ID_PREFIX . $dataObject->getId());
}

if ($dataObject->getLabel()) {
$this->appendChild($this->ownerDocument->createElement('label', $dataObject->getLabel()));
Expand Down
18 changes: 17 additions & 1 deletion src/docx2jats/jats/Par.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
/**
* @file src/docx2jats/jats/Row.php
*
* Copyright (c) 2018-2019 Vitalii Bezsheiko
* Copyright (c) 2018-2020 Vitalii Bezsheiko
* Distributed under the GNU GPL v3.
*
* @brief represent JATS XML paragraph; can't be nested. To be included into body, sections, lists and table cells.
Expand All @@ -25,6 +25,7 @@ public function setContent() {

foreach ($this->getDataObject()->getContent() as $content) {
if (get_class($content) === 'docx2jats\objectModel\body\Field') {
// Write links to references
if ($content->getType() === Field::DOCX_FIELD_CSL) {
$lastKey = array_key_last($content->getRefIds());
foreach ($content->getRefIds() as $key => $id) {
Expand All @@ -38,6 +39,21 @@ public function setContent() {
}
}
}
// Write links to table and figures
elseif ($content->getType() === Field::DOCX_FIELD_BOOKMARK_REF) {
$refEl = $this->ownerDocument->createElement('xref');
$this->appendChild($refEl);
foreach ($content->getContent() as $text) { /* @var $text \docx2jats\objectModel\body\Text */
JatsText::extractText($text, $refEl);
}
if ($tableIdRef = $content->tableIdRef) {
$refEl->setAttribute('ref-type', 'table');
$refEl->setAttribute('rid', Table::JATS_TABLE_ID_PREFIX . $tableIdRef);
} elseif ($figureIdRef = $content->figureIdRef) {
$refEl->setAttribute('ref-type', 'fig');
$refEl->setAttribute('rid', Figure::JATS_FIGURE_ID_PREFIX . $figureIdRef);
}
}
} else {
JatsText::extractText($content, $this);
}
Expand Down
8 changes: 7 additions & 1 deletion src/docx2jats/jats/Table.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
/**
* @file src/docx2jats/jats/Table.php
*
* Copyright (c) 2018-2019 Vitalii Bezsheiko
* Copyright (c) 2018-2020 Vitalii Bezsheiko
* Distributed under the GNU GPL v3.
*
* @brief represent JATS XML table
Expand All @@ -14,13 +14,19 @@

class Table extends Element {

const JATS_TABLE_ID_PREFIX = 'tbl';

public function __construct(DataObject $dataObject) {
parent::__construct($dataObject);
}

public function setContent() {
$dataObject = $this->getDataObject(); /* @var $dataObject \docx2jats\objectModel\body\Table */

if ($dataObject->getId()) {
$this->setAttribute('id', self::JATS_TABLE_ID_PREFIX . $dataObject->getId());
}

if ($dataObject->getLabel()) {
$this->appendChild($this->ownerDocument->createElement('label', $dataObject->getLabel()));
}
Expand Down
83 changes: 82 additions & 1 deletion src/docx2jats/objectModel/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
/**
* @file src/docx2jats/objectModel/Document.php
*
* Copyright (c) 2018-2019 Vitalii Bezsheiko
* Copyright (c) 2018-2020 Vitalii Bezsheiko
* Distributed under the GNU GPL v3.
*
* @brief representation of an article; extracts all main elements from DOCX document.xml
*/

use docx2jats\jats\Figure;
use docx2jats\objectModel\DataObject;
use docx2jats\objectModel\body\Par;
use docx2jats\objectModel\body\Table;
Expand Down Expand Up @@ -43,6 +44,19 @@ class Document {
private $references = array();
private $refCount = 0;

// Set unique IDs for tables and figure in order of appearance
private $currentFigureId = 1;
private $currentTableId = 1;

/**
* @var $parsHaveBookmarks array
* @brief Key numbers of paragraphs that contain bookmarks inside the content
* is used to speed up a search
*/
private $elsHaveBookmarks = array();
private $elsAreTables = array();
private $elsAreFigures = array();

public function __construct(array $params) {
if (array_key_exists("relationships", $params)) {
$this->relationships = $params["relationships"];
Expand All @@ -66,6 +80,7 @@ public function __construct(array $params) {
$content = array();
$unUsedCaption = null;
foreach ($childNodes as $key => $childNode) {
// Assign block elements, i.e., Figures, Tables, Paragraphs, depending on the context
switch ($childNode->nodeName) {
case "w:p":
// There can be multiple drawings inside a run and multiple elements inside a drawing
Expand All @@ -77,6 +92,10 @@ public function __construct(array $params) {
foreach ($imageNodes as $imageNode) {
$figure = new Image($imageNode, $this);
$content[] = $figure;
$this->elsAreFigures[] = count($content)-1;

// Set unique ID
$figure->setFigureId($this->currentFigureId++);

// Set caption if exists
if ($unUsedCaption) {
Expand All @@ -103,12 +122,19 @@ public function __construct(array $params) {
} else {
$content[] = $par;
}

if ($par->hasBookmarks) {
$this->elsHaveBookmarks[] = count($content)-1;
}
}
break;
case "w:tbl":
$table = new Table($childNode, $this);
$content[] = $table;
$this->elsAreTables[] = count($content)-1;

// Set unique ID
$table->setTableId($this->currentTableId++);
// Set caption if exists
if ($unUsedCaption) {
$table->setCaption($unUsedCaption);
Expand All @@ -120,6 +146,7 @@ public function __construct(array $params) {

$this->content = $this->addSectionMarks($content);
self::$minimalHeadingLevel = $this->minimalHeadingLevel();
$this->setInternalRefs();
}

/**
Expand Down Expand Up @@ -286,4 +313,58 @@ public function getLastReference() : ?Reference {
$lastId = array_key_last($this->references);
return $this->references[$lastId];
}

/**
* @brief iterate through the content and establish internal links between element
* elsHaveBookmarks holds position in an array of each paragraph that includes a bookmark
* it's slightly faster than looping over the whole content
*/
private function setInternalRefs(): void {
if (empty($this->elsHaveBookmarks)) return;

// Find and map tables' and figures' bookmarks
$refTableMap = $this->getBookmarkCaptionMapping($this->elsAreTables);
$refFigureMap = $this->getBookmarkCaptionMapping($this->elsAreFigures);

// Find bookmark refs
foreach ($this->elsHaveBookmarks as $parKeyWithBookmark) {
$par = $this->getContent()[$parKeyWithBookmark]; /* @var $par Par */
foreach ($par->bookmarkPos as $fieldKeyWithBookmark) {
$field = $par->getContent()[$fieldKeyWithBookmark]; /* @var $field \docx2jats\objectModel\body\Field */

// Set links to tables
foreach ($refTableMap as $tableId => $tableRefs) {
if (in_array($field->getBookmarkId(), $tableRefs)) {
$field->tableIdRef = $tableId;
}
}

// Set links to Figures
foreach ($refFigureMap as $figureId => $figureRefs) {
if (in_array($field->getBookmarkId(), $figureRefs)) {
$field->figureIdRef = $tableId;
}
}
}
}
}

/**
* @return array
* @brief (or not so brief) Map OOXML bookmark refs inside table and figures with correspondent table/figure IDs.
* In OOXML those bookmarks are stored inside captions
* This is used to set right link to these objects from the text
* Keep in mind that bookmarks also may be stored in an external file, e.g., Mendeley plugin for LibreOffice Writer
* stores links to references this way
*/
function getBookmarkCaptionMapping(array $keysInContent): array {
$refMap = [];
foreach ($keysInContent as $tableKey) {
$table = $this->content[$tableKey]; /* @var $table Table|Image */
if (empty($table->getBookmarkIds())) continue;
$refMap[$table->getId()] = $table->getBookmarkIds();
}

return $refMap;
}
}
47 changes: 46 additions & 1 deletion src/docx2jats/objectModel/body/Field.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

class Field extends DataObject {

const DOCX_FIELD_CSL = 1;
const DOCX_FIELD_CSL = 1; // Zotero/Mendeley JSON-CSL reference
const DOCX_FIELD_BOOKMARK_REF = 2; // Internal OOXML reference to the bookmark
/** @var $type int DOCX_FIELD... const */
private $type = 0;
private $isZoteroCSL = false;
Expand All @@ -25,6 +26,20 @@ class Field extends DataObject {
private $rawRuns = array();
/** @var array contains instructions to be processed as strings, e.g., CSL citations as a JSON string */
private $instructions = array();
private $bookmarkId;

/**
* @var $tableIdRef int
* @brief the reference to table that the field contains
* TODO check if may include several references at once
*/
public $tableIdRef = 0;

/**
* @var $figureIdRef int
* @brief the reference to figure that the field contains
*/
public $figureIdRef = 0;

public function __construct(\DOMElement $domElement, Document $ownerDocument, Par $parent) {
parent::__construct($domElement, $ownerDocument, $parent);
Expand Down Expand Up @@ -77,6 +92,7 @@ private function processRuns() {
if ($instructionNode) {
$instructionString = $instructionNode->nodeValue;
$this->instructions[] = $instructionString;
// Check if Zotero/Mendeley Citation
if (strpos($instructionString, 'CSL_CITATION') !== false) {
$this->type = self::DOCX_FIELD_CSL;
$rawCSL = $this->extractRawCSL($instructionString);
Expand All @@ -92,6 +108,12 @@ private function processRuns() {
}
}
}
// Check if Link to the Bookmark (only Tables and Figures are supported)
elseif (strpos($instructionString, 'REF') !== false) {
$this->getParent()->hasBookmarks = true;
$this->type = self::DOCX_FIELD_BOOKMARK_REF;
$this->bookmarkId = $this->extractRefID($instructionString);
}
}
} else {
$this->content[] = new Text($run);
Expand Down Expand Up @@ -119,6 +141,18 @@ private function extractRawCSL(string $instruction) {
return $rawCSL;
}

private function extractRefID(string $instruction) {
$exploded = explode(' ', trim($instruction));
foreach ($exploded as $key => $word) {
if ($word == 'REF') {
if (array_key_exists($key+1, $exploded)) {
return $exploded[$key + 1];
}
}
}
return null;
}

public function getPlainCit() {
return $this->plainCit;
}
Expand All @@ -133,4 +167,15 @@ public function getRefIds() {
public function isZoteroCSL(): bool {
return $this->isZoteroCSL;
}

/**
* @return mixed
*/
public function getBookmarkId() {
if ($this->type === self::DOCX_FIELD_BOOKMARK_REF) {
return $this->bookmarkId;
}

return null;
}
}
40 changes: 37 additions & 3 deletions src/docx2jats/objectModel/body/Image.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
/**
* @file src/docx2jats/objectModel/body/Image.php
*
* Copyright (c) 2018-2019 Vitalii Bezsheiko
* Copyright (c) 2018-2020 Vitalii Bezsheiko
* Distributed under the GNU GPL v3.
*
* @brief parses data from OOXML drawings; supports only pictures
Expand All @@ -12,12 +12,16 @@
use docx2jats\objectModel\DataObject;
use docx2jats\objectModel\Document;

// TODO create a common parent class for Image and Table
class Image extends DataObject {

/* @var $link string */
private $link;
private ?string $label = null;
private ?string $title = null;
private $label = null;
private $title = null;
private $figureId = 0;
private $bookmarkIds = array();
private $bookmarkText = ''; // TODO Check if there are situation where bookmark text is needed for JATS

public function __construct(\DOMElement $domElement, $ownerDocument) {
parent::__construct($domElement, $ownerDocument);
Expand Down Expand Up @@ -81,6 +85,15 @@ public function setCaption(\DOMElement $el): void {
if (!empty($title)) {
$this->title = trim($title);
}

// Caption may have bookmarks that are pointed from outside the table, retrieve their IDs;
// TODO Check if other bookmark types may be inserted in captions
$bookmarkStartEls = Document::$xpath->query('w:bookmarkStart', $el);
foreach ($bookmarkStartEls as $bookmarkStartEl) { /* @var $bookmarkStartEl \DOMElement */
if ($bookmarkStartEl->hasAttribute('w:name')) {
$this->bookmarkIds[] = $bookmarkStartEl->getAttribute('w:name');
}
}
}

/**
Expand All @@ -96,4 +109,25 @@ public function getLabel(): ?string {
public function getTitle(): ?string {
return $this->title;
}

/**
* @param int $currentFigureId
*/
public function setFigureId(int $currentFigureId): void {
$this->figureId = $currentFigureId;
}

/**
* @return int
*/
public function getId(): int {
return $this->figureId;
}

/**
* @return array
*/
public function getBookmarkIds(): array {
return $this->bookmarkIds;
}
}
Loading

0 comments on commit 4abfb61

Please sign in to comment.