diff --git a/src/docx2jats/DOCXArchive.php b/src/docx2jats/DOCXArchive.php index 851b6df..c289c70 100644 --- a/src/docx2jats/DOCXArchive.php +++ b/src/docx2jats/DOCXArchive.php @@ -34,6 +34,7 @@ public function __construct(string $filepath) { $styles = $this->transformToXml("word/styles.xml"); $this->mediaFiles = $this->extractMediaFiles(); $numbering = $this->transformToXml("word/numbering.xml"); + $docPropsCustom = $this->transformToXml("docProps/custom.xml"); $this->close(); // construct as an array @@ -42,10 +43,9 @@ public function __construct(string $filepath) { $params["ooxmlDocument"] = $this->ooxmlDocument; if ($relationships) $params["relationships"] = $relationships; - if ($styles) $params["styles"] = $styles; - if ($numbering) $params["numbering"] = $numbering; + if ($docPropsCustom) $params["docPropsCustom"] = $docPropsCustom; $document = new Document($params); diff --git a/src/docx2jats/jats/Par.php b/src/docx2jats/jats/Par.php index ab7324c..e95353d 100644 --- a/src/docx2jats/jats/Par.php +++ b/src/docx2jats/jats/Par.php @@ -31,7 +31,7 @@ public function setContent() { foreach ($content->getRefIds() as $key => $id) { $refEl = $this->ownerDocument->createElement('xref', $id); $refEl->setAttribute('ref-type', 'bibr'); - $refEl->setAttribute('rid', 'bib' . $id); + $refEl->setAttribute('rid', Reference::JATS_REF_ID_PREFIX . $id); $this->appendChild($refEl); if ($key !== $lastKey) { $refEl = $this->ownerDocument->createTextNode(' '); diff --git a/src/docx2jats/jats/Table.php b/src/docx2jats/jats/Table.php index cfac98a..5ae9f9a 100644 --- a/src/docx2jats/jats/Table.php +++ b/src/docx2jats/jats/Table.php @@ -6,7 +6,7 @@ * Copyright (c) 2018-2020 Vitalii Bezsheiko * Distributed under the GNU GPL v3. * - * @brief represent JATS XML table + * @brief representŃ– JATS XML table */ use docx2jats\objectModel\DataObject; diff --git a/src/docx2jats/objectModel/Document.php b/src/docx2jats/objectModel/Document.php index e208eb5..be0fe2a 100644 --- a/src/docx2jats/objectModel/Document.php +++ b/src/docx2jats/objectModel/Document.php @@ -41,6 +41,13 @@ class Document { private $numbering; static $numberingXpath; + /** + * @var $docPropsCustom \DOMDocument represents custom properties of the document, + * e.g., Mendeley plugin for LibreOffice Writer exports CSL in this file + */ + private $docPropsCustom; + static $docPropsCustomXpath; + private $references = array(); private $refCount = 0; @@ -53,10 +60,15 @@ class Document { * @brief Key numbers of paragraphs that contain bookmarks inside the content * is used to speed up a search */ - private $elsHaveBookmarks = array(); + private $elsHavefldCharRefs = array(); private $elsAreTables = array(); private $elsAreFigures = array(); + /** + * @var array bookmark id => name + */ + public $bookMarks = array(); + public function __construct(array $params) { if (array_key_exists("relationships", $params)) { $this->relationships = $params["relationships"]; @@ -73,7 +85,13 @@ public function __construct(array $params) { self::$numberingXpath = new \DOMXPath($this->numbering); } + if (array_key_exists("docPropsCustom", $params)) { + $this->docPropsCustom = $params["docPropsCustom"]; + self::$docPropsCustomXpath = new \DOMXPath($this->docPropsCustom); + } + self::$xpath = new \DOMXPath($params["ooxmlDocument"]); + $this->findBookmarks(); $childNodes = self::$xpath->query("//w:body/child::node()"); @@ -141,7 +159,7 @@ public function __construct(array $params) { } if ($par->hasBookmarks) { - $this->elsHaveBookmarks[] = count($content)-1; + $this->elsHavefldCharRefs[] = count($content)-1; } } break; @@ -337,29 +355,29 @@ public function getLastReference() : ?Reference { * it's slightly faster than looping over the whole content */ private function setInternalRefs(): void { - if (empty($this->elsHaveBookmarks)) return; + if (empty($this->elsHavefldCharRefs)) return; // Find and map tables' and figures' bookmarks $refTableMap = $this->getBookmarkCaptionMapping($this->elsAreTables); $refFigureMap = $this->getBookmarkCaptionMapping($this->elsAreFigures); // Find bookmark refs - foreach ($this->elsHaveBookmarks as $parKeyWithBookmark) { + foreach ($this->elsHavefldCharRefs as $parKeyWithBookmark) { $par = $this->getContent()[$parKeyWithBookmark]; /* @var $par Par */ - foreach ($par->bookmarkPos as $fieldKeyWithBookmark) { + foreach ($par->fldCharRefPos as $fieldKeyWithBookmark) { $field = $par->getContent()[$fieldKeyWithBookmark]; /* @var $field \docx2jats\objectModel\body\Field */ // Set links to tables foreach ($refTableMap as $tableId => $tableRefs) { - if (in_array($field->getBookmarkId(), $tableRefs)) { + if (in_array($field->getFldCharRefId(), $tableRefs)) { $field->tableIdRef = $tableId; } } // Set links to Figures foreach ($refFigureMap as $figureId => $figureRefs) { - if (in_array($field->getBookmarkId(), $figureRefs)) { - $field->figureIdRef = $tableId; + if (in_array($field->getFldCharRefId(), $figureRefs)) { + $field->figureIdRef = $figureId; } } } @@ -368,7 +386,7 @@ private function setInternalRefs(): void { /** * @return array - * @brief (or not so brief) Map OOXML bookmark refs inside table and figures with correspondent table/figure IDs. + * @brief (or not so brief) Map OOXML bookmark refs inside tables and figures with correspondent table/figure IDs. * In OOXML those bookmarks are stored inside captions * This is used to set right link to these objects from the text * Keep in mind that bookmarks also may be stored in an external file, e.g., Mendeley plugin for LibreOffice Writer @@ -384,4 +402,18 @@ function getBookmarkCaptionMapping(array $keysInContent): array { return $refMap; } + + /** + * Find and retrieve id and name from all bookmarks in the main document part + */ + private function findBookmarks(): void { + $bookmarkEls = self::$xpath->query('//w:bookmarkStart'); + foreach ($bookmarkEls as $bookmarkEl) { + $this->bookMarks[$bookmarkEl->getAttribute('w:id')] = $bookmarkEl->getAttribute('w:name'); + } + } + + public function docPropsCustom() { + return $this->docPropsCustom; + } } diff --git a/src/docx2jats/objectModel/body/Field.php b/src/docx2jats/objectModel/body/Field.php index 682ace1..4fe9279 100644 --- a/src/docx2jats/objectModel/body/Field.php +++ b/src/docx2jats/objectModel/body/Field.php @@ -26,7 +26,7 @@ class Field extends DataObject { private $rawRuns = array(); /** @var array contains instructions to be processed as strings, e.g., CSL citations as a JSON string */ private $instructions = array(); - private $bookmarkId; + private $fldCharRefId; /** * @var $tableIdRef int @@ -112,11 +112,11 @@ private function processRuns() { elseif (strpos($instructionString, 'REF') !== false) { $this->getParent()->hasBookmarks = true; $this->type = self::DOCX_FIELD_BOOKMARK_REF; - $this->bookmarkId = $this->extractRefID($instructionString); + $this->fldCharRefId = $this->extractRefID($instructionString); } } } else { - $this->content[] = new Text($run); + $this->content[] = new Text($run, $this->getOwnerDocument()); } } } @@ -129,15 +129,16 @@ public function getType(): int return $this->type; } - private function extractRawCSL(string $instruction) { - $instruction = trim($instruction); - $pos = strpos($instruction, '{'); - $instructionsRawPart = substr($instruction, 0, $pos); + /** + * @param string $instruction + * @return string containing raw CSL + * @brief extract CSL as a string and determine its type (Zotero or Mendeley) + */ + private function extractRawCSL(string $instruction): string { + list($instructionsRawPart, $rawCSL) = Reference::extractRawCSL($instruction); if (strpos($instructionsRawPart, 'ZOTERO_ITEM') !== false) { $this->isZoteroCSL = true; } - - $rawCSL = substr($instruction, $pos); return $rawCSL; } @@ -171,9 +172,9 @@ public function isZoteroCSL(): bool { /** * @return mixed */ - public function getBookmarkId() { + public function getFldCharRefId() { if ($this->type === self::DOCX_FIELD_BOOKMARK_REF) { - return $this->bookmarkId; + return $this->fldCharRefId; } return null; diff --git a/src/docx2jats/objectModel/body/Par.php b/src/docx2jats/objectModel/body/Par.php index 1e91ee7..4f1cda4 100644 --- a/src/docx2jats/objectModel/body/Par.php +++ b/src/docx2jats/objectModel/body/Par.php @@ -54,7 +54,7 @@ class Par extends DataObject { static $numberingUnorderedMarkers = array("bullet", "none", ""); public $hasBookmarks = false; - public $bookmarkPos = array(); // position of bookmarks in the content + public $fldCharRefPos = array(); // position of bookmarks in the content public function __construct(\DOMElement $domElement, Document $ownerDocument, DataObject $parent = null) { parent::__construct($domElement, $ownerDocument, $parent); @@ -112,16 +112,16 @@ protected function setContent(string $xpathExpression) { $field->addContent($contentNode); // record a position of field with a bookmark in an array - if ($field->getBookmarkId()) $this->bookmarkPos[] = count($content)-1; + if ($field->getFldCharRefId()) $this->fldCharRefPos[] = count($content)-1; $field = null; } else { - $text = new Text($contentNode); + $text = new Text($contentNode, $this->getOwnerDocument()); $content[] = $text; } } elseif ($contentNode->nodeName === "w:hyperlink") { $children = $this->getXpath()->query('child::node()', $contentNode); foreach ($children as $child) { - $href = new Text($child); + $href = new Text($child, $this->getOwnerDocument()); $href->addType($href::DOCX_TEXT_EXTLINK); $href->setLink(); $content[] = $href; diff --git a/src/docx2jats/objectModel/body/Reference.php b/src/docx2jats/objectModel/body/Reference.php index 24c5316..a82407b 100644 --- a/src/docx2jats/objectModel/body/Reference.php +++ b/src/docx2jats/objectModel/body/Reference.php @@ -33,6 +33,18 @@ public function getCslId(): string return $this->cslId; } + /** + * @param string $instruction + * @return array [instructions command, CSL as a string] + */ + public static function extractRawCSL(string $instruction): array { + $instruction = trim($instruction); + $pos = strpos($instruction, '{'); + $instructionsRawPart = substr($instruction, 0, $pos); + $rawCSL = substr($instruction, $pos); + return array($instructionsRawPart, $rawCSL); + } + public static function findRefsCSL(string $rawCSL) : array { $citations = []; $json = json_decode($rawCSL); @@ -71,6 +83,14 @@ public static function findPlainCit(string $rawCSL): ?string { // Mendeley if (property_exists($json, 'mendeley')) { $mendeley = $json->{'mendeley'}; + if ($props && property_exists($mendeley, 'formattedCitation')) { + return $mendeley->{'formattedCitation'}; + } + + if (property_exists($mendeley, 'previouslyFormattedCitation')) { + return $mendeley->{'plainTextFormattedCitation'}; + } + if (property_exists($mendeley, 'previouslyFormattedCitation')) { return $mendeley->{'previouslyFormattedCitation'}; } diff --git a/src/docx2jats/objectModel/body/Text.php b/src/docx2jats/objectModel/body/Text.php index ebe8f2e..a40eb9e 100644 --- a/src/docx2jats/objectModel/body/Text.php +++ b/src/docx2jats/objectModel/body/Text.php @@ -25,11 +25,25 @@ class Text extends DataObject { private $type = array(); private $link; - public function __construct(\DOMElement $domElement) { - parent::__construct($domElement); + /** + * Mendeley plugin for LibreOffice Writer allows to export citation data to the OOXML + * the link to the ref is between w:bookmarkStart and w:bookmarkEnd elements, the children of the w:p + * Mendeley includes prefix Mendeley_Bookmark_... as an attribute value of w:name attribute of w:bookmarkStart element + * TODO implement bookmarks that span on several paragraphs + * @var $bookmarked bool whether Text is inside a bookmark + * @var $bookmarkData array of bookmarks + */ + public $bookmarked = false; + private $bookmarkData = array(); + public $hasCSLRefs = false; + public $refIds = array(); + + public function __construct(\DOMElement $domElement, Document $ownerDocument) { + parent::__construct($domElement, $ownerDocument); $this->properties = $this->setProperties('w:rPr/child::node()'); $this->text = $this->setText('w:t'); $this->type = $this->setType(); + $this->setBookmarks(); } /** @@ -141,4 +155,111 @@ function setLink(): void { public function getLink(): ?string { return $this->link; } + + function getBookmarkData(): array { + return $this->bookmarkData; + } + + /** + * Determine whether this element is inside a bookmarks + * Check previous siblings, find w:bookmarkStart or w:bookmarkEnd + */ + private function setBookmarks(): void { + $prevSibling = $this->getDomElement()->previousSibling; + $bookMarks = array_fill_keys(['started', 'ended'], []); + $bookMarks = $this->prevBookmarks($prevSibling, $bookMarks); + $bookMarksActive = array_diff($bookMarks['started'], $bookMarks['ended']); + if (empty($bookMarksActive)) return; + + $this->bookmarked = true; + $allBookmarks = $this->getOwnerDocument()->bookMarks; + foreach ($bookMarksActive as $bookMarkActive) { + $name = $allBookmarks[$bookMarkActive]; + $this->bookmarkData[$bookMarkActive]['name'] = $name ; + $content = $this->searchBookmarkContentByName($name); + if (empty(trim($content)) || strpos($content, 'CSL_CITATION') === false) continue; + + $this->bookmarkData[$bookMarkActive]['content'] = $content; + list($instructions, $rawCSL) = Reference::extractRawCSL($content); + if (empty($rawCSL)) continue; + + $citations = Reference::findRefsCSL($rawCSL); + if (empty($citations)) continue; + $this->hasCSLRefs = true; + + foreach ($citations as $citation) { + if (!$ref = Reference::cslExists($citation, $this->getOwnerDocument())) { + $this->getOwnerDocument()->addReference($citation); + $this->refIds[] = $citation->getId(); + } else { + $this->refIds[] = $ref->getId(); + } + } + } + } + + /** + * @param \DOMElement $prevSibling + * @param array $bookMarks + * @return array + * @brief recursively find started and ended bookmarks + */ + private function prevBookmarks(?\DOMElement $prevSibling, array $bookMarks): array { + if (is_null($prevSibling)) return $bookMarks; // reached end + if ($prevSibling->nodeType !== XML_ELEMENT_NODE) return $this->prevBookmarks($prevSibling->previousSibling, $bookMarks); + + if ($prevSibling->tagName === 'w:bookmarkStart') { + $bookMarks['started'][] = $prevSibling->getAttribute('w:id'); + } + + if ($prevSibling->tagName === 'w:bookmarkEnd') { + $bookMarks['ended'][] = $prevSibling->getAttribute('w:id'); + } + + return $this->prevBookmarks($prevSibling->previousSibling, $bookMarks); + } + + /** + * @param int $id + * @return string + * @brief search for the bookmarks content in the docProps/custom.xml by bookmarkStart ID + * particularly this is for searching of CSL Mendeley references + */ + public function searchBookmarkContentByName(string $bookmarkName): ?string { + $xpath = Document::$docPropsCustomXpath; + $propertyEls = $this->getOwnerDocument()->docPropsCustom()->getElementsByTagName('property'); + $contentEls = []; + $nameLen = strlen($bookmarkName); + foreach ($propertyEls as $propertyEl) { /* @var $propertyEl \DOMElement */ + if ($propertyEl->hasAttribute('name')) { + $attrValue = $propertyEl->getAttribute('name'); + // attribute value consists of a name and unique ending + if (substr($attrValue, 0, $nameLen) === $bookmarkName) { + $contentEls[] = $propertyEl; + } + } + } + + if (empty($contentEls)) return null; + + /** + * Needs to be sorted as property elements can be present inside the parent in any order; + * Consider 2 values of name attr: Mendeley_Bookmark_3XXM13m2wL_14 Mendeley_Bookmark_3XXM13m2wL_2, + * Sorting is done by comparing trailing numbers that appear after last underscore + */ + usort($contentEls, function ($a, $b) use ($nameLen) { + $aInt = filter_var(substr($a->getAttribute('name'), $nameLen), FILTER_SANITIZE_NUMBER_INT); + $bInt = filter_var(substr($b->getAttribute('name'), $nameLen), FILTER_SANITIZE_NUMBER_INT); + if ($aInt === $bInt) return 0; + return ($aInt < $bInt) ? -1 : 1; + }); + + $resultString = ''; + foreach ($contentEls as $contentEl) { + $lpwstr = $xpath->query('./vt:lpwstr[1]', $contentEl)[0]; + $resultString .= $lpwstr->nodeValue; + } + + return $resultString; + } }