Skip to content

Commit

Permalink
#20 extract Mendeley references from LibreOffice's OOXML
Browse files Browse the repository at this point in the history
  • Loading branch information
Vitaliy-1 committed Nov 12, 2020
1 parent 7273695 commit 11df6f2
Show file tree
Hide file tree
Showing 8 changed files with 204 additions and 30 deletions.
4 changes: 2 additions & 2 deletions src/docx2jats/DOCXArchive.php
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ public function __construct(string $filepath) {
$styles = $this->transformToXml("word/styles.xml");
$this->mediaFiles = $this->extractMediaFiles();
$numbering = $this->transformToXml("word/numbering.xml");
$docPropsCustom = $this->transformToXml("docProps/custom.xml");
$this->close();

// construct as an array
Expand All @@ -42,10 +43,9 @@ public function __construct(string $filepath) {
$params["ooxmlDocument"] = $this->ooxmlDocument;

if ($relationships) $params["relationships"] = $relationships;

if ($styles) $params["styles"] = $styles;

if ($numbering) $params["numbering"] = $numbering;
if ($docPropsCustom) $params["docPropsCustom"] = $docPropsCustom;

$document = new Document($params);

Expand Down
2 changes: 1 addition & 1 deletion src/docx2jats/jats/Par.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public function setContent() {
foreach ($content->getRefIds() as $key => $id) {
$refEl = $this->ownerDocument->createElement('xref', $id);
$refEl->setAttribute('ref-type', 'bibr');
$refEl->setAttribute('rid', 'bib' . $id);
$refEl->setAttribute('rid', Reference::JATS_REF_ID_PREFIX . $id);
$this->appendChild($refEl);
if ($key !== $lastKey) {
$refEl = $this->ownerDocument->createTextNode(' ');
Expand Down
2 changes: 1 addition & 1 deletion src/docx2jats/jats/Table.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
* Copyright (c) 2018-2020 Vitalii Bezsheiko
* Distributed under the GNU GPL v3.
*
* @brief represent JATS XML table
* @brief representі JATS XML table
*/

use docx2jats\objectModel\DataObject;
Expand Down
50 changes: 41 additions & 9 deletions src/docx2jats/objectModel/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ class Document {
private $numbering;
static $numberingXpath;

/**
* @var $docPropsCustom \DOMDocument represents custom properties of the document,
* e.g., Mendeley plugin for LibreOffice Writer exports CSL in this file
*/
private $docPropsCustom;
static $docPropsCustomXpath;

private $references = array();
private $refCount = 0;

Expand All @@ -53,10 +60,15 @@ class Document {
* @brief Key numbers of paragraphs that contain bookmarks inside the content
* is used to speed up a search
*/
private $elsHaveBookmarks = array();
private $elsHavefldCharRefs = array();
private $elsAreTables = array();
private $elsAreFigures = array();

/**
* @var array bookmark id => name
*/
public $bookMarks = array();

public function __construct(array $params) {
if (array_key_exists("relationships", $params)) {
$this->relationships = $params["relationships"];
Expand All @@ -73,7 +85,13 @@ public function __construct(array $params) {
self::$numberingXpath = new \DOMXPath($this->numbering);
}

if (array_key_exists("docPropsCustom", $params)) {
$this->docPropsCustom = $params["docPropsCustom"];
self::$docPropsCustomXpath = new \DOMXPath($this->docPropsCustom);
}

self::$xpath = new \DOMXPath($params["ooxmlDocument"]);
$this->findBookmarks();

$childNodes = self::$xpath->query("//w:body/child::node()");

Expand Down Expand Up @@ -141,7 +159,7 @@ public function __construct(array $params) {
}

if ($par->hasBookmarks) {
$this->elsHaveBookmarks[] = count($content)-1;
$this->elsHavefldCharRefs[] = count($content)-1;
}
}
break;
Expand Down Expand Up @@ -337,29 +355,29 @@ public function getLastReference() : ?Reference {
* it's slightly faster than looping over the whole content
*/
private function setInternalRefs(): void {
if (empty($this->elsHaveBookmarks)) return;
if (empty($this->elsHavefldCharRefs)) return;

// Find and map tables' and figures' bookmarks
$refTableMap = $this->getBookmarkCaptionMapping($this->elsAreTables);
$refFigureMap = $this->getBookmarkCaptionMapping($this->elsAreFigures);

// Find bookmark refs
foreach ($this->elsHaveBookmarks as $parKeyWithBookmark) {
foreach ($this->elsHavefldCharRefs as $parKeyWithBookmark) {
$par = $this->getContent()[$parKeyWithBookmark]; /* @var $par Par */
foreach ($par->bookmarkPos as $fieldKeyWithBookmark) {
foreach ($par->fldCharRefPos as $fieldKeyWithBookmark) {
$field = $par->getContent()[$fieldKeyWithBookmark]; /* @var $field \docx2jats\objectModel\body\Field */

// Set links to tables
foreach ($refTableMap as $tableId => $tableRefs) {
if (in_array($field->getBookmarkId(), $tableRefs)) {
if (in_array($field->getFldCharRefId(), $tableRefs)) {
$field->tableIdRef = $tableId;
}
}

// Set links to Figures
foreach ($refFigureMap as $figureId => $figureRefs) {
if (in_array($field->getBookmarkId(), $figureRefs)) {
$field->figureIdRef = $tableId;
if (in_array($field->getFldCharRefId(), $figureRefs)) {
$field->figureIdRef = $figureId;
}
}
}
Expand All @@ -368,7 +386,7 @@ private function setInternalRefs(): void {

/**
* @return array
* @brief (or not so brief) Map OOXML bookmark refs inside table and figures with correspondent table/figure IDs.
* @brief (or not so brief) Map OOXML bookmark refs inside tables and figures with correspondent table/figure IDs.
* In OOXML those bookmarks are stored inside captions
* This is used to set right link to these objects from the text
* Keep in mind that bookmarks also may be stored in an external file, e.g., Mendeley plugin for LibreOffice Writer
Expand All @@ -384,4 +402,18 @@ function getBookmarkCaptionMapping(array $keysInContent): array {

return $refMap;
}

/**
* Find and retrieve id and name from all bookmarks in the main document part
*/
private function findBookmarks(): void {
$bookmarkEls = self::$xpath->query('//w:bookmarkStart');
foreach ($bookmarkEls as $bookmarkEl) {
$this->bookMarks[$bookmarkEl->getAttribute('w:id')] = $bookmarkEl->getAttribute('w:name');
}
}

public function docPropsCustom() {
return $this->docPropsCustom;
}
}
23 changes: 12 additions & 11 deletions src/docx2jats/objectModel/body/Field.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class Field extends DataObject {
private $rawRuns = array();
/** @var array contains instructions to be processed as strings, e.g., CSL citations as a JSON string */
private $instructions = array();
private $bookmarkId;
private $fldCharRefId;

/**
* @var $tableIdRef int
Expand Down Expand Up @@ -112,11 +112,11 @@ private function processRuns() {
elseif (strpos($instructionString, 'REF') !== false) {
$this->getParent()->hasBookmarks = true;
$this->type = self::DOCX_FIELD_BOOKMARK_REF;
$this->bookmarkId = $this->extractRefID($instructionString);
$this->fldCharRefId = $this->extractRefID($instructionString);
}
}
} else {
$this->content[] = new Text($run);
$this->content[] = new Text($run, $this->getOwnerDocument());
}
}
}
Expand All @@ -129,15 +129,16 @@ public function getType(): int
return $this->type;
}

private function extractRawCSL(string $instruction) {
$instruction = trim($instruction);
$pos = strpos($instruction, '{');
$instructionsRawPart = substr($instruction, 0, $pos);
/**
* @param string $instruction
* @return string containing raw CSL
* @brief extract CSL as a string and determine its type (Zotero or Mendeley)
*/
private function extractRawCSL(string $instruction): string {
list($instructionsRawPart, $rawCSL) = Reference::extractRawCSL($instruction);
if (strpos($instructionsRawPart, 'ZOTERO_ITEM') !== false) {
$this->isZoteroCSL = true;
}

$rawCSL = substr($instruction, $pos);
return $rawCSL;
}

Expand Down Expand Up @@ -171,9 +172,9 @@ public function isZoteroCSL(): bool {
/**
* @return mixed
*/
public function getBookmarkId() {
public function getFldCharRefId() {
if ($this->type === self::DOCX_FIELD_BOOKMARK_REF) {
return $this->bookmarkId;
return $this->fldCharRefId;
}

return null;
Expand Down
8 changes: 4 additions & 4 deletions src/docx2jats/objectModel/body/Par.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class Par extends DataObject {
static $numberingUnorderedMarkers = array("bullet", "none", "");

public $hasBookmarks = false;
public $bookmarkPos = array(); // position of bookmarks in the content
public $fldCharRefPos = array(); // position of bookmarks in the content

public function __construct(\DOMElement $domElement, Document $ownerDocument, DataObject $parent = null) {
parent::__construct($domElement, $ownerDocument, $parent);
Expand Down Expand Up @@ -112,16 +112,16 @@ protected function setContent(string $xpathExpression) {
$field->addContent($contentNode);

// record a position of field with a bookmark in an array
if ($field->getBookmarkId()) $this->bookmarkPos[] = count($content)-1;
if ($field->getFldCharRefId()) $this->fldCharRefPos[] = count($content)-1;
$field = null;
} else {
$text = new Text($contentNode);
$text = new Text($contentNode, $this->getOwnerDocument());
$content[] = $text;
}
} elseif ($contentNode->nodeName === "w:hyperlink") {
$children = $this->getXpath()->query('child::node()', $contentNode);
foreach ($children as $child) {
$href = new Text($child);
$href = new Text($child, $this->getOwnerDocument());
$href->addType($href::DOCX_TEXT_EXTLINK);
$href->setLink();
$content[] = $href;
Expand Down
20 changes: 20 additions & 0 deletions src/docx2jats/objectModel/body/Reference.php
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,18 @@ public function getCslId(): string
return $this->cslId;
}

/**
* @param string $instruction
* @return array [instructions command, CSL as a string]
*/
public static function extractRawCSL(string $instruction): array {
$instruction = trim($instruction);
$pos = strpos($instruction, '{');
$instructionsRawPart = substr($instruction, 0, $pos);
$rawCSL = substr($instruction, $pos);
return array($instructionsRawPart, $rawCSL);
}

public static function findRefsCSL(string $rawCSL) : array {
$citations = [];
$json = json_decode($rawCSL);
Expand Down Expand Up @@ -71,6 +83,14 @@ public static function findPlainCit(string $rawCSL): ?string {
// Mendeley
if (property_exists($json, 'mendeley')) {
$mendeley = $json->{'mendeley'};
if ($props && property_exists($mendeley, 'formattedCitation')) {
return $mendeley->{'formattedCitation'};
}

if (property_exists($mendeley, 'previouslyFormattedCitation')) {
return $mendeley->{'plainTextFormattedCitation'};
}

if (property_exists($mendeley, 'previouslyFormattedCitation')) {
return $mendeley->{'previouslyFormattedCitation'};
}
Expand Down
Loading

0 comments on commit 11df6f2

Please sign in to comment.