Skip to content

Commit

Permalink
#18 More reliable way to handle DOCX archive's content
Browse files Browse the repository at this point in the history
  • Loading branch information
Vitaliy-1 committed Nov 19, 2020
1 parent 7490817 commit ce0fe50
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 9 deletions.
100 changes: 93 additions & 7 deletions src/docx2jats/DOCXArchive.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,19 @@

class DOCXArchive extends \ZipArchive {

public const CONTENT_TYPES_PATH = '[Content_Types].xml';
public const CONTENT_TYPE_DOCUMENT_MAIN = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml';
public const CONTENT_TYPE_STYLES = 'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml';
public const CONTENT_TYPE_SETTINGS = 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml';
public const CONTENT_TYPE_NUMBERING = 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml';
public const CONTENT_TYPE_CUSTOM_PROP = 'application/vnd.openxmlformats-officedocument.custom-properties+xml';
public const CONTENT_TYPE_RELATIONSHIPS = 'application/vnd.openxmlformats-package.relationships+xml';

/* @var $contentType \DOMDocument */
private $contentType;

static $contentTypeXpath;

/* @var $filePath string */
private $filePath;

Expand All @@ -29,20 +42,39 @@ public function __construct(string $filepath) {
$this->filePath = $filepath;

if ($this->open($filepath)) {
$this->ooxmlDocument = $this->transformToXml("word/document.xml");
$relationships = $this->transformToXml("word/_rels/document.xml.rels");
$styles = $this->transformToXml("word/styles.xml");
$this->contentType = $this->transformToXml(self::CONTENT_TYPES_PATH);
self::$contentTypeXpath = new \DOMXPath($this->contentType);

// Set the Main Document Part
$ooxmlDocumentPath = $this->getRealFileDocumentPath('word/document.xml', self::CONTENT_TYPE_DOCUMENT_MAIN);
$this->ooxmlDocument = $this->transformToXml($ooxmlDocumentPath);

// Relationships of the Main Document Part
$partRelationshipsPath = $this->getRealFileDocumentPath('word/_rels/document.xml.rels', self::CONTENT_TYPE_RELATIONSHIPS, $ooxmlDocumentPath);
$partRelationships = $this->transformToXml($partRelationshipsPath);

// Style names used in the document, styles should be checked recursively, see docx2jats\objectModel\Document::getBuiltinStyle
$stylePath = $this->getRealFileDocumentPath('word/styles.xml', self::CONTENT_TYPE_STYLES);
$styles = $this->transformToXml($stylePath);

// Media files, e.g., images
$this->mediaFiles = $this->extractMediaFiles();
$numbering = $this->transformToXml("word/numbering.xml");
$docPropsCustom = $this->transformToXml("docProps/custom.xml");

// Description of all numbered content, e.g., lists
$numberingPath = $this->getRealFileDocumentPath('word/numbering.xml', self::CONTENT_TYPE_NUMBERING);
$numbering = $this->transformToXml($numberingPath);

// Custom Document properties, this is used by Mendeley plugin export from LibreOffice Writer
$docPropsCustom = $this->getRealFileDocumentPath('docProps/custom.xml', self::CONTENT_TYPE_CUSTOM_PROP);
$docPropsCustom = $this->transformToXml($docPropsCustom);
$this->close();

// construct as an array
$params = array();

$params["ooxmlDocument"] = $this->ooxmlDocument;

if ($relationships) $params["relationships"] = $relationships;
if ($partRelationships) $params["partRelationships"] = $partRelationships;
if ($styles) $params["styles"] = $styles;
if ($numbering) $params["numbering"] = $numbering;
if ($docPropsCustom) $params["docPropsCustom"] = $docPropsCustom;
Expand All @@ -63,7 +95,7 @@ public function getDocument(): Document {

private function transformToXml(string $path): ?\DOMDocument {
$index = $this->locateName($path);
if (!$index) return null;
if ($index === false) return null;
$data = $this->getFromIndex($index);
$xml = new \DOMDocument();
$xml->loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
Expand Down Expand Up @@ -137,4 +169,58 @@ public function getMediaFilesContent(): array {

return $filesContent;
}

private function getRealFileDocumentPath(string $defaultPath, string $contentType = null, string $parentPath = null): string {
$path = null;
if (!is_null($contentType)) {
foreach ($this->contentType->getElementsByTagName('Override') as $override) {
if ($override->hasAttribute('PartName') &&
$override->hasAttribute('ContentType') &&
$override->getAttribute('ContentType') == $contentType) {
if ($contentType !== self::CONTENT_TYPE_RELATIONSHIPS) {
$path = $override->getAttribute('PartName');
break;
} else {
// Find the file associated with relationships, compare by filename
$partName = $override->getAttribute('PartName');
if (strpos(pathinfo($partName)['basename'], pathinfo($parentPath)['basename']) !== false) {
$path = $partName;
break;
}
}
}
}

// MS Word may not specify the path to the relationships files trying to guess based on the parent path
if ($contentType === self::CONTENT_TYPE_RELATIONSHIPS && is_null($path)) {
$path = 'word/_rels/' . pathinfo($parentPath)['basename'] . '.rels';
}
}

if (is_null($path)) {
$path = $defaultPath;
}

$path = ltrim($path, '/');

try {
$this->findDocumentByPath($path);
} catch (\Exception $e) {
trigger_error($e->getMessage(), E_USER_ERROR);
}

return $path;
}

/**
* @param string $path
* @return \DOMDocument
* @throws \Exception if the document inside the archive isn't found
*/
private function findDocumentByPath(string $path): void {
$domDocument = $this->transformToXml($path);
if (!$domDocument) {
throw new \Exception('Cannot find document inside the archive by the path ' . $path);
}
}
}
4 changes: 2 additions & 2 deletions src/docx2jats/objectModel/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ class Document {
public $bookMarks = array();

public function __construct(array $params) {
if (array_key_exists("relationships", $params)) {
$this->relationships = $params["relationships"];
if (array_key_exists("partRelationships", $params)) {
$this->relationships = $params["partRelationships"];
self::$relationshipsXpath = new \DOMXPath($this->relationships);
}

Expand Down

0 comments on commit ce0fe50

Please sign in to comment.