From a74c3d4ff6eb30cb16855dc7cac581fc15485628 Mon Sep 17 00:00:00 2001 From: Levente Hunyadi Date: Fri, 20 Sep 2024 01:12:26 +0200 Subject: [PATCH] Reorganize code --- md2conf/application.py | 85 +++++++++++++++++++++--------------------- md2conf/converter.py | 32 +++++++++++----- md2conf/processor.py | 54 +++++++++++++++++---------- 3 files changed, 99 insertions(+), 72 deletions(-) diff --git a/md2conf/application.py b/md2conf/application.py index a4a27ae..adcebf6 100644 --- a/md2conf/application.py +++ b/md2conf/application.py @@ -8,8 +8,10 @@ ConfluenceDocument, ConfluenceDocumentOptions, ConfluencePageMetadata, + ConfluenceQualifiedID, attachment_name, extract_qualified_id, + read_qualified_id, ) LOGGER = logging.getLogger(__name__) @@ -42,25 +44,50 @@ def synchronize_page(self, page_path: Path) -> None: self._synchronize_page(page_path, {}) - def _get_qualified_id(self, absolute_path: Path) -> Optional[str]: - with open(absolute_path, "r", encoding="utf-8") as f: - document = f.read() + def synchronize_directory(self, local_dir: Path) -> None: + "Synchronizes a directory of Markdown pages with Confluence." - qualified_id, _ = extract_qualified_id(document) - if qualified_id is not None: - return qualified_id.page_id + LOGGER.info(f"Synchronizing directory: {local_dir}") + + # Step 1: build index of all page metadata + page_metadata: Dict[Path, ConfluencePageMetadata] = {} + root_id = ( + ConfluenceQualifiedID(self.options.root_page_id, self.api.space_key) + if self.options.root_page_id + else None + ) + self._index_directory(local_dir, root_id, page_metadata) + LOGGER.info(f"indexed {len(page_metadata)} page(s)") + + # Step 2: convert each page + for page_path in page_metadata.keys(): + self._synchronize_page(page_path, page_metadata) + + def _synchronize_page( + self, + page_path: Path, + page_metadata: Dict[Path, ConfluencePageMetadata], + ) -> None: + base_path = page_path.parent + + LOGGER.info(f"Synchronizing page: {page_path}") + document = ConfluenceDocument(page_path, self.options, page_metadata) + + if document.id.space_key: + with self.api.switch_space(document.id.space_key): + self._update_document(document, base_path) else: - return None + self._update_document(document, base_path) def _index_directory( self, local_dir: Path, - root_id: Optional[str], + root_id: Optional[ConfluenceQualifiedID], page_metadata: Dict[Path, ConfluencePageMetadata], ) -> None: "Indexes Markdown files in a directory recursively." - LOGGER.info(f"Synchronizing directory: {local_dir}") + LOGGER.info(f"Indexing directory: {local_dir}") files: List[Path] = [] directories: List[Path] = [] @@ -74,11 +101,11 @@ def _index_directory( directories.append((Path(local_dir) / entry.name).absolute()) # make page act as parent node in Confluence - parent_id: Optional[str] = None + parent_id: Optional[ConfluenceQualifiedID] = None if "index.md" in files: - parent_id = self._get_qualified_id(Path(local_dir) / "index.md") + parent_id = read_qualified_id(Path(local_dir) / "index.md") elif "README.md" in files: - parent_id = self._get_qualified_id(Path(local_dir) / "README.md") + parent_id = read_qualified_id(Path(local_dir) / "README.md") if parent_id is None: parent_id = root_id @@ -91,38 +118,10 @@ def _index_directory( for directory in directories: self._index_directory(Path(local_dir) / directory, parent_id, page_metadata) - def synchronize_directory(self, local_dir: Path) -> None: - "Synchronizes a directory of Markdown pages with Confluence." - - # Step 1: build index of all page metadata - page_metadata: Dict[Path, ConfluencePageMetadata] = {} - self._index_directory(local_dir, self.options.root_page_id, page_metadata) - LOGGER.info(f"indexed {len(page_metadata)} page(s)") - - # Step 2: convert each page - for page_path in page_metadata.keys(): - self._synchronize_page(page_path, page_metadata) - - def _synchronize_page( - self, - page_path: Path, - page_metadata: Dict[Path, ConfluencePageMetadata], - ) -> None: - base_path = page_path.parent - - LOGGER.info(f"Synchronizing page: {page_path}") - document = ConfluenceDocument(page_path, self.options, page_metadata) - - if document.id.space_key: - with self.api.switch_space(document.id.space_key): - self._update_document(document, base_path) - else: - self._update_document(document, base_path) - def _get_or_create_page( self, absolute_path: Path, - parent_id: Optional[str], + parent_id: Optional[ConfluenceQualifiedID], *, title: Optional[str] = None, ) -> ConfluencePageMetadata: @@ -149,7 +148,9 @@ def _get_or_create_page( if title is None: title = absolute_path.stem - confluence_page = self.api.get_or_create_page(title, parent_id) + confluence_page = self.api.get_or_create_page( + title, parent_id.page_id, space_key=parent_id.space_key + ) self._update_markdown( absolute_path, document, diff --git a/md2conf/converter.py b/md2conf/converter.py index 6b03c0f..cbf1076 100644 --- a/md2conf/converter.py +++ b/md2conf/converter.py @@ -4,11 +4,11 @@ import importlib.resources as resources import logging import os.path -import pathlib import re import sys import uuid from dataclasses import dataclass +from pathlib import Path from typing import Dict, List, Literal, Optional, Tuple from urllib.parse import ParseResult, urlparse, urlunparse @@ -61,7 +61,7 @@ def markdown_to_html(content: str) -> str: ) -def _elements_from_strings(dtd_path: pathlib.Path, items: List[str]) -> ET._Element: +def _elements_from_strings(dtd_path: Path, items: List[str]) -> ET._Element: """ Creates a fragment of several XML nodes from their string representation wrapped in a root element. @@ -252,18 +252,18 @@ class ConfluenceStorageFormatConverter(NodeVisitor): "Transforms a plain HTML tree into the Confluence storage format." options: ConfluenceConverterOptions - path: pathlib.Path - base_path: pathlib.Path + path: Path + base_path: Path links: List[str] images: List[str] embedded_images: Dict[str, bytes] - page_metadata: Dict[pathlib.Path, ConfluencePageMetadata] + page_metadata: Dict[Path, ConfluencePageMetadata] def __init__( self, options: ConfluenceConverterOptions, - path: pathlib.Path, - page_metadata: Dict[pathlib.Path, ConfluencePageMetadata], + path: Path, + page_metadata: Dict[Path, ConfluencePageMetadata], ) -> None: super().__init__() self.options = options @@ -365,7 +365,7 @@ def _transform_image(self, image: ET._Element) -> ET._Element: # prefer PNG over SVG; Confluence displays SVG in wrong size, and text labels are truncated if path and is_relative_url(path): - relative_path = pathlib.Path(path) + relative_path = Path(path) if ( relative_path.suffix == ".svg" and (self.base_path / relative_path.with_suffix(".png")).exists() @@ -728,6 +728,8 @@ class ConfluenceQualifiedID: def extract_qualified_id(string: str) -> Tuple[Optional[ConfluenceQualifiedID], str]: + "Extracts the Confluence page ID and space key from a Markdown document." + page_id, string = extract_value(r"", string) if page_id is None: @@ -741,6 +743,16 @@ def extract_qualified_id(string: str) -> Tuple[Optional[ConfluenceQualifiedID], return ConfluenceQualifiedID(page_id, space_key), string +def read_qualified_id(absolute_path: Path) -> Optional[ConfluenceQualifiedID]: + "Reads the Confluence page ID and space key from a Markdown document." + + with open(absolute_path, "r", encoding="utf-8") as f: + document = f.read() + + qualified_id, _ = extract_qualified_id(document) + return qualified_id + + @dataclass class ConfluenceDocumentOptions: """ @@ -774,9 +786,9 @@ class ConfluenceDocument: def __init__( self, - path: pathlib.Path, + path: Path, options: ConfluenceDocumentOptions, - page_metadata: Dict[pathlib.Path, ConfluencePageMetadata], + page_metadata: Dict[Path, ConfluencePageMetadata], ) -> None: self.options = options path = path.absolute() diff --git a/md2conf/processor.py b/md2conf/processor.py index fcb1232..a83eb56 100644 --- a/md2conf/processor.py +++ b/md2conf/processor.py @@ -1,7 +1,7 @@ import logging import os from pathlib import Path -from typing import Dict +from typing import Dict, List from .converter import ( ConfluenceDocument, @@ -37,28 +37,14 @@ def process(self, path: Path) -> None: def process_directory(self, local_dir: Path) -> None: "Recursively scans a directory hierarchy for Markdown files." - page_metadata: Dict[Path, ConfluencePageMetadata] = {} LOGGER.info(f"Synchronizing directory: {local_dir}") # Step 1: build index of all page metadata - # NOTE: Pathlib.walk() is implemented only in Python 3.12+ - # so sticking for old os.walk - for root, directories, files in os.walk(local_dir): - for file_name in files: - # Reconstitute Path object back - docfile = (Path(root) / file_name).absolute() - - # Skip non-markdown files - if docfile.suffix.lower() != ".md": - continue - - metadata = self._get_page(docfile) - LOGGER.debug(f"indexed {docfile} with metadata: {metadata}") - page_metadata[docfile] = metadata - - LOGGER.info(f"indexed {len(page_metadata)} pages") + page_metadata: Dict[Path, ConfluencePageMetadata] = {} + self._index_directory(local_dir, page_metadata) + LOGGER.info(f"indexed {len(page_metadata)} page(s)") - # Step 2: Convert each page + # Step 2: convert each page for page_path in page_metadata.keys(): self.process_page(page_path, page_metadata) @@ -72,13 +58,41 @@ def process_page( with open(path.with_suffix(".csf"), "w", encoding="utf-8") as f: f.write(content) + def _index_directory( + self, + local_dir: Path, + page_metadata: Dict[Path, ConfluencePageMetadata], + ) -> None: + "Indexes Markdown files in a directory recursively." + + LOGGER.info(f"Indexing directory: {local_dir}") + + files: List[Path] = [] + directories: List[Path] = [] + for entry in os.scandir(local_dir): + if entry.is_file(): + if entry.name.endswith(".md"): + # skip non-markdown files + files.append((Path(local_dir) / entry.name).absolute()) + elif entry.is_dir(): + if not entry.name.startswith("."): + directories.append((Path(local_dir) / entry.name).absolute()) + + for doc in files: + metadata = self._get_page(doc) + LOGGER.debug(f"indexed {doc} with metadata: {metadata}") + page_metadata[doc] = metadata + + for directory in directories: + self._index_directory(Path(local_dir) / directory, page_metadata) + def _get_page(self, absolute_path: Path) -> ConfluencePageMetadata: "Extracts metadata from a Markdown file." with open(absolute_path, "r", encoding="utf-8") as f: document = f.read() - qualified_id, document = extract_qualified_id(document) + qualified_id, _ = extract_qualified_id(document) if qualified_id is None: raise ValueError("required: page ID for local output")