Skip to content

Commit

Permalink
Reorganize code
Browse files Browse the repository at this point in the history
  • Loading branch information
hunyadi committed Sep 19, 2024
1 parent bee899e commit a74c3d4
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 72 deletions.
85 changes: 43 additions & 42 deletions md2conf/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
ConfluenceDocument,
ConfluenceDocumentOptions,
ConfluencePageMetadata,
ConfluenceQualifiedID,
attachment_name,
extract_qualified_id,
read_qualified_id,
)

LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -42,25 +44,50 @@ def synchronize_page(self, page_path: Path) -> None:

self._synchronize_page(page_path, {})

def _get_qualified_id(self, absolute_path: Path) -> Optional[str]:
with open(absolute_path, "r", encoding="utf-8") as f:
document = f.read()
def synchronize_directory(self, local_dir: Path) -> None:
"Synchronizes a directory of Markdown pages with Confluence."

qualified_id, _ = extract_qualified_id(document)
if qualified_id is not None:
return qualified_id.page_id
LOGGER.info(f"Synchronizing directory: {local_dir}")

# Step 1: build index of all page metadata
page_metadata: Dict[Path, ConfluencePageMetadata] = {}
root_id = (
ConfluenceQualifiedID(self.options.root_page_id, self.api.space_key)
if self.options.root_page_id
else None
)
self._index_directory(local_dir, root_id, page_metadata)
LOGGER.info(f"indexed {len(page_metadata)} page(s)")

# Step 2: convert each page
for page_path in page_metadata.keys():
self._synchronize_page(page_path, page_metadata)

def _synchronize_page(
self,
page_path: Path,
page_metadata: Dict[Path, ConfluencePageMetadata],
) -> None:
base_path = page_path.parent

LOGGER.info(f"Synchronizing page: {page_path}")
document = ConfluenceDocument(page_path, self.options, page_metadata)

if document.id.space_key:
with self.api.switch_space(document.id.space_key):
self._update_document(document, base_path)
else:
return None
self._update_document(document, base_path)

def _index_directory(
self,
local_dir: Path,
root_id: Optional[str],
root_id: Optional[ConfluenceQualifiedID],
page_metadata: Dict[Path, ConfluencePageMetadata],
) -> None:
"Indexes Markdown files in a directory recursively."

LOGGER.info(f"Synchronizing directory: {local_dir}")
LOGGER.info(f"Indexing directory: {local_dir}")

files: List[Path] = []
directories: List[Path] = []
Expand All @@ -74,11 +101,11 @@ def _index_directory(
directories.append((Path(local_dir) / entry.name).absolute())

# make page act as parent node in Confluence
parent_id: Optional[str] = None
parent_id: Optional[ConfluenceQualifiedID] = None
if "index.md" in files:
parent_id = self._get_qualified_id(Path(local_dir) / "index.md")
parent_id = read_qualified_id(Path(local_dir) / "index.md")
elif "README.md" in files:
parent_id = self._get_qualified_id(Path(local_dir) / "README.md")
parent_id = read_qualified_id(Path(local_dir) / "README.md")

if parent_id is None:
parent_id = root_id
Expand All @@ -91,38 +118,10 @@ def _index_directory(
for directory in directories:
self._index_directory(Path(local_dir) / directory, parent_id, page_metadata)

def synchronize_directory(self, local_dir: Path) -> None:
"Synchronizes a directory of Markdown pages with Confluence."

# Step 1: build index of all page metadata
page_metadata: Dict[Path, ConfluencePageMetadata] = {}
self._index_directory(local_dir, self.options.root_page_id, page_metadata)
LOGGER.info(f"indexed {len(page_metadata)} page(s)")

# Step 2: convert each page
for page_path in page_metadata.keys():
self._synchronize_page(page_path, page_metadata)

def _synchronize_page(
self,
page_path: Path,
page_metadata: Dict[Path, ConfluencePageMetadata],
) -> None:
base_path = page_path.parent

LOGGER.info(f"Synchronizing page: {page_path}")
document = ConfluenceDocument(page_path, self.options, page_metadata)

if document.id.space_key:
with self.api.switch_space(document.id.space_key):
self._update_document(document, base_path)
else:
self._update_document(document, base_path)

def _get_or_create_page(
self,
absolute_path: Path,
parent_id: Optional[str],
parent_id: Optional[ConfluenceQualifiedID],
*,
title: Optional[str] = None,
) -> ConfluencePageMetadata:
Expand All @@ -149,7 +148,9 @@ def _get_or_create_page(
if title is None:
title = absolute_path.stem

confluence_page = self.api.get_or_create_page(title, parent_id)
confluence_page = self.api.get_or_create_page(
title, parent_id.page_id, space_key=parent_id.space_key
)
self._update_markdown(
absolute_path,
document,
Expand Down
32 changes: 22 additions & 10 deletions md2conf/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
import importlib.resources as resources
import logging
import os.path
import pathlib
import re
import sys
import uuid
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Literal, Optional, Tuple
from urllib.parse import ParseResult, urlparse, urlunparse

Expand Down Expand Up @@ -61,7 +61,7 @@ def markdown_to_html(content: str) -> str:
)


def _elements_from_strings(dtd_path: pathlib.Path, items: List[str]) -> ET._Element:
def _elements_from_strings(dtd_path: Path, items: List[str]) -> ET._Element:
"""
Creates a fragment of several XML nodes from their string representation wrapped in a root element.
Expand Down Expand Up @@ -252,18 +252,18 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
"Transforms a plain HTML tree into the Confluence storage format."

options: ConfluenceConverterOptions
path: pathlib.Path
base_path: pathlib.Path
path: Path
base_path: Path
links: List[str]
images: List[str]
embedded_images: Dict[str, bytes]
page_metadata: Dict[pathlib.Path, ConfluencePageMetadata]
page_metadata: Dict[Path, ConfluencePageMetadata]

def __init__(
self,
options: ConfluenceConverterOptions,
path: pathlib.Path,
page_metadata: Dict[pathlib.Path, ConfluencePageMetadata],
path: Path,
page_metadata: Dict[Path, ConfluencePageMetadata],
) -> None:
super().__init__()
self.options = options
Expand Down Expand Up @@ -365,7 +365,7 @@ def _transform_image(self, image: ET._Element) -> ET._Element:

# prefer PNG over SVG; Confluence displays SVG in wrong size, and text labels are truncated
if path and is_relative_url(path):
relative_path = pathlib.Path(path)
relative_path = Path(path)
if (
relative_path.suffix == ".svg"
and (self.base_path / relative_path.with_suffix(".png")).exists()
Expand Down Expand Up @@ -728,6 +728,8 @@ class ConfluenceQualifiedID:


def extract_qualified_id(string: str) -> Tuple[Optional[ConfluenceQualifiedID], str]:
"Extracts the Confluence page ID and space key from a Markdown document."

page_id, string = extract_value(r"<!--\s+confluence-page-id:\s*(\d+)\s+-->", string)

if page_id is None:
Expand All @@ -741,6 +743,16 @@ def extract_qualified_id(string: str) -> Tuple[Optional[ConfluenceQualifiedID],
return ConfluenceQualifiedID(page_id, space_key), string


def read_qualified_id(absolute_path: Path) -> Optional[ConfluenceQualifiedID]:
"Reads the Confluence page ID and space key from a Markdown document."

with open(absolute_path, "r", encoding="utf-8") as f:
document = f.read()

qualified_id, _ = extract_qualified_id(document)
return qualified_id


@dataclass
class ConfluenceDocumentOptions:
"""
Expand Down Expand Up @@ -774,9 +786,9 @@ class ConfluenceDocument:

def __init__(
self,
path: pathlib.Path,
path: Path,
options: ConfluenceDocumentOptions,
page_metadata: Dict[pathlib.Path, ConfluencePageMetadata],
page_metadata: Dict[Path, ConfluencePageMetadata],
) -> None:
self.options = options
path = path.absolute()
Expand Down
54 changes: 34 additions & 20 deletions md2conf/processor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import os
from pathlib import Path
from typing import Dict
from typing import Dict, List

from .converter import (
ConfluenceDocument,
Expand Down Expand Up @@ -37,28 +37,14 @@ def process(self, path: Path) -> None:
def process_directory(self, local_dir: Path) -> None:
"Recursively scans a directory hierarchy for Markdown files."

page_metadata: Dict[Path, ConfluencePageMetadata] = {}
LOGGER.info(f"Synchronizing directory: {local_dir}")

# Step 1: build index of all page metadata
# NOTE: Pathlib.walk() is implemented only in Python 3.12+
# so sticking for old os.walk
for root, directories, files in os.walk(local_dir):
for file_name in files:
# Reconstitute Path object back
docfile = (Path(root) / file_name).absolute()

# Skip non-markdown files
if docfile.suffix.lower() != ".md":
continue

metadata = self._get_page(docfile)
LOGGER.debug(f"indexed {docfile} with metadata: {metadata}")
page_metadata[docfile] = metadata

LOGGER.info(f"indexed {len(page_metadata)} pages")
page_metadata: Dict[Path, ConfluencePageMetadata] = {}
self._index_directory(local_dir, page_metadata)
LOGGER.info(f"indexed {len(page_metadata)} page(s)")

# Step 2: Convert each page
# Step 2: convert each page
for page_path in page_metadata.keys():
self.process_page(page_path, page_metadata)

Expand All @@ -72,13 +58,41 @@ def process_page(
with open(path.with_suffix(".csf"), "w", encoding="utf-8") as f:
f.write(content)

def _index_directory(
self,
local_dir: Path,
page_metadata: Dict[Path, ConfluencePageMetadata],
) -> None:
"Indexes Markdown files in a directory recursively."

LOGGER.info(f"Indexing directory: {local_dir}")

files: List[Path] = []
directories: List[Path] = []
for entry in os.scandir(local_dir):
if entry.is_file():
if entry.name.endswith(".md"):
# skip non-markdown files
files.append((Path(local_dir) / entry.name).absolute())
elif entry.is_dir():
if not entry.name.startswith("."):
directories.append((Path(local_dir) / entry.name).absolute())

for doc in files:
metadata = self._get_page(doc)
LOGGER.debug(f"indexed {doc} with metadata: {metadata}")
page_metadata[doc] = metadata

for directory in directories:
self._index_directory(Path(local_dir) / directory, page_metadata)

def _get_page(self, absolute_path: Path) -> ConfluencePageMetadata:
"Extracts metadata from a Markdown file."

with open(absolute_path, "r", encoding="utf-8") as f:
document = f.read()

qualified_id, document = extract_qualified_id(document)
qualified_id, _ = extract_qualified_id(document)
if qualified_id is None:
raise ValueError("required: page ID for local output")

Expand Down

0 comments on commit a74c3d4

Please sign in to comment.