Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

parameter setup in v3 API #14

Merged
merged 33 commits into from
Aug 25, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
d777527
Processor.parameter: only validate when set…
bertsky Aug 22, 2024
7998aae
get_processor: ensure passing non-empty parameter, rely on `_setup` t…
bertsky Aug 22, 2024
cc8592b
test_processor: adapt, check required parameters
bertsky Aug 22, 2024
45e556d
improve _setup docstring
bertsky Aug 22, 2024
d4c802b
Processor._setup: raise with full ParameterValidator report
bertsky Aug 22, 2024
b28fefb
get_processor: parameter only as kwarg
bertsky Aug 22, 2024
642938b
tests: adapt for get_processor parameter only as kwarg
bertsky Aug 22, 2024
f5e5c54
Processor.parameter: make the bound dict read-only
bertsky Aug 22, 2024
f2d53a6
Processor.parameter: move ParameterValidator back to setter, convert …
bertsky Aug 22, 2024
7297ca2
Processor.parameter: frozendict instead of mappingproxy, add test
bertsky Aug 22, 2024
6cd4a34
introduce Processor.shutdown to be overridden (called at deinit or pa…
bertsky Aug 22, 2024
407bff8
Processor: introduce `max_instances` class attribute
bertsky Aug 23, 2024
c9fbb2c
get_cached_processor: set lru_cache maxsize from min(cfg,class) at ru…
bertsky Aug 23, 2024
9c212a9
test get_processor instance_caching w/ max_instances
bertsky Aug 23, 2024
a413f04
test get_processor instance_caching w/ clear_cache
bertsky Aug 23, 2024
870523c
:package: v3.0.0a2
kba Aug 22, 2024
20bb6d1
remove make *-workaround, we will not do that for v3+
kba Aug 22, 2024
faa59a8
Processor.metadata_location property to specify where in the package …
kba Aug 23, 2024
5819c81
Processor.verify: always check cardinality (as we now have the defaul…
bertsky Aug 23, 2024
4f88f1d
fix --log-filename (6fc606027a): apply in ocrd_cli_wrap_processor
bertsky Aug 24, 2024
d621f36
fix exception
bertsky Aug 24, 2024
4868fb1
adapt to PIL.Image moved constants
bertsky Aug 24, 2024
da72c0a
ocrd_utils: add parse_json_file_with_comments
bertsky Aug 24, 2024
ca78b94
cli.workspace: pass fileGrp as well, improve description
bertsky Aug 24, 2024
cf41745
OcrdMets.add_agent: does not have positional args
bertsky Aug 24, 2024
cadc6e6
remove misplaced kwargs from run_processor
bertsky Aug 24, 2024
7966057
Processor.metadata: refactor…
bertsky Aug 24, 2024
bba142d
bashlib input-files: adapt, allow passing ocrd-tool.json path and exe…
bertsky Aug 24, 2024
32cdc5a
add to pylint karma
bertsky Aug 24, 2024
a95f269
update pylintrc
bertsky Aug 24, 2024
50c088e
processor.metadata_location: use self.__module__ not __package__
kba Aug 24, 2024
ad8c76e
Merge pull request #17 from OCR-D/new-processor-api-parameter-setup
bertsky Aug 24, 2024
8211237
pylint: try ignoring generateds (again)
bertsky Aug 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 38 additions & 11 deletions src/ocrd/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
import inspect
import tarfile
import io
import weakref
from warnings import warn
from frozendict import frozendict
from deprecated import deprecated
from requests import HTTPError

Expand Down Expand Up @@ -139,6 +141,26 @@ def ocrd_tool(self) -> dict:
self._ocrd_tool = self.metadata['tools'][self.executable]
return self._ocrd_tool

@property
def parameter(self) -> Optional[dict]:
"""the runtime parameter dict to be used by this processor"""
if hasattr(self, '_parameter'):
return self._parameter
return None

@parameter.setter
def parameter(self, parameter : dict) -> None:
if self.parameter is not None:
self.shutdown()
parameterValidator = ParameterValidator(self.ocrd_tool)
report = parameterValidator.validate(parameter)
if not report.is_valid:
raise ValueError(f'Invalid parameters:\n{report.to_xml()}')
# make parameter dict read-only
self._parameter = frozendict(parameter)
# (re-)run setup to load models etc
self.setup()

def __init__(
self,
# FIXME: deprecate in favor of process_workspace(workspace)
Expand Down Expand Up @@ -204,19 +226,14 @@ def __init__(
"is deprecated - pass as argument to process_workspace instead")
self.page_id = page_id or None
self.download = download_files
if parameter is None:
parameter = {}
parameterValidator = ParameterValidator(self.ocrd_tool)

report = parameterValidator.validate(parameter)
if not report.is_valid:
raise ValueError("Invalid parameters %s" % report.errors)
self.parameter = parameter
# NOTE: this is the logger to be used by processor implementations,
# `processor.base` default implementations should use
# :py:attr:`self._base_logger`
#: The logger to be used by processor implementations.
# `ocrd.processor.base` internals should use :py:attr:`self._base_logger`
self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
self._base_logger = getLogger('ocrd.processor.base')
if parameter is not None:
self.parameter = parameter
# ensure that shutdown gets called at destruction
self._finalizer = weakref.finalize(self, self.shutdown)
# workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
setattr(self, 'process',
deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
Expand Down Expand Up @@ -299,6 +316,16 @@ def setup(self) -> None:
"""
pass

def shutdown(self) -> None:
"""
Bring down the processor after data processing,
after to changing back from the workspace directory but
before exiting (or setting up with different parameters).

(Override this to unload models from memory etc.)
"""
pass

@deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
def process(self) -> None:
"""
Expand Down
15 changes: 8 additions & 7 deletions src/ocrd/processor/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import json
import inspect
from subprocess import run
from typing import List
from typing import List, Optional

from click import wrap_text
from ocrd.workspace import Workspace
Expand Down Expand Up @@ -84,7 +84,7 @@ def run_processor(
log.debug("Running processor %s", processorClass)

processor = get_processor(
processor_class=processorClass,
processorClass,
parameter=parameter,
workspace=None,
page_id=page_id,
Expand Down Expand Up @@ -374,28 +374,29 @@ def get_cached_processor(parameter: dict, processor_class):
Otherwise, an instance of the `:py:class:~ocrd.Processor` is returned.
"""
if processor_class:
dict_params = dict(parameter) if parameter else None
processor = processor_class(None, parameter=dict_params)
processor.setup()
processor = processor_class(None, parameter=dict(parameter))
return processor
return None


def get_processor(
processor_class,
parameter: dict,
parameter: Optional[dict] = None,
workspace: Workspace = None,
page_id: str = None,
input_file_grp: List[str] = None,
output_file_grp: List[str] = None,
instance_caching: bool = False,
):
if processor_class:
if parameter is None:
parameter = {}
if instance_caching:
processor = get_cached_processor(parameter, processor_class)
else:
# avoid passing workspace already (deprecated chdir behaviour)
processor = processor_class(None, parameter=parameter)
processor.setup()
# set current processing parameters
processor.workspace = workspace
processor.page_id = page_id
processor.input_file_grp = input_file_grp
Expand Down
59 changes: 56 additions & 3 deletions tests/processor/test_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging
from ocrd.resolver import Resolver
from ocrd.processor import Processor, run_processor, run_cli, NonUniqueInputFile
from ocrd.processor.helpers import get_processor

from unittest import mock
import pytest
Expand Down Expand Up @@ -73,14 +74,53 @@ def test_parameter(self):
with open(jsonpath, 'w') as f:
f.write('{"baz": "quux"}')
with open(jsonpath, 'r') as f:
parameter = json.load(f)
processor = run_processor(
DummyProcessor,
parameter=json.load(f),
parameter=parameter,
input_file_grp="OCR-D-IMG",
resolver=self.resolver,
workspace=self.workspace
)
self.assertEqual(processor.parameter['baz'], 'quux')
processor = get_processor(
DummyProcessor,
parameter=parameter)
with self.assertRaises(TypeError):
processor.parameter['baz'] = 'xuuq'
processor.parameter = { **parameter, 'baz': 'xuuq' }
self.assertEqual(processor.parameter['baz'], 'xuuq')

def test_instance_caching(self):
class DyingDummyProcessor(DummyProcessor):
def shutdown(self):
print(self.parameter['baz'])
self.capture_out_err()
# well above OCRD_MAX_PROCESSOR_CACHE=128
firstp = None
for i in range(200):
p = get_processor(
DyingDummyProcessor,
parameter={'baz': str(i)},
instance_caching=True
)
if i == 0:
firstp = p
lastp = p
p = get_processor(DyingDummyProcessor,
parameter={'baz': '0'},
instance_caching=True
)
# should not be cached anymore
self.assertNotEqual(firstp, p)
p = get_processor(DyingDummyProcessor,
parameter={'baz': '199'},
instance_caching=True
)
# should still be cached
self.assertEqual(lastp, p)
out, err = self.capture_out_err()
#assert '0' in out.split('\n')

def test_verify(self):
proc = DummyProcessor(None)
Expand All @@ -95,8 +135,18 @@ def test_json(self):
DummyProcessor(None).dump_json()

def test_params_missing_required(self):
with self.assertRaisesRegex(Exception, 'is a required property'):
DummyProcessorWithRequiredParameters(None)
proc = DummyProcessorWithRequiredParameters(None)
assert proc.parameter is None
with self.assertRaisesRegex(ValueError, 'is a required property'):
proc.parameter = {}
with self.assertRaisesRegex(ValueError, 'is a required property'):
get_processor(DummyProcessorWithRequiredParameters)
with self.assertRaisesRegex(ValueError, 'is a required property'):
get_processor(DummyProcessorWithRequiredParameters, parameter={})
with self.assertRaisesRegex(ValueError, 'is a required property'):
run_processor(DummyProcessorWithRequiredParameters,
workspace=self.workspace, input_file_grp="OCR-D-IMG")
proc.parameter = {'i-am-required': 'foo'}

def test_params_preset_resolve(self):
with pushd_popd(tempdir=True) as tempdir:
Expand Down Expand Up @@ -127,6 +177,9 @@ class ParamTestProcessor(Processor):
def ocrd_tool(self):
return {}
proc = ParamTestProcessor(None)
self.assertEqual(proc.parameter, None)
# get_processor will set to non-none and validate
proc = get_processor(ParamTestProcessor)
self.assertEqual(proc.parameter, {})

def test_run_agent(self):
Expand Down