Skip to content

Commit cae9428

Browse files
authored
Merge pull request #1 from anyparser/release/1.0.2
# Release anyparser-core@1.0.2
2 parents bf35c80 + 22bbe25 commit cae9428

17 files changed

+91
-73
lines changed

README.md

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ import os
106106
import asyncio
107107
import sys
108108

109-
from anyparser_core import Anyparser, AnyparserOption, OcrLanguage, OCRPreset
109+
from anyparser_core import Anyparser, AnyparserOption, OcrLanguage, OcrPreset
110110

111111
single_file = "docs/document.png"
112112

@@ -116,7 +116,7 @@ options = AnyparserOption(
116116
model="ocr",
117117
format="markdown",
118118
ocr_language=[OcrLanguage.JAPANESE],
119-
ocr_preset=OCRPreset.SCAN,
119+
ocr_preset=OcrPreset.SCAN,
120120
)
121121

122122
parser = Anyparser(options)
@@ -226,7 +226,7 @@ The `Anyparser` class utilizes the `AnyparserOption` dataclass for flexible conf
226226
from dataclasses import dataclass
227227
from typing import List, Literal, Optional, Union
228228

229-
from anyparser_core import OcrLanguage, OCRPreset
229+
from anyparser_core import OcrLanguage, OcrPreset
230230

231231
@dataclass
232232
class AnyparserOption:
@@ -255,7 +255,7 @@ class AnyparserOption:
255255

256256
# OCR Configuration
257257
ocr_language: Optional[List[OcrLanguage]] = None # Languages for OCR processing
258-
ocr_preset: Optional[OCRPreset] = None # Preset configuration for OCR
258+
ocr_preset: Optional[OcrPreset] = None # Preset configuration for OCR
259259

260260
# Crawler Configuration
261261
max_depth: Optional[int] = None # Maximum crawl depth
@@ -278,7 +278,7 @@ class AnyparserOption:
278278
| `files` | `Optional[Union[str, List[str]]]` | `None` | Input files to process |
279279
| `url` | `Optional[str]` | `None` | URL for crawler model |
280280
| `ocr_language` | `Optional[List[OcrLanguage]]` | `None` | Languages for OCR processing |
281-
| `ocr_preset` | `Optional[OCRPreset]` | `None` | Preset configuration for OCR |
281+
| `ocr_preset` | `Optional[OcrPreset]` | `None` | Preset configuration for OCR |
282282
| `max_depth` | `Optional[int]` | `None` | Maximum crawl depth for crawler model |
283283
| `max_executions` | `Optional[int]` | `None` | Maximum number of pages to crawl |
284284
| `strategy` | `Optional[str]` | `None` | Crawling strategy: `"LIFO"` or `"FIFO"` |
@@ -288,19 +288,19 @@ class AnyparserOption:
288288

289289
The following OCR presets are available for optimized document processing:
290290

291-
- `OCRPreset.DOCUMENT` - General document processing
292-
- `OCRPreset.HANDWRITING` - Handwritten text recognition
293-
- `OCRPreset.SCAN` - Scanned document processing
294-
- `OCRPreset.RECEIPT` - Receipt processing
295-
- `OCRPreset.MAGAZINE` - Magazine/article processing
296-
- `OCRPreset.INVOICE` - Invoice processing
297-
- `OCRPreset.BUSINESS_CARD` - Business card processing
298-
- `OCRPreset.PASSPORT` - Passport document processing
299-
- `OCRPreset.DRIVER_LICENSE` - Driver's license processing
300-
- `OCRPreset.IDENTITY_CARD` - ID card processing
301-
- `OCRPreset.LICENSE_PLATE` - License plate recognition
302-
- `OCRPreset.MEDICAL_REPORT` - Medical document processing
303-
- `OCRPreset.BANK_STATEMENT` - Bank statement processing
291+
- `OcrPreset.DOCUMENT` - General document processing
292+
- `OcrPreset.HANDWRITING` - Handwritten text recognition
293+
- `OcrPreset.SCAN` - Scanned document processing
294+
- `OcrPreset.RECEIPT` - Receipt processing
295+
- `OcrPreset.MAGAZINE` - Magazine/article processing
296+
- `OcrPreset.INVOICE` - Invoice processing
297+
- `OcrPreset.BUSINESS_CARD` - Business card processing
298+
- `OcrPreset.PASSPORT` - Passport document processing
299+
- `OcrPreset.DRIVER_LICENSE` - Driver's license processing
300+
- `OcrPreset.IDENTITY_CARD` - ID card processing
301+
- `OcrPreset.LICENSE_PLATE` - License plate recognition
302+
- `OcrPreset.MEDICAL_REPORT` - Medical document processing
303+
- `OcrPreset.BANK_STATEMENT` - Bank statement processing
304304

305305
**Model Types for AI Data Pipelines:**
306306

anyparser_core/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from .config.hardcoded import OcrLanguage, OCRPreset
1+
from .config.hardcoded import OcrLanguage, OcrPreset
22
from .form import build_form
33
from .options import AnyparserOption, AnyparserParsedOption, UploadedFile
44
from .parser import (
@@ -15,8 +15,8 @@
1515
AnyparserUrl,
1616
)
1717
from .validator import validate_and_parse, validate_option, validate_path
18+
from .version import __version__
1819

19-
__version__ = "1.0.1"
2020
__all__ = [
2121
"Anyparser",
2222
"AnyparserCrawlDirective",
@@ -35,6 +35,6 @@
3535
"validate_option",
3636
"build_form",
3737
"Anyparser",
38-
"OCRPreset",
38+
"OcrPreset",
3939
"OcrLanguage",
4040
]

anyparser_core/config/hardcoded.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
]
2222

2323

24-
class OCRPreset(Enum):
24+
class OcrPreset(Enum):
2525
"""Enumeration of supported OCR presets for document processing."""
2626

2727
DOCUMENT = "document"

anyparser_core/form.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,18 +54,18 @@ def add_field(name: str, value: Any) -> None:
5454
if parsed.model == "ocr":
5555
if parsed.ocr_language:
5656
add_field(
57-
"ocrLanguage", ",".join([lang.value for lang in parsed.ocr_language])
57+
"ocr_language", ",".join([lang.value for lang in parsed.ocr_language])
5858
)
5959

6060
if parsed.ocr_preset:
61-
add_field("ocrPreset", parsed.ocr_preset.value)
61+
add_field("ocr_preset", parsed.ocr_preset.value)
6262

6363
if parsed.model == "crawler":
6464
add_field("url", parsed.url)
65-
add_field("maxDepth", parsed.max_depth)
66-
add_field("maxExecutions", parsed.max_executions)
65+
add_field("max_depth", parsed.max_depth)
66+
add_field("max_executions", parsed.max_executions)
6767
add_field("strategy", parsed.strategy)
68-
add_field("traversalScope", parsed.traversal_scope)
68+
add_field("traversal_scope", parsed.traversal_scope)
6969
else:
7070
# Add files to the form
7171
for file in parsed.files:

anyparser_core/options.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from dataclasses import dataclass, field
66
from typing import List, Literal, Optional, TypedDict, Union
77

8-
from anyparser_core.config.hardcoded import OcrLanguage, OCRPreset
8+
from anyparser_core.config.hardcoded import OcrLanguage, OcrPreset
99

1010
# Type aliases for better readability
1111
AnyparserFormatType = Literal["json", "markdown", "html"]
@@ -26,7 +26,7 @@ class AnyparserOption:
2626
table: Optional[bool] = None
2727
files: Optional[Union[str, List[str]]] = None
2828
ocr_language: Optional[List[OcrLanguage]] = None
29-
ocr_preset: Optional[OCRPreset] = None
29+
ocr_preset: Optional[OcrPreset] = None
3030
url: Optional[str] = None
3131
max_depth: Optional[int] = None
3232
max_executions: Optional[int] = None
@@ -54,7 +54,7 @@ class AnyparserParsedOption:
5454
image: Optional[bool] = None
5555
table: Optional[bool] = None
5656
ocr_language: Optional[List[OcrLanguage]] = None
57-
ocr_preset: Optional[OCRPreset] = None
57+
ocr_preset: Optional[OcrPreset] = None
5858
url: Optional[str] = None
5959
max_depth: Optional[int] = None
6060
max_executions: Optional[int] = None
@@ -72,7 +72,7 @@ class DefaultOptions(TypedDict):
7272
image: Optional[bool]
7373
table: Optional[bool]
7474
ocr_language: Optional[List[OcrLanguage]]
75-
ocr_preset: Optional[OCRPreset]
75+
ocr_preset: Optional[OcrPreset]
7676
url: Optional[str]
7777
max_depth: Optional[int]
7878
max_executions: Optional[int]

anyparser_core/parser.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from .options import AnyparserOption
1111
from .request import async_request
1212
from .validator import validate_and_parse
13+
from .version import __version__
1314

1415

1516
@dataclass
@@ -81,6 +82,7 @@ class AnyparserUrl:
8182
images: List[AnyparserImageReference] = field(default_factory=list)
8283
text: Optional[str] = field(default=None)
8384

85+
8486
@dataclass
8587
class AnyparserPdfPage:
8688
"""Represents a parsed PDF page with extracted content."""
@@ -152,7 +154,8 @@ async def parse(
152154

153155
# Set up the headers, using the same boundary
154156
headers: Dict[str, str] = {
155-
"Content-Type": f"multipart/form-data; boundary={boundary}"
157+
"Content-Type": f"multipart/form-data; boundary={boundary}",
158+
"User-Agent": f"anyparser_core@{__version__}",
156159
}
157160

158161
if parsed.api_key:

anyparser_core/validator/path.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,7 @@ async def validate_path(file_paths: Union[str, List[str]]) -> PathValidationResu
1717
Validates file paths exist and are accessible
1818
"""
1919
if not file_paths or (isinstance(file_paths, str) and not file_paths.strip()):
20-
return InvalidPathValidationResult(
21-
error=FileNotFoundError("No files provided")
22-
)
23-
20+
return InvalidPathValidationResult(error=FileNotFoundError("No files provided"))
2421

2522
if isinstance(file_paths, (str, Path)):
2623
files = [file_paths]

anyparser_core/version.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__version__ = "1.0.2"

changelogs/v1.0.2-changelog.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Release anyparser-core@1.0.2
2+
3+
## Changes
4+
5+
**User Agent**
6+
7+
- Added a User-Agent header.
8+
- Moved the version literal `__version__` to a separate file to prevent circular referencing.
9+
10+
**Rename "OCRPreset" to "OcrPreset"**
11+
12+
This pull request refactors the OCRPreset class to OcrPreset across the codebase for consistency in naming conventions.
13+
14+
- Renamed OCRPreset to `OcrPreset` in files like `README.md`, `anyparser_core/__init__.py`, and examples.
15+
- Updated variable names and documentation to reflect the new class name.
16+
- Modified test files to use the updated class.
17+
18+
This change is purely a refactor with no functional impact, aiming for consistency and improved readability.
19+
20+
## Breaking Changes
21+
22+
The class `OCRPreset` has been renamed to `OcrPreset` to maintain consistency in naming conventions.
23+
24+
## Migration Guide
25+
26+
Search and replace all instances of `OCRPreset` with `OcrPreset` in your codebase.

examples/03_one_liner.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,8 @@
44

55
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
66

7-
from anyparser_core import Anyparser
8-
9-
multiple_files = ["docs/sample.docx", "docs/sample.pdf"]
7+
# ------------------------------------------------------------------------------
108

11-
result = asyncio.run(Anyparser().parse(multiple_files))
12-
13-
for item in result:
14-
print("-" * 100)
15-
print("File:", item.original_filename)
16-
print("Checksum:", item.checksum)
17-
print("Total characters:", item.total_characters)
18-
print("Markdown:", item.markdown)
9+
from anyparser_core import Anyparser
1910

20-
print("-" * 100)
11+
print(asyncio.run(Anyparser().parse(["docs/sample.docx", "docs/sample.pdf"])))

0 commit comments

Comments
 (0)