-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathhighlighted_text_element.py
102 lines (84 loc) · 2.6 KB
/
highlighted_text_element.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from __future__ import annotations
from dataclasses import asdict, dataclass
from typing import TYPE_CHECKING, Any
from sec_parser.semantic_elements.abstract_semantic_element import (
AbstractSemanticElement,
)
if TYPE_CHECKING: # pragma: no cover
from sec_parser.processing_engine.html_tag import HtmlTag
class HighlightedTextElement(AbstractSemanticElement):
"""
The HighlightedTextElement class, among other uses,
is an intermediate step in identifying title elements.
For example:
============
First, elements with specific styles (like bold or italic text)
are classified as HighlightedTextElements.
These are later examined to determine if they should
be considered TitleElements.
"""
def __init__(
self,
html_tag: HtmlTag,
*,
style: TextStyle | None = None,
) -> None:
super().__init__(html_tag)
if style is None:
msg = "styles must be specified for HighlightedElement"
raise ValueError(msg)
self.style = style
@classmethod
def create_from_element(
cls,
source: AbstractSemanticElement,
*,
style: TextStyle | None = None,
) -> HighlightedTextElement:
return cls(
source.html_tag,
style=style,
)
def to_dict(self) -> dict[str, Any]:
return {
**super().to_dict(),
"text_style": asdict(self.style),
}
@dataclass(frozen=True)
class TextStyle:
PERCENTAGE_THRESHOLD = 80
BOLD_THRESHOLD = 600
bold_with_font_weight: bool
italic: bool
# underline?
# all-caps?
def __bool__(self) -> bool:
return any(asdict(self).values())
@classmethod
def from_style_string(
cls,
style_string: dict[tuple[str, str], float],
) -> TextStyle:
filtered_styles = {
(k, v): p
for (k, v), p in style_string.items()
if p >= cls.PERCENTAGE_THRESHOLD
}
bold_with_font_weight = any(
cls._is_bold_with_font_weight(k, v) for (k, v) in filtered_styles
)
italic = any(k == "font-style" and v == "italic" for (k, v) in filtered_styles)
return cls(
bold_with_font_weight=bold_with_font_weight,
italic=italic,
)
@classmethod
def _is_bold_with_font_weight(cls, key: str, value: str) -> bool:
if key != "font-weight":
return False
if value == "bold":
return True
try:
return int(value) >= cls.BOLD_THRESHOLD
except ValueError:
return False