diff --git a/juriscraper/pacer/appellate_docket.py b/juriscraper/pacer/appellate_docket.py index 2e1aac83a..614a95d3f 100644 --- a/juriscraper/pacer/appellate_docket.py +++ b/juriscraper/pacer/appellate_docket.py @@ -43,7 +43,6 @@ class AppellateDocketReport(BaseDocketReport, BaseReport): docket_number_dist_regex = re.compile( r"((\d{1,2}:)?\d\d-[a-zA-Z]{1,4}-\d{1,10})" ) - docket_number_lookup_regex = "Docket #|Case Number" CACHE_ATTRS = ["metadata", "docket_entries"] @@ -358,9 +357,7 @@ def metadata(self): data = { "court_id": self.court_id, - "docket_number": self._get_tail_by_regex( - self.docket_number_lookup_regex - ), + "docket_number": self._parse_docket_number(), "case_name": self._get_case_name(), "panel": self._get_panel(), "nature_of_suit": self._get_tail_by_regex("Nature of Suit"), @@ -480,6 +477,32 @@ def _parse_attorney(self, tree): attorney["contact"] = "\n".join(contacts) return attorney + def _parse_docket_number(self) -> str: + """Parse the docket_number from the appellate report. + :return: The docket_number. + """ + + docket_number_regex = "Docket #|Case Number" + # Try to parse docket_number first using _get_tail_by_regex. + if docket_number := self._get_tail_by_regex(docket_number_regex): + return docket_number + + # If that doesn't work, fall back to parsing the docket_number wrapped + # in a "tel:" href. + nodes = self.tree.re_xpath( + f'//*[re:match(text(), "{docket_number_regex}")]' + ) + if not nodes: + return "" + + a_node = nodes[0].xpath( + "following-sibling::a[starts-with(@href, 'tel:')][1]" + ) + if a_node and a_node[0].text: + return clean_string(a_node[0].text) + + return "" + @property def parties(self): """Return the party table as HTML. @@ -920,16 +943,7 @@ def _get_tail_by_regex(self, regex, cast_to_date=False, node=None): node = node if node is not None else self.tree nodes = node.re_xpath(f'//*[re:match(text(), "{regex}")]') try: - tail_text = nodes[0].tail - tail_text = tail_text.strip() if tail_text is not None else "" - # If there's no tail text and we're parsing the docket_number, - # fall back to parsing the docket_number wrapped in a "tel:" href. - if not tail_text and regex == self.docket_number_lookup_regex: - a_nodes = nodes[0].xpath( - "following-sibling::a[starts-with(@href, 'tel:')][1]" - ) - tail_text = a_nodes[0].text or "" if a_nodes else "" - tail = clean_string(tail_text) + tail = clean_string(nodes[0].tail.strip()) except (IndexError, AttributeError): if cast_to_date: return None diff --git a/tests/examples/pacer/dockets/appellate/ca5_212643.html b/tests/examples/pacer/dockets/appellate/ca5_212643.html index 182cfd546..3c104c836 100644 --- a/tests/examples/pacer/dockets/appellate/ca5_212643.html +++ b/tests/examples/pacer/dockets/appellate/ca5_212643.html @@ -71,7 +71,7 @@