From 05851151035a38112e563da858686cf87eb503b1 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Fri, 24 Jan 2025 13:34:27 -0400 Subject: [PATCH 1/2] feat(appellate): Handle Attachment Pages in download_pdf This commit improves the download_pdf method to correctly handle cases where attachment pages are returned instead of the expected PDF documents. --- juriscraper/pacer/appellate_docket.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/juriscraper/pacer/appellate_docket.py b/juriscraper/pacer/appellate_docket.py index aaf379c75..0141588d9 100644 --- a/juriscraper/pacer/appellate_docket.py +++ b/juriscraper/pacer/appellate_docket.py @@ -329,14 +329,24 @@ def download_pdf( ) r = self.session.get(self.url, params=query_params) r.raise_for_status() - if is_pdf(r): - logger.info( - "Got PDF binary data for document #%s in court %s", - pacer_doc_id, - self.court_id, + + if b"Documents are attached to this filing" in r.content: + error_message = ( + "This PACER document is part of an attachment page. " + "Our system currently lacks the metadata for this attachment. " + "Please purchase the attachment page and try again." ) - return r, "" - return None, "Unable to download PDF." + return None, error_message + + if not is_pdf(r): + return None, "Unable to download PDF." + + logger.info( + "Got PDF binary data for document #%s in court %s", + pacer_doc_id, + self.court_id, + ) + return r, "" @property def metadata(self): From ec9386e89c365ebe8ce48a177f5a3063b008ee29 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Fri, 24 Jan 2025 19:30:52 -0400 Subject: [PATCH 2/2] feat(appellate): Refines error message for attachments --- juriscraper/pacer/appellate_docket.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/juriscraper/pacer/appellate_docket.py b/juriscraper/pacer/appellate_docket.py index 0141588d9..02b5b1ac4 100644 --- a/juriscraper/pacer/appellate_docket.py +++ b/juriscraper/pacer/appellate_docket.py @@ -332,9 +332,8 @@ def download_pdf( if b"Documents are attached to this filing" in r.content: error_message = ( - "This PACER document is part of an attachment page. " - "Our system currently lacks the metadata for this attachment. " - "Please purchase the attachment page and try again." + "Unable to download PDF. " + "An attachment page was returned instead." ) return None, error_message