Merge branch 'main' into fix_sd_extract_from_text

freelawproject · Jan 28, 2025 · 43d1dd1 · 43d1dd1
2 parents b560953 + ca39567
commit 43d1dd1
Show file tree

Hide file tree

Showing 9 changed files with 18,740 additions and 628 deletions.
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -9,6 +9,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v5
+      - run: python -m pip install -U packaging
       - uses: casperdcl/deploy-pypi@v2
         with:
           password: ${{ secrets.pypi_token }}

diff --git a/CHANGES.md b/CHANGES.md
@@ -16,6 +16,28 @@ Releases are also tagged in git, if that's helpful.
 
 ## Current
 
+**2.6.54 - 2024-01-24**
+
+- Fixes:
+  - `ca6` oral argument scraper is no longer failing
+  - update the pypi.yml github actions workflow to solve a bug with twine and
+    packaging packages interaction. It now forces the update of packaging
+  - due to that bug, we discarded the 2.6.53 version
+
+## Past
+
+**2.6.52 - 2024-01-20**
+
+- Fixes:
+  - `AppellateDocketReport.download_pdf` now returns a two-tuple containing the
+    response object or None and a str. This aligns with the changes introduced
+    in v 2.5.1.
+
+**2.6.51 - 2024-01-14**
+
+- Fixes:
+  - `extract_from_text` now returns plain citation strings, instead of parsed dicts
+
 **2.6.50 - 2024-01-10**
 
 - Fixes:
@@ -56,7 +78,6 @@ Releases are also tagged in git, if that's helpful.
 - Features
   - added `extract_from_text` to `sc`
 
-## Past
 
 **2.6.46 - 2024-12-10**
 

diff --git a/juriscraper/oral_args/united_states/federal_appellate/ca6.py b/juriscraper/oral_args/united_states/federal_appellate/ca6.py
@@ -7,108 +7,67 @@
 History:
   2014-11-06: Started by Brian W. Carver and wrapped up by mlr.
   2016-06-30: Updated by mlr.
+  2025-01-21: Updated to OralArgumentSiteLinear by grossir
 """
 
 import re
-from datetime import datetime
-from urllib.parse import parse_qs, urljoin, urlparse
+from datetime import date, datetime
 
-from juriscraper.lib.string_utils import convert_date_string
-from juriscraper.OralArgumentSite import OralArgumentSite
+from juriscraper.AbstractSite import logger
+from juriscraper.OralArgumentSiteLinear import OralArgumentSiteLinear
 
 
-class Site(OralArgumentSite):
+class Site(OralArgumentSiteLinear):
+    days_interval = 10000  # force a single interval
+    first_opinion_date = datetime(2012, 12, 1)
+    # check the first 100 records; Otherwise, it will try to download more
+    # than 1000 every time
+    limit = 100
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.court_id = self.__module__
         self.url = (
-            "http://www.opn.ca6.uscourts.gov/internet/court_audio/aud1.php"
+            "https://www.opn.ca6.uscourts.gov/internet/court_audio/aud1.php"
         )
-        self.xpath_root = '//table[@class="views-table cols-3"]'
-        self.regex = re.compile(r"((?:\d{2}[- ]\d{4}\s*)+)(.*)")
-        self.back_scrape_iterable = [
-            "nothing"
-        ]  # Just a placeholder for this court
-        self.backscrape = False
+        self.make_backscrape_iterable(kwargs)
 
-    def _get_download_urls(self):
-        """Two options are currently provided by the site. The first is a link
-        to "save" the file, which gives you a zip containing the file. The
-        second is a link to "play" the file, which takes you to a flash player.
+    def _process_html(self) -> None:
+        """All parsable fields are contained in the URL
 
-        The good news is that the link to "play" it contains a real link to
-        actually download it inside the 'link' param.
+        Parsing the URL helps simplifying the backscraper which has a different
+        HTML structure than the regular page
         """
-        if self.backscrape:
-            path_to_flash_page = '//tr/td[3]/a/@href[contains(., "?link=")]'
-        else:
-            path_to_flash_page = '//tr/td[2]/a/@href[contains(., "?link=")]'
-        links_to_flash = list(self.html.xpath(path_to_flash_page))
-        urls = []
-        for url in links_to_flash:
-            path = parse_qs(urlparse(url).query)["link"][0]
-            # Remove newlines and line returns from urls.
-            path = path.replace("\n", "").replace("\r", "")
-
-            if "www.opn" not in url:
-                # Update the URL if it's not the one we want.
-                url = url.replace("www", "www.opn")
-            urls.append(urljoin(url, path))
-        return urls
-
-    def _get_case_names(self):
-        if self.backscrape:
-            path = f"{self.xpath_root}//td[2]/text()"
-        else:
-            path = f"{self.xpath_root}/tr/td[1]/text()"
-        case_names = []
-        for s in self.html.xpath(path):
-            case_names.append(self.regex.search(s).group(2))
-        return case_names
+        for link in self.html.xpath("//a[text()='Play']/@href")[: self.limit]:
+            *_, date_str, case = link.split("/")
+            docket_match = re.search(r"(\d{2}-\d{4}\s?)+", case)
+            if not docket_match:
+                logger.warning("Skipping row %s", link)
+                continue
 
-    def _get_case_dates(self):
-        dates = []
-        if self.backscrape:
-            date_strs = self.html.xpath("//table//td[1]//text()")
-            return [convert_date_string(s) for s in date_strs]
-        else:
-            # Multiple items are listed under a single date.
-            date_path = ".//th[1]"
-            # For every table full of OA's...
-            for table in self.html.xpath(self.xpath_root):
-                # Find the date str, e.g. "10-10-2014 - Friday"
-                date_str = table.xpath(date_path)[0].text_content()
-                d = datetime.strptime(date_str[:10], "%m-%d-%Y").date()
+            docket = docket_match.group(0).strip()
+            name = case[docket_match.end() : case.find(".mp3")].strip()
+            self.cases.append(
+                {
+                    "docket": docket,
+                    "name": name,
+                    "url": link,
+                    "date": date_str.rsplit("-", 1)[0].strip(),
+                }
+            )
 
-                # The count of OAs on a date is the number of rows minus the
-                # header row.
-                total_rows = len(table.xpath(".//tr")) - 1
-                dates.extend([d] * total_rows)
-            return dates
-
-    def _get_docket_numbers(self):
-        if self.backscrape:
-            path = f"{self.xpath_root}//td[2]/text()"
-        else:
-            path = f"{self.xpath_root}/tr/td[1]/text()"
-        return [
-            self.regex.search(s).group(1).strip().replace(" ", "-")
-            for s in self.html.xpath(path)
-        ]
-
-    def _download_backwards(self, _):
-        """You can get everything with a single POST, thus we just ignore the
-        back_scrape_iterable.
-        """
-        self.backscrape = True
+    def _download_backwards(self, dates: tuple[date]) -> None:
+        """Downloads and parses older records according to input dates"""
+        logger.info("Backscraping for range %s", *dates)
+        self.limit = 10000  # disable limit
         self.method = "POST"
-        self.xpath_root = "//table"
-        self.url = "http://www.opn.ca6.uscourts.gov/internet/court_audio/audSearchRes.php"
+        self.url = "https://www.opn.ca6.uscourts.gov/internet/court_audio/audSearchRes.php"
         self.parameters = {
             "caseNumber": "",
             "shortTitle": "",
-            "dateFrom": "01/01/2013",
-            "dateTo": "01/01/2015",
+            "dateFrom": dates[0].strftime("%m/%d/%y"),
+            "dateTo": dates[1].strftime("%m/%d/%y"),
             "Submit": "Submit+Query",
         }
         self.html = self._download()
+        self._process_html()
diff --git a/juriscraper/pacer/appellate_docket.py b/juriscraper/pacer/appellate_docket.py
@@ -6,6 +6,7 @@
 
 from lxml import html
 from lxml.etree import _ElementUnicodeResult
+from requests import Response
 
 from ..lib.judge_parsers import normalize_judge_string
 from ..lib.log_tools import make_default_logger
@@ -260,13 +261,16 @@ def parse(self):
         self._clear_caches()
         super().parse()
 
-    def download_pdf(self, pacer_doc_id, pacer_case_id=None):
+    def download_pdf(
+        self, pacer_doc_id, pacer_case_id=None
+    ) -> tuple[Optional[Response], str]:
         """Download a PDF from an appellate court.
 
         :param pacer_case_id: The case ID for the docket
         :param pacer_doc_id: The document ID for the item.
-        :return: request.Response object containing the PDF, if one can be
-        found, else returns None.
+        :return: A tuple of the request.Response object containing a PDF, if
+        one can be found (is not sealed, gone, etc.). And a string indicating
+        the error message, if there is one or else an empty string.
 
         This is a functional curl command to get a PDF (though the cookies have
         been changed to protect the innocent):
@@ -325,14 +329,23 @@ def download_pdf(self, pacer_doc_id, pacer_case_id=None):
         )
         r = self.session.get(self.url, params=query_params)
         r.raise_for_status()
-        if is_pdf(r):
-            logger.info(
-                "Got PDF binary data for document #%s in court %s",
-                pacer_doc_id,
-                self.court_id,
+
+        if b"Documents are attached to this filing" in r.content:
+            error_message = (
+                "Unable to download PDF. "
+                "An attachment page was returned instead."
             )
-            return r
-        return None
+            return None, error_message
+
+        if not is_pdf(r):
+            return None, "Unable to download PDF."
+
+        logger.info(
+            "Got PDF binary data for document #%s in court %s",
+            pacer_doc_id,
+            self.court_id,
+        )
+        return r, ""
 
     @property
     def metadata(self):

diff --git a/juriscraper/pacer/hidden_api.py b/juriscraper/pacer/hidden_api.py
@@ -227,8 +227,9 @@ def query(self, pacer_case_id, document_number, attachment_number=""):
             )
         )
         logger.info(f"Querying the show_doc_url endpoint with URL: {url}")
-        # Only do a head request, else we get text content we don't need.
-        self.response = self.session.head(url, allow_redirects=True)
+        # we use get request because nysd court disabled all head requests
+        # and bans by IP in case head request is made
+        self.response = self.session.get(url, allow_redirects=True)
         self.parse()
 
     def _parse_text(self, text):

diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 from setuptools import find_packages, setup
 from setuptools.command.install import install
 
-VERSION = "2.6.50"
+VERSION = "2.6.54"
 AUTHOR = "Free Law Project"
 EMAIL = "info@free.law"
 HERE = os.path.abspath(os.path.dirname(__file__))

diff --git a/tests/examples/oral_args/united_states/ca6_example.compare.json b/tests/examples/oral_args/united_states/ca6_example.compare.json