Skip to content

Commit

Permalink
Merge branch 'main' into fix_sd_extract_from_text
Browse files Browse the repository at this point in the history
  • Loading branch information
grossir committed Jan 28, 2025
2 parents b560953 + ca39567 commit 43d1dd1
Show file tree
Hide file tree
Showing 9 changed files with 18,740 additions and 628 deletions.
1 change: 1 addition & 0 deletions .github/workflows/pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
- run: python -m pip install -U packaging
- uses: casperdcl/deploy-pypi@v2
with:
password: ${{ secrets.pypi_token }}
Expand Down
23 changes: 22 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,28 @@ Releases are also tagged in git, if that's helpful.

## Current

**2.6.54 - 2024-01-24**

- Fixes:
- `ca6` oral argument scraper is no longer failing
- update the pypi.yml github actions workflow to solve a bug with twine and
packaging packages interaction. It now forces the update of packaging
- due to that bug, we discarded the 2.6.53 version

## Past

**2.6.52 - 2024-01-20**

- Fixes:
- `AppellateDocketReport.download_pdf` now returns a two-tuple containing the
response object or None and a str. This aligns with the changes introduced
in v 2.5.1.

**2.6.51 - 2024-01-14**

- Fixes:
- `extract_from_text` now returns plain citation strings, instead of parsed dicts

**2.6.50 - 2024-01-10**

- Fixes:
Expand Down Expand Up @@ -56,7 +78,6 @@ Releases are also tagged in git, if that's helpful.
- Features
- added `extract_from_text` to `sc`

## Past

**2.6.46 - 2024-12-10**

Expand Down
123 changes: 41 additions & 82 deletions juriscraper/oral_args/united_states/federal_appellate/ca6.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,108 +7,67 @@
History:
2014-11-06: Started by Brian W. Carver and wrapped up by mlr.
2016-06-30: Updated by mlr.
2025-01-21: Updated to OralArgumentSiteLinear by grossir
"""

import re
from datetime import datetime
from urllib.parse import parse_qs, urljoin, urlparse
from datetime import date, datetime

from juriscraper.lib.string_utils import convert_date_string
from juriscraper.OralArgumentSite import OralArgumentSite
from juriscraper.AbstractSite import logger
from juriscraper.OralArgumentSiteLinear import OralArgumentSiteLinear


class Site(OralArgumentSite):
class Site(OralArgumentSiteLinear):
days_interval = 10000 # force a single interval
first_opinion_date = datetime(2012, 12, 1)
# check the first 100 records; Otherwise, it will try to download more
# than 1000 every time
limit = 100

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = (
"http://www.opn.ca6.uscourts.gov/internet/court_audio/aud1.php"
"https://www.opn.ca6.uscourts.gov/internet/court_audio/aud1.php"
)
self.xpath_root = '//table[@class="views-table cols-3"]'
self.regex = re.compile(r"((?:\d{2}[- ]\d{4}\s*)+)(.*)")
self.back_scrape_iterable = [
"nothing"
] # Just a placeholder for this court
self.backscrape = False
self.make_backscrape_iterable(kwargs)

def _get_download_urls(self):
"""Two options are currently provided by the site. The first is a link
to "save" the file, which gives you a zip containing the file. The
second is a link to "play" the file, which takes you to a flash player.
def _process_html(self) -> None:
"""All parsable fields are contained in the URL
The good news is that the link to "play" it contains a real link to
actually download it inside the 'link' param.
Parsing the URL helps simplifying the backscraper which has a different
HTML structure than the regular page
"""
if self.backscrape:
path_to_flash_page = '//tr/td[3]/a/@href[contains(., "?link=")]'
else:
path_to_flash_page = '//tr/td[2]/a/@href[contains(., "?link=")]'
links_to_flash = list(self.html.xpath(path_to_flash_page))
urls = []
for url in links_to_flash:
path = parse_qs(urlparse(url).query)["link"][0]
# Remove newlines and line returns from urls.
path = path.replace("\n", "").replace("\r", "")

if "www.opn" not in url:
# Update the URL if it's not the one we want.
url = url.replace("www", "www.opn")
urls.append(urljoin(url, path))
return urls

def _get_case_names(self):
if self.backscrape:
path = f"{self.xpath_root}//td[2]/text()"
else:
path = f"{self.xpath_root}/tr/td[1]/text()"
case_names = []
for s in self.html.xpath(path):
case_names.append(self.regex.search(s).group(2))
return case_names
for link in self.html.xpath("//a[text()='Play']/@href")[: self.limit]:
*_, date_str, case = link.split("/")
docket_match = re.search(r"(\d{2}-\d{4}\s?)+", case)
if not docket_match:
logger.warning("Skipping row %s", link)
continue

def _get_case_dates(self):
dates = []
if self.backscrape:
date_strs = self.html.xpath("//table//td[1]//text()")
return [convert_date_string(s) for s in date_strs]
else:
# Multiple items are listed under a single date.
date_path = ".//th[1]"
# For every table full of OA's...
for table in self.html.xpath(self.xpath_root):
# Find the date str, e.g. "10-10-2014 - Friday"
date_str = table.xpath(date_path)[0].text_content()
d = datetime.strptime(date_str[:10], "%m-%d-%Y").date()
docket = docket_match.group(0).strip()
name = case[docket_match.end() : case.find(".mp3")].strip()
self.cases.append(
{
"docket": docket,
"name": name,
"url": link,
"date": date_str.rsplit("-", 1)[0].strip(),
}
)

# The count of OAs on a date is the number of rows minus the
# header row.
total_rows = len(table.xpath(".//tr")) - 1
dates.extend([d] * total_rows)
return dates

def _get_docket_numbers(self):
if self.backscrape:
path = f"{self.xpath_root}//td[2]/text()"
else:
path = f"{self.xpath_root}/tr/td[1]/text()"
return [
self.regex.search(s).group(1).strip().replace(" ", "-")
for s in self.html.xpath(path)
]

def _download_backwards(self, _):
"""You can get everything with a single POST, thus we just ignore the
back_scrape_iterable.
"""
self.backscrape = True
def _download_backwards(self, dates: tuple[date]) -> None:
"""Downloads and parses older records according to input dates"""
logger.info("Backscraping for range %s", *dates)
self.limit = 10000 # disable limit
self.method = "POST"
self.xpath_root = "//table"
self.url = "http://www.opn.ca6.uscourts.gov/internet/court_audio/audSearchRes.php"
self.url = "https://www.opn.ca6.uscourts.gov/internet/court_audio/audSearchRes.php"
self.parameters = {
"caseNumber": "",
"shortTitle": "",
"dateFrom": "01/01/2013",
"dateTo": "01/01/2015",
"dateFrom": dates[0].strftime("%m/%d/%y"),
"dateTo": dates[1].strftime("%m/%d/%y"),
"Submit": "Submit+Query",
}
self.html = self._download()
self._process_html()
33 changes: 23 additions & 10 deletions juriscraper/pacer/appellate_docket.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from lxml import html
from lxml.etree import _ElementUnicodeResult
from requests import Response

from ..lib.judge_parsers import normalize_judge_string
from ..lib.log_tools import make_default_logger
Expand Down Expand Up @@ -260,13 +261,16 @@ def parse(self):
self._clear_caches()
super().parse()

def download_pdf(self, pacer_doc_id, pacer_case_id=None):
def download_pdf(
self, pacer_doc_id, pacer_case_id=None
) -> tuple[Optional[Response], str]:
"""Download a PDF from an appellate court.
:param pacer_case_id: The case ID for the docket
:param pacer_doc_id: The document ID for the item.
:return: request.Response object containing the PDF, if one can be
found, else returns None.
:return: A tuple of the request.Response object containing a PDF, if
one can be found (is not sealed, gone, etc.). And a string indicating
the error message, if there is one or else an empty string.
This is a functional curl command to get a PDF (though the cookies have
been changed to protect the innocent):
Expand Down Expand Up @@ -325,14 +329,23 @@ def download_pdf(self, pacer_doc_id, pacer_case_id=None):
)
r = self.session.get(self.url, params=query_params)
r.raise_for_status()
if is_pdf(r):
logger.info(
"Got PDF binary data for document #%s in court %s",
pacer_doc_id,
self.court_id,

if b"Documents are attached to this filing" in r.content:
error_message = (
"Unable to download PDF. "
"An attachment page was returned instead."
)
return r
return None
return None, error_message

if not is_pdf(r):
return None, "Unable to download PDF."

logger.info(
"Got PDF binary data for document #%s in court %s",
pacer_doc_id,
self.court_id,
)
return r, ""

@property
def metadata(self):
Expand Down
5 changes: 3 additions & 2 deletions juriscraper/pacer/hidden_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,9 @@ def query(self, pacer_case_id, document_number, attachment_number=""):
)
)
logger.info(f"Querying the show_doc_url endpoint with URL: {url}")
# Only do a head request, else we get text content we don't need.
self.response = self.session.head(url, allow_redirects=True)
# we use get request because nysd court disabled all head requests
# and bans by IP in case head request is made
self.response = self.session.get(url, allow_redirects=True)
self.parse()

def _parse_text(self, text):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from setuptools import find_packages, setup
from setuptools.command.install import install

VERSION = "2.6.50"
VERSION = "2.6.54"
AUTHOR = "Free Law Project"
EMAIL = "info@free.law"
HERE = os.path.abspath(os.path.dirname(__file__))
Expand Down
803 changes: 802 additions & 1 deletion tests/examples/oral_args/united_states/ca6_example.compare.json

Large diffs are not rendered by default.

Loading

0 comments on commit 43d1dd1

Please sign in to comment.