Skip to content

Commit

Permalink
feat(citations): add UnmatchedCitation model and logic
Browse files Browse the repository at this point in the history
Solves #4920

- Add new model UnmatchedCitation on citations app
- refactor cl.search.models.Citation to create a BaseCitation abstract model to reuse on the UnmatchedCitation model
- updates cl.citations.tasks.store_opinion_citations_and_update_parentheticals to handle storing and updating unmatched citations
- updates cl.search.signals to update UnmatchedCitation status when a new Citation is saved
- add tests
- add update_unmatched_citations command to trigger update for found citations
  • Loading branch information
grossir committed Jan 21, 2025
1 parent 4e18f5a commit e363c90
Show file tree
Hide file tree
Showing 9 changed files with 513 additions and 17 deletions.
4 changes: 4 additions & 0 deletions cl/citations/management/commands/find_citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from django.core.management import CommandError
from django.core.management.base import CommandParser

from cl.citations.models import UnmatchedCitation
from cl.citations.tasks import (
find_citations_and_parentheticals_for_opinion_by_pks,
)
Expand Down Expand Up @@ -112,6 +113,9 @@ def handle(self, *args: List[str], **options: OptionsType) -> None:
query = query.filter(date_modified__gte=options["modified_after"])
if options.get("all"):
query = Opinion.objects.all()
sys.stdout.write("Deleting all UnmatchedCitation rows")
UnmatchedCitation.objects.all().delete()

self.count = query.count()
self.average_per_s = 0.0
self.timings: List[float] = []
Expand Down
50 changes: 50 additions & 0 deletions cl/citations/management/commands/update_unmatched_citations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from cl.citations.management.commands import find_citations
from cl.citations.models import UnmatchedCitation
from cl.lib.command_utils import VerboseCommand


class Command(find_citations.Command):
"""Re-run find_citations_and_parentheticals_for_opinion_by_pks for
opinions where unmatched citations have been found
"""

help = "Try to resolve unmatched citations"
# variables to use find_citations.Command.update_documents
count = 0
average_per_s = 0.0
timings: list[float] = []

def add_arguments(self, parser):
VerboseCommand.add_arguments(self, parser)
parser.add_argument(
"--resolve-failures",
action="store_true",
default=False,
help="Include citations with FAILED and FAILED_AMBIGUOUS status",
)
parser.add_argument(
"--queue",
default="batch1",
help="The celery queue where the tasks should be processed.",
)

def handle(self, *args, **options):
"""Re-uses find_citations.Command enqueuer and logging"""
VerboseCommand.handle(self, *args, **options)
status = [UnmatchedCitation.FOUND]
if options["resolve_failures"]:
status.extend(
[UnmatchedCitation.FAILED, UnmatchedCitation.FAILED_AMBIGUOUS]
)

# distinct() on Django only works when the same field is on .order_by()
opinion_ids = (
UnmatchedCitation.objects.filter(status__in=status)
.order_by("citing_opinion_id")
.distinct("citing_opinion_id")
)
self.count = opinion_ids.count()
opinion_pks = opinion_ids.values_list("citing_opinion_id", flat=True)
find_citations.Command.update_documents(
self, opinion_pks, options["queue"]
)
153 changes: 153 additions & 0 deletions cl/citations/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Generated by Django 5.1.4 on 2025-01-21 03:45

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):

initial = True

dependencies = [
("search", "0037_alter_citation_type_noop"),
]

operations = [
migrations.CreateModel(
name="UnmatchedCitation",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"volume",
models.SmallIntegerField(
help_text="The volume of the reporter"
),
),
(
"reporter",
models.TextField(
db_index=True,
help_text="The abbreviation for the reporter",
),
),
(
"page",
models.TextField(
help_text="The 'page' of the citation in the reporter. Unfortunately, this is not an integer, but is a string-type because several jurisdictions do funny things with the so-called 'page'. For example, we have seen Roman numerals in Nebraska, 13301-M in Connecticut, and 144M in Montana."
),
),
(
"type",
models.SmallIntegerField(
choices=[
(1, "A federal reporter citation (e.g. 5 F. 55)"),
(
2,
"A citation in a state-based reporter (e.g. Alabama Reports)",
),
(
3,
"A citation in a regional reporter (e.g. Atlantic Reporter)",
),
(
4,
"A citation in a specialty reporter (e.g. Lawyers' Edition)",
),
(
5,
"A citation in an early SCOTUS reporter (e.g. 5 Black. 55)",
),
(
6,
"A citation in the Lexis system (e.g. 5 LEXIS 55)",
),
(
7,
"A citation in the WestLaw system (e.g. 5 WL 55)",
),
(8, "A vendor neutral citation (e.g. 2013 FL 1)"),
(
9,
"A law journal citation within a scholarly or professional legal periodical (e.g. 95 Yale L.J. 5; 72 Soc.Sec.Rep.Serv. 318)",
),
],
help_text="The type of citation that this is.",
),
),
(
"status",
models.SmallIntegerField(
choices=[
(
1,
"The citation does not exist in the search_citation table. We couldn't match the citation to a cluster on the previous citation extractor run",
),
(
2,
"The citation exists on the search_citation table. We haven't updated the citing Opinion.html_with_citations yet",
),
(
3,
"The citing Opinion.html_with_citations was updated successfully",
),
(
4,
"The citing Opinion.html_with_citations update failed because the citation is ambiguous",
),
(
5,
"The citing Opinion.html_with_citations update failed",
),
],
help_text="Status of resolution of the initially unmatched citation",
),
),
(
"citation_string",
models.TextField(
help_text="The unparsed citation string in case it doesn't match the regular citation model in BaseCitation"
),
),
(
"court_id",
models.TextField(
help_text="A court_id as identified by eyecite from the opinion's context. May be useful to know where to find missing citations"
),
),
(
"year",
models.TextField(
help_text="A year identified by eyecite from the opinion's context"
),
),
(
"citing_opinion",
models.ForeignKey(
help_text="The opinion citing this citation",
on_delete=django.db.models.deletion.CASCADE,
related_name="eyecite_citations",
to="search.opinion",
),
),
],
options={
"indexes": [
models.Index(
fields=["volume", "reporter", "page"],
name="citations_u_volume_da4d25_idx",
)
],
"unique_together": {
("citing_opinion", "volume", "reporter", "page")
},
},
),
]
Empty file.
96 changes: 96 additions & 0 deletions cl/citations/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from django.db import models
from eyecite.models import FullCaseCitation

from cl.citations.utils import map_reporter_db_cite_type
from cl.search.models import BaseCitation, Opinion


class UnmatchedCitation(BaseCitation):
"""Keep track of citations that could not be resolved to a cluster on the
batch citator run
"""

UNMATCHED = 1
FOUND = 2
RESOLVED = 3
FAILED_AMBIGUOUS = 4
FAILED = 5
STATUS = (
(
UNMATCHED,
"The citation does not exist in the search_citation table."
" We couldn't match the citation to a cluster on the "
" previous citation extractor run",
),
(
FOUND,
"The citation exists on the search_citation table. We "
" haven't updated the citing Opinion.html_with_citations yet",
),
(
RESOLVED,
"The citing Opinion.html_with_citations was updated successfully",
),
(
FAILED_AMBIGUOUS,
"The citing Opinion.html_with_citations update "
"failed because the citation is ambiguous",
),
(FAILED, "The citing Opinion.html_with_citations update failed"),
)
citing_opinion: models.ForeignKey = models.ForeignKey(
Opinion,
help_text="The opinion citing this citation",
on_delete=models.CASCADE,
related_name="eyecite_citations",
)
status: models.SmallIntegerField = models.SmallIntegerField(
help_text="Status of resolution of the initially unmatched citation",
choices=STATUS,
)
citation_string: models.TextField = models.TextField(
help_text="The unparsed citation string in case it doesn't match the "
"regular citation model in BaseCitation"
)
court_id: models.TextField = models.TextField(
help_text="A court_id as identified by eyecite from the opinion's "
"context. May be useful to know where to find missing citations"
)
year: models.TextField = models.TextField(
help_text="A year identified by eyecite from the opinion's context"
)

class Meta:
indexes = [
models.Index(
fields=["volume", "reporter", "page"],
)
]
#
unique_together = (("citing_opinion", "volume", "reporter", "page"),)

@classmethod
def create_from_eyecite(
cls, eyecite_citation: FullCaseCitation, citing_opinion: Opinion
):
"""
Create an UnmatchedCitation instance using an eyecite FullCaseCitation
Saving is left to the caller
:param eyecite_citation: a FullCaseCitation as returned by
eyecite.get_citations
:param citing_opinion: the opinion which uses the citation
"""
cite_type_str = eyecite_citation.all_editions[0].reporter.cite_type
return cls(
citing_opinion=citing_opinion,
status=cls.UNMATCHED,
citation_string=eyecite_citation.matched_text(),
court_id=eyecite_citation.metadata.court or "",
year=eyecite_citation.metadata.year or "",
volume=eyecite_citation.groups["volume"],
reporter=eyecite_citation.corrected_reporter(),
page=eyecite_citation.groups["page"],
type=map_reporter_db_cite_type(cite_type_str),
)
Loading

0 comments on commit e363c90

Please sign in to comment.