Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add direct book providers #6585

Merged
merged 23 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ec1909d
Experiment with direct book providers
cdrini May 25, 2022
d65f95a
Add download links for pressbooks direct providers
cdrini May 25, 2022
4731816
Only show read button if open-access book
cdrini Jun 3, 2022
e8756ae
Make direct book providers support OPDS samples
cdrini Jun 3, 2022
02f9ce0
Make solr-updater take into account book providers for ebook_access
cdrini Oct 17, 2022
4980134
Add optional providers/description to search resp
cdrini Oct 18, 2022
feb72ee
Search results page shows read when providers
cdrini Oct 18, 2022
cc1dbc5
Make EbookProvider handle html access enum
cdrini Oct 19, 2022
eb4234c
Generate providers sections for TBP
cdrini Oct 19, 2022
77b1896
TMP: Hide weird /request linsk from testing!
cdrini Oct 19, 2022
b8083fe
Fix mypy
cdrini Nov 19, 2022
4e4b870
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 12, 2024
2268e53
Linting: type hints + exception for too many branches in worksearch
scottbarnes Jun 12, 2024
54b5f91
Basic /borrow -> webbook functionality
scottbarnes Jun 19, 2024
5a2a980
Revert "Add LCP download links for IA"
scottbarnes Jul 9, 2024
7418b02
Linting: placate ruff
scottbarnes Jul 9, 2024
1d8fd6c
Fix: use conditions in place of assert for edition existence
scottbarnes Jul 9, 2024
c1891d5
Feature: ensure action==read also succeeds
scottbarnes Jul 9, 2024
abaf398
Refactor: rename provider -> acquisition
scottbarnes Jul 9, 2024
2f9d60e
Fix: omit None from solr fields
scottbarnes Jul 9, 2024
6298877
Feature: add default generic toast for direct providers
scottbarnes Jul 10, 2024
8961a53
Implement code review feedback
scottbarnes Jul 10, 2024
84bdb7d
Implement changes from code review
scottbarnes Jul 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
260 changes: 257 additions & 3 deletions openlibrary/book_providers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from typing import TypedDict, Literal, cast, TypeVar, Generic
from dataclasses import dataclass
import logging
from collections.abc import Callable, Iterator
from typing import TypedDict, Literal, cast, TypeVar, Generic
from urllib import parse

import web
from web import uniq
Expand All @@ -10,6 +13,13 @@
from openlibrary.utils import OrderedEnum, multisort_best


logger = logging.getLogger("openlibrary.book_providers")

AcquisitionAccessLiteral = Literal[
'sample', 'buy', 'open-access', 'borrow', 'subscribe'
]


class EbookAccess(OrderedEnum):
# Keep in sync with solr/conf/enumsConfig.xml !
NO_EBOOK = 0
Expand All @@ -21,6 +31,103 @@ class EbookAccess(OrderedEnum):
def to_solr_str(self):
return self.name.lower()

@staticmethod
def from_acquisition_access(literal: AcquisitionAccessLiteral) -> 'EbookAccess':
if literal == 'sample':
# We need to update solr to handle these! Requires full reindex
return EbookAccess.PRINTDISABLED
elif literal == 'buy':
return EbookAccess.NO_EBOOK
elif literal == 'open-access':
return EbookAccess.PUBLIC
elif literal == 'borrow':
return EbookAccess.BORROWABLE
elif literal == 'subscribe':
return EbookAccess.NO_EBOOK
else:
raise ValueError(f'Unknown access literal: {literal}')


@dataclass
class Acquisition:
"""
Acquisition represents a book resource found on another website, such as
Standard Ebooks.

Wording inspired by OPDS; see https://specs.opds.io/opds-1.2#23-acquisition-feeds
"""

access: AcquisitionAccessLiteral
format: Literal['web', 'pdf', 'epub', 'audio']
price: str | None
url: str
provider_name: str | None = None

@property
def ebook_access(self) -> EbookAccess:
return EbookAccess.from_acquisition_access(self.access)

@staticmethod
def from_json(json: dict) -> 'Acquisition':
if 'href' in json:
# OPDS-style provider
return Acquisition.from_opds_json(json)
elif 'url' in json:
# We have an inconsistency in our API
html_access: dict[str, AcquisitionAccessLiteral] = {
'read': 'open-access',
'listen': 'open-access',
'buy': 'buy',
'borrow': 'borrow',
'preview': 'sample',
}
access = json.get('access', 'open-access')
if access in html_access:
access = html_access[access]
# Pressbooks/OL-style
return Acquisition(
access=access,
format=json.get('format', 'web'),
price=json.get('price'),
url=json['url'],
provider_name=json.get('provider_name'),
)
else:
raise ValueError(f'Unknown ebook acquisition format: {json}')

@staticmethod
def from_opds_json(json: dict) -> 'Acquisition':
if json.get('properties', {}).get('indirectAcquisition', None):
mimetype = json['properties']['indirectAcquisition'][0]['type']
else:
mimetype = json['type']

fmt: Literal['web', 'pdf', 'epub', 'audio'] = 'web'
if mimetype.startswith('audio/'):
fmt = 'audio'
elif mimetype == 'application/pdf':
fmt = 'pdf'
elif mimetype == 'application/epub+zip':
fmt = 'epub'
elif mimetype == 'text/html':
fmt = 'web'
else:
logger.warning(f'Unknown mimetype: {mimetype}')
fmt = 'web'

if json.get('properties', {}).get('price', None):
price = f"{json['properties']['price']['value']} {json['properties']['price']['currency']}"
else:
price = None

return Acquisition(
access=json['rel'].split('/')[-1],
format=fmt,
price=price,
url=json['href'],
provider_name=json.get('name'),
)


class IALiteMetadata(TypedDict):
boxid: set[str]
Expand All @@ -38,7 +145,7 @@ class AbstractBookProvider(Generic[TProviderMetadata]):
The key in the identifiers field on editions;
see https://openlibrary.org/config/edition
"""
identifier_key: str
identifier_key: str | None

def get_olids(self, identifier):
return web.ctx.site.things(
Expand Down Expand Up @@ -112,6 +219,15 @@ def get_access(
# Most providers are for public-only ebooks right now
return EbookAccess.PUBLIC

def get_acquisitions(
self,
edition: Edition,
) -> list[Acquisition]:
if edition.providers:
return [Acquisition.from_json(dict(p)) for p in edition.providers]
else:
Comment on lines +226 to +227
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're a little torn on this wording; there are a few spots in the code where providers refers to a list of AbstractBookProvider, but here it instead refers to effectively a list of Acquisition. Do we want to change our data schema to refer to this as acquisitions as well?

return []


class InternetArchiveProvider(AbstractBookProvider[IALiteMetadata]):
short_name = 'ia'
Expand Down Expand Up @@ -195,6 +311,20 @@ def render_download_options(self, edition: Edition, extra_args: list | None = No
def is_own_ocaid(self, ocaid: str) -> bool:
return 'librivox' in ocaid

def get_acquisitions(
self,
edition: Edition,
) -> list[Acquisition]:
return [
Acquisition(
access='open-access',
format='audio',
price=None,
url=f'https://librivox.org/{self.get_best_identifier(edition)}',
provider_name=self.short_name,
)
]


class ProjectGutenbergProvider(AbstractBookProvider):
short_name = 'gutenberg'
Expand All @@ -203,6 +333,20 @@ class ProjectGutenbergProvider(AbstractBookProvider):
def is_own_ocaid(self, ocaid: str) -> bool:
return ocaid.endswith('gut')

def get_acquisitions(
self,
edition: Edition,
) -> list[Acquisition]:
return [
Acquisition(
access='open-access',
format='web',
price=None,
url=f'https://www.gutenberg.org/ebooks/{self.get_best_identifier(edition)}',
provider_name=self.short_name,
)
]


class StandardEbooksProvider(AbstractBookProvider):
short_name = 'standard_ebooks'
Expand All @@ -212,6 +356,30 @@ def is_own_ocaid(self, ocaid: str) -> bool:
# Standard ebooks isn't archived on IA
return False

def get_acquisitions(
self,
edition: Edition,
) -> list[Acquisition]:
standard_ebooks_id = self.get_best_identifier(edition)
base_url = 'https://standardebooks.org/ebooks/' + standard_ebooks_id
flat_id = standard_ebooks_id.replace('/', '_')
return [
Acquisition(
access='open-access',
format='web',
price=None,
url=f'{base_url}/text/single-page',
provider_name=self.short_name,
),
Acquisition(
access='open-access',
format='epub',
price=None,
url=f'{base_url}/downloads/{flat_id}.epub',
provider_name=self.short_name,
),
]


class OpenStaxProvider(AbstractBookProvider):
short_name = 'openstax'
Expand All @@ -220,6 +388,20 @@ class OpenStaxProvider(AbstractBookProvider):
def is_own_ocaid(self, ocaid: str) -> bool:
return False

def get_acquisitions(
self,
edition: Edition,
) -> list[Acquisition]:
return [
Acquisition(
access='open-access',
format='web',
price=None,
url=f'https://openstax.org/details/books/{self.get_best_identifier(edition)}',
provider_name=self.short_name,
)
]


class CitaPressProvider(AbstractBookProvider):
short_name = 'cita_press'
Expand All @@ -229,9 +411,81 @@ def is_own_ocaid(self, ocaid: str) -> bool:
return False


class DirectProvider(AbstractBookProvider):
short_name = 'direct'
identifier_key = None

@property
def db_selector(self):
return "providers.url"

@property
def solr_key(self):
# TODO: Not implemented yet
return None

def get_identifiers(self, ed_or_solr: Edition | dict) -> list[str]:
# It's an edition
if ed_or_solr.get('providers'):
return [
provider.url
for provider in map(Acquisition.from_json, ed_or_solr['providers'])
if provider.ebook_access >= EbookAccess.PRINTDISABLED
]
else:
# TODO: Not implemented for search/solr yet
return []
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Follow-up for a future issue/pr: make this work in carousels. E.g. when a Web Book appears in a carousel, it should also display the "read" button


def render_read_button(
self, ed_or_solr: Edition | dict, analytics_attr: Callable[[str], str]
):
acq_sorted = sorted(
(
p
for p in map(Acquisition.from_json, ed_or_solr.get('providers', []))
if p.ebook_access >= EbookAccess.PRINTDISABLED
),
key=lambda p: p.ebook_access,
reverse=True,
)
if not acq_sorted:
return ''

acquisition = acq_sorted[0]
# pre-process acquisition.url so ParseResult.netloc is always the domain. Only netloc is used.
url = (
"https://" + acquisition.url
if not acquisition.url.startswith("http")
else acquisition.url
)
parsed_url = parse.urlparse(url)
domain = parsed_url.netloc
return render_template(
self.get_template_path('read_button'), acquisition, domain
)

def render_download_options(self, edition: Edition, extra_args: list | None = None):
# Return an empty string until #9581 is addressed.
return ""

def get_access(
self,
edition: dict,
metadata: TProviderMetadata | None = None,
) -> EbookAccess:
"""
Return the access level of the edition.
"""
# For now assume 0 is best
return EbookAccess.from_acquisition_access(
Acquisition.from_json(edition['providers'][0]).access
)


PROVIDER_ORDER: list[AbstractBookProvider] = [
# These providers act essentially as their own publishers, so link to the first when
# we're on an edition page
DirectProvider(),
LibriVoxProvider(),
ProjectGutenbergProvider(),
StandardEbooksProvider(),
Expand Down Expand Up @@ -389,7 +643,7 @@ def get_best_edition(


def get_solr_keys():
return [p.solr_key for p in PROVIDER_ORDER]
return [p.solr_key for p in PROVIDER_ORDER if p.solr_key]


setattr(get_book_provider, 'ia', get_book_provider_by_name('ia')) # noqa: B010
Loading
Loading