From 41b734bfdd1cd34cf9ae09dd794371f0811baa1c Mon Sep 17 00:00:00 2001 From: tristanlatr <19967168+tristanlatr@users.noreply.github.com> Date: Tue, 15 Oct 2024 13:04:15 -0400 Subject: [PATCH] Implement Canonical URLs (#821) * Fix #608 --- README.rst | 2 ++ docs/source/conf.py | 6 ++++ docs/source/publish-github-action.rst | 1 + docs/source/quickstart.rst | 1 + docs/tests/test.py | 8 +++-- pydoctor/options.py | 10 +++++++ pydoctor/templatewriter/pages/__init__.py | 24 +++++++++++++-- pydoctor/test/test_commandline.py | 17 +++++++++++ pydoctor/test/test_templatewriter.py | 36 +++++++++++++++++++++++ pydoctor/themes/base/head.html | 3 +- pydoctor/themes/classic/head.html | 3 +- pydoctor/themes/readthedocs/head.html | 3 +- 12 files changed, 107 insertions(+), 7 deletions(-) diff --git a/README.rst b/README.rst index 64b30e004..6aa2842df 100644 --- a/README.rst +++ b/README.rst @@ -74,6 +74,8 @@ in development ^^^^^^^^^^^^^^ * Drop Python 3.7 and support Python 3.13. +* Implement canonical HTML element (````) to help search engines reduce outdated content. + Enable this feature by passing the base URL of the API documentation with option ``--html-base-url``. * Improve collection of objects: - Document objects declared in the ``else`` block of 'if' statements (previously they were ignored). - Document objects declared in ``finalbody`` and ``else`` block of 'try' statements (previously they were ignored). diff --git a/docs/source/conf.py b/docs/source/conf.py index 5043188c9..eeba964ff 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -99,6 +99,7 @@ pydoctor_args = { 'main': [ '--html-output={outdir}/api/', # Make sure to have a trailing delimiter for better usage coverage. + '--html-base-url=https://pydoctor.readthedocs.io/en/latest/api', '--project-name=pydoctor', f'--project-version={version}', '--docformat=epytext', @@ -108,6 +109,7 @@ ] + _common_args, 'custom_template_demo': [ '--html-output={outdir}/custom_template_demo/', + '--html-base-url=https://pydoctor.readthedocs.io/en/latest/custom_template_demo', f'--project-version={version}', f'--template-dir={_pydoctor_root}/docs/sample_template', f'{_pydoctor_root}/pydoctor', @@ -116,6 +118,7 @@ '-qqq' ], # we don't want to hear any warnings from this custom template demo. 'epydoc_demo': [ '--html-output={outdir}/docformat/epytext_demo', + '--html-base-url=https://pydoctor.readthedocs.io/en/latest/docformat/epytext_demo', '--project-name=pydoctor-epytext-demo', '--project-version=1.3.0', '--docformat=epytext', @@ -126,6 +129,7 @@ ] + _common_args, 'restructuredtext_demo': [ '--html-output={outdir}/docformat/restructuredtext_demo', + '--html-base-url=https://pydoctor.readthedocs.io/en/latest/docformat/restructuredtext_demo', '--project-name=pydoctor-restructuredtext-demo', '--project-version=1.0.0', '--docformat=restructuredtext', @@ -136,6 +140,7 @@ ] + _common_args, 'numpy_demo': [ # no need to pass --docformat here, we use __docformat__ '--html-output={outdir}/docformat/numpy_demo', + '--html-base-url=https://pydoctor.readthedocs.io/en/latest/docformat/numpy_demo', '--project-name=pydoctor-numpy-style-demo', '--project-version=1.0.0', '--project-url=../google-numpy.html', @@ -145,6 +150,7 @@ ] + _common_args, 'google_demo': [ '--html-output={outdir}/docformat/google_demo', + '--html-base-url=https://pydoctor.readthedocs.io/en/latest/docformat/google_demo', '--project-name=pydoctor-google-style-demo', '--project-version=1.0.0', '--docformat=google', diff --git a/docs/source/publish-github-action.rst b/docs/source/publish-github-action.rst index 3e30ce751..ec7b7aef6 100644 --- a/docs/source/publish-github-action.rst +++ b/docs/source/publish-github-action.rst @@ -40,6 +40,7 @@ with the appropriate information. --project-name=(projectname) \ --project-url=https://github.com/$GITHUB_REPOSITORY \ --html-viewsource-base=https://github.com/$GITHUB_REPOSITORY/tree/$GITHUB_SHA \ + --html-base-url=https://$GITHUB_REPOSITORY_OWNER.github.io/${GITHUB_REPOSITORY#*/} \ --html-output=./apidocs \ --docformat=restructuredtext \ --intersphinx=https://docs.python.org/3/objects.inv \ diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 92d3e9b6d..a0d0b4024 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -29,6 +29,7 @@ The result looks like `this `_. --project-version=1.2.0 \ --project-url=https://github.com/twisted/pydoctor/ \ --html-viewsource-base=https://github.com/twisted/pydoctor/tree/20.7.2 \ + --html-base-url=https://pydoctor.readthedocs.io/en/latest/api \ --html-output=docs/api \ --docformat=epytext \ --intersphinx=https://docs.python.org/3/objects.inv \ diff --git a/docs/tests/test.py b/docs/tests/test.py index 398e14912..be6134047 100644 --- a/docs/tests/test.py +++ b/docs/tests/test.py @@ -93,6 +93,7 @@ def test_page_contains_infos(): - nav and links to modules, classes, names - js script source - pydoctor github link in the footer + - canonical link """ infos = (f'', - 'pydoctor',) + 'pydoctor', + 'pydoctor', 'Twisted', - '',) + '', + ' ArgumentParser: "The default behaviour auto detects most common providers like Github, Bitbucket, GitLab or SourceForge. " "But in some cases you might have to override the template string, for instance to make it work with git-web, use: " '--html-viewsource-template="{mod_source_href}#n{lineno}"'), metavar='SOURCETEMPLATE', default=Options.HTML_SOURCE_TEMPLATE_DEFAULT) + parser.add_argument( + '--html-base-url', dest='htmlbaseurl', + help=("A base URL used to include a canonical link in every html page. " + "This help search engine to link to the preferred version of " + "a web page to prevent duplicated or oudated content. "), default=None, metavar='BASEURL', ) parser.add_argument( '--buildtime', dest='buildtime', help=("Use the specified build time over the current time. " @@ -297,6 +302,10 @@ def _convert_htmlwriter(s: str) -> Type['IWriter']: error(str(e)) def _convert_privacy(l: List[str]) -> List[Tuple['model.PrivacyClass', str]]: return list(map(functools.partial(parse_privacy_tuple, opt='--privacy'), l)) +def _convert_htmlbaseurl(url:str | None) -> str | None: + if url and not url.endswith('/'): + url += '/' + return url _RECOGNIZED_SOURCE_HREF = { # Sourceforge @@ -361,6 +370,7 @@ class Options: htmlwriter: Type['IWriter'] = attr.ib(converter=_convert_htmlwriter) htmlsourcebase: Optional[str] = attr.ib() htmlsourcetemplate: str = attr.ib() + htmlbaseurl: str | None = attr.ib(converter=_convert_htmlbaseurl) buildtime: Optional[str] = attr.ib() warnings_as_errors: bool = attr.ib() verbosity: int = attr.ib() diff --git a/pydoctor/templatewriter/pages/__init__.py b/pydoctor/templatewriter/pages/__init__.py index 68e013ec0..22dabe5f0 100644 --- a/pydoctor/templatewriter/pages/__init__.py +++ b/pydoctor/templatewriter/pages/__init__.py @@ -7,6 +7,7 @@ ) import ast import abc +from urllib.parse import urljoin from twisted.web.iweb import IRenderable, ITemplateLoader, IRequest from twisted.web.template import Element, Tag, renderer, tags @@ -146,9 +147,19 @@ class Head(TemplateElement): filename = 'head.html' - def __init__(self, title: str, loader: ITemplateLoader) -> None: + def __init__(self, title: str, baseurl: str | None, pageurl: str, + loader: ITemplateLoader) -> None: super().__init__(loader) self._title = title + self._baseurl = baseurl + self._pageurl = pageurl + + @renderer + def canonicalurl(self, request: IRequest, tag: Tag) -> Flattenable: + if not self._baseurl: + return '' + canonical_link = urljoin(self._baseurl, self._pageurl) + return tags.link(rel='canonical', href=canonical_link) @renderer def title(self, request: IRequest, tag: Tag) -> str: @@ -171,6 +182,14 @@ def __init__(self, system: model.System, if not loader: loader = self.lookup_loader(template_lookup) super().__init__(loader) + + @property + def page_url(self) -> str: + # This MUST be overriden in CommonPage + """ + The relative page url + """ + return self.filename def render(self, request: Optional[IRequest]) -> Tag: return tags.transparent(super().render(request)).fillSlots(**self.slot_map) @@ -197,7 +216,8 @@ def title(self) -> str: @renderer def head(self, request: IRequest, tag: Tag) -> IRenderable: - return Head(self.title(), Head.lookup_loader(self.template_lookup)) + return Head(self.title(), self.system.options.htmlbaseurl, self.page_url, + loader=Head.lookup_loader(self.template_lookup)) @renderer def nav(self, request: IRequest, tag: Tag) -> IRenderable: diff --git a/pydoctor/test/test_commandline.py b/pydoctor/test/test_commandline.py index dc6623c1b..01e06c97e 100644 --- a/pydoctor/test/test_commandline.py +++ b/pydoctor/test/test_commandline.py @@ -303,3 +303,20 @@ def test_index_hardlink(tmp_path: Path) -> None: assert (tmp_path / 'basic.html').exists() assert not (tmp_path / 'basic.html').is_symlink() assert (tmp_path / 'basic.html').is_file() + +def test_htmlbaseurl_option_all_pages(tmp_path: Path) -> None: + """ + Check that the canonical link is included in all html pages, including summary pages. + """ + exit_code = driver.main(args=[ + '--html-base-url=https://example.com.abcde', + '--html-output', str(tmp_path), 'pydoctor/test/testpackages/basic/']) + assert exit_code == 0 + for t in tmp_path.iterdir(): + if not t.name.endswith('.html'): + continue + filename = t.name + if t.stem == 'basic': + filename = 'index.html' # since we have only one module it's linked as index.html + assert f' None: + src = ''' + var = True + class Cls: + foo = False + ''' + mod = fromText(src, modname='t', system=model.System(model.Options.from_args( + ['--html-base-url=https://example.org/t/docs'] + ))) + html1 = getHTMLOf(mod) + html2 = getHTMLOf(mod.contents['Cls']) + + assert ' None: + src = ''' + var = True + class Cls: + foo = False + ''' + mod = fromText(src, modname='t', system=model.System(model.Options.from_args( + ['--html-base-url=https://example.org/t/docs'] + ))) + mod2 = fromText(src, modname='t2', system=mod.system) + html1 = getHTMLOf(mod) + html2 = getHTMLOf(mod.contents['Cls']) + + assert ' - + <t:transparent t:render="title"> The title of Something </t:transparent> @@ -10,4 +10,5 @@ + Canonical URL diff --git a/pydoctor/themes/classic/head.html b/pydoctor/themes/classic/head.html index 37b2af2a6..7615872c8 100644 --- a/pydoctor/themes/classic/head.html +++ b/pydoctor/themes/classic/head.html @@ -1,5 +1,5 @@ - + <t:transparent t:render="title"> The title of Something </t:transparent> @@ -11,4 +11,5 @@ + Canonical URL diff --git a/pydoctor/themes/readthedocs/head.html b/pydoctor/themes/readthedocs/head.html index 1f75b1379..ae39aa647 100644 --- a/pydoctor/themes/readthedocs/head.html +++ b/pydoctor/themes/readthedocs/head.html @@ -1,5 +1,5 @@ - + <t:transparent t:render="title"> The title of Something </t:transparent> @@ -11,4 +11,5 @@ + Canonical URL