From 41b734bfdd1cd34cf9ae09dd794371f0811baa1c Mon Sep 17 00:00:00 2001
From: tristanlatr <19967168+tristanlatr@users.noreply.github.com>
Date: Tue, 15 Oct 2024 13:04:15 -0400
Subject: [PATCH] Implement Canonical URLs (#821)
* Fix #608
---
README.rst | 2 ++
docs/source/conf.py | 6 ++++
docs/source/publish-github-action.rst | 1 +
docs/source/quickstart.rst | 1 +
docs/tests/test.py | 8 +++--
pydoctor/options.py | 10 +++++++
pydoctor/templatewriter/pages/__init__.py | 24 +++++++++++++--
pydoctor/test/test_commandline.py | 17 +++++++++++
pydoctor/test/test_templatewriter.py | 36 +++++++++++++++++++++++
pydoctor/themes/base/head.html | 3 +-
pydoctor/themes/classic/head.html | 3 +-
pydoctor/themes/readthedocs/head.html | 3 +-
12 files changed, 107 insertions(+), 7 deletions(-)
diff --git a/README.rst b/README.rst
index 64b30e004..6aa2842df 100644
--- a/README.rst
+++ b/README.rst
@@ -74,6 +74,8 @@ in development
^^^^^^^^^^^^^^
* Drop Python 3.7 and support Python 3.13.
+* Implement canonical HTML element (````) to help search engines reduce outdated content.
+ Enable this feature by passing the base URL of the API documentation with option ``--html-base-url``.
* Improve collection of objects:
- Document objects declared in the ``else`` block of 'if' statements (previously they were ignored).
- Document objects declared in ``finalbody`` and ``else`` block of 'try' statements (previously they were ignored).
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 5043188c9..eeba964ff 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -99,6 +99,7 @@
pydoctor_args = {
'main': [
'--html-output={outdir}/api/', # Make sure to have a trailing delimiter for better usage coverage.
+ '--html-base-url=https://pydoctor.readthedocs.io/en/latest/api',
'--project-name=pydoctor',
f'--project-version={version}',
'--docformat=epytext',
@@ -108,6 +109,7 @@
] + _common_args,
'custom_template_demo': [
'--html-output={outdir}/custom_template_demo/',
+ '--html-base-url=https://pydoctor.readthedocs.io/en/latest/custom_template_demo',
f'--project-version={version}',
f'--template-dir={_pydoctor_root}/docs/sample_template',
f'{_pydoctor_root}/pydoctor',
@@ -116,6 +118,7 @@
'-qqq' ], # we don't want to hear any warnings from this custom template demo.
'epydoc_demo': [
'--html-output={outdir}/docformat/epytext_demo',
+ '--html-base-url=https://pydoctor.readthedocs.io/en/latest/docformat/epytext_demo',
'--project-name=pydoctor-epytext-demo',
'--project-version=1.3.0',
'--docformat=epytext',
@@ -126,6 +129,7 @@
] + _common_args,
'restructuredtext_demo': [
'--html-output={outdir}/docformat/restructuredtext_demo',
+ '--html-base-url=https://pydoctor.readthedocs.io/en/latest/docformat/restructuredtext_demo',
'--project-name=pydoctor-restructuredtext-demo',
'--project-version=1.0.0',
'--docformat=restructuredtext',
@@ -136,6 +140,7 @@
] + _common_args,
'numpy_demo': [ # no need to pass --docformat here, we use __docformat__
'--html-output={outdir}/docformat/numpy_demo',
+ '--html-base-url=https://pydoctor.readthedocs.io/en/latest/docformat/numpy_demo',
'--project-name=pydoctor-numpy-style-demo',
'--project-version=1.0.0',
'--project-url=../google-numpy.html',
@@ -145,6 +150,7 @@
] + _common_args,
'google_demo': [
'--html-output={outdir}/docformat/google_demo',
+ '--html-base-url=https://pydoctor.readthedocs.io/en/latest/docformat/google_demo',
'--project-name=pydoctor-google-style-demo',
'--project-version=1.0.0',
'--docformat=google',
diff --git a/docs/source/publish-github-action.rst b/docs/source/publish-github-action.rst
index 3e30ce751..ec7b7aef6 100644
--- a/docs/source/publish-github-action.rst
+++ b/docs/source/publish-github-action.rst
@@ -40,6 +40,7 @@ with the appropriate information.
--project-name=(projectname) \
--project-url=https://github.com/$GITHUB_REPOSITORY \
--html-viewsource-base=https://github.com/$GITHUB_REPOSITORY/tree/$GITHUB_SHA \
+ --html-base-url=https://$GITHUB_REPOSITORY_OWNER.github.io/${GITHUB_REPOSITORY#*/} \
--html-output=./apidocs \
--docformat=restructuredtext \
--intersphinx=https://docs.python.org/3/objects.inv \
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index 92d3e9b6d..a0d0b4024 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -29,6 +29,7 @@ The result looks like `this `_.
--project-version=1.2.0 \
--project-url=https://github.com/twisted/pydoctor/ \
--html-viewsource-base=https://github.com/twisted/pydoctor/tree/20.7.2 \
+ --html-base-url=https://pydoctor.readthedocs.io/en/latest/api \
--html-output=docs/api \
--docformat=epytext \
--intersphinx=https://docs.python.org/3/objects.inv \
diff --git a/docs/tests/test.py b/docs/tests/test.py
index 398e14912..be6134047 100644
--- a/docs/tests/test.py
+++ b/docs/tests/test.py
@@ -93,6 +93,7 @@ def test_page_contains_infos():
- nav and links to modules, classes, names
- js script source
- pydoctor github link in the footer
+ - canonical link
"""
infos = (f'',
- 'pydoctor',)
+ 'pydoctor',
+ 'pydoctor',
'',
- '',)
+ '',
+ ' ArgumentParser:
"The default behaviour auto detects most common providers like Github, Bitbucket, GitLab or SourceForge. "
"But in some cases you might have to override the template string, for instance to make it work with git-web, use: "
'--html-viewsource-template="{mod_source_href}#n{lineno}"'), metavar='SOURCETEMPLATE', default=Options.HTML_SOURCE_TEMPLATE_DEFAULT)
+ parser.add_argument(
+ '--html-base-url', dest='htmlbaseurl',
+ help=("A base URL used to include a canonical link in every html page. "
+ "This help search engine to link to the preferred version of "
+ "a web page to prevent duplicated or oudated content. "), default=None, metavar='BASEURL', )
parser.add_argument(
'--buildtime', dest='buildtime',
help=("Use the specified build time over the current time. "
@@ -297,6 +302,10 @@ def _convert_htmlwriter(s: str) -> Type['IWriter']:
error(str(e))
def _convert_privacy(l: List[str]) -> List[Tuple['model.PrivacyClass', str]]:
return list(map(functools.partial(parse_privacy_tuple, opt='--privacy'), l))
+def _convert_htmlbaseurl(url:str | None) -> str | None:
+ if url and not url.endswith('/'):
+ url += '/'
+ return url
_RECOGNIZED_SOURCE_HREF = {
# Sourceforge
@@ -361,6 +370,7 @@ class Options:
htmlwriter: Type['IWriter'] = attr.ib(converter=_convert_htmlwriter)
htmlsourcebase: Optional[str] = attr.ib()
htmlsourcetemplate: str = attr.ib()
+ htmlbaseurl: str | None = attr.ib(converter=_convert_htmlbaseurl)
buildtime: Optional[str] = attr.ib()
warnings_as_errors: bool = attr.ib()
verbosity: int = attr.ib()
diff --git a/pydoctor/templatewriter/pages/__init__.py b/pydoctor/templatewriter/pages/__init__.py
index 68e013ec0..22dabe5f0 100644
--- a/pydoctor/templatewriter/pages/__init__.py
+++ b/pydoctor/templatewriter/pages/__init__.py
@@ -7,6 +7,7 @@
)
import ast
import abc
+from urllib.parse import urljoin
from twisted.web.iweb import IRenderable, ITemplateLoader, IRequest
from twisted.web.template import Element, Tag, renderer, tags
@@ -146,9 +147,19 @@ class Head(TemplateElement):
filename = 'head.html'
- def __init__(self, title: str, loader: ITemplateLoader) -> None:
+ def __init__(self, title: str, baseurl: str | None, pageurl: str,
+ loader: ITemplateLoader) -> None:
super().__init__(loader)
self._title = title
+ self._baseurl = baseurl
+ self._pageurl = pageurl
+
+ @renderer
+ def canonicalurl(self, request: IRequest, tag: Tag) -> Flattenable:
+ if not self._baseurl:
+ return ''
+ canonical_link = urljoin(self._baseurl, self._pageurl)
+ return tags.link(rel='canonical', href=canonical_link)
@renderer
def title(self, request: IRequest, tag: Tag) -> str:
@@ -171,6 +182,14 @@ def __init__(self, system: model.System,
if not loader:
loader = self.lookup_loader(template_lookup)
super().__init__(loader)
+
+ @property
+ def page_url(self) -> str:
+ # This MUST be overriden in CommonPage
+ """
+ The relative page url
+ """
+ return self.filename
def render(self, request: Optional[IRequest]) -> Tag:
return tags.transparent(super().render(request)).fillSlots(**self.slot_map)
@@ -197,7 +216,8 @@ def title(self) -> str:
@renderer
def head(self, request: IRequest, tag: Tag) -> IRenderable:
- return Head(self.title(), Head.lookup_loader(self.template_lookup))
+ return Head(self.title(), self.system.options.htmlbaseurl, self.page_url,
+ loader=Head.lookup_loader(self.template_lookup))
@renderer
def nav(self, request: IRequest, tag: Tag) -> IRenderable:
diff --git a/pydoctor/test/test_commandline.py b/pydoctor/test/test_commandline.py
index dc6623c1b..01e06c97e 100644
--- a/pydoctor/test/test_commandline.py
+++ b/pydoctor/test/test_commandline.py
@@ -303,3 +303,20 @@ def test_index_hardlink(tmp_path: Path) -> None:
assert (tmp_path / 'basic.html').exists()
assert not (tmp_path / 'basic.html').is_symlink()
assert (tmp_path / 'basic.html').is_file()
+
+def test_htmlbaseurl_option_all_pages(tmp_path: Path) -> None:
+ """
+ Check that the canonical link is included in all html pages, including summary pages.
+ """
+ exit_code = driver.main(args=[
+ '--html-base-url=https://example.com.abcde',
+ '--html-output', str(tmp_path), 'pydoctor/test/testpackages/basic/'])
+ assert exit_code == 0
+ for t in tmp_path.iterdir():
+ if not t.name.endswith('.html'):
+ continue
+ filename = t.name
+ if t.stem == 'basic':
+ filename = 'index.html' # since we have only one module it's linked as index.html
+ assert f' None:
+ src = '''
+ var = True
+ class Cls:
+ foo = False
+ '''
+ mod = fromText(src, modname='t', system=model.System(model.Options.from_args(
+ ['--html-base-url=https://example.org/t/docs']
+ )))
+ html1 = getHTMLOf(mod)
+ html2 = getHTMLOf(mod.contents['Cls'])
+
+ assert ' None:
+ src = '''
+ var = True
+ class Cls:
+ foo = False
+ '''
+ mod = fromText(src, modname='t', system=model.System(model.Options.from_args(
+ ['--html-base-url=https://example.org/t/docs']
+ )))
+ mod2 = fromText(src, modname='t2', system=mod.system)
+ html1 = getHTMLOf(mod)
+ html2 = getHTMLOf(mod.contents['Cls'])
+
+ assert '
-
+
The title of Something
@@ -10,4 +10,5 @@
+ Canonical URL
diff --git a/pydoctor/themes/classic/head.html b/pydoctor/themes/classic/head.html
index 37b2af2a6..7615872c8 100644
--- a/pydoctor/themes/classic/head.html
+++ b/pydoctor/themes/classic/head.html
@@ -1,5 +1,5 @@
-
+
The title of Something
@@ -11,4 +11,5 @@
+ Canonical URL
diff --git a/pydoctor/themes/readthedocs/head.html b/pydoctor/themes/readthedocs/head.html
index 1f75b1379..ae39aa647 100644
--- a/pydoctor/themes/readthedocs/head.html
+++ b/pydoctor/themes/readthedocs/head.html
@@ -1,5 +1,5 @@
-
+
The title of Something
@@ -11,4 +11,5 @@
+ Canonical URL