Skip to content

Commit

Permalink
Fix naive handling of relative links when proofing (#75)
Browse files Browse the repository at this point in the history
This commit fixes naive handling of relative links when proofing. The
problem here was the assumption that a relative link of foo.html that
was coming from a file at bar/fubar.md in the filesystem would always
appear at bar/foo.html.

That is not always true. If directory_urls was on, then it wasn't true.
Thus, #46.

It was also not true when using relative links between blog posts when
using the mkdocs material blog plugin.

This commit changes the relative url lookup logic to be more robust and
in the process make the proofer work with both the mkdocs material blog
plugin and with directory_urls on.

It does this by storing additional information in the lookup files that
we use to find information about pages. We can now look up a file by its
src_uri entry, for example blog/posts/index.md and then use that to get
the dest_uri like blog/12/10/some-post/ and then use that dest_uri value
to robustly handle figuring out where a relative link is pointing.

Closes #46
Closes #70
  • Loading branch information
SeanTAllen authored Feb 27, 2024
1 parent 14c97cc commit 5776fea
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 68 deletions.
12 changes: 3 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,6 @@ plugins:
- htmlproofer
```
To enable cross-page anchor validation, you must set `use_directory_urls = False` in `mkdocs.yml`:

```yaml
use_directory_urls: False
```

## Configuring
### `enabled`
Expand Down Expand Up @@ -147,11 +141,11 @@ plugins:

## Compatibility with `attr_list` extension

If you need to manually specify anchors make use of the `attr_list` [extension](https://python-markdown.github.io/extensions/attr_list) in the markdown.
If you need to manually specify anchors make use of the `attr_list` [extension](https://python-markdown.github.io/extensions/attr_list) in the markdown.
This can be useful for multilingual documentation to keep anchors as language neutral permalinks in all languages.

* A sample for a heading `# Grüße {#greetings}` (the slugified generated anchor `Gre` is overwritten with `greetings`).
* This also works for images `this is a nice image [](foo-bar.png){#nice-image}`
* This also works for images `this is a nice image [](foo-bar.png){#nice-image}`
* And generall for paragraphs:
```markdown
Listing: This is noteworthy.
Expand All @@ -164,4 +158,4 @@ More information about plugins in the [MkDocs documentation](http://www.mkdocs.o

## Acknowledgement

This work is based on the [mkdocs-markdownextradata-plugin](https://github.com/rosscdh/mkdocs-markdownextradata-plugin) project and the [Finding and Fixing Website Link Rot with Python, BeautifulSoup and Requests](https://www.twilio.com/en-us/blog/find-fix-website-link-rot-python-beautifulsoup-requests-html) article.
This work is based on the [mkdocs-markdownextradata-plugin](https://github.com/rosscdh/mkdocs-markdownextradata-plugin) project and the [Finding and Fixing Website Link Rot with Python, BeautifulSoup and Requests](https://www.twilio.com/en-us/blog/find-fix-website-link-rot-python-beautifulsoup-requests-html) article.
26 changes: 12 additions & 14 deletions htmlproofer/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,13 @@ def on_post_page(self, output_content: str, page: Page, config: Config) -> None:
if not self.config['enabled']:
return

use_directory_urls = config.data["use_directory_urls"]

# Optimization: At this point, we have all the files, so we can create
# a dictionary for faster lookups. Prior to this point, files are
# still being updated so creating a dictionary before now would result
# in incorrect values appearing as the key.
opt_files = {}
opt_files.update({os.path.normpath(file.url): file for file in self.files})
opt_files.update({os.path.normpath(file.src_uri): file for file in self.files})

# Optimization: only parse links and headings
# li, sup are used for footnotes
Expand All @@ -123,7 +122,7 @@ def on_post_page(self, output_content: str, page: Page, config: Config) -> None:
if self.config['warn_on_ignored_urls']:
log_warning(f"ignoring URL {url} from {page.file.src_path}")
else:
url_status = self.get_url_status(url, page.file.src_path, all_element_ids, opt_files, use_directory_urls)
url_status = self.get_url_status(url, page.file.src_path, all_element_ids, opt_files)
if self.bad_url(url_status) and self.is_error(self.config, url, url_status):
self.report_invalid_url(url, url_status, page.file.src_path)

Expand Down Expand Up @@ -161,8 +160,7 @@ def get_url_status(
url: str,
src_path: str,
all_element_ids: Set[str],
files: Dict[str, File],
use_directory_urls: bool
files: Dict[str, File]
) -> int:
if any(pat.match(url) for pat in LOCAL_PATTERNS):
return 0
Expand All @@ -174,18 +172,13 @@ def get_url_status(
return 0
if fragment and not path:
return 0 if url[1:] in all_element_ids else 404
elif not use_directory_urls:
# use_directory_urls = True injects too many challenges for locating the correct target
# Markdown file, so disable target anchor validation in this case. Examples include:
# ../..#BAD_ANCHOR style links to index.html and extra ../ inserted into relative
# links.
else:
is_valid = self.is_url_target_valid(url, src_path, files)
url_status = 404
if not is_valid and self.is_error(self.config, url, url_status):
log_warning(f"Unable to locate source file for: {url}")
return url_status
return 0
return 0

@staticmethod
def is_url_target_valid(url: str, src_path: str, files: Dict[str, File]) -> bool:
Expand Down Expand Up @@ -225,9 +218,14 @@ def find_source_file(url: str, src_path: str, files: Dict[str, File]) -> Optiona
# Convert root/site paths
search_path = os.path.normpath(url[1:])
else:
# Handle relative links by concatenating the source dir with the destination path
src_dir = urllib.parse.quote(str(pathlib.Path(src_path).parent), safe='/\\')
search_path = os.path.normpath(str(pathlib.Path(src_dir) / pathlib.Path(url)))
# Handle relative links by looking up the destination url for the
# src_path and getting the parent directory.
try:
dest_uri = files[src_path].dest_uri
src_dir = urllib.parse.quote(str(pathlib.Path(dest_uri).parent), safe='/\\')
search_path = os.path.normpath(str(pathlib.Path(src_dir) / pathlib.Path(url)))
except KeyError:
return None

try:
return files[search_path]
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/docs/nested/page1.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ you can either link to a local file without or with an anchor.
* [Main Page](../index.md)
* [Sub-Page](./page2.md)
* <figure markdown>
<a href="../assets/hello-world.drawio.svg">
<a href="/assets/hello-world.drawio.svg">
![Image](../assets/hello-world.drawio.svg)
</a>
</figure>
Expand All @@ -27,6 +27,6 @@ But allows valid anchors such as

## Image Link absolute/relative

<a href="../assets/hello-world.drawio.svg">![test](../assets/hello-world.drawio.svg)</a>
<a href="/assets/hello-world.drawio.svg">![test](../assets/hello-world.drawio.svg)</a>

<a href="/assets/hello-world.drawio.svg">![test](/assets/hello-world.drawio.svg)</a>
130 changes: 87 additions & 43 deletions tests/unit/test_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def test_on_post_page__plugin_disabled():
),
)
def test_get_url_status__ignore_local_servers(plugin, empty_files, url):
assert plugin.get_url_status(url, 'src/path.md', set(), empty_files, False) == 0
assert plugin.get_url_status(url, 'src/path.md', set(), empty_files) == 0


@pytest.mark.parametrize(
Expand All @@ -126,7 +126,7 @@ def test_get_url_status(validate_external: bool):
plugin.load_config({'validate_external_urls': validate_external})

def get_url():
return plugin.get_url_status('https://google.com', 'src/path.md', set(), empty_files, False)
return plugin.get_url_status('https://google.com', 'src/path.md', set(), empty_files)

if validate_external:
with pytest.raises(Exception):
Expand Down Expand Up @@ -172,9 +172,9 @@ def test_contains_anchor(plugin, markdown, anchor, expected):


def test_get_url_status__same_page_anchor(plugin, empty_files):
assert plugin.get_url_status('#ref', 'src/path.md', {'ref'}, empty_files, False) == 0
assert plugin.get_url_status('##ref', 'src/path.md', {'ref'}, empty_files, False) == 404
assert plugin.get_url_status('#ref', 'src/path.md', set(), empty_files, False) == 404
assert plugin.get_url_status('#ref', 'src/path.md', {'ref'}, empty_files) == 0
assert plugin.get_url_status('##ref', 'src/path.md', {'ref'}, empty_files) == 404
assert plugin.get_url_status('#ref', 'src/path.md', set(), empty_files) == 404


@pytest.mark.parametrize(
Expand All @@ -195,7 +195,7 @@ def test_get_url_status__external(plugin, empty_files, url):

with patch.object(HtmlProoferPlugin, "get_external_url") as mock_get_ext_url:
mock_get_ext_url.return_value = expected_status
status = plugin.get_url_status(url, src_path, set(), empty_files, False)
status = plugin.get_url_status(url, src_path, set(), empty_files)

mock_get_ext_url.assert_called_once_with(url, scheme, src_path)
assert status == expected_status
Expand Down Expand Up @@ -241,39 +241,64 @@ def test_get_url_status__local_page(plugin):
index_page = Mock(spec=Page, markdown='# Heading\nContent')
page1_page = Mock(spec=Page, markdown='# Page One\n## Sub Heading\nContent')
special_char_page = Mock(spec=Page, markdown='# Heading éèà\n## Sub Heading éèà\nContent')
files = {os.path.normpath(file.url): file for file in Files([
Mock(spec=File, src_path='index.md', dest_path='index.html', url='index.html', page=index_page),
Mock(spec=File, src_path='page1.md', dest_path='page1.html', url='page1.html', page=page1_page),
mock_files = Files([
Mock(spec=File, src_path='index.md', dest_path='index.html',
dest_uri='index.html', url='index.html', src_uri='index.md',
page=index_page),
Mock(spec=File, src_path='page1.md', dest_path='page1.html',
dest_uri='page1.html', url='page1.html', src_uri='page1.md',
page=page1_page),
Mock(spec=File, src_path='Dir éèà/éèà.md', dest_path='Dir éèà/éèà.html',
url='Dir%20%C3%A9%C3%A8%C3%A0/%C3%A9%C3%A8%C3%A0.html', page=special_char_page),
])}
dest_uri='Dir éèà/éèà.html',
url='Dir%20%C3%A9%C3%A8%C3%A0/%C3%A9%C3%A8%C3%A0.html',
src_uri='Dir éèà/éèà.md', page=special_char_page),
Mock(spec=File, src_path='Dir éèà/page1.md', dest_path='Dir éèà/page1.html',
dest_uri='Dir éèà/page1.html',
url='Dir%20%C3%A9%C3%A8%C3%A0/page1.html',
src_uri='Dir%20%C3%A9%C3%A8%C3%A0/page1.md',
page=special_char_page),
])
files = {}
files.update({os.path.normpath(file.url): file for file in mock_files})
files.update({file.src_uri: file for file in mock_files})

assert plugin.get_url_status('index.html', 'page1.md', set(), files, False) == 0
assert plugin.get_url_status('index.html#heading', 'page1.md', set(), files, False) == 0
assert plugin.get_url_status('index.html#bad-heading', 'page1.md', set(), files, False) == 404
assert plugin.get_url_status('index.html', 'page1.md', set(), files) == 0
assert plugin.get_url_status('index.html#heading', 'page1.md', set(), files) == 0
assert plugin.get_url_status('index.html#bad-heading', 'page1.md', set(), files) == 404

assert plugin.get_url_status('page1.html', 'page1.md', set(), files, False) == 0
assert plugin.get_url_status('page1.html#sub-heading', 'page1.md', set(), files, False) == 0
assert plugin.get_url_status('page1.html#heading', 'page1.md', set(), files, False) == 404
assert plugin.get_url_status('page1.html', 'page1.md', set(), files) == 0
assert plugin.get_url_status('page1.html#sub-heading', 'page1.md', set(), files) == 0
assert plugin.get_url_status('page1.html#heading', 'page1.md', set(), files) == 404

assert plugin.get_url_status('page2.html', 'page1.md', set(), files, False) == 404
assert plugin.get_url_status('page2.html#heading', 'page1.md', set(), files, False) == 404
assert plugin.get_url_status('page2.html', 'page1.md', set(), files) == 404
assert plugin.get_url_status('page2.html#heading', 'page1.md', set(), files) == 404

assert plugin.get_url_status('Dir%20%C3%A9%C3%A8%C3%A0/%C3%A9%C3%A8%C3%A0.html#sub-heading-eea',
'page1.md', set(), files, False) == 0
assert plugin.get_url_status('%C3%A9%C3%A8%C3%A0.html#sub-heading-eea', 'Dir éèà/page3.md', set(), files, False) == 0
assert plugin.get_url_status(
'Dir%20%C3%A9%C3%A8%C3%A0/%C3%A9%C3%A8%C3%A0.html#sub-heading-eea',
'page1.md', set(), files) == 0
assert plugin.get_url_status(
'%C3%A9%C3%A8%C3%A0.html#sub-heading-eea',
'Dir%20%C3%A9%C3%A8%C3%A0/page1.md',
set(), files) == 0


def test_get_url_status__non_markdown_page(plugin):
index_page = Mock(spec=Page, markdown='# Heading\nContent')
files = {os.path.normpath(file.url): file for file in Files([
Mock(spec=File, src_path='index.md', dest_path='index.html', url='index.html', page=index_page),
Mock(spec=File, src_path='drawing.svg', dest_path='drawing.svg', url='drawing.svg', page=None),
])}
mock_files = Files([
Mock(spec=File, src_path='index.md', dest_path='index.html',
dest_uri='index.html', url='index.html', src_uri='index.md',
page=index_page),
Mock(spec=File, src_path='drawing.svg', dest_path='drawing.svg',
dest_uri='index.html', url='drawing.svg', src_uri='drawing.svg',
page=None),
])
files = {}
files.update({os.path.normpath(file.url): file for file in mock_files})
files.update({file.src_uri: file for file in mock_files})

assert plugin.get_url_status('drawing.svg', 'index.md', set(), files, False) == 0
assert plugin.get_url_status('/drawing.svg', 'index.md', set(), files, False) == 0
assert plugin.get_url_status('not-existing.svg', 'index.md', set(), files, False) == 404
assert plugin.get_url_status('drawing.svg', 'index.md', set(), files) == 0
assert plugin.get_url_status('/drawing.svg', 'index.md', set(), files) == 0
assert plugin.get_url_status('not-existing.svg', 'index.md', set(), files) == 404


def test_get_url_status__local_page_nested(plugin):
Expand All @@ -282,48 +307,67 @@ def test_get_url_status__local_page_nested(plugin):
nested1_sibling_page = Mock(spec=Page, markdown='# Nested Sibling')
nested2_page = Mock(spec=Page, markdown='# Nested\n## Nested Two\nContent')
nested2_sibling_page = Mock(spec=Page, markdown='# Nested Sibling')
files = {os.path.normpath(file.url): file for file in Files([
Mock(spec=File, src_path='index.md', dest_path='index.html', url='index.html', page=index_page),
mock_files = Files([
Mock(
spec=File,
src_path='index.md',
dest_path='index.html',
dest_uri='index.html',
url='index.html',
src_uri='index.md',
page=index_page),
Mock(
spec=File,
src_path='foo/bar/nested.md',
dest_path='foo/bar/nested.html',
dest_uri='foo/bar/nested.html',
url='foo/bar/nested.html',
src_uri='foo/bar/nested.md',
page=nested1_page
),
Mock(
spec=File,
src_path='foo/bar/sibling.md',
dest_path='foo/bar/sibling.html',
dest_uri='foo/bar/sibling.html',
url='foo/bar/sibling.html',
src_uri='foo/bar/sibling.md',
page=nested1_sibling_page
),
Mock(
spec=File,
src_path='foo/baz/nested.md',
dest_path='foo/baz/nested.html',
dest_uri='foo/baz/nested.html',
url='foo/baz/nested.html',
src_uri='foo/baz/nested.md',
page=nested2_page
),
Mock(
spec=File,
src_path='foo/baz/sibling.md',
dest_path='foo/baz/sibling.html',
dest_uri='foo/baz/sibling.html',
url='foo/baz/sibling.html',
src_uri='foo/baz/sibling.md',
page=nested2_sibling_page
),
])}
])

files = {}
files.update({os.path.normpath(file.url): file for file in mock_files})
files.update({file.src_uri: file for file in mock_files})

assert plugin.get_url_status('nested.html#nested-one', 'foo/bar/sibling.md', set(), files, False) == 0
assert plugin.get_url_status('nested.html#nested-two', 'foo/bar/sibling.md', set(), files, False) == 404
assert plugin.get_url_status('nested.html#nested-one', 'foo/bar/sibling.md', set(), files) == 0
assert plugin.get_url_status('nested.html#nested-two', 'foo/bar/sibling.md', set(), files) == 404

assert plugin.get_url_status('nested.html#nested-two', 'foo/baz/sibling.md', set(), files, False) == 0
assert plugin.get_url_status('nested.html#nested-one', 'foo/baz/sibling.md', set(), files, False) == 404
assert plugin.get_url_status('nested.html#nested-two', 'foo/baz/sibling.md', set(), files) == 0
assert plugin.get_url_status('nested.html#nested-one', 'foo/baz/sibling.md', set(), files) == 404

assert plugin.get_url_status('foo/bar/nested.html#nested-one', 'index.md', set(), files, False) == 0
assert plugin.get_url_status('foo/baz/nested.html#nested-two', 'index.md', set(), files, False) == 0
assert plugin.get_url_status('foo/bar/nested.html#nested-one', 'index.md', set(), files) == 0
assert plugin.get_url_status('foo/baz/nested.html#nested-two', 'index.md', set(), files) == 0

assert plugin.get_url_status('/index.html', 'foo/baz/sibling.md', set(), files, False) == 0
assert plugin.get_url_status('/index.html', 'foo/baz/sibling.md', set(), files) == 0


@patch.object(htmlproofer.plugin, "log_warning", autospec=True)
Expand All @@ -334,7 +378,7 @@ def test_get_url_status__excluded_non_existing_relative_url__no_warning(log_warn
files = {}
plugin.config['raise_error_excludes'][url_status] = [url]

status = plugin.get_url_status(url, src_path, set(), files, False)
status = plugin.get_url_status(url, src_path, set(), files)

log_warning_mock.assert_not_called()
assert 0 == status
Expand All @@ -349,12 +393,12 @@ def test_get_url_status__excluded_existing_relative_url__no_warning(log_warning_
existing_page = Mock(spec=Page, markdown='')
files = {
os.path.normpath(file.url): file for file in Files([
Mock(spec=File, src_path=src_path, dest_path=url, url=url, page=existing_page)
Mock(spec=File, src_path=src_path, dest_path=url, dest_uri=url, url=url, src_uri=src_path, page=existing_page)
])
}
plugin.config['raise_error_excludes'][url_status] = [url]

status = plugin.get_url_status(url, src_path, set(), files, False)
status = plugin.get_url_status(url, src_path, set(), files)

log_warning_mock.assert_not_called()
assert 0 == status
Expand All @@ -367,7 +411,7 @@ def test_get_url_status__non_existing_relative_url__warning_and_404(log_warning_
src_path = "index.md"
files = {}

status = plugin.get_url_status(url, src_path, set(), files, False)
status = plugin.get_url_status(url, src_path, set(), files)

log_warning_mock.assert_called_once()
assert expected_url_status == status
Expand Down

0 comments on commit 5776fea

Please sign in to comment.