diff --git a/README.md b/README.md index 0eb0eb4..3ee32b6 100644 --- a/README.md +++ b/README.md @@ -28,12 +28,6 @@ plugins: - htmlproofer ``` -To enable cross-page anchor validation, you must set `use_directory_urls = False` in `mkdocs.yml`: - -```yaml -use_directory_urls: False -``` - ## Configuring ### `enabled` @@ -147,11 +141,11 @@ plugins: ## Compatibility with `attr_list` extension -If you need to manually specify anchors make use of the `attr_list` [extension](https://python-markdown.github.io/extensions/attr_list) in the markdown. +If you need to manually specify anchors make use of the `attr_list` [extension](https://python-markdown.github.io/extensions/attr_list) in the markdown. This can be useful for multilingual documentation to keep anchors as language neutral permalinks in all languages. * A sample for a heading `# Grüße {#greetings}` (the slugified generated anchor `Gre` is overwritten with `greetings`). -* This also works for images `this is a nice image [](foo-bar.png){#nice-image}` +* This also works for images `this is a nice image [](foo-bar.png){#nice-image}` * And generall for paragraphs: ```markdown Listing: This is noteworthy. @@ -164,4 +158,4 @@ More information about plugins in the [MkDocs documentation](http://www.mkdocs.o ## Acknowledgement -This work is based on the [mkdocs-markdownextradata-plugin](https://github.com/rosscdh/mkdocs-markdownextradata-plugin) project and the [Finding and Fixing Website Link Rot with Python, BeautifulSoup and Requests](https://www.twilio.com/en-us/blog/find-fix-website-link-rot-python-beautifulsoup-requests-html) article. +This work is based on the [mkdocs-markdownextradata-plugin](https://github.com/rosscdh/mkdocs-markdownextradata-plugin) project and the [Finding and Fixing Website Link Rot with Python, BeautifulSoup and Requests](https://www.twilio.com/en-us/blog/find-fix-website-link-rot-python-beautifulsoup-requests-html) article. diff --git a/htmlproofer/plugin.py b/htmlproofer/plugin.py index 77fbbd2..59d62fa 100644 --- a/htmlproofer/plugin.py +++ b/htmlproofer/plugin.py @@ -98,14 +98,13 @@ def on_post_page(self, output_content: str, page: Page, config: Config) -> None: if not self.config['enabled']: return - use_directory_urls = config.data["use_directory_urls"] - # Optimization: At this point, we have all the files, so we can create # a dictionary for faster lookups. Prior to this point, files are # still being updated so creating a dictionary before now would result # in incorrect values appearing as the key. opt_files = {} opt_files.update({os.path.normpath(file.url): file for file in self.files}) + opt_files.update({os.path.normpath(file.src_uri): file for file in self.files}) # Optimization: only parse links and headings # li, sup are used for footnotes @@ -123,7 +122,7 @@ def on_post_page(self, output_content: str, page: Page, config: Config) -> None: if self.config['warn_on_ignored_urls']: log_warning(f"ignoring URL {url} from {page.file.src_path}") else: - url_status = self.get_url_status(url, page.file.src_path, all_element_ids, opt_files, use_directory_urls) + url_status = self.get_url_status(url, page.file.src_path, all_element_ids, opt_files) if self.bad_url(url_status) and self.is_error(self.config, url, url_status): self.report_invalid_url(url, url_status, page.file.src_path) @@ -161,8 +160,7 @@ def get_url_status( url: str, src_path: str, all_element_ids: Set[str], - files: Dict[str, File], - use_directory_urls: bool + files: Dict[str, File] ) -> int: if any(pat.match(url) for pat in LOCAL_PATTERNS): return 0 @@ -174,18 +172,13 @@ def get_url_status( return 0 if fragment and not path: return 0 if url[1:] in all_element_ids else 404 - elif not use_directory_urls: - # use_directory_urls = True injects too many challenges for locating the correct target - # Markdown file, so disable target anchor validation in this case. Examples include: - # ../..#BAD_ANCHOR style links to index.html and extra ../ inserted into relative - # links. + else: is_valid = self.is_url_target_valid(url, src_path, files) url_status = 404 if not is_valid and self.is_error(self.config, url, url_status): log_warning(f"Unable to locate source file for: {url}") return url_status return 0 - return 0 @staticmethod def is_url_target_valid(url: str, src_path: str, files: Dict[str, File]) -> bool: @@ -225,9 +218,14 @@ def find_source_file(url: str, src_path: str, files: Dict[str, File]) -> Optiona # Convert root/site paths search_path = os.path.normpath(url[1:]) else: - # Handle relative links by concatenating the source dir with the destination path - src_dir = urllib.parse.quote(str(pathlib.Path(src_path).parent), safe='/\\') - search_path = os.path.normpath(str(pathlib.Path(src_dir) / pathlib.Path(url))) + # Handle relative links by looking up the destination url for the + # src_path and getting the parent directory. + try: + dest_uri = files[src_path].dest_uri + src_dir = urllib.parse.quote(str(pathlib.Path(dest_uri).parent), safe='/\\') + search_path = os.path.normpath(str(pathlib.Path(src_dir) / pathlib.Path(url))) + except KeyError: + return None try: return files[search_path] diff --git a/tests/integration/docs/nested/page1.md b/tests/integration/docs/nested/page1.md index 338df37..b29e14e 100644 --- a/tests/integration/docs/nested/page1.md +++ b/tests/integration/docs/nested/page1.md @@ -8,7 +8,7 @@ you can either link to a local file without or with an anchor. * [Main Page](../index.md) * [Sub-Page](./page2.md) *
- + ![Image](../assets/hello-world.drawio.svg)
@@ -27,6 +27,6 @@ But allows valid anchors such as ## Image Link absolute/relative -![test](../assets/hello-world.drawio.svg) +![test](../assets/hello-world.drawio.svg) ![test](/assets/hello-world.drawio.svg) diff --git a/tests/unit/test_plugin.py b/tests/unit/test_plugin.py index a17f2a6..2645779 100644 --- a/tests/unit/test_plugin.py +++ b/tests/unit/test_plugin.py @@ -115,7 +115,7 @@ def test_on_post_page__plugin_disabled(): ), ) def test_get_url_status__ignore_local_servers(plugin, empty_files, url): - assert plugin.get_url_status(url, 'src/path.md', set(), empty_files, False) == 0 + assert plugin.get_url_status(url, 'src/path.md', set(), empty_files) == 0 @pytest.mark.parametrize( @@ -126,7 +126,7 @@ def test_get_url_status(validate_external: bool): plugin.load_config({'validate_external_urls': validate_external}) def get_url(): - return plugin.get_url_status('https://google.com', 'src/path.md', set(), empty_files, False) + return plugin.get_url_status('https://google.com', 'src/path.md', set(), empty_files) if validate_external: with pytest.raises(Exception): @@ -172,9 +172,9 @@ def test_contains_anchor(plugin, markdown, anchor, expected): def test_get_url_status__same_page_anchor(plugin, empty_files): - assert plugin.get_url_status('#ref', 'src/path.md', {'ref'}, empty_files, False) == 0 - assert plugin.get_url_status('##ref', 'src/path.md', {'ref'}, empty_files, False) == 404 - assert plugin.get_url_status('#ref', 'src/path.md', set(), empty_files, False) == 404 + assert plugin.get_url_status('#ref', 'src/path.md', {'ref'}, empty_files) == 0 + assert plugin.get_url_status('##ref', 'src/path.md', {'ref'}, empty_files) == 404 + assert plugin.get_url_status('#ref', 'src/path.md', set(), empty_files) == 404 @pytest.mark.parametrize( @@ -195,7 +195,7 @@ def test_get_url_status__external(plugin, empty_files, url): with patch.object(HtmlProoferPlugin, "get_external_url") as mock_get_ext_url: mock_get_ext_url.return_value = expected_status - status = plugin.get_url_status(url, src_path, set(), empty_files, False) + status = plugin.get_url_status(url, src_path, set(), empty_files) mock_get_ext_url.assert_called_once_with(url, scheme, src_path) assert status == expected_status @@ -241,39 +241,64 @@ def test_get_url_status__local_page(plugin): index_page = Mock(spec=Page, markdown='# Heading\nContent') page1_page = Mock(spec=Page, markdown='# Page One\n## Sub Heading\nContent') special_char_page = Mock(spec=Page, markdown='# Heading éèà\n## Sub Heading éèà\nContent') - files = {os.path.normpath(file.url): file for file in Files([ - Mock(spec=File, src_path='index.md', dest_path='index.html', url='index.html', page=index_page), - Mock(spec=File, src_path='page1.md', dest_path='page1.html', url='page1.html', page=page1_page), + mock_files = Files([ + Mock(spec=File, src_path='index.md', dest_path='index.html', + dest_uri='index.html', url='index.html', src_uri='index.md', + page=index_page), + Mock(spec=File, src_path='page1.md', dest_path='page1.html', + dest_uri='page1.html', url='page1.html', src_uri='page1.md', + page=page1_page), Mock(spec=File, src_path='Dir éèà/éèà.md', dest_path='Dir éèà/éèà.html', - url='Dir%20%C3%A9%C3%A8%C3%A0/%C3%A9%C3%A8%C3%A0.html', page=special_char_page), - ])} + dest_uri='Dir éèà/éèà.html', + url='Dir%20%C3%A9%C3%A8%C3%A0/%C3%A9%C3%A8%C3%A0.html', + src_uri='Dir éèà/éèà.md', page=special_char_page), + Mock(spec=File, src_path='Dir éèà/page1.md', dest_path='Dir éèà/page1.html', + dest_uri='Dir éèà/page1.html', + url='Dir%20%C3%A9%C3%A8%C3%A0/page1.html', + src_uri='Dir%20%C3%A9%C3%A8%C3%A0/page1.md', + page=special_char_page), + ]) + files = {} + files.update({os.path.normpath(file.url): file for file in mock_files}) + files.update({file.src_uri: file for file in mock_files}) - assert plugin.get_url_status('index.html', 'page1.md', set(), files, False) == 0 - assert plugin.get_url_status('index.html#heading', 'page1.md', set(), files, False) == 0 - assert plugin.get_url_status('index.html#bad-heading', 'page1.md', set(), files, False) == 404 + assert plugin.get_url_status('index.html', 'page1.md', set(), files) == 0 + assert plugin.get_url_status('index.html#heading', 'page1.md', set(), files) == 0 + assert plugin.get_url_status('index.html#bad-heading', 'page1.md', set(), files) == 404 - assert plugin.get_url_status('page1.html', 'page1.md', set(), files, False) == 0 - assert plugin.get_url_status('page1.html#sub-heading', 'page1.md', set(), files, False) == 0 - assert plugin.get_url_status('page1.html#heading', 'page1.md', set(), files, False) == 404 + assert plugin.get_url_status('page1.html', 'page1.md', set(), files) == 0 + assert plugin.get_url_status('page1.html#sub-heading', 'page1.md', set(), files) == 0 + assert plugin.get_url_status('page1.html#heading', 'page1.md', set(), files) == 404 - assert plugin.get_url_status('page2.html', 'page1.md', set(), files, False) == 404 - assert plugin.get_url_status('page2.html#heading', 'page1.md', set(), files, False) == 404 + assert plugin.get_url_status('page2.html', 'page1.md', set(), files) == 404 + assert plugin.get_url_status('page2.html#heading', 'page1.md', set(), files) == 404 - assert plugin.get_url_status('Dir%20%C3%A9%C3%A8%C3%A0/%C3%A9%C3%A8%C3%A0.html#sub-heading-eea', - 'page1.md', set(), files, False) == 0 - assert plugin.get_url_status('%C3%A9%C3%A8%C3%A0.html#sub-heading-eea', 'Dir éèà/page3.md', set(), files, False) == 0 + assert plugin.get_url_status( + 'Dir%20%C3%A9%C3%A8%C3%A0/%C3%A9%C3%A8%C3%A0.html#sub-heading-eea', + 'page1.md', set(), files) == 0 + assert plugin.get_url_status( + '%C3%A9%C3%A8%C3%A0.html#sub-heading-eea', + 'Dir%20%C3%A9%C3%A8%C3%A0/page1.md', + set(), files) == 0 def test_get_url_status__non_markdown_page(plugin): index_page = Mock(spec=Page, markdown='# Heading\nContent') - files = {os.path.normpath(file.url): file for file in Files([ - Mock(spec=File, src_path='index.md', dest_path='index.html', url='index.html', page=index_page), - Mock(spec=File, src_path='drawing.svg', dest_path='drawing.svg', url='drawing.svg', page=None), - ])} + mock_files = Files([ + Mock(spec=File, src_path='index.md', dest_path='index.html', + dest_uri='index.html', url='index.html', src_uri='index.md', + page=index_page), + Mock(spec=File, src_path='drawing.svg', dest_path='drawing.svg', + dest_uri='index.html', url='drawing.svg', src_uri='drawing.svg', + page=None), + ]) + files = {} + files.update({os.path.normpath(file.url): file for file in mock_files}) + files.update({file.src_uri: file for file in mock_files}) - assert plugin.get_url_status('drawing.svg', 'index.md', set(), files, False) == 0 - assert plugin.get_url_status('/drawing.svg', 'index.md', set(), files, False) == 0 - assert plugin.get_url_status('not-existing.svg', 'index.md', set(), files, False) == 404 + assert plugin.get_url_status('drawing.svg', 'index.md', set(), files) == 0 + assert plugin.get_url_status('/drawing.svg', 'index.md', set(), files) == 0 + assert plugin.get_url_status('not-existing.svg', 'index.md', set(), files) == 404 def test_get_url_status__local_page_nested(plugin): @@ -282,48 +307,67 @@ def test_get_url_status__local_page_nested(plugin): nested1_sibling_page = Mock(spec=Page, markdown='# Nested Sibling') nested2_page = Mock(spec=Page, markdown='# Nested\n## Nested Two\nContent') nested2_sibling_page = Mock(spec=Page, markdown='# Nested Sibling') - files = {os.path.normpath(file.url): file for file in Files([ - Mock(spec=File, src_path='index.md', dest_path='index.html', url='index.html', page=index_page), + mock_files = Files([ + Mock( + spec=File, + src_path='index.md', + dest_path='index.html', + dest_uri='index.html', + url='index.html', + src_uri='index.md', + page=index_page), Mock( spec=File, src_path='foo/bar/nested.md', dest_path='foo/bar/nested.html', + dest_uri='foo/bar/nested.html', url='foo/bar/nested.html', + src_uri='foo/bar/nested.md', page=nested1_page ), Mock( spec=File, src_path='foo/bar/sibling.md', dest_path='foo/bar/sibling.html', + dest_uri='foo/bar/sibling.html', url='foo/bar/sibling.html', + src_uri='foo/bar/sibling.md', page=nested1_sibling_page ), Mock( spec=File, src_path='foo/baz/nested.md', dest_path='foo/baz/nested.html', + dest_uri='foo/baz/nested.html', url='foo/baz/nested.html', + src_uri='foo/baz/nested.md', page=nested2_page ), Mock( spec=File, src_path='foo/baz/sibling.md', dest_path='foo/baz/sibling.html', + dest_uri='foo/baz/sibling.html', url='foo/baz/sibling.html', + src_uri='foo/baz/sibling.md', page=nested2_sibling_page ), - ])} + ]) + + files = {} + files.update({os.path.normpath(file.url): file for file in mock_files}) + files.update({file.src_uri: file for file in mock_files}) - assert plugin.get_url_status('nested.html#nested-one', 'foo/bar/sibling.md', set(), files, False) == 0 - assert plugin.get_url_status('nested.html#nested-two', 'foo/bar/sibling.md', set(), files, False) == 404 + assert plugin.get_url_status('nested.html#nested-one', 'foo/bar/sibling.md', set(), files) == 0 + assert plugin.get_url_status('nested.html#nested-two', 'foo/bar/sibling.md', set(), files) == 404 - assert plugin.get_url_status('nested.html#nested-two', 'foo/baz/sibling.md', set(), files, False) == 0 - assert plugin.get_url_status('nested.html#nested-one', 'foo/baz/sibling.md', set(), files, False) == 404 + assert plugin.get_url_status('nested.html#nested-two', 'foo/baz/sibling.md', set(), files) == 0 + assert plugin.get_url_status('nested.html#nested-one', 'foo/baz/sibling.md', set(), files) == 404 - assert plugin.get_url_status('foo/bar/nested.html#nested-one', 'index.md', set(), files, False) == 0 - assert plugin.get_url_status('foo/baz/nested.html#nested-two', 'index.md', set(), files, False) == 0 + assert plugin.get_url_status('foo/bar/nested.html#nested-one', 'index.md', set(), files) == 0 + assert plugin.get_url_status('foo/baz/nested.html#nested-two', 'index.md', set(), files) == 0 - assert plugin.get_url_status('/index.html', 'foo/baz/sibling.md', set(), files, False) == 0 + assert plugin.get_url_status('/index.html', 'foo/baz/sibling.md', set(), files) == 0 @patch.object(htmlproofer.plugin, "log_warning", autospec=True) @@ -334,7 +378,7 @@ def test_get_url_status__excluded_non_existing_relative_url__no_warning(log_warn files = {} plugin.config['raise_error_excludes'][url_status] = [url] - status = plugin.get_url_status(url, src_path, set(), files, False) + status = plugin.get_url_status(url, src_path, set(), files) log_warning_mock.assert_not_called() assert 0 == status @@ -349,12 +393,12 @@ def test_get_url_status__excluded_existing_relative_url__no_warning(log_warning_ existing_page = Mock(spec=Page, markdown='') files = { os.path.normpath(file.url): file for file in Files([ - Mock(spec=File, src_path=src_path, dest_path=url, url=url, page=existing_page) + Mock(spec=File, src_path=src_path, dest_path=url, dest_uri=url, url=url, src_uri=src_path, page=existing_page) ]) } plugin.config['raise_error_excludes'][url_status] = [url] - status = plugin.get_url_status(url, src_path, set(), files, False) + status = plugin.get_url_status(url, src_path, set(), files) log_warning_mock.assert_not_called() assert 0 == status @@ -367,7 +411,7 @@ def test_get_url_status__non_existing_relative_url__warning_and_404(log_warning_ src_path = "index.md" files = {} - status = plugin.get_url_status(url, src_path, set(), files, False) + status = plugin.get_url_status(url, src_path, set(), files) log_warning_mock.assert_called_once() assert expected_url_status == status