Skip to content

Commit

Permalink
Added support for including/excluding tags by XPath
Browse files Browse the repository at this point in the history
  • Loading branch information
mikegoatly committed Aug 12, 2020
1 parent b6e8af6 commit 3aa0545
Show file tree
Hide file tree
Showing 8 changed files with 89 additions and 19 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ Options:
--image-output|-i <IMAGE OUTPUT LOCATION>
If no image output location is specified then they will be written to the same folder as the markdown file.
--include-tags|--it|-t <COMMA SEPARATED TAG LIST>
--include-tags|--it|-t <TAG|XPATH,[TAG|XPATH[,...]]>
If unspecified the entire body tag will be processed, otherwise only text contained in the specified tags will be processed.
--exclude-tags|--et|-e <COMMA SEPARATED TAG LIST>
--exclude-tags|--et|-e <TAG|XPATH,[TAG|XPATH[,...]]>
Allows for specific tags to be ignored.
--image-path-prefix|--ipp <IMAGE PATH PREFIX>
Expand Down Expand Up @@ -142,8 +142,8 @@ In `ConversionOptions` you can specify:
images from a different location, relative or absolute.
- `DefaultCodeLanguage`: The default code language to apply to code blocks mapped from `pre` tags.
The default is `csharp`.
- `IncludeTags`: The set of tags to include in the conversion process. If this is empty then all elements will processed.
- `ExcludeTags`: The set of tags to exclude from the conversion process. You can use this if there are certain parts of
- `IncludeTags`: The set of tags or XPaths for tags to include in the conversion process. If this is empty then all elements will processed.
- `ExcludeTags`: The set of tags or XPaths for tags to exclude from the conversion process. You can use this if there are certain parts of
a document you don't want translating to markdown, e.g. aside, nav, etc.
- `CodeLanguageClassMap`: A dictionary mapping between class names that can appear on `pre` tags and the language they map to.E.g. you might map the class name "sh_csharp" to "csharp" and "sh_powershell" to "powershell".
- `FrontMatter`: Configuration for how Front Matter metadata should be emitted into a converted document.
Expand Down
21 changes: 15 additions & 6 deletions src/Html2md.Core/ConversionState.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
namespace Html2md
using HtmlAgilityPack;
using System.Collections.Generic;

namespace Html2md
{
internal struct ConversionState
{
Expand All @@ -9,13 +12,9 @@ private ConversionState(ConversionState previous)
this.ListItemPrefix = previous.ListItemPrefix;
this.EmitMarkDownStyles = previous.EmitMarkDownStyles;
this.LinePrefix = previous.LinePrefix;
this.NodesToExclude = previous.NodesToExclude;
}

public static ConversionState InitialState { get; } = new ConversionState
{
EmitMarkDownStyles = true
};

public string? ListItemPrefix { get; private set; }

public bool RenderingEnabled { get; private set; }
Expand All @@ -25,6 +24,16 @@ private ConversionState(ConversionState previous)
public int ListDepth { get; private set; }

public string? LinePrefix { get; private set; }
public ISet<HtmlNode> NodesToExclude { get; private set; }

public static ConversionState InitialState(ISet<HtmlNode> nodesToExclude)
{
return new ConversionState
{
NodesToExclude = nodesToExclude,
EmitMarkDownStyles = true
};
}

public ConversionState WithRenderingEnabled()
{
Expand Down
2 changes: 1 addition & 1 deletion src/Html2md.Core/Html2md.Core.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
<PackageLicenseFile>LICENSE</PackageLicenseFile>
<PackageProjectUrl>https://github.com/mikegoatly/html2md</PackageProjectUrl>
<PackageTags>convert-html convert-markdown html markdown conversion</PackageTags>
<Version>2.0.0</Version>
<Version>2.1.0</Version>
<RepositoryUrl>https://github.com/mikegoatly/html2md</RepositoryUrl>
<PackageReleaseNotes>Added support for extracting Front Matter metadata</PackageReleaseNotes>
</PropertyGroup>
Expand Down
6 changes: 4 additions & 2 deletions src/Html2md.Core/IConversionOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,15 @@ public interface IConversionOptions
IDictionary<string, string> CodeLanguageClassMap { get; }

/// <summary>
/// Gets the set of tags to include in the conversion process. If this is empty then all elements will processed.
/// Gets the set of tags to include in the conversion process. If this is empty then all elements will processed. These
/// can be tag names or an XPath query to tags to include.
/// </summary>
ISet<string> IncludeTags { get; }

/// <summary>
/// Gets the set of tags to exclude from the conversion process. You can use this if there are certain parts of
/// a document you don't want translating to markdown, e.g. aside, nav, etc.
/// a document you don't want translating to markdown, e.g. aside, nav, etc. These
/// can be tag names or an XPath query to tags to include.
/// </summary>
ISet<string> ExcludeTags { get; }

Expand Down
34 changes: 31 additions & 3 deletions src/Html2md.Core/MarkdownConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@ public class MarkdownConverter
private readonly IConversionOptions options;
private readonly ILogger logger;
private readonly HttpClient httpClient;
private readonly List<string> includeXPaths;
private readonly HashSet<string> includeTags;
private readonly List<string> excludeXPaths;
private readonly FrontMatterExtractor frontMatterExtractor = new FrontMatterExtractor();
private readonly HashSet<string> excludeTags;

public MarkdownConverter(IConversionOptions options, ILogger? logger = null)
: this(options, null, logger)
Expand All @@ -32,6 +36,12 @@ public MarkdownConverter(IConversionOptions options, HttpClient? httpClient = nu
this.options = options;
this.logger = logger ?? NullLogger.Instance;
this.httpClient = httpClient ?? new HttpClient();

this.includeXPaths = this.options.IncludeTags.Where(t => t.StartsWith("/")).ToList();
this.includeTags = this.options.IncludeTags.Except(this.includeXPaths).ToHashSet();

this.excludeXPaths = this.options.ExcludeTags.Where(t => t.StartsWith("/")).ToList();
this.excludeTags = this.options.ExcludeTags.Except(this.includeXPaths).ToHashSet();
}

public async Task<ConvertionResult> ConvertAsync(IEnumerable<Uri> urls)
Expand Down Expand Up @@ -78,7 +88,25 @@ private async Task<ConvertedDocument> ConvertAsync(Uri pageUri, StringBuilder bu
}

this.logger.LogDebug("Processing page content");
this.ProcessNode(pageUri, doc.DocumentNode, builder, imageCollector, ConversionState.InitialState);

this.logger.LogTrace("Building list of explicitly included elements");
var nodesToProcess = this.includeXPaths.SelectMany(p => doc.DocumentNode.SelectNodes(p) ?? Enumerable.Empty<HtmlNode>()).ToList();
var nodesToExclude = this.excludeXPaths.SelectMany(p => doc.DocumentNode.SelectNodes(p) ?? Enumerable.Empty<HtmlNode>()).ToHashSet();

if (nodesToProcess.Count == 0)
{
nodesToProcess.Add(doc.DocumentNode);
}

var index = 0;
foreach (var node in nodesToProcess)
{
this.ProcessNode(pageUri, node, builder, imageCollector, ConversionState.InitialState(nodesToExclude));
if (++index != nodesToProcess.Count)
{
builder.AppendLine().AppendLine();
}
}

return new ConvertedDocument(pageUri, this.RemoveRedundantWhiteSpace(builder.ToString()));
}
Expand Down Expand Up @@ -109,7 +137,7 @@ private void ProcessNode(

case HtmlNodeType.Document:
case HtmlNodeType.Element:
if (!this.options.ExcludeTags.Contains(node.Name))
if (!this.excludeTags.Contains(node.Name) && !state.NodesToExclude.Contains(node))
{
var emitNewLineAfterChildren = false;
if (this.IsIncludedTag(node.Name))
Expand Down Expand Up @@ -217,7 +245,7 @@ private void ProcessNode(

private bool IsIncludedTag(string tagName)
{
return this.options.IncludeTags.Count == 0 || this.options.IncludeTags.Contains(tagName);
return this.includeTags.Count == 0 || this.includeTags.Contains(tagName);
}

private void ProcessChildNodes(
Expand Down
2 changes: 1 addition & 1 deletion src/Html2md/Html2md.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
<Copyright>Copyright Mike Goatly</Copyright>
<PackageLicenseFile>LICENSE</PackageLicenseFile>
<PackageProjectUrl>https://github.com/mikegoatly/html2md</PackageProjectUrl>
<Version>2.0.0</Version>
<Version>2.1.0</Version>
<PackageTags>convert-html convert-markdown html markdown conversion</PackageTags>
<RepositoryUrl>https://github.com/mikegoatly/html2md</RepositoryUrl>
<PackageReleaseNotes>Added support for extracting Front Matter metadata</PackageReleaseNotes>
Expand Down
4 changes: 2 additions & 2 deletions src/Html2md/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,10 @@ private static void WriteHelp()
Console.WriteLine("--image-output|-i <IMAGE OUTPUT LOCATION>");
Console.WriteLine("If no image output location is specified then they will be written to the same folder as the markdown file.");
Console.WriteLine();
Console.WriteLine("--include-tags|--it|-t <COMMA SEPARATED TAG LIST>");
Console.WriteLine("--include-tags|--it|-t <TAG|XPATH,[TAG|XPATH[,...]]>");
Console.WriteLine("If unspecified the entire body tag will be processed, otherwise only text contained in the specified tags will be processed.");
Console.WriteLine();
Console.WriteLine("--exclude-tags|--et|-e <COMMA SEPARATED TAG LIST>");
Console.WriteLine("--exclude-tags|--et|-e <TAG|XPATH,[TAG|XPATH[,...]]>");
Console.WriteLine("Allows for specific tags to be ignored.");
Console.WriteLine();
Console.WriteLine("--image-path-prefix|--ipp <IMAGE PATH PREFIX>");
Expand Down
31 changes: 31 additions & 0 deletions test/Html2md.Tests.Unit/MarkdownConverterTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,37 @@ await TestConverter(
});
}

[Fact]
public async Task ShouldOnlyProcessIncludedNodesFromXPath()
{
await TestConverter(
"<body><article>Should appear</article><article>Should also appear</article><p>This too</p></body>",
@"Should appear
Should also appear
This too
",
options: new ConversionOptions
{
IncludeTags = { "//article", "//p" }
});
}

[Fact]
public async Task ShouldExcludeSpecificNodesIndicatedByXPaths()
{
await TestConverter(
"<body><article>Should appear <div class='comments'>Should be ignored</div></article></body>",
@"Should appear ",
options: new ConversionOptions
{
IncludeTags = { "//article" },
ExcludeTags = { "//div[@class='comments']"}
});
}

[Fact]
public async Task ShouldNotProcessExcludedTags()
{
Expand Down

0 comments on commit 3aa0545

Please sign in to comment.