Added support for including/excluding tags by XPath

mikegoatly · Aug 12, 2020 · 3aa0545 · 3aa0545
1 parent b6e8af6
commit 3aa0545
Show file tree

Hide file tree

Showing 8 changed files with 89 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -26,10 +26,10 @@ Options:
 --image-output|-i <IMAGE OUTPUT LOCATION>
 If no image output location is specified then they will be written to the same folder as the markdown file.
 
---include-tags|--it|-t <COMMA SEPARATED TAG LIST>
+--include-tags|--it|-t <TAG|XPATH,[TAG|XPATH[,...]]>
 If unspecified the entire body tag will be processed, otherwise only text contained in the specified tags will be processed.
 
---exclude-tags|--et|-e <COMMA SEPARATED TAG LIST>
+--exclude-tags|--et|-e <TAG|XPATH,[TAG|XPATH[,...]]>
 Allows for specific tags to be ignored.
 
 --image-path-prefix|--ipp <IMAGE PATH PREFIX>
@@ -142,8 +142,8 @@ In `ConversionOptions` you can specify:
 images from a different location, relative or absolute.
 - `DefaultCodeLanguage`: The default code language to apply to code blocks mapped from `pre` tags.
 The default is `csharp`.
-- `IncludeTags`: The set of tags to include in the conversion process. If this is empty then all elements will processed.
-- `ExcludeTags`: The set of tags to exclude from the conversion process. You can use this if there are certain parts of
+- `IncludeTags`: The set of tags or XPaths for tags to include in the conversion process. If this is empty then all elements will processed.
+- `ExcludeTags`: The set of tags or XPaths for tags to exclude from the conversion process. You can use this if there are certain parts of
 a document you don't want translating to markdown, e.g. aside, nav, etc.
 - `CodeLanguageClassMap`: A dictionary mapping between class names that can appear on `pre` tags and the language they map to.E.g. you might map the class name "sh_csharp" to "csharp" and "sh_powershell" to "powershell".
 - `FrontMatter`: Configuration for how Front Matter metadata should be emitted into a converted document.

diff --git a/src/Html2md.Core/ConversionState.cs b/src/Html2md.Core/ConversionState.cs
@@ -1,4 +1,7 @@
-namespace Html2md
+using HtmlAgilityPack;
+using System.Collections.Generic;
+
+namespace Html2md
 {
     internal struct ConversionState
     {
@@ -9,13 +12,9 @@ private ConversionState(ConversionState previous)
             this.ListItemPrefix = previous.ListItemPrefix;
             this.EmitMarkDownStyles = previous.EmitMarkDownStyles;
             this.LinePrefix = previous.LinePrefix;
+            this.NodesToExclude = previous.NodesToExclude;
         }
 
-        public static ConversionState InitialState { get; } = new ConversionState
-        {
-            EmitMarkDownStyles = true
-        };
-
         public string? ListItemPrefix { get; private set; }
 
         public bool RenderingEnabled { get; private set; }
@@ -25,6 +24,16 @@ private ConversionState(ConversionState previous)
         public int ListDepth { get; private set; }
 
         public string? LinePrefix { get; private set; }
+        public ISet<HtmlNode> NodesToExclude { get; private set; }
+
+        public static ConversionState InitialState(ISet<HtmlNode> nodesToExclude)
+        {
+            return new ConversionState
+            {
+                NodesToExclude = nodesToExclude,
+                EmitMarkDownStyles = true
+            };
+        }
 
         public ConversionState WithRenderingEnabled()
         {

diff --git a/src/Html2md.Core/Html2md.Core.csproj b/src/Html2md.Core/Html2md.Core.csproj
@@ -13,7 +13,7 @@
     <PackageLicenseFile>LICENSE</PackageLicenseFile>
     <PackageProjectUrl>https://github.com/mikegoatly/html2md</PackageProjectUrl>
     <PackageTags>convert-html convert-markdown html markdown conversion</PackageTags>
-    <Version>2.0.0</Version>
+    <Version>2.1.0</Version>
     <RepositoryUrl>https://github.com/mikegoatly/html2md</RepositoryUrl>
     <PackageReleaseNotes>Added support for extracting Front Matter metadata</PackageReleaseNotes>
   </PropertyGroup>

diff --git a/src/Html2md.Core/IConversionOptions.cs b/src/Html2md.Core/IConversionOptions.cs
@@ -25,13 +25,15 @@ public interface IConversionOptions
         IDictionary<string, string> CodeLanguageClassMap { get; }
 
         /// <summary>
-        /// Gets the set of tags to include in the conversion process. If this is empty then all elements will processed.
+        /// Gets the set of tags to include in the conversion process. If this is empty then all elements will processed. These
+        /// can be tag names or an XPath query to tags to include.
         /// </summary>
         ISet<string> IncludeTags { get; }
 
         /// <summary>
         /// Gets the set of tags to exclude from the conversion process. You can use this if there are certain parts of
-        /// a document you don't want translating to markdown, e.g. aside, nav, etc.
+        /// a document you don't want translating to markdown, e.g. aside, nav, etc.  These
+        /// can be tag names or an XPath query to tags to include.
         /// </summary>
         ISet<string> ExcludeTags { get; }
 

diff --git a/src/Html2md.Core/MarkdownConverter.cs b/src/Html2md.Core/MarkdownConverter.cs
@@ -20,7 +20,11 @@ public class MarkdownConverter
         private readonly IConversionOptions options;
         private readonly ILogger logger;
         private readonly HttpClient httpClient;
+        private readonly List<string> includeXPaths;
+        private readonly HashSet<string> includeTags;
+        private readonly List<string> excludeXPaths;
         private readonly FrontMatterExtractor frontMatterExtractor = new FrontMatterExtractor();
+        private readonly HashSet<string> excludeTags;
 
         public MarkdownConverter(IConversionOptions options, ILogger? logger = null)
             : this(options, null, logger)
@@ -32,6 +36,12 @@ public MarkdownConverter(IConversionOptions options, HttpClient? httpClient = nu
             this.options = options;
             this.logger = logger ?? NullLogger.Instance;
             this.httpClient = httpClient ?? new HttpClient();
+
+            this.includeXPaths = this.options.IncludeTags.Where(t => t.StartsWith("/")).ToList();
+            this.includeTags = this.options.IncludeTags.Except(this.includeXPaths).ToHashSet();
+
+            this.excludeXPaths = this.options.ExcludeTags.Where(t => t.StartsWith("/")).ToList();
+            this.excludeTags = this.options.ExcludeTags.Except(this.includeXPaths).ToHashSet();
         }
 
         public async Task<ConvertionResult> ConvertAsync(IEnumerable<Uri> urls)
@@ -78,7 +88,25 @@ private async Task<ConvertedDocument> ConvertAsync(Uri pageUri, StringBuilder bu
             }
 
             this.logger.LogDebug("Processing page content");
-            this.ProcessNode(pageUri, doc.DocumentNode, builder, imageCollector, ConversionState.InitialState);
+
+            this.logger.LogTrace("Building list of explicitly included elements");
+            var nodesToProcess = this.includeXPaths.SelectMany(p => doc.DocumentNode.SelectNodes(p) ?? Enumerable.Empty<HtmlNode>()).ToList();
+            var nodesToExclude = this.excludeXPaths.SelectMany(p => doc.DocumentNode.SelectNodes(p) ?? Enumerable.Empty<HtmlNode>()).ToHashSet();
+
+            if (nodesToProcess.Count == 0)
+            {
+                nodesToProcess.Add(doc.DocumentNode);
+            }
+
+            var index = 0;
+            foreach (var node in nodesToProcess)
+            {
+                this.ProcessNode(pageUri, node, builder, imageCollector, ConversionState.InitialState(nodesToExclude));
+                if (++index != nodesToProcess.Count)
+                {
+                    builder.AppendLine().AppendLine();
+                }
+            }
 
             return new ConvertedDocument(pageUri, this.RemoveRedundantWhiteSpace(builder.ToString()));
         }
@@ -109,7 +137,7 @@ private void ProcessNode(
 
                 case HtmlNodeType.Document:
                 case HtmlNodeType.Element:
-                    if (!this.options.ExcludeTags.Contains(node.Name))
+                    if (!this.excludeTags.Contains(node.Name) && !state.NodesToExclude.Contains(node))
                     {
                         var emitNewLineAfterChildren = false;
                         if (this.IsIncludedTag(node.Name))
@@ -217,7 +245,7 @@ private void ProcessNode(
 
         private bool IsIncludedTag(string tagName)
         {
-            return this.options.IncludeTags.Count == 0 || this.options.IncludeTags.Contains(tagName);
+            return this.includeTags.Count == 0 || this.includeTags.Contains(tagName);
         }
 
         private void ProcessChildNodes(

diff --git a/src/Html2md/Html2md.csproj b/src/Html2md/Html2md.csproj
@@ -15,7 +15,7 @@
     <Copyright>Copyright Mike Goatly</Copyright>
     <PackageLicenseFile>LICENSE</PackageLicenseFile>
     <PackageProjectUrl>https://github.com/mikegoatly/html2md</PackageProjectUrl>
-    <Version>2.0.0</Version>
+    <Version>2.1.0</Version>
     <PackageTags>convert-html convert-markdown html markdown conversion</PackageTags>
     <RepositoryUrl>https://github.com/mikegoatly/html2md</RepositoryUrl>
     <PackageReleaseNotes>Added support for extracting Front Matter metadata</PackageReleaseNotes>

diff --git a/src/Html2md/Program.cs b/src/Html2md/Program.cs
@@ -73,10 +73,10 @@ private static void WriteHelp()
             Console.WriteLine("--image-output|-i <IMAGE OUTPUT LOCATION>");
             Console.WriteLine("If no image output location is specified then they will be written to the same folder as the markdown file.");
             Console.WriteLine();
-            Console.WriteLine("--include-tags|--it|-t <COMMA SEPARATED TAG LIST>");
+            Console.WriteLine("--include-tags|--it|-t <TAG|XPATH,[TAG|XPATH[,...]]>");
             Console.WriteLine("If unspecified the entire body tag will be processed, otherwise only text contained in the specified tags will be processed.");
             Console.WriteLine();
-            Console.WriteLine("--exclude-tags|--et|-e <COMMA SEPARATED TAG LIST>");
+            Console.WriteLine("--exclude-tags|--et|-e <TAG|XPATH,[TAG|XPATH[,...]]>");
             Console.WriteLine("Allows for specific tags to be ignored.");
             Console.WriteLine();
             Console.WriteLine("--image-path-prefix|--ipp <IMAGE PATH PREFIX>");

diff --git a/test/Html2md.Tests.Unit/MarkdownConverterTests.cs b/test/Html2md.Tests.Unit/MarkdownConverterTests.cs
@@ -136,6 +136,37 @@ await TestConverter(
                 });
         }
 
+        [Fact]
+        public async Task ShouldOnlyProcessIncludedNodesFromXPath()
+        {
+            await TestConverter(
+                "<body><article>Should appear</article><article>Should also appear</article><p>This too</p></body>",
+                @"Should appear
+
+Should also appear
+
+This too
+
+",
+                options: new ConversionOptions
+                {
+                    IncludeTags = { "//article", "//p" }
+                });
+        }
+
+        [Fact]
+        public async Task ShouldExcludeSpecificNodesIndicatedByXPaths()
+        {
+            await TestConverter(
+                "<body><article>Should appear <div class='comments'>Should be ignored</div></article></body>",
+                @"Should appear ",
+                options: new ConversionOptions
+                {
+                    IncludeTags = { "//article" },
+                    ExcludeTags = { "//div[@class='comments']"}
+                });
+        }
+
         [Fact]
         public async Task ShouldNotProcessExcludedTags()
         {