From 8d4b04297b530bda5e2f037a6eb9cd501419da1e Mon Sep 17 00:00:00 2001 From: dankl Date: Sun, 12 Aug 2018 20:40:00 +0200 Subject: [PATCH] Implemented wrapping content in structure so set encoding, see Issue 1 and 2 (https://github.com/dankito/Readability4J/issues/1). --- .../net/dankito/readability4j/Article.kt | 42 ++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/src/main/kotlin/net/dankito/readability4j/Article.kt b/src/main/kotlin/net/dankito/readability4j/Article.kt index fef90d8..b65dd58 100644 --- a/src/main/kotlin/net/dankito/readability4j/Article.kt +++ b/src/main/kotlin/net/dankito/readability4j/Article.kt @@ -20,11 +20,29 @@ open class Article( var articleContent: Element? = null /** - * HTML string of processed article content + * HTML string of processed article content in a <div> element. + * + * Therefore no encoding is applied, see [contentWithUtf8Encoding] or issue + * [https://github.com/dankito/Readability4J/issues/1]. */ val content: String? get() = articleContent?.html() // TODO: but this removes paging information (pages in top node
) + /** + * [content] returns a <div> element. + * + * As the only way in HTML to set an encoding is via <head><meta charset=""> tag, therefore no explicit + * encoding is applied to it. + * As a result non-ASCII characters may get displayed incorrectly. + * + * So this method wraps [content] in <html><head><meta charset="utf-8"/></head><body><!-- + * content--></body></html> so that UTF-8 encoding gets applied. + * + * See [https://github.com/dankito/Readability4J/issues/1] for more info. + */ + val contentWithUtf8Encoding: String? + get() = getContentWithEncoding("utf-8") + val textContent: String? get() = articleContent?.text() @@ -49,4 +67,26 @@ open class Article( */ var dir: String? = null + + /** + * [content] returns a <div> element. + * + * As the only way in HTML to set an encoding is via <head><meta charset=""> tag, therefore no explicit + * encoding is applied to it. + * As a result non-ASCII characters may get displayed incorrectly. + * + * So this method wraps [content] in <html><head><meta charset="[encoding]"/></head><body><!-- + * content--></body></html> so that encoding gets applied. + * + * See [https://github.com/dankito/Readability4J/issues/1] for more info. + */ + fun getContentWithEncoding(encoding: String): String? { + content?.let { content -> + return "\n \n \n \n \n " + + "$content\n \n" + } + + return null + } + } \ No newline at end of file