Skip to content

Commit

Permalink
Implemented wrapping content in <html> structure so set encoding, see…
Browse files Browse the repository at this point in the history
… Issue 1 and 2 (#1).
  • Loading branch information
dankito committed Aug 12, 2018
1 parent 9d5a8c3 commit 8d4b042
Showing 1 changed file with 41 additions and 1 deletion.
42 changes: 41 additions & 1 deletion src/main/kotlin/net/dankito/readability4j/Article.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,29 @@ open class Article(
var articleContent: Element? = null

/**
* HTML string of processed article content
* HTML string of processed article content in a &lt;div> element.
*
* Therefore no encoding is applied, see [contentWithUtf8Encoding] or issue
* [https://github.com/dankito/Readability4J/issues/1].
*/
val content: String?
get() = articleContent?.html() // TODO: but this removes paging information (pages in top node <div id="readability-content">)

/**
* [content] returns a &lt;div> element.
*
* As the only way in HTML to set an encoding is via &lt;head>&lt;meta charset=""> tag, therefore no explicit
* encoding is applied to it.
* As a result non-ASCII characters may get displayed incorrectly.
*
* So this method wraps [content] in &lt;html>&lt;head>&lt;meta charset="utf-8"/>&lt;/head>&lt;body>&lt;!--
* content-->&lt;/body>&lt;/html> so that UTF-8 encoding gets applied.
*
* See [https://github.com/dankito/Readability4J/issues/1] for more info.
*/
val contentWithUtf8Encoding: String?
get() = getContentWithEncoding("utf-8")

val textContent: String?
get() = articleContent?.text()

Expand All @@ -49,4 +67,26 @@ open class Article(
*/
var dir: String? = null


/**
* [content] returns a &lt;div> element.
*
* As the only way in HTML to set an encoding is via &lt;head>&lt;meta charset=""> tag, therefore no explicit
* encoding is applied to it.
* As a result non-ASCII characters may get displayed incorrectly.
*
* So this method wraps [content] in &lt;html>&lt;head>&lt;meta charset="[encoding]"/>&lt;/head>&lt;body>&lt;!--
* content-->&lt;/body>&lt;/html> so that encoding gets applied.
*
* See [https://github.com/dankito/Readability4J/issues/1] for more info.
*/
fun getContentWithEncoding(encoding: String): String? {
content?.let { content ->
return "<html>\n <head>\n <meta charset=\"$encoding\"/>\n </head>\n <body>\n " +
"$content\n </body>\n<html>"
}

return null
}

}

0 comments on commit 8d4b042

Please sign in to comment.