Skip to content

Commit

Permalink
Fix regression in read_html
Browse files Browse the repository at this point in the history
Oh C string handling how I loathe you
  • Loading branch information
jimhester committed Apr 4, 2020
1 parent b53cac7 commit 6543857
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 2 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# xml2 (development version)

* `read_html()` now again works with HTML files with non-ASCII encodings (#293).

# xml2 1.3.0

* Removes the Rcpp dependency
Expand Down
4 changes: 2 additions & 2 deletions src/xml2_doc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,13 +194,13 @@ extern "C" SEXP doc_parse_file(
if (as_html) {
pDoc = htmlReadFile(
path,
strncmp(encoding, "", 0) == 0 ? NULL : encoding,
encoding[0] == '\0' ? NULL : encoding,
options
);
} else {
pDoc = xmlReadFile(
path,
strncmp(encoding, "", 0) == 0 ? NULL : encoding,
encoding[0] == '\0' ? NULL : encoding,
options
);
}
Expand Down
12 changes: 12 additions & 0 deletions tests/testthat/test-read-xml.R
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,15 @@ test_that("read_xml and read_html fail for bad status codes", {
class = "http_404"
)
})

test_that("read_html works with non-ASCII encodings", {
tmp <- tempfile()
on.exit(unlink(tmp))

writeLines("<html><body>\U2019</body></html>", tmp)
res <- read_html(tmp, encoding = "UTF-8")

expect_equal(as.character(res, options = ""),
"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html><body>\U2019</body></html>\n")
})

2 comments on commit 6543857

@jayhesselberth
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From r-lib/pkgdown#1288:

The top example still doesn't work with new changes. Is this a different problem?

library(xml2)

tmp <- tempfile()
cat("x <- 1", file = tmp)
read_html(tmp, encoding = "UTF-8")
#> {html_document}
#> <html>
#> [1] <body><p>x </p></body>

tmp <- tempfile()
cat("x = 1", file = tmp)
read_html(tmp, encoding = "UTF-8")
#> {html_document}
#> <html>
#> [1] <body><p>x = 1</p></body>

Created on 2020-04-04 by the reprex package (v0.3.0)

Session info
devtools::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value                       
#>  version  R version 3.6.3 (2020-02-29)
#>  os       macOS Catalina 10.15.4      
#>  system   x86_64, darwin19.3.0        
#>  ui       unknown                     
#>  language (EN)                        
#>  collate  en_US.UTF-8                 
#>  ctype    en_US.UTF-8                 
#>  tz       America/Denver              
#>  date     2020-04-04                  
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version    date       lib source                        
#>  assertthat    0.2.1      2019-03-21 [1] CRAN (R 3.6.0)                
#>  backports     1.1.5      2019-10-02 [1] CRAN (R 3.6.1)                
#>  callr         3.4.2      2020-02-12 [1] CRAN (R 3.6.2)                
#>  cli           2.0.2      2020-02-28 [1] CRAN (R 3.6.2)                
#>  crayon        1.3.4      2017-09-16 [1] CRAN (R 3.6.0)                
#>  desc          1.2.0      2018-05-01 [1] CRAN (R 3.6.0)                
#>  devtools      2.2.2      2020-02-17 [1] CRAN (R 3.6.2)                
#>  digest        0.6.25     2020-02-23 [1] CRAN (R 3.6.2)                
#>  ellipsis      0.3.0      2019-09-20 [1] CRAN (R 3.6.1)                
#>  evaluate      0.14       2019-05-28 [1] CRAN (R 3.6.0)                
#>  fansi         0.4.1      2020-01-08 [1] CRAN (R 3.6.2)                
#>  fs            1.3.2      2020-03-05 [1] CRAN (R 3.6.2)                
#>  glue          1.4.0      2020-04-03 [1] CRAN (R 3.6.3)                
#>  highr         0.8        2019-03-20 [1] CRAN (R 3.6.0)                
#>  htmltools     0.4.0      2019-10-04 [1] CRAN (R 3.6.1)                
#>  knitr         1.28       2020-02-06 [1] CRAN (R 3.6.2)                
#>  magrittr      1.5        2014-11-22 [1] CRAN (R 3.6.0)                
#>  memoise       1.1.0      2017-04-21 [1] CRAN (R 3.6.0)                
#>  pkgbuild      1.0.6      2019-10-09 [1] CRAN (R 3.6.1)                
#>  pkgload       1.0.2      2018-10-29 [1] CRAN (R 3.6.0)                
#>  prettyunits   1.1.1      2020-01-24 [1] CRAN (R 3.6.2)                
#>  processx      3.4.2      2020-02-09 [1] CRAN (R 3.6.2)                
#>  ps            1.3.2      2020-02-13 [1] CRAN (R 3.6.2)                
#>  R6            2.4.1      2019-11-12 [1] CRAN (R 3.6.1)                
#>  Rcpp          1.0.4.5    2020-04-02 [1] local                         
#>  remotes       2.1.1      2020-02-15 [1] CRAN (R 3.6.2)                
#>  rlang         0.4.5.9000 2020-04-04 [1] Github (r-lib/rlang@a90b04b)  
#>  rmarkdown     2.1        2020-01-20 [1] CRAN (R 3.6.2)                
#>  rprojroot     1.3-2      2018-01-03 [1] CRAN (R 3.6.0)                
#>  sessioninfo   1.1.1      2018-11-05 [1] CRAN (R 3.6.0)                
#>  stringi       1.4.6      2020-02-17 [1] CRAN (R 3.6.2)                
#>  stringr       1.4.0      2019-02-10 [1] CRAN (R 3.6.0)                
#>  testthat      2.3.2      2020-03-02 [1] CRAN (R 3.6.2)                
#>  usethis       1.5.1.9000 2020-03-21 [1] Github (r-lib/usethis@2431e97)
#>  withr         2.1.2      2018-03-15 [1] CRAN (R 3.6.0)                
#>  xfun          0.12       2020-01-13 [1] CRAN (R 3.6.2)                
#>  xml2        * 1.3.0.9000 2020-04-04 [1] local                         
#>  yaml          2.2.1      2020-02-01 [1] CRAN (R 3.6.2)                
#> 
#> [1] /usr/local/lib/R/3.6/site-library
#> [2] /usr/local/Cellar/r/3.6.3_1/lib/R/library

@jimhester
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know, but I don't think that is due to this encoding issue

Please sign in to comment.