diff --git a/DESCRIPTION b/DESCRIPTION index 6b29cb848..9e00eb8f1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -83,5 +83,6 @@ Authors@R: c( person("Dereck","de Mezquita", role="ctb"), person("Michael","Czekanski", role="ctb"), person("Dmitry", "Shemetov", role="ctb"), - person("Nitish", "Jha", role="ctb") + person("Nitish", "Jha", role="ctb"), + person("Joshua", "Wu", role="ctb") ) diff --git a/NEWS.md b/NEWS.md index e62a60104..b15ebd696 100644 --- a/NEWS.md +++ b/NEWS.md @@ -60,6 +60,8 @@ 8. OpenMP detection when building from source on Mac is improved, [#4348](https://github.com/Rdatatable/data.table/issues/4348). Thanks @jameshester and @kevinushey for the request and @kevinushey for the PR, @jameslamb for the advice and @s-u of R-core for ensuring CRAN machines are configured to support the uxpected setup. +9. `print.data.table` now handles combination multibyte characters correctly when truncating wide string entries, [#5096](https://github.com/Rdatatable/data.table/issues/5096). Thanks to @MichaelChirico for the report and @joshhwuu for the fix. + # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29) (30 Jan 2024) ## BREAKING CHANGE diff --git a/R/print.data.table.R b/R/print.data.table.R index 6588ca458..919c8aaed 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -229,11 +229,16 @@ format_list_item.default = function(x, ...) { # FR #1091 for pretty printing of character # TODO: maybe instead of doing "this is...", we could do "this ... test"? +# Current implementation may have issues when dealing with strings that have combinations of full-width and half-width characters, +# if this becomes a problem in the future, we could consider string traversal instead. char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) { trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE) if (!is.character(x) || trunc.char <= 0L) return(x) - idx = which(nchar(x) > trunc.char) - x[idx] = paste0(substr(x[idx], 1L, as.integer(trunc.char)), "...") + nchar_width = nchar(x, 'width') # Check whether string is full-width or half-width, #5096 + nchar_chars = nchar(x, 'char') + is_full_width = nchar_width > nchar_chars + idx = pmin(nchar_width, nchar_chars) > trunc.char + x[idx] = paste0(strtrim(x[idx], trunc.char * fifelse(is_full_width[idx], 2L, 1L)), "...") x } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0b740f605..62a33db50 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18434,3 +18434,43 @@ dt = data.table(a = 1L) test(2252.1, dt[, b:=2L], error = "\\[ was called on a data.table.*not data.table-aware.*':='") test(2252.2, dt[, let(b=2L)], error = "\\[ was called on a data.table.*not data.table-aware.*'let'") rm(.datatable.aware) + +# tests for trunc.char handling wide characters # 5096 +accented_a = "\u0061\u0301" +ja_ichi = "\u4E00" +ja_ni = "\u4E8C" +ja_ko = "\u3053" +ja_n = "\u3093" +dots = "..." +clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output +# Tests for combining character latin a and acute accent, single row +DT = data.table(strrep(accented_a, 4L)) +test(2253.01, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(accented_a, 4L)) +test(2253.02, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(accented_a, 3L), dots)) +test(2253.03, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(accented_a, 1L), dots)) +# Tests for full-width japanese character ichi, single row +DT = data.table(strrep(ja_ichi, 4L)) +test(2253.04, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(ja_ichi, 4L)) +test(2253.05, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(ja_ichi, 3L), dots)) +test(2253.06, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(ja_ichi, 1L), dots)) +# Tests for multiple, different length combining character rows +DT = data.table(strrep(accented_a, 1L:4L)) +test(2253.07, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "áááá")) +test(2253.08, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "ááá...")) +test(2253.09, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "á...", "á...", "á...")) +# Tests for multiple, different length full-width characters +DT = data.table(strrep(ja_ichi, 1L:4L)) +test(2253.10, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一一")) +test(2253.11, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一...")) +test(2253.12, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一...", "一...", "一...")) +# Tests for combined characters, multiple columns +DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa") +test(2253.13, options=list(datatable.prettyprint.char = 4L), capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa") +test(2253.14, options=list(datatable.prettyprint.char = 3L), capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa") +test(2253.15, options=list(datatable.prettyprint.char = 2L), capture.output(print(DT))[-1L], "1: 一 二二 ここ... áá aa...") +test(2253.16, options=list(datatable.prettyprint.char = 1L), capture.output(print(DT))[-1L], "1: 一 二... こ... á... a...") +# Tests for multiple columns, multiple rows +DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3)) +test(2253.17, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ んん ááá", "ここ んんん ááá", "こここ んんんん ááá")) +test(2253.18, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ んん ááá", "ここ んんん ááá", "こここ んんん... ááá")) +test(2253.19, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ ん... á...", "こ... ん... á...", "こ... ん... á..."))