johnkerl · johnkerl · Mar 20, 2022 · Mar 20, 2022 · Mar 20, 2022 · Mar 20, 2022
diff --git a/docs/src/manpage.md b/docs/src/manpage.md
@@ -50,7 +50,7 @@ DESCRIPTION
        insertion-ordered hash map.  This encompasses a variety of data
        formats, including but not limited to the familiar CSV, TSV, and JSON.
        (Miller can handle positionally-indexed data as a special case.) This
-       manpage documents mlr 6.2.0.
+       manpage documents mlr 6.2.0-dev.
 
 EXAMPLES
        mlr --icsv --opprint cat example.csv
@@ -192,11 +192,11 @@ VERB LIST
        altkv bar bootstrap cat check clean-whitespace count-distinct count
        count-similar cut decimate fill-down fill-empty filter flatten format-values
        fraction gap grep group-by group-like having-fields head histogram json-parse
-       json-stringify join label least-frequent merge-fields most-frequent nest
-       nothing put regularize remove-empty-columns rename reorder repeat reshape
-       sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records sort
-       sort-within-records split stats1 stats2 step tac tail tee template top
-       unflatten uniq unsparsify
+       json-stringify join label latin1-to-utf8 utf8-to-latin1 least-frequent
+       merge-fields most-frequent nest nothing put regularize remove-empty-columns
+       rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle
+       skip-trivial-records sort sort-within-records split stats1 stats2 step tac
+       tail tee template top unflatten uniq unsparsify
 
 FUNCTION LIST
        abs acos acosh any append apply arrayify asin asinh asserting_absent
@@ -212,16 +212,17 @@ FUNCTION LIST
        is_absent is_array is_bool is_boolean is_empty is_empty_map is_error is_float
        is_int is_map is_nan is_nonempty_map is_not_array is_not_empty is_not_map
        is_not_null is_null is_numeric is_present is_string joink joinkv joinv
-       json_parse json_stringify leafcount length localtime2gmt localtime2sec log
-       log10 log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5
-       mexp min mmul msub os pow qnorm reduce regextract regextract_or_else round
-       roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime
-       select sgn sha1 sha256 sha512 sin sinh sort splita splitax splitkv splitkvx
-       splitnv splitnvx sqrt ssub strftime strftime_local string strip strlen
-       strptime strptime_local sub substr substr0 substr1 system systime systimeint
-       tan tanh tolower toupper truncate typeof unflatten unformat unformatx uptime
-       urand urand32 urandelement urandint urandrange version ! != !=~ % & && * ** +
-       - . .* .+ .- ./ / // &lt; &lt;&lt; &lt;= &lt;=&gt; == =~ &gt; &gt;= &gt;&gt; &gt;&gt;&gt; ?: ?? ??? ^ ^^ | || ~
+       json_parse json_stringify latin1_to_utf8 leafcount length localtime2gmt
+       localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect
+       mapsum max md5 mexp min mmul msub os pow qnorm reduce regextract
+       regextract_or_else round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms
+       sec2localdate sec2localtime select sgn sha1 sha256 sha512 sin sinh sort splita
+       splitax splitkv splitkvx splitnv splitnvx sqrt ssub strftime strftime_local
+       string strip strlen strptime strptime_local sub substr substr0 substr1 system
+       systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat
+       unformatx uptime urand urand32 urandelement urandint urandrange utf8_to_latin1
+       version ! != !=~ % & && * ** + - . .* .+ .- ./ / // &lt; &lt;&lt; &lt;= &lt;=&gt; == =~ &gt; &gt;= &gt;&gt;
+       &gt;&gt;&gt; ?: ?? ??? ^ ^^ | || ~
 
 COMMENTS-IN-DATA FLAGS
        Miller lets you put comments in your data, such as
@@ -1319,6 +1320,20 @@ VERBS
        Options:
        -h|--help Show this message.
 
+   latin1-to-utf8
+       Usage: mlr latin1-to-utf8, with no options.
+       Recursively converts record strings from Latin-1 to UTF-8.
+       For field-level control, please see the latin1_to_utf8 DSL function.
+       Options:
+       -h|--help Show this message.
+
+   utf8-to-latin1
+       Usage: mlr utf8-to-latin1, with no options.
+       Recursively converts record strings from Latin-1 to UTF-8.
+       For field-level control, please see the utf8_to_latin1 DSL function.
+       Options:
+       -h|--help Show this message.
+
    least-frequent
        Usage: mlr least-frequent [options]
        Shows the least frequently occurring distinct values for specified field names.
@@ -2363,6 +2378,12 @@ FUNCTIONS FOR FILTER/PUT
    json_stringify
         (class=collections #args=1,2) Converts value to JSON-formatted string. Default output is single-line. With optional second boolean argument set to true, produces multiline output.
 
+   latin1_to_utf8
+        (class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it.
+       Examples:
+       $y = latin1_to_utf8($x)
+       $* = latin1_to_utf8($*)
+
    leafcount
         (class=collections #args=1) Counts total number of terminal values in map/array. For single-level map/array, same as length.
 
@@ -2694,6 +2715,12 @@ FUNCTIONS FOR FILTER/PUT
    urandrange
         (class=math #args=2) Floating-point numbers uniformly distributed on the interval [a, b).
 
+   utf8_to_latin1
+        (class=string #args=1) Tries to convert UTF-8-encoded string to Latin-1-encoded string. If argument is array or map, recurses into it.
+       Examples:
+       $y = utf8_to_latin1($x)
+       $* = utf8_to_latin1($*)
+
    version
         (class=system #args=0) Returns the Miller version as a string.
 
@@ -3195,5 +3222,5 @@ SEE ALSO
 
 
 
-                                  2022-03-19                         MILLER(1)
+                                  2022-03-20                         MILLER(1)
 </pre>
diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt
@@ -29,7 +29,7 @@ DESCRIPTION
        insertion-ordered hash map.  This encompasses a variety of data
        formats, including but not limited to the familiar CSV, TSV, and JSON.
        (Miller can handle positionally-indexed data as a special case.) This
-       manpage documents mlr 6.2.0.
+       manpage documents mlr 6.2.0-dev.
 
 EXAMPLES
        mlr --icsv --opprint cat example.csv
@@ -171,11 +171,11 @@ VERB LIST
        altkv bar bootstrap cat check clean-whitespace count-distinct count
        count-similar cut decimate fill-down fill-empty filter flatten format-values
        fraction gap grep group-by group-like having-fields head histogram json-parse
-       json-stringify join label least-frequent merge-fields most-frequent nest
-       nothing put regularize remove-empty-columns rename reorder repeat reshape
-       sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records sort
-       sort-within-records split stats1 stats2 step tac tail tee template top
-       unflatten uniq unsparsify
+       json-stringify join label latin1-to-utf8 utf8-to-latin1 least-frequent
+       merge-fields most-frequent nest nothing put regularize remove-empty-columns
+       rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle
+       skip-trivial-records sort sort-within-records split stats1 stats2 step tac
+       tail tee template top unflatten uniq unsparsify
 
 FUNCTION LIST
        abs acos acosh any append apply arrayify asin asinh asserting_absent
@@ -191,16 +191,17 @@ FUNCTION LIST
        is_absent is_array is_bool is_boolean is_empty is_empty_map is_error is_float
        is_int is_map is_nan is_nonempty_map is_not_array is_not_empty is_not_map
        is_not_null is_null is_numeric is_present is_string joink joinkv joinv
-       json_parse json_stringify leafcount length localtime2gmt localtime2sec log
-       log10 log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5
-       mexp min mmul msub os pow qnorm reduce regextract regextract_or_else round
-       roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime
-       select sgn sha1 sha256 sha512 sin sinh sort splita splitax splitkv splitkvx
-       splitnv splitnvx sqrt ssub strftime strftime_local string strip strlen
-       strptime strptime_local sub substr substr0 substr1 system systime systimeint
-       tan tanh tolower toupper truncate typeof unflatten unformat unformatx uptime
-       urand urand32 urandelement urandint urandrange version ! != !=~ % & && * ** +
-       - . .* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
+       json_parse json_stringify latin1_to_utf8 leafcount length localtime2gmt
+       localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect
+       mapsum max md5 mexp min mmul msub os pow qnorm reduce regextract
+       regextract_or_else round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms
+       sec2localdate sec2localtime select sgn sha1 sha256 sha512 sin sinh sort splita
+       splitax splitkv splitkvx splitnv splitnvx sqrt ssub strftime strftime_local
+       string strip strlen strptime strptime_local sub substr substr0 substr1 system
+       systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat
+       unformatx uptime urand urand32 urandelement urandint urandrange utf8_to_latin1
+       version ! != !=~ % & && * ** + - . .* .+ .- ./ / // < << <= <=> == =~ > >= >>
+       >>> ?: ?? ??? ^ ^^ | || ~
 
 COMMENTS-IN-DATA FLAGS
        Miller lets you put comments in your data, such as
@@ -1298,6 +1299,20 @@ VERBS
        Options:
        -h|--help Show this message.
 
+   latin1-to-utf8
+       Usage: mlr latin1-to-utf8, with no options.
+       Recursively converts record strings from Latin-1 to UTF-8.
+       For field-level control, please see the latin1_to_utf8 DSL function.
+       Options:
+       -h|--help Show this message.
+
+   utf8-to-latin1
+       Usage: mlr utf8-to-latin1, with no options.
+       Recursively converts record strings from Latin-1 to UTF-8.
+       For field-level control, please see the utf8_to_latin1 DSL function.
+       Options:
+       -h|--help Show this message.
+
    least-frequent
        Usage: mlr least-frequent [options]
        Shows the least frequently occurring distinct values for specified field names.
@@ -2342,6 +2357,12 @@ FUNCTIONS FOR FILTER/PUT
    json_stringify
         (class=collections #args=1,2) Converts value to JSON-formatted string. Default output is single-line. With optional second boolean argument set to true, produces multiline output.
 
+   latin1_to_utf8
+        (class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it.
+       Examples:
+       $y = latin1_to_utf8($x)
+       $* = latin1_to_utf8($*)
+
    leafcount
         (class=collections #args=1) Counts total number of terminal values in map/array. For single-level map/array, same as length.
 
@@ -2673,6 +2694,12 @@ FUNCTIONS FOR FILTER/PUT
    urandrange
         (class=math #args=2) Floating-point numbers uniformly distributed on the interval [a, b).
 
+   utf8_to_latin1
+        (class=string #args=1) Tries to convert UTF-8-encoded string to Latin-1-encoded string. If argument is array or map, recurses into it.
+       Examples:
+       $y = utf8_to_latin1($x)
+       $* = utf8_to_latin1($*)
+
    version
         (class=system #args=0) Returns the Miller version as a string.
 
@@ -3174,4 +3201,4 @@ SEE ALSO
 
 
 
-                                  2022-03-19                         MILLER(1)
+                                  2022-03-20                         MILLER(1)
diff --git a/docs/src/pix/latin1-to-utf8.png b/docs/src/pix/latin1-to-utf8.png
diff --git a/docs/src/pix/utf8-to-latin1.png b/docs/src/pix/utf8-to-latin1.png
diff --git a/docs/src/reference-dsl-builtin-functions.md b/docs/src/reference-dsl-builtin-functions.md
@@ -74,7 +74,7 @@ is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary
 * [**Hashing functions**](#hashing-functions):  [md5](#md5),  [sha1](#sha1),  [sha256](#sha256),  [sha512](#sha512).
 * [**Higher-order-functions functions**](#higher-order-functions-functions):  [any](#any),  [apply](#apply),  [every](#every),  [fold](#fold),  [reduce](#reduce),  [select](#select),  [sort](#sort).
 * [**Math functions**](#math-functions):  [abs](#abs),  [acos](#acos),  [acosh](#acosh),  [asin](#asin),  [asinh](#asinh),  [atan](#atan),  [atan2](#atan2),  [atanh](#atanh),  [cbrt](#cbrt),  [ceil](#ceil),  [cos](#cos),  [cosh](#cosh),  [erf](#erf),  [erfc](#erfc),  [exp](#exp),  [expm1](#expm1),  [floor](#floor),  [invqnorm](#invqnorm),  [log](#log),  [log10](#log10),  [log1p](#log1p),  [logifit](#logifit),  [max](#max),  [min](#min),  [qnorm](#qnorm),  [round](#round),  [roundm](#roundm),  [sgn](#sgn),  [sin](#sin),  [sinh](#sinh),  [sqrt](#sqrt),  [tan](#tan),  [tanh](#tanh),  [urand](#urand),  [urand32](#urand32),  [urandelement](#urandelement),  [urandint](#urandint),  [urandrange](#urandrange).
-* [**String functions**](#string-functions):  [capitalize](#capitalize),  [clean_whitespace](#clean_whitespace),  [collapse_whitespace](#collapse_whitespace),  [format](#format),  [gssub](#gssub),  [gsub](#gsub),  [lstrip](#lstrip),  [regextract](#regextract),  [regextract_or_else](#regextract_or_else),  [rstrip](#rstrip),  [ssub](#ssub),  [strip](#strip),  [strlen](#strlen),  [sub](#sub),  [substr](#substr),  [substr0](#substr0),  [substr1](#substr1),  [tolower](#tolower),  [toupper](#toupper),  [truncate](#truncate),  [unformat](#unformat),  [unformatx](#unformatx),  [\.](#dot).
+* [**String functions**](#string-functions):  [capitalize](#capitalize),  [clean_whitespace](#clean_whitespace),  [collapse_whitespace](#collapse_whitespace),  [format](#format),  [gssub](#gssub),  [gsub](#gsub),  [latin1_to_utf8](#latin1_to_utf8),  [lstrip](#lstrip),  [regextract](#regextract),  [regextract_or_else](#regextract_or_else),  [rstrip](#rstrip),  [ssub](#ssub),  [strip](#strip),  [strlen](#strlen),  [sub](#sub),  [substr](#substr),  [substr0](#substr0),  [substr1](#substr1),  [tolower](#tolower),  [toupper](#toupper),  [truncate](#truncate),  [unformat](#unformat),  [unformatx](#unformatx),  [utf8_to_latin1](#utf8_to_latin1),  [\.](#dot).
 * [**System functions**](#system-functions):  [hostname](#hostname),  [os](#os),  [system](#system),  [version](#version).
 * [**Time functions**](#time-functions):  [dhms2fsec](#dhms2fsec),  [dhms2sec](#dhms2sec),  [fsec2dhms](#fsec2dhms),  [fsec2hms](#fsec2hms),  [gmt2localtime](#gmt2localtime),  [gmt2sec](#gmt2sec),  [hms2fsec](#hms2fsec),  [hms2sec](#hms2sec),  [localtime2gmt](#localtime2gmt),  [localtime2sec](#localtime2sec),  [sec2dhms](#sec2dhms),  [sec2gmt](#sec2gmt),  [sec2gmtdate](#sec2gmtdate),  [sec2hms](#sec2hms),  [sec2localdate](#sec2localdate),  [sec2localtime](#sec2localtime),  [strftime](#strftime),  [strftime_local](#strftime_local),  [strptime](#strptime),  [strptime_local](#strptime_local),  [systime](#systime),  [systimeint](#systimeint),  [uptime](#uptime).
 * [**Typing functions**](#typing-functions):  [asserting_absent](#asserting_absent),  [asserting_array](#asserting_array),  [asserting_bool](#asserting_bool),  [asserting_boolean](#asserting_boolean),  [asserting_empty](#asserting_empty),  [asserting_empty_map](#asserting_empty_map),  [asserting_error](#asserting_error),  [asserting_float](#asserting_float),  [asserting_int](#asserting_int),  [asserting_map](#asserting_map),  [asserting_nonempty_map](#asserting_nonempty_map),  [asserting_not_array](#asserting_not_array),  [asserting_not_empty](#asserting_not_empty),  [asserting_not_map](#asserting_not_map),  [asserting_not_null](#asserting_not_null),  [asserting_null](#asserting_null),  [asserting_numeric](#asserting_numeric),  [asserting_present](#asserting_present),  [asserting_string](#asserting_string),  [is_absent](#is_absent),  [is_array](#is_array),  [is_bool](#is_bool),  [is_boolean](#is_boolean),  [is_empty](#is_empty),  [is_empty_map](#is_empty_map),  [is_error](#is_error),  [is_float](#is_float),  [is_int](#is_int),  [is_map](#is_map),  [is_nan](#is_nan),  [is_nonempty_map](#is_nonempty_map),  [is_not_array](#is_not_array),  [is_not_empty](#is_not_empty),  [is_not_map](#is_not_map),  [is_not_null](#is_not_null),  [is_null](#is_null),  [is_numeric](#is_numeric),  [is_present](#is_present),  [is_string](#is_string),  [typeof](#typeof).
@@ -1012,6 +1012,15 @@ gsub("prefix4529:suffix8567", "(....ix)([0-9]+)", "[\1 : \2]") gives "[prefix :
 </pre>
 
 
+### latin1_to_utf8
+<pre class="pre-non-highlight-non-pair">
+latin1_to_utf8  (class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it.
+Examples:
+$y = latin1_to_utf8($x)
+$* = latin1_to_utf8($*)
+</pre>
+
+
 ### lstrip
 <pre class="pre-non-highlight-non-pair">
 lstrip  (class=string #args=1) Strip leading whitespace from string.
@@ -1130,6 +1139,15 @@ is_error(unformatx("{}h{}m{}s", "3:47:22")) gives true.
 </pre>
 
 
+### utf8_to_latin1
+<pre class="pre-non-highlight-non-pair">
+utf8_to_latin1  (class=string #args=1) Tries to convert UTF-8-encoded string to Latin-1-encoded string. If argument is array or map, recurses into it.
+Examples:
+$y = utf8_to_latin1($x)
+$* = utf8_to_latin1($*)
+</pre>
+
+
 <a id=dot> </a>
 
 ### \.

diff --git a/docs/src/reference-verbs.md b/docs/src/reference-verbs.md
@@ -1873,6 +1873,39 @@ Alice 56  missing
 Carol 45  present
 </pre>
 
+## latin1-to-utf8
+
+<pre class="pre-highlight-in-pair">
+<b>mlr latin1-to-utf8 -h</b>
+</pre>
+<pre class="pre-non-highlight-in-pair">
+Usage: mlr latin1-to-utf8, with no options.
+Recursively converts record strings from Latin-1 to UTF-8.
+For field-level control, please see the latin1_to_utf8 DSL function.
+Options:
+-h|--help Show this message.
+</pre>
+
+![pix/latin1-to-utf8.png](pix/latin1-to-utf8.png)
+
+## utf8-to-latin1
+
+<pre class="pre-highlight-in-pair">
+<b>mlr utf8-to-latin1 -h</b>
+</pre>
+<pre class="pre-non-highlight-in-pair">
+Usage: mlr utf8-to-latin1, with no options.
+Recursively converts record strings from Latin-1 to UTF-8.
+For field-level control, please see the utf8_to_latin1 DSL function.
+Options:
+-h|--help Show this message.
+</pre>
+
+In this example, the English and German pangrams are convertible from UTF-8 to Latin-1, but the
+Russian one is not:
+
+![pix/utf8-to-latin1.png](pix/utf8-to-latin1.png)
+
 ## least-frequent
 
 <pre class="pre-highlight-in-pair">

diff --git a/docs/src/reference-verbs.md.in b/docs/src/reference-verbs.md.in
@@ -615,6 +615,25 @@ GENMD-RUN-COMMAND
 mlr --icsv --implicit-csv-header --opprint label name,age,status data/headerless.csv
 GENMD-EOF
 
+## latin1-to-utf8
+
+GENMD-RUN-COMMAND
+mlr latin1-to-utf8 -h
+GENMD-EOF
+
+![pix/latin1-to-utf8.png](pix/latin1-to-utf8.png)
+
+## utf8-to-latin1
+
+GENMD-RUN-COMMAND
+mlr utf8-to-latin1 -h
+GENMD-EOF
+
+In this example, the English and German pangrams are convertible from UTF-8 to Latin-1, but the
+Russian one is not:
+
+![pix/utf8-to-latin1.png](pix/utf8-to-latin1.png)
+
 ## least-frequent
 
 GENMD-RUN-COMMAND

diff --git a/docs/src/special-symbols-and-formatting.md b/docs/src/special-symbols-and-formatting.md
@@ -170,6 +170,8 @@ The
 [`gssub`](reference-dsl-builtin-functions.md#gssub)
 functions exist precisely for this reason: so you don't have to escape anything.
 
+## Latin-1 and UTF-8 character encodings
+
 The `ssub` and `gssub` functions are also handy for dealing with non-UTF-8 strings such as Latin 1, since Go's
 `regexp` library -- which Miller uses -- requires UTF-8 strings. For example:
 
@@ -186,6 +188,24 @@ The `ssub` and `gssub` functions are also handy for dealing with non-UTF-8 strin
 Kaðlín og Þormundr
 </pre>
 
+More generally, though, we have the DSL functions
+[`latin1_to_utf8`](reference-dsl-builtin-functions.md#latin1_to_utf8) and
+[`utf8_to_latin1`](reference-dsl-builtin-functions.md#utf8_to_latin1)
+and the verbs
+[`latin1-to-utf8`](reference-verbs.md#latin1-to-utf8) and
+[`utf8-to-latin1`](reference-verbs.md#utf8-to-latin1). The former let you fix encodings on a field-by-field
+level; the latter, for all records (with less keystroking). (Latin 1 is also known as
+[ISO/IEC 8859-1](https://en.wikipedia.org/wiki/ISO/IEC_8859-1).)
+
+In this example, all the inputs are convertible from Latin-1 to UTF-8:
+
+![pix/latin1-to-utf8.png](pix/latin1-to-utf8.png)
+
+In this example, the English and German pangrams are convertible from UTF-8 to Latin-1, but the
+Russian one is not:
+
+![pix/utf8-to-latin1.png](pix/utf8-to-latin1.png)
+
 ## How to apply math to regex output?
 
 * Use parentheses for capture groups