From cc1cd954eac65f4421bb81d6b6a7a110481b1f61 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 23 Dec 2024 12:27:08 -0500 Subject: [PATCH] Fix unflatten with field names like `.` `.x` or `x..y` (#1735) * Fix unflatten with field name like `.` `.x` or `x..y` * docs & test data --- docs/src/data/flatten-dots.csv | 2 + docs/src/flatten-unflatten.md | 53 +++++++++++++++++ docs/src/flatten-unflatten.md.in | 27 +++++++++ docs/src/manpage.md | 10 ++-- docs/src/manpage.txt | 10 ++-- docs/src/reference-main-flag-list.md | 6 +- man/manpage.txt | 10 ++-- man/mlr.1 | 12 ++-- pkg/cli/option_parse.go | 6 +- pkg/mlrval/mlrmap_flatten_unflatten.go | 57 ++++++++++++++----- test/cases/verb-flatten-unflatten/0011/expout | 9 ++- test/input/unflatten-input-2.xtab | 5 ++ 12 files changed, 164 insertions(+), 43 deletions(-) create mode 100644 docs/src/data/flatten-dots.csv diff --git a/docs/src/data/flatten-dots.csv b/docs/src/data/flatten-dots.csv new file mode 100644 index 0000000000..6a79471498 --- /dev/null +++ b/docs/src/data/flatten-dots.csv @@ -0,0 +1,2 @@ +a,b.,.c,.,d..e,f.g +1,2,3,4,5,6 diff --git a/docs/src/flatten-unflatten.md b/docs/src/flatten-unflatten.md index 7a3c138d2f..ff428ca39f 100644 --- a/docs/src/flatten-unflatten.md +++ b/docs/src/flatten-unflatten.md @@ -348,6 +348,59 @@ a.1,a.3,a.5 ] +## Non-inferencing cases + +An additional heuristic is that if a field name starts with a `.`, ends with +a `.`, or has two or more consecutive `.` characters, no attempt is made +to unflatten it on conversion from non-JSON to JSON. + +
+cat data/flatten-dots.csv
+
+
+a,b.,.c,.,d..e,f.g
+1,2,3,4,5,6
+
+ +
+mlr --icsv --oxtab cat data/flatten-dots.csv
+
+
+a    1
+b.   2
+.c   3
+.    4
+d..e 5
+f.g  6
+
+ +
+mlr --icsv --ojson cat data/flatten-dots.csv
+
+
+[
+{
+  "a": 1,
+  "b.": 2,
+  ".c": 3,
+  ".": 4,
+  "d..e": 5,
+  "f": {
+    "g": 6
+  }
+}
+]
+
+ +## Non-inferencing cases + +An additional heuristic is that if a field name starts with a `.`, ends with +a `.`, or has two or more consecutive `.` characters, no attempt is made +to unflatten it on conversion from non-JSON to JSON. + +## Manual control + + ## Manual control To see what our options are for manually controlling flattening and diff --git a/docs/src/flatten-unflatten.md.in b/docs/src/flatten-unflatten.md.in index 68033d594d..152efadba1 100644 --- a/docs/src/flatten-unflatten.md.in +++ b/docs/src/flatten-unflatten.md.in @@ -156,6 +156,33 @@ GENMD-RUN-COMMAND mlr --c2j cat data/non-consecutive.csv GENMD-EOF +## Non-inferencing cases + +An additional heuristic is that if a field name starts with a `.`, ends with +a `.`, or has two or more consecutive `.` characters, no attempt is made +to unflatten it on conversion from non-JSON to JSON. + +GENMD-RUN-COMMAND +cat data/flatten-dots.csv +GENMD-EOF + +GENMD-RUN-COMMAND +mlr --icsv --oxtab cat data/flatten-dots.csv +GENMD-EOF + +GENMD-RUN-COMMAND +mlr --icsv --ojson cat data/flatten-dots.csv +GENMD-EOF + +## Non-inferencing cases + +An additional heuristic is that if a field name starts with a `.`, ends with +a `.`, or has two or more consecutive `.` characters, no attempt is made +to unflatten it on conversion from non-JSON to JSON. + +## Manual control + + ## Manual control To see what our options are for manually controlling flattening and diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 2092d8abfa..b9af6e51cb 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -424,7 +424,7 @@ This is simply a copy of what you should see on running `man mlr` at a command p 1mFLATTEN-UNFLATTEN FLAGS0m These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening). - See the Flatten/unflatten doc page for more information. + See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information. --flatsep or --jflatsep {string} Separator for flattening multi-level JSON keys, e.g. @@ -435,10 +435,10 @@ This is simply a copy of what you should see on running `man mlr` at a command p then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`. - --no-auto-unflatten When input non-JSON and output is JSON, suppress the - default auto-unflatten behavior. Default: if the + --no-auto-unflatten When input is non-JSON and output is JSON, suppress + the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to - `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With + `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=91. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`. @@ -3737,5 +3737,5 @@ This is simply a copy of what you should see on running `man mlr` at a command p MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite https://miller.readthedocs.io - 2024-11-23 4mMILLER24m(1) + 2024-12-23 4mMILLER24m(1) diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index 35680317d6..aa0b21b9ba 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -403,7 +403,7 @@ 1mFLATTEN-UNFLATTEN FLAGS0m These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening). - See the Flatten/unflatten doc page for more information. + See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information. --flatsep or --jflatsep {string} Separator for flattening multi-level JSON keys, e.g. @@ -414,10 +414,10 @@ then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`. - --no-auto-unflatten When input non-JSON and output is JSON, suppress the - default auto-unflatten behavior. Default: if the + --no-auto-unflatten When input is non-JSON and output is JSON, suppress + the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to - `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With + `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=91. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`. @@ -3716,4 +3716,4 @@ MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite https://miller.readthedocs.io - 2024-11-23 4mMILLER24m(1) + 2024-12-23 4mMILLER24m(1) diff --git a/docs/src/reference-main-flag-list.md b/docs/src/reference-main-flag-list.md index fdea7b253b..7258cce089 100644 --- a/docs/src/reference-main-flag-list.md +++ b/docs/src/reference-main-flag-list.md @@ -195,14 +195,14 @@ are overridden in all cases by setting output format to `format2`. These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening). -See the Flatten/unflatten doc page for more information. +See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information. **Flags:** * `--flatsep or --jflatsep {string}`: Separator for flattening multi-level JSON keys, e.g. `{"a":{"b":3}}` becomes `a:b => 3` for non-JSON formats. Defaults to `.`. -* `--no-auto-flatten`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`. -* `--no-auto-unflatten`: When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`. +* `--no-auto-flatten`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9`, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`. +* `--no-auto-unflatten`: When input is non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`. ## Format-conversion keystroke-saver flags diff --git a/man/manpage.txt b/man/manpage.txt index 35680317d6..aa0b21b9ba 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -403,7 +403,7 @@ 1mFLATTEN-UNFLATTEN FLAGS0m These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening). - See the Flatten/unflatten doc page for more information. + See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information. --flatsep or --jflatsep {string} Separator for flattening multi-level JSON keys, e.g. @@ -414,10 +414,10 @@ then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`. - --no-auto-unflatten When input non-JSON and output is JSON, suppress the - default auto-unflatten behavior. Default: if the + --no-auto-unflatten When input is non-JSON and output is JSON, suppress + the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to - `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With + `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=91. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`. @@ -3716,4 +3716,4 @@ MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite https://miller.readthedocs.io - 2024-11-23 4mMILLER24m(1) + 2024-12-23 4mMILLER24m(1) diff --git a/man/mlr.1 b/man/mlr.1 index 67cc66732a..967d53e310 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -2,12 +2,12 @@ .\" Title: mlr .\" Author: [see the "AUTHOR" section] .\" Generator: ./mkman.rb -.\" Date: 2024-11-23 +.\" Date: 2024-12-23 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "MILLER" "1" "2024-11-23" "\ \&" "\ \&" +.TH "MILLER" "1" "2024-12-23" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Portability definitions .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -492,7 +492,7 @@ are overridden in all cases by setting output format to `format2`. .nf These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening). -See the Flatten/unflatten doc page for more information. +See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information. --flatsep or --jflatsep {string} Separator for flattening multi-level JSON keys, e.g. @@ -503,10 +503,10 @@ See the Flatten/unflatten doc page for more information. then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`. ---no-auto-unflatten When input non-JSON and output is JSON, suppress the - default auto-unflatten behavior. Default: if the +--no-auto-unflatten When input is non-JSON and output is JSON, suppress + the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to - `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With + `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=91. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`. .fi diff --git a/pkg/cli/option_parse.go b/pkg/cli/option_parse.go index 34db19a774..41be332b6d 100644 --- a/pkg/cli/option_parse.go +++ b/pkg/cli/option_parse.go @@ -2877,7 +2877,7 @@ var OutputColorizationFlagSection = FlagSection{ func FlattenUnflattenPrintInfo() { fmt.Println("These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).") fmt.Println() - fmt.Println("See the Flatten/unflatten doc page for more information.") + fmt.Println("See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.") } func init() { FlattenUnflattenFlagSection.Sort() } @@ -2901,7 +2901,7 @@ var FlattenUnflattenFlagSection = FlagSection{ { name: "--no-auto-flatten", - help: "When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.", + help: "When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9`, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.", parser: func(args []string, argc int, pargi *int, options *TOptions) { options.WriterOptions.AutoFlatten = false *pargi += 1 @@ -2910,7 +2910,7 @@ var FlattenUnflattenFlagSection = FlagSection{ { name: "--no-auto-unflatten", - help: "When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.", + help: "When input is non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.", parser: func(args []string, argc int, pargi *int, options *TOptions) { options.WriterOptions.AutoUnflatten = false *pargi += 1 diff --git a/pkg/mlrval/mlrmap_flatten_unflatten.go b/pkg/mlrval/mlrmap_flatten_unflatten.go index 579522f223..4e5d117d2e 100644 --- a/pkg/mlrval/mlrmap_flatten_unflatten.go +++ b/pkg/mlrval/mlrmap_flatten_unflatten.go @@ -106,7 +106,18 @@ func (mlrmap *Mlrmap) isFlattenable() bool { // For mlr unflatten without -f. This undoes Unflatten. This is for conversion // from non-JSON to JSON. If there are fields x.a, x.b, x.c, etc. they're put // into a single field x with map-valued value keyed by "a", "b", "c". - +// +// There is a heurtistic here though. Miller is (wildly) multi-format and needs +// to accommodate all manner of data. In the JSON world, "." is the default +// delimiter for nested data, and we're here to handle that. But in the R world, +// "." is just like "_" in other languages: witness "data.frame" rather than +// "data_frame". If the "." was intended as punctuation, in a say a field named +// "a.b" with value 3, then unflatten-to-JSON will make `{"a": {"b": 3}}`. This +// is just our default behavior; users can use --no-auto-unflatten. Weirder +// are field names like ".", ".x", "x.", "x..y", etc. The heuristic here +// is that when we split on "." and any of the pieces around/between the dots +// are empty string, we don't try to unflatten that field. +// // Special case: if the resulting string keys are string representations of 1, // 2, 3, etc -- without gaps -- then the map is converted to an array. // @@ -134,22 +145,38 @@ func (mlrmap *Mlrmap) CopyUnflattened( // We'll come through this loop once for x.a, another for x.b, etc. for pe := mlrmap.Head; pe != nil; pe = pe.Next { - // Is the field name something dot something? - if strings.Contains(pe.Key, separator) { - arrayOfIndices := SplitAXHelper(pe.Key, separator) - arrayval := arrayOfIndices.intf.([]*Mlrval) - lib.InternalCodingErrorIf(len(arrayval) < 1) - // If the input field name was "x.a" then remember the "x". - baseIndex := arrayval[0].String() - affectedBaseIndices[baseIndex] = true - // Use PutIndexed to assign $x["a"] = 7, or $x["b"] = 8, etc. - other.PutIndexed( - CopyMlrvalArray(arrayval), - unflattenTerminal(pe.Value).Copy(), - ) - } else { + // If there are no dots in the field name, treat it as a terminal. + if !strings.Contains(pe.Key, separator) { + other.PutReference(pe.Key, unflattenTerminal(pe.Value)) + continue + } + + arrayOfIndices := SplitAXHelper(pe.Key, separator) + arrayval := arrayOfIndices.intf.([]*Mlrval) + lib.InternalCodingErrorIf(len(arrayval) < 1) + + // Check for "" in any of the split pieces; treat the field as terminal if so. + legitDots := true + for i, _ := range arrayval { + piece := arrayval[i].String() + if piece == "" { + legitDots = false + break + } + } + if !legitDots { other.PutReference(pe.Key, unflattenTerminal(pe.Value)) + continue } + + // If the input field name was "x.a" then remember the "x". + baseIndex := arrayval[0].String() + affectedBaseIndices[baseIndex] = true + // Use PutIndexed to assign $x["a"] = 7, or $x["b"] = 8, etc. + other.PutIndexed( + CopyMlrvalArray(arrayval), + unflattenTerminal(pe.Value).Copy(), + ) } // Go through all the field names which were turned into maps -- e.g. "x" diff --git a/test/cases/verb-flatten-unflatten/0011/expout b/test/cases/verb-flatten-unflatten/0011/expout index 9a45bc1869..18f7372233 100644 --- a/test/cases/verb-flatten-unflatten/0011/expout +++ b/test/cases/verb-flatten-unflatten/0011/expout @@ -24,6 +24,13 @@ "wrapper": { "empty3": {}, "emtpy4": [] - } + }, + "x": { + "y": 1 + }, + "@": 2, + "x@": 3, + "@y": 4, + "x@@y": 5 } ] diff --git a/test/input/unflatten-input-2.xtab b/test/input/unflatten-input-2.xtab index 97b1941e1e..21ea4bd2b9 100644 --- a/test/input/unflatten-input-2.xtab +++ b/test/input/unflatten-input-2.xtab @@ -13,3 +13,8 @@ empty1 {} empty2 [] wrapper@empty3 {} wrapper@emtpy4 [] +x@y 1 +@ 2 +x@ 3 +@y 4 +x@@y 5