Skip to content

Commit

Permalink
#99: Rename duplicate ingest headers (#283)
Browse files Browse the repository at this point in the history
* CSV now renames duplicate ingest headers

* Fix broken test

* xlsx ingester now handles duplicate col names

* Update CHANGELOG

* Additional tests for ingest.column.rename

* Removed dead comment in grammar
  • Loading branch information
neilotoole authored Jul 4, 2023
1 parent b4cc109 commit 4ffaae9
Show file tree
Hide file tree
Showing 27 changed files with 469 additions and 83 deletions.
26 changes: 25 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

Breaking changes are annotated with ☢️.

## Upcoming

### Added

- [#99]: The [CSV](https://sq.io/docs/drivers/csv) and [XLSX](https://sq.io/docs/drivers/xlsx)
drivers can now handle duplicate header column names. For example, given a CSV file:

```csv
actor_id,first_name,actor_id
1,PENELOPE,1
2,NICK,2
```

The columns will be renamed to:

```csv
actor_id,first_name,actor_id_1
```

The renaming behavior is controlled by a new option `ingest.column.rename`
([docs](https://sq.io/docs/config/#ingestcolumnrename)).


## [v0.40.0] - 2023-07-03

This release features a complete overhaul of the [`join`](https://sq.io/docs/query/#joins)
Expand All @@ -18,7 +41,7 @@ mechanism.
particularly useful, but it's a building block for [multiple joins](https://github.com/neilotoole/sq/issues/12).

```shell
$ sq `@sakila | .actor:a | .a.first_name`
$ sq '@sakila | .actor:a | .a.first_name'
```

- New option `result.column.rename` that exposes a template used to rename
Expand Down Expand Up @@ -659,6 +682,7 @@ make working with lots of sources much easier.
[#91]: https://github.com/neilotoole/sq/pull/91
[#95]: https://github.com/neilotoole/sq/issues/93
[#98]: https://github.com/neilotoole/sq/issues/98
[#99]: https://github.com/neilotoole/sq/issues/99
[#123]: https://github.com/neilotoole/sq/issues/123
[#142]: https://github.com/neilotoole/sq/issues/142
[#144]: https://github.com/neilotoole/sq/issues/144
Expand Down
7 changes: 4 additions & 3 deletions cli/config/yamlstore/upgrades/v0.34.0/upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ import (
"testing"
"time"

"github.com/neilotoole/sq/libsq/driver"

"github.com/neilotoole/sq/cli"
"github.com/neilotoole/sq/cli/config/yamlstore"
v0_34_0 "github.com/neilotoole/sq/cli/config/yamlstore/upgrades/v0.34.0"
"github.com/neilotoole/sq/drivers"
"github.com/neilotoole/sq/drivers/csv"
"github.com/neilotoole/sq/drivers/xlsx"
"github.com/neilotoole/sq/libsq/core/options"
Expand Down Expand Up @@ -80,12 +81,12 @@ func TestUpgrade(t *testing.T) {
src1 := cfg.Collection.Sources()[1]
require.Equal(t, handleCSV, src1.Handle)
require.Equal(t, csv.TypeCSV, src1.Type)
require.Equal(t, true, src1.Options[drivers.OptIngestHeader.Key()])
require.Equal(t, true, src1.Options[driver.OptIngestHeader.Key()])

src2 := cfg.Collection.Sources()[2]
require.Equal(t, handleXLSX, src2.Handle)
require.Equal(t, xlsx.Type, src2.Type)
require.Equal(t, false, src2.Options[drivers.OptIngestHeader.Key()])
require.Equal(t, false, src2.Options[driver.OptIngestHeader.Key()])

wantCfgRaw, err := os.ReadFile(filepath.Join("testdata", "want.sq.yml"))
require.NoError(t, err)
Expand Down
6 changes: 3 additions & 3 deletions cli/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (

"github.com/neilotoole/sq/libsq/core/timez"

"github.com/neilotoole/sq/drivers"
"github.com/neilotoole/sq/drivers/csv"
"github.com/neilotoole/sq/libsq/core/errz"
"github.com/neilotoole/sq/libsq/core/options"
Expand Down Expand Up @@ -162,8 +161,9 @@ func RegisterDefaultOpts(reg *options.Registry) {
driver.OptTuningErrgroupLimit,
driver.OptTuningRecChanSize,
OptTuningFlushThreshold,
drivers.OptIngestHeader,
drivers.OptIngestSampleSize,
driver.OptIngestHeader,
driver.OptIngestColRename,
driver.OptIngestSampleSize,
csv.OptDelim,
csv.OptEmptyAsNull,
)
Expand Down
2 changes: 1 addition & 1 deletion cli/options_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ func TestRegisterDefaultOpts(t *testing.T) {
log.Debug("options.Registry (after)", "reg", reg)

keys := reg.Keys()
require.Len(t, keys, 32)
require.Len(t, keys, 33)

for _, opt := range reg.Opts() {
opt := opt
Expand Down
4 changes: 1 addition & 3 deletions cli/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ import (

"github.com/neilotoole/sq/cli/run"

"github.com/neilotoole/sq/drivers"

"github.com/neilotoole/sq/cli/config/yamlstore"
v0_34_0 "github.com/neilotoole/sq/cli/config/yamlstore/upgrades/v0.34.0"
"github.com/neilotoole/sq/libsq/core/lg/slogbuf"
Expand Down Expand Up @@ -175,7 +173,7 @@ func FinishRunInit(ctx context.Context, ru *run.Run) error {
dr.AddProvider(json.TypeJSON, jsonp)
dr.AddProvider(json.TypeJSONA, jsonp)
dr.AddProvider(json.TypeJSONL, jsonp)
sampleSize := drivers.OptIngestSampleSize.Get(cfg.Options)
sampleSize := driver.OptIngestSampleSize.Get(cfg.Options)
ru.Files.AddDriverDetectors(
json.DetectJSON(sampleSize),
json.DetectJSONA(sampleSize),
Expand Down
44 changes: 44 additions & 0 deletions drivers/csv/csv_test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
package csv_test

import (
"context"
"path/filepath"
"testing"

"github.com/neilotoole/sq/libsq/driver"

"github.com/neilotoole/sq/cli/testrun"

"github.com/neilotoole/sq/libsq/core/timez"

"github.com/neilotoole/sq/libsq/core/stringz"
Expand Down Expand Up @@ -88,3 +94,41 @@ func TestEmptyAsNull(t *testing.T) {
require.EqualValues(t, want[i], rec0[i], "field [%d]", i)
}
}

func TestIngestDuplicateColumns(t *testing.T) {
ctx := context.Background()
tr := testrun.New(ctx, t, nil)

err := tr.Exec(
"add", filepath.Join("testdata", "actor_duplicate_cols.csv"),
"--handle", "@actor_dup",
)
require.NoError(t, err)

tr = testrun.New(ctx, t, tr).Hush()
require.NoError(t, tr.Exec("--csv", ".data"))
wantHeaders := []string{"actor_id", "first_name", "last_name", "last_update", "actor_id_1"}
data := tr.MustReadCSV()
require.Equal(t, wantHeaders, data[0])

// Make sure the data is correct
require.Len(t, data, sakila.TblActorCount+1) // +1 for header row
wantFirstDataRecord := []string{"1", "PENELOPE", "GUINESS", "2020-02-15T06:59:28Z", "1"}
require.Equal(t, wantFirstDataRecord, data[1])

// Verify that changing the template works
const tpl2 = "x_{{.Name}}{{with .Recurrence}}_{{.}}{{end}}"

tr = testrun.New(ctx, t, tr)
require.NoError(t, tr.Exec(
"config",
"set",
driver.OptIngestColRename.Key(),
tpl2,
))
tr = testrun.New(ctx, t, tr)
require.NoError(t, tr.Exec("--csv", ".data"))
wantHeaders = []string{"x_actor_id", "x_first_name", "x_last_name", "x_last_update", "x_actor_id_1"}
data = tr.MustReadCSV()
require.Equal(t, wantHeaders, data[0])
}
9 changes: 5 additions & 4 deletions drivers/csv/detect_header.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ import (
"context"
"strings"

"github.com/neilotoole/sq/libsq/driver"

"github.com/neilotoole/sq/libsq/core/lg"
"github.com/neilotoole/sq/libsq/core/lg/lga"

"github.com/neilotoole/sq/drivers"
"github.com/neilotoole/sq/libsq/core/options"

"github.com/neilotoole/sq/libsq/core/errz"
Expand All @@ -18,10 +19,10 @@ import (
// set in opts, or if detectHeaderRow detects that the first
// row of recs seems to be a header.
func hasHeaderRow(ctx context.Context, recs [][]string, opts options.Options) (bool, error) {
if drivers.OptIngestHeader.IsSet(opts) {
b := drivers.OptIngestHeader.Get(opts)
if driver.OptIngestHeader.IsSet(opts) {
b := driver.OptIngestHeader.Get(opts)
lg.FromContext(ctx).Debug("CSV ingest header explicitly specified: skipping header detection",
lga.Key, drivers.OptIngestHeader.Key(),
lga.Key, driver.OptIngestHeader.Key(),
lga.Val, b)
return b, nil
}
Expand Down
8 changes: 5 additions & 3 deletions drivers/csv/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ import (

"github.com/neilotoole/sq/libsq/core/record"

"github.com/neilotoole/sq/drivers"

"github.com/neilotoole/sq/libsq/core/kind"
"github.com/neilotoole/sq/libsq/core/stringz"

Expand Down Expand Up @@ -74,7 +72,7 @@ func ingestCSV(ctx context.Context, src *source.Source, openFn source.FileOpenFu
}

cr := newCSVReader(r, delim)
recs, err := readRecords(cr, drivers.OptIngestSampleSize.Get(src.Options))
recs, err := readRecords(cr, driver.OptIngestSampleSize.Get(src.Options))
if err != nil {
return err
}
Expand All @@ -99,6 +97,10 @@ func ingestCSV(ctx context.Context, src *source.Source, openFn source.FileOpenFu
}
}

if header, err = driver.MungeIngestColNames(ctx, header); err != nil {
return err
}

kinds, mungers, err := detectColKinds(recs)
if err != nil {
return err
Expand Down
Loading

0 comments on commit 4ffaae9

Please sign in to comment.