Skip to content

Commit

Permalink
New contains DSL function (#1374)
Browse files Browse the repository at this point in the history
* New `contains` DSL function

* unit-test files, and docs
  • Loading branch information
johnkerl authored Aug 28, 2023
1 parent 5b29169 commit 5146dd7
Show file tree
Hide file tree
Showing 11 changed files with 198 additions and 100 deletions.
56 changes: 32 additions & 24 deletions docs/src/manpage.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,28 +209,28 @@ MILLER(1) MILLER(1)
asserting_map asserting_nonempty_map asserting_not_array asserting_not_empty
asserting_not_map asserting_not_null asserting_null asserting_numeric
asserting_present asserting_string atan atan2 atanh bitcount boolean
capitalize cbrt ceil clean_whitespace collapse_whitespace concat cos cosh
count depth dhms2fsec dhms2sec distinct_count erf erfc every exec exp expm1
flatten float floor fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys
get_values gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec
hms2sec hostname index int invqnorm is_absent is_array is_bool is_boolean
is_empty is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map
is_not_array is_not_empty is_not_map is_not_null is_null is_numeric is_present
is_string joink joinkv joinv json_parse json_stringify kurtosis latin1_to_utf8
leafcount leftpad length localtime2gmt localtime2nsec localtime2sec log log10
log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max maxlen md5
mean meaneb median mexp min minlen mmul mode msub nsec2gmt nsec2gmtdate
nsec2localdate nsec2localtime null_count os percentile percentiles pow qnorm
reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms
sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256
sha512 sin sinh skewness sort sort_collection splita splitax splitkv splitkvx
splitnv splitnvx sqrt ssub stddev strfntime strfntime_local strftime
strftime_local string strip strlen strpntime strpntime_local strptime
strptime_local sub substr substr0 substr1 sum sum2 sum3 sum4 sysntime system
systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat
unformatx upntime uptime urand urand32 urandelement urandint urandrange
utf8_to_latin1 variance version ! != !=~ % & && * ** + - . .* .+ .- ./ / // <
<< <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
capitalize cbrt ceil clean_whitespace collapse_whitespace concat contains cos
cosh count depth dhms2fsec dhms2sec distinct_count erf erfc every exec exp
expm1 flatten float floor fmtifnum fmtnum fold format fsec2dhms fsec2hms
get_keys get_values gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt
hms2fsec hms2sec hostname index int invqnorm is_absent is_array is_bool
is_boolean is_empty is_empty_map is_error is_float is_int is_map is_nan
is_nonempty_map is_not_array is_not_empty is_not_map is_not_null is_null
is_numeric is_present is_string joink joinkv joinv json_parse json_stringify
kurtosis latin1_to_utf8 leafcount leftpad length localtime2gmt localtime2nsec
localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect
mapsum max maxlen md5 mean meaneb median mexp min minlen mmul mode msub
nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os percentile
percentiles pow qnorm reduce regextract regextract_or_else rightpad round
roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime
select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita
splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime
strfntime_local strftime strftime_local string strip strlen strpntime
strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2
sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate
typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement
urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - .
.* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~

1mCOMMENTS-IN-DATA FLAGS0m
Miller lets you put comments in your data, such as
Expand Down Expand Up @@ -2311,6 +2311,14 @@ MILLER(1) MILLER(1)
concat([1,2],3) is [1,2,3]
concat([1,2],[3]) is [1,2,3]

1mcontains0m
(class=string #args=2) Returns true if the first argument contains the second as a substring. This is like saying `index(arg1, arg2) >= 0`but with less keystroking.
Examples:
contains("abcde", "e") gives true
contains("abcde", "x") gives false
contains(12345, 34) gives true
contains("fort", "") gives true

1mcos0m
(class=math #args=1) Trigonometric cosine.

Expand Down Expand Up @@ -2461,7 +2469,7 @@ MILLER(1) MILLER(1)
(class=string #args=2) Returns the index (1-based) of the second argument within the first. Returns -1 if the second argument isn't a substring of the first. Stringifies non-string inputs. Uses UTF-8 encoding to count characters, not bytes.
Examples:
index("abcde", "e") gives 5
index("abcde", "x") gives 01
index("abcde", "x") gives -1
index(12345, 34) gives 3
index("fort", "t") gives 5

Expand Down Expand Up @@ -3634,5 +3642,5 @@ MILLER(1) MILLER(1)



2023-08-27 MILLER(1)
2023-08-28 MILLER(1)
</pre>
56 changes: 32 additions & 24 deletions docs/src/manpage.txt
Original file line number Diff line number Diff line change
Expand Up @@ -188,28 +188,28 @@ MILLER(1) MILLER(1)
asserting_map asserting_nonempty_map asserting_not_array asserting_not_empty
asserting_not_map asserting_not_null asserting_null asserting_numeric
asserting_present asserting_string atan atan2 atanh bitcount boolean
capitalize cbrt ceil clean_whitespace collapse_whitespace concat cos cosh
count depth dhms2fsec dhms2sec distinct_count erf erfc every exec exp expm1
flatten float floor fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys
get_values gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec
hms2sec hostname index int invqnorm is_absent is_array is_bool is_boolean
is_empty is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map
is_not_array is_not_empty is_not_map is_not_null is_null is_numeric is_present
is_string joink joinkv joinv json_parse json_stringify kurtosis latin1_to_utf8
leafcount leftpad length localtime2gmt localtime2nsec localtime2sec log log10
log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max maxlen md5
mean meaneb median mexp min minlen mmul mode msub nsec2gmt nsec2gmtdate
nsec2localdate nsec2localtime null_count os percentile percentiles pow qnorm
reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms
sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256
sha512 sin sinh skewness sort sort_collection splita splitax splitkv splitkvx
splitnv splitnvx sqrt ssub stddev strfntime strfntime_local strftime
strftime_local string strip strlen strpntime strpntime_local strptime
strptime_local sub substr substr0 substr1 sum sum2 sum3 sum4 sysntime system
systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat
unformatx upntime uptime urand urand32 urandelement urandint urandrange
utf8_to_latin1 variance version ! != !=~ % & && * ** + - . .* .+ .- ./ / // <
<< <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~
capitalize cbrt ceil clean_whitespace collapse_whitespace concat contains cos
cosh count depth dhms2fsec dhms2sec distinct_count erf erfc every exec exp
expm1 flatten float floor fmtifnum fmtnum fold format fsec2dhms fsec2hms
get_keys get_values gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt
hms2fsec hms2sec hostname index int invqnorm is_absent is_array is_bool
is_boolean is_empty is_empty_map is_error is_float is_int is_map is_nan
is_nonempty_map is_not_array is_not_empty is_not_map is_not_null is_null
is_numeric is_present is_string joink joinkv joinv json_parse json_stringify
kurtosis latin1_to_utf8 leafcount leftpad length localtime2gmt localtime2nsec
localtime2sec log log10 log1p logifit lstrip madd mapdiff mapexcept mapselect
mapsum max maxlen md5 mean meaneb median mexp min minlen mmul mode msub
nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime null_count os percentile
percentiles pow qnorm reduce regextract regextract_or_else rightpad round
roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime
select sgn sha1 sha256 sha512 sin sinh skewness sort sort_collection splita
splitax splitkv splitkvx splitnv splitnvx sqrt ssub stddev strfntime
strfntime_local strftime strftime_local string strip strlen strpntime
strpntime_local strptime strptime_local sub substr substr0 substr1 sum sum2
sum3 sum4 sysntime system systime systimeint tan tanh tolower toupper truncate
typeof unflatten unformat unformatx upntime uptime urand urand32 urandelement
urandint urandrange utf8_to_latin1 variance version ! != !=~ % & && * ** + - .
.* .+ .- ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~

1mCOMMENTS-IN-DATA FLAGS0m
Miller lets you put comments in your data, such as
Expand Down Expand Up @@ -2290,6 +2290,14 @@ MILLER(1) MILLER(1)
concat([1,2],3) is [1,2,3]
concat([1,2],[3]) is [1,2,3]

1mcontains0m
(class=string #args=2) Returns true if the first argument contains the second as a substring. This is like saying `index(arg1, arg2) >= 0`but with less keystroking.
Examples:
contains("abcde", "e") gives true
contains("abcde", "x") gives false
contains(12345, 34) gives true
contains("fort", "") gives true

1mcos0m
(class=math #args=1) Trigonometric cosine.

Expand Down Expand Up @@ -2440,7 +2448,7 @@ MILLER(1) MILLER(1)
(class=string #args=2) Returns the index (1-based) of the second argument within the first. Returns -1 if the second argument isn't a substring of the first. Stringifies non-string inputs. Uses UTF-8 encoding to count characters, not bytes.
Examples:
index("abcde", "e") gives 5
index("abcde", "x") gives 01
index("abcde", "x") gives -1
index(12345, 34) gives 3
index("fort", "t") gives 5

Expand Down Expand Up @@ -3613,4 +3621,4 @@ MILLER(1) MILLER(1)



2023-08-27 MILLER(1)
2023-08-28 MILLER(1)
15 changes: 13 additions & 2 deletions docs/src/reference-dsl-builtin-functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary
* [**Higher-order-functions functions**](#higher-order-functions-functions): [any](#any), [apply](#apply), [every](#every), [fold](#fold), [reduce](#reduce), [select](#select), [sort](#sort).
* [**Math functions**](#math-functions): [abs](#abs), [acos](#acos), [acosh](#acosh), [asin](#asin), [asinh](#asinh), [atan](#atan), [atan2](#atan2), [atanh](#atanh), [cbrt](#cbrt), [ceil](#ceil), [cos](#cos), [cosh](#cosh), [erf](#erf), [erfc](#erfc), [exp](#exp), [expm1](#expm1), [floor](#floor), [invqnorm](#invqnorm), [log](#log), [log10](#log10), [log1p](#log1p), [logifit](#logifit), [max](#max), [min](#min), [qnorm](#qnorm), [round](#round), [roundm](#roundm), [sgn](#sgn), [sin](#sin), [sinh](#sinh), [sqrt](#sqrt), [tan](#tan), [tanh](#tanh), [urand](#urand), [urand32](#urand32), [urandelement](#urandelement), [urandint](#urandint), [urandrange](#urandrange).
* [**Stats functions**](#stats-functions): [antimode](#antimode), [count](#count), [distinct_count](#distinct_count), [kurtosis](#kurtosis), [maxlen](#maxlen), [mean](#mean), [meaneb](#meaneb), [median](#median), [minlen](#minlen), [mode](#mode), [null_count](#null_count), [percentile](#percentile), [percentiles](#percentiles), [skewness](#skewness), [sort_collection](#sort_collection), [stddev](#stddev), [sum](#sum), [sum2](#sum2), [sum3](#sum3), [sum4](#sum4), [variance](#variance).
* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot).
* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [contains](#contains), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot).
* [**System functions**](#system-functions): [exec](#exec), [hostname](#hostname), [os](#os), [system](#system), [version](#version).
* [**Time functions**](#time-functions): [dhms2fsec](#dhms2fsec), [dhms2sec](#dhms2sec), [fsec2dhms](#fsec2dhms), [fsec2hms](#fsec2hms), [gmt2localtime](#gmt2localtime), [gmt2nsec](#gmt2nsec), [gmt2sec](#gmt2sec), [hms2fsec](#hms2fsec), [hms2sec](#hms2sec), [localtime2gmt](#localtime2gmt), [localtime2nsec](#localtime2nsec), [localtime2sec](#localtime2sec), [nsec2gmt](#nsec2gmt), [nsec2gmtdate](#nsec2gmtdate), [nsec2localdate](#nsec2localdate), [nsec2localtime](#nsec2localtime), [sec2dhms](#sec2dhms), [sec2gmt](#sec2gmt), [sec2gmtdate](#sec2gmtdate), [sec2hms](#sec2hms), [sec2localdate](#sec2localdate), [sec2localtime](#sec2localtime), [strfntime](#strfntime), [strfntime_local](#strfntime_local), [strftime](#strftime), [strftime_local](#strftime_local), [strpntime](#strpntime), [strpntime_local](#strpntime_local), [strptime](#strptime), [strptime_local](#strptime_local), [sysntime](#sysntime), [systime](#systime), [systimeint](#systimeint), [upntime](#upntime), [uptime](#uptime).
* [**Typing functions**](#typing-functions): [asserting_absent](#asserting_absent), [asserting_array](#asserting_array), [asserting_bool](#asserting_bool), [asserting_boolean](#asserting_boolean), [asserting_empty](#asserting_empty), [asserting_empty_map](#asserting_empty_map), [asserting_error](#asserting_error), [asserting_float](#asserting_float), [asserting_int](#asserting_int), [asserting_map](#asserting_map), [asserting_nonempty_map](#asserting_nonempty_map), [asserting_not_array](#asserting_not_array), [asserting_not_empty](#asserting_not_empty), [asserting_not_map](#asserting_not_map), [asserting_not_null](#asserting_not_null), [asserting_null](#asserting_null), [asserting_numeric](#asserting_numeric), [asserting_present](#asserting_present), [asserting_string](#asserting_string), [is_absent](#is_absent), [is_array](#is_array), [is_bool](#is_bool), [is_boolean](#is_boolean), [is_empty](#is_empty), [is_empty_map](#is_empty_map), [is_error](#is_error), [is_float](#is_float), [is_int](#is_int), [is_map](#is_map), [is_nan](#is_nan), [is_nonempty_map](#is_nonempty_map), [is_not_array](#is_not_array), [is_not_empty](#is_not_empty), [is_not_map](#is_not_map), [is_not_null](#is_not_null), [is_null](#is_null), [is_numeric](#is_numeric), [is_present](#is_present), [is_string](#is_string), [typeof](#typeof).
Expand Down Expand Up @@ -1219,6 +1219,17 @@ collapse_whitespace (class=string #args=1) Strip repeated whitespace from strin
</pre>


### contains
<pre class="pre-non-highlight-non-pair">
contains (class=string #args=2) Returns true if the first argument contains the second as a substring. This is like saying `index(arg1, arg2) >= 0`but with less keystroking.
Examples:
contains("abcde", "e") gives true
contains("abcde", "x") gives false
contains(12345, 34) gives true
contains("forêt", "ê") gives true
</pre>


### format
<pre class="pre-non-highlight-non-pair">
format (class=string #args=variadic) Using first argument as format string, interpolate remaining arguments in place of each "{}" in the format string. Too-few arguments are treated as the empty string; too-many arguments are discarded.
Expand Down Expand Up @@ -1254,7 +1265,7 @@ gsub("prefix4529:suffix8567", "(....ix)([0-9]+)", "[\1 : \2]") gives "[prefix :
index (class=string #args=2) Returns the index (1-based) of the second argument within the first. Returns -1 if the second argument isn't a substring of the first. Stringifies non-string inputs. Uses UTF-8 encoding to count characters, not bytes.
Examples:
index("abcde", "e") gives 5
index("abcde", "x") gives 01
index("abcde", "x") gives -1
index(12345, 34) gives 3
index("forêt", "t") gives 5
</pre>
Expand Down
14 changes: 14 additions & 0 deletions internal/pkg/bifs/strings.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,20 @@ func BIF_index(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
return mlrval.FromInt(lib.UTF8Strlen(sinput1[:iindex]) + 1)
}

// ================================================================
// contains(string, substring) returns true if string contains substring, else false.

func BIF_contains(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsAbsent() {
return mlrval.ABSENT
}
if input1.IsError() {
return mlrval.ERROR
}

return mlrval.FromBool(strings.Contains(input1.String(), input2.String()))
}

// ================================================================
func BIF_truncate(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
if input1.IsErrorOrAbsent() {
Expand Down
14 changes: 13 additions & 1 deletion internal/pkg/dsl/cst/builtin_function_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -546,11 +546,23 @@ Arrays are new in Miller 6; the substr function is older.`,
binaryFunc: bifs.BIF_index,
examples: []string{
`index("abcde", "e") gives 5`,
`index("abcde", "x") gives 01`,
`index("abcde", "x") gives -1`,
`index(12345, 34) gives 3`,
`index("forêt", "t") gives 5`,
},
},
{
name: "contains",
class: FUNC_CLASS_STRING,
help: `Returns true if the first argument contains the second as a substring. This is like saying ` + "`index(arg1, arg2) >= 0`" + `but with less keystroking.`,
binaryFunc: bifs.BIF_contains,
examples: []string{
`contains("abcde", "e") gives true`,
`contains("abcde", "x") gives false`,
`contains(12345, 34) gives true`,
`contains("forêt", "ê") gives true`,
},
},

{
name: "tolower",
Expand Down
Loading

0 comments on commit 5146dd7

Please sign in to comment.