Skip to content

Commit

Permalink
strings: add eachsplit for iterative splitting
Browse files Browse the repository at this point in the history
  • Loading branch information
anaveragehuman committed Jan 14, 2021
1 parent a3369df commit 9132a21
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 36 deletions.
1 change: 1 addition & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,7 @@ export
codeunits,
digits,
digits!,
eachsplit,
escape_string,
hex2bytes,
hex2bytes!,
Expand Down
107 changes: 71 additions & 36 deletions base/strings/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,74 @@ function rpad(
r == 0 ? string(s, p^q) : string(s, p^q, first(p, r))
end

"""
eachsplit(str::AbstractString, dlm; limit::Integer=0)
eachsplit(str::AbstractString; limit::Integer=0)
Split `str` on occurrences of the delimiter(s) `dlm` and return an iterator over the
substrings. `dlm` can be any of the formats allowed by [`findnext`](@ref)'s first argument
(i.e. as a string, regular expression or a function), or as a single character or collection
of characters.
If `dlm` is omitted, it defaults to [`isspace`](@ref).
The iterator will return a maximum of `limit` results if the keyword argument is supplied.
The default of `limit=0` implies no maximum.
See also [`split`](@ref).
# Examples
```jldoctest
julia> a = "Ma.rch"
"Ma.rch"
julia> collect(eachsplit(a, "."))
2-element Vector{SubString{String}}:
"Ma"
"rch"
```
"""
function eachsplit end

struct SplitIterator{S<:AbstractString,F}
str::S
splitter::F
limit::Int
end

eltype(::Type{<:SplitIterator}) = SubString

IteratorSize(::Type{<:SplitIterator}) = SizeUnknown()

function iterate(iter::SplitIterator, (i, k, n)=(1, 1, 0))
i - 1 > ncodeunits(iter.str)::Int && return nothing
r = findnext(iter.splitter, iter.str, k)::Union{Nothing,Int,UnitRange{Int}}
if r === nothing || n == iter.limit - 1 || first(r) > lastindex(iter.str)
return (@inbounds SubString(iter.str, i), (ncodeunits(iter.str) + 2, k, n + 1))
end

j, k = first(r), nextind(iter.str, last(r))::Int
k_ = ifelse(k <= j, nextind(iter.str, j), k)
if i < k
substr = @inbounds SubString(iter.str, i, prevind(iter.str, j)::Int)
return (substr, (max(i, k), k_, n + Int(i < j)))
else
return iterate(iter, (i, k_, n))
end
end

eachsplit(str::T, splitter; limit::Integer=0) where {T<:AbstractString} =
SplitIterator(str, splitter, limit)

eachsplit(str::T, splitter::Union{Tuple{Vararg{AbstractChar}},AbstractVector{<:AbstractChar},Set{<:AbstractChar}}; limit::Integer=0) where {T<:AbstractString} =
eachsplit(str, in(splitter); limit)

eachsplit(str::T, splitter::AbstractChar; limit::Integer=0) where {T<:AbstractString} =
eachsplit(str, isequal(splitter); limit)

# a bit oddball, but standard behavior in Perl, Ruby & Python:
eachsplit(str::AbstractString; limit::Integer=0) = eachsplit(str, isspace; limit)

"""
split(str::AbstractString, dlm; limit::Integer=0, keepempty::Bool=true)
split(str::AbstractString; limit::Integer=0, keepempty::Bool=false)
Expand Down Expand Up @@ -400,42 +468,9 @@ function split end

function split(str::T, splitter;
limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString}
_split(str, splitter, limit, keepempty, T <: SubString ? T[] : SubString{T}[])
end
function split(str::T, splitter::Union{Tuple{Vararg{AbstractChar}},AbstractVector{<:AbstractChar},Set{<:AbstractChar}};
limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString}
_split(str, in(splitter), limit, keepempty, T <: SubString ? T[] : SubString{T}[])
end
function split(str::T, splitter::AbstractChar;
limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString}
_split(str, isequal(splitter), limit, keepempty, T <: SubString ? T[] : SubString{T}[])
end

function _split(str::AbstractString, splitter::F, limit::Integer, keepempty::Bool, strs::Vector) where F
# Forcing specialization on `splitter` improves performance (roughly 30% decrease in runtime)
# and prevents a major invalidation risk (1550 MethodInstances)
i = 1 # firstindex(str)
n = lastindex(str)::Int
r = findfirst(splitter,str)::Union{Nothing,Int,UnitRange{Int}}
if r !== nothing
j, k = first(r), nextind(str,last(r))::Int
while 0 < j <= n && length(strs) != limit-1
if i < k
if keepempty || i < j
push!(strs, @inbounds SubString(str,i,prevind(str,j)::Int))
end
i = k
end
(k <= j) && (k = nextind(str,j)::Int)
r = findnext(splitter,str,k)::Union{Nothing,Int,UnitRange{Int}}
r === nothing && break
j, k = first(r), nextind(str,last(r))::Int
end
end
if keepempty || i <= ncodeunits(str)::Int
push!(strs, @inbounds SubString(str,i))
end
return strs
itr = eachsplit(str, splitter; limit)
keepempty || (itr = Iterators.filter(!isempty, itr))
collect(T <: SubString ? T : SubString{T}, itr)
end

# a bit oddball, but standard behavior in Perl, Ruby & Python:
Expand Down

0 comments on commit 9132a21

Please sign in to comment.