Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add temporal clustering #370

Merged
merged 2 commits into from
Jun 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ AxisKeys = "94b1ba4f-4ee9-5380-92f1-94cde586c3c5"
BlackBoxOptim = "a134a8b2-14d6-55f6-9291-3336d3ab0209"
Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Expand Down
1 change: 1 addition & 0 deletions src/analysis/analysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -83,5 +83,6 @@ end
include("pareto.jl")
include("sensitivity.jl")
include("intervention.jl")
include("clustering.jl")

end
114 changes: 114 additions & 0 deletions src/analysis/clustering.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
using Distances
using Clustering


"""
CE(x::AbstractMatrix{T})::AbstractMatrix{Float64} where {T <: Real}

Compute Complexity (CE) of an Matrix `x` of shape \$T ⋅ S\$, where \$T\$ is total number of
time steps and \$S\$ is number of scenarios.

- `x` : series matrix of shape \$T ⋅ S\$

# Return
Vector of \$N\$ elements

# Examples
```julia-repl
julia> CE([[1; 2; 3] [1; 3; 4]))
Vector{Float64}:
2
5
"""
function complexity(x::AbstractMatrix{T})::Vector{Float64} where {T <: Real}
return vec(sqrt.(sum(diff(Matrix(x), dims = 1) .^ 2, dims = 1)))
end

"""
CF(ce_i::T, ce_j::T)::Float64 where {T<:Real}

Compute Correlation Factor (CF) between two time series complexities `ce_i` and `ce_j`.

- `ce_i` : Time series `i`
- `ce_j` : Time series `j`

# Returns
Float64

# Examples
```julia-repl
julia> ce = CE([[1; 2; 3] [1; 3; 4]))
julia> CF(ce[1], ce[2])
Float64:
2.5
"""
function correlation_factor(ce_i::T, ce_j::T)::Float64 where {T <: Real}
return max(ce_i, ce_j) / min(ce_i, ce_j)
end

"""
CID(data::AbstractMatrix{T})::AbstractMatrix{Float64} where {T<:Real}

Compute Complexity Invariance Distance (CID) between every two cols of a matrix `data` \$T ⋅ S\$ of data.
Returns a matrix of distances (\$S ⋅ S\$).

- `data` : Matrix of \$T ⋅ S\$, where \$T\$ is total number of time steps and \$S\$ is number of scenarios

# Returns
Matrix of complexity invariance distances
"""
function complexity_invariance_distance(data::AbstractMatrix{T})::AbstractMatrix{Float64} where {T <: Real}
ce = complexity(data)

# Create empty Matrix
data_size = size(data, 2)
cid_matrix::AbstractMatrix{Float64} = zeros(data_size, data_size)

# Iterate over data matrix to compute CID (Complexity Invariance Distance)
for i in axes(data, 2)
for j in axes(data, 2)
ed = euclidean(data[:, i], data[:, j])
cf = correlation_factor(ce[i], ce[j])

# Complexity Invariance Distance
cid = ed * cf
cid_matrix[i, j] = cid
cid_matrix[j, i] = cid
end
end

return cid_matrix
end

"""
time_series_clustering(data::AbstractMatrix{T}, n_clusters::Int64)::Vector{Int64} where {T<:Real}

Hierarchical clustering between \$S\$ scenarios with \$T\$ time steps each.

- `data` : Matrix of \$T ⋅ S\$, where \$T\$ is total number of time steps and \$S\$ is number of scenarios
- `n_clusters` : Number of clusters determined _a priori_.

# Returns
Vector of cluster ids indicating which cluster each scenario belongs to.

# References
1. Steinmann, P., Auping, W.L., Kwakkel, J.H., 2020.
Behavior-based scenario discovery using time series clustering.
Technological Forecasting and Social Change 156, 120052.
https://doi.org/10.1016/j.techfore.2020.120052

2. Batista, G.E.A.P.A., Keogh, E.J., Tataw, O.M., de Souza, V.M.A., 2014.
CID: an efficient complexity-invariant distance for time series.
Data Min Knowl Disc 28, 634-669.
https://doi.org/10.1007/s10618-013-0312-3
"""
function time_series_clustering(data::AbstractMatrix{T}, n_clusters::Int64)::Vector{Int64} where {T<:Real}
# Compute CID Distance Matrix
distances = complexity_invariance_distance(data)

# Create dendogram using distantes matrix
dendogram = hclust(distances, linkage = :average)

# Hierarchical clustering with n_clusters clusters
return cutree(dendogram, k = n_clusters)
end
53 changes: 53 additions & 0 deletions test/clustering.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@

@testset "Temporal clustering" begin
d1 = [1.; 2.; 3.]
d2 = [10.; 20.; 30.]
d3 = [1.; 5.; 8.]

test_data::Matrix = [d1 d2 d3]

@testset "Compute CE (Complexity)" begin
# Compute CD for test_data
ce = ADRIA.analysis.complexity(test_data)

# CE is a N Vector, where N is the number of rows in test_data
@test ce isa Vector
@test length(ce) == size(test_data, 2)

# Expected results
@test ce[1] == sqrt(2.)
@test ce[2] == sqrt(200.)
@test ce[3] == sqrt(25.)
end

@testset "Compute CF (Correlation Factor)" begin
# mock ce vector
ce = [2.5, 207.0, 25.0, 25.0]

# Expected Results
@test ADRIA.analysis.correlation_factor(ce[1], ce[2]) == 207.0 / 2.5
@test ADRIA.analysis.correlation_factor(ce[2], ce[3]) == 207.0 / 25.0
@test ADRIA.analysis.correlation_factor(ce[1], ce[3]) == 25.0 / 2.5
@test ADRIA.analysis.correlation_factor(ce[3], ce[4]) == 1
end

@testset "Comput CID Matrix (Complexity Invariance Matrix)" begin
cid = ADRIA.analysis.complexity_invariance_distance(test_data)

# CID is a Matrix (N,N)
@test size(cid, 1) == size(cid, 2) == size(test_data, 2)

# All CID are positive
@testset "CID positivity" for i in cid
@test i >= 0
end

# CID ij and ji entries are the same
@test cid[1,2] == cid[2,1] >= 0
@test cid[1,3] == cid[3,1] >= 0
@test cid[2,3] == cid[3,2] >= 0

# CID (i,i) is null
@test cid[1,1] == cid[2,2] == cid[3,3] == 0.0
end
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,5 +59,6 @@ include("metrics.jl")
include("growth.jl")
include("spec.jl")
include("sampling.jl")
include("clustering.jl")

# include("example_run.jl")