open-AIMS · ConnectedSystems · Jun 8, 2023 · Jun 8, 2023 · Jun 8, 2023
diff --git a/Project.toml b/Project.toml
@@ -9,6 +9,7 @@ AxisKeys = "94b1ba4f-4ee9-5380-92f1-94cde586c3c5"
 BlackBoxOptim = "a134a8b2-14d6-55f6-9291-3336d3ab0209"
 Bootstrap = "e28b5b4c-05e8-5b66-bc03-6f0c0a0a06e0"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"

diff --git a/src/analysis/analysis.jl b/src/analysis/analysis.jl
@@ -83,5 +83,6 @@ end
 include("pareto.jl")
 include("sensitivity.jl")
 include("intervention.jl")
+include("clustering.jl")
 
 end
diff --git a/src/analysis/clustering.jl b/src/analysis/clustering.jl
@@ -0,0 +1,114 @@
+using Distances
+using Clustering
+
+
+"""
+	CE(x::AbstractMatrix{T})::AbstractMatrix{Float64} where {T <: Real}
+
+Compute Complexity (CE) of an Matrix `x` of shape \$T ⋅ S\$, where \$T\$ is total number of
+time steps and \$S\$ is number of scenarios.
+
+- `x` : series matrix of shape \$T ⋅ S\$
+
+# Return
+Vector of \$N\$ elements
+
+# Examples
+```julia-repl
+julia> CE([[1; 2; 3] [1; 3; 4]))
+Vector{Float64}:
+ 2
+ 5
+"""
+function complexity(x::AbstractMatrix{T})::Vector{Float64} where {T <: Real}
+	return vec(sqrt.(sum(diff(Matrix(x), dims = 1) .^ 2, dims = 1)))
+end
+
+"""
+	CF(ce_i::T, ce_j::T)::Float64 where {T<:Real}
+
+Compute Correlation Factor (CF) between two time series complexities `ce_i` and `ce_j`.
+
+- `ce_i` : Time series `i`
+- `ce_j` : Time series `j`
+
+# Returns
+Float64
+
+# Examples
+```julia-repl
+julia> ce = CE([[1; 2; 3] [1; 3; 4]))
+julia> CF(ce[1], ce[2])
+Float64:
+ 2.5
+"""
+function correlation_factor(ce_i::T, ce_j::T)::Float64 where {T <: Real}
+	return max(ce_i, ce_j) / min(ce_i, ce_j)
+end
+
+"""
+	CID(data::AbstractMatrix{T})::AbstractMatrix{Float64} where {T<:Real}
+
+Compute Complexity Invariance Distance (CID) between every two cols of a matrix `data` \$T ⋅ S\$ of data.
+Returns a matrix of distances (\$S ⋅ S\$).
+
+- `data` : Matrix of \$T ⋅ S\$, where \$T\$ is total number of time steps and \$S\$ is number of scenarios
+
+# Returns
+Matrix of complexity invariance distances
+"""
+function complexity_invariance_distance(data::AbstractMatrix{T})::AbstractMatrix{Float64} where {T <: Real}
+	ce = complexity(data)
+
+	# Create empty Matrix
+	data_size = size(data, 2)
+	cid_matrix::AbstractMatrix{Float64} = zeros(data_size, data_size)
+
+	# Iterate over data matrix to compute CID (Complexity Invariance Distance)
+	for i in axes(data, 2)
+		for j in axes(data, 2)
+			ed = euclidean(data[:, i], data[:, j])
+			cf = correlation_factor(ce[i], ce[j])
+
+			# Complexity Invariance Distance
+			cid = ed * cf
+			cid_matrix[i, j] = cid
+			cid_matrix[j, i] = cid
+		end
+	end
+
+	return cid_matrix
+end
+
+"""
+	time_series_clustering(data::AbstractMatrix{T}, n_clusters::Int64)::Vector{Int64} where {T<:Real}
+
+Hierarchical clustering between \$S\$ scenarios with \$T\$ time steps each.
+
+- `data` : Matrix of \$T ⋅ S\$, where \$T\$ is total number of time steps and \$S\$ is number of scenarios
+- `n_clusters` : Number of clusters determined _a priori_.
+
+# Returns
+Vector of cluster ids indicating which cluster each scenario belongs to.
+
+# References
+1. Steinmann, P., Auping, W.L., Kwakkel, J.H., 2020.
+   Behavior-based scenario discovery using time series clustering.
+   Technological Forecasting and Social Change 156, 120052.
+   https://doi.org/10.1016/j.techfore.2020.120052
+
+2. Batista, G.E.A.P.A., Keogh, E.J., Tataw, O.M., de Souza, V.M.A., 2014.
+   CID: an efficient complexity-invariant distance for time series.
+   Data Min Knowl Disc 28, 634-669.
+   https://doi.org/10.1007/s10618-013-0312-3
+"""
+function time_series_clustering(data::AbstractMatrix{T}, n_clusters::Int64)::Vector{Int64} where {T<:Real}
+	# Compute CID Distance Matrix
+	distances = complexity_invariance_distance(data)
+
+	# Create dendogram using distantes matrix
+	dendogram = hclust(distances, linkage = :average)
+
+	# Hierarchical clustering with n_clusters clusters
+	return cutree(dendogram, k = n_clusters)
+end
diff --git a/test/clustering.jl b/test/clustering.jl
@@ -0,0 +1,53 @@
+
+@testset "Temporal clustering" begin
+    d1 = [1.; 2.; 3.]
+    d2 = [10.; 20.; 30.]
+    d3 = [1.; 5.; 8.]
+
+    test_data::Matrix = [d1 d2 d3]
+
+    @testset "Compute CE (Complexity)" begin
+        # Compute CD for test_data
+        ce = ADRIA.analysis.complexity(test_data)
+
+        # CE is a N Vector, where N is the number of rows in test_data
+        @test ce isa Vector
+        @test length(ce) == size(test_data, 2)
+
+        # Expected results
+        @test ce[1] == sqrt(2.)
+        @test ce[2] == sqrt(200.)
+        @test ce[3] == sqrt(25.)
+    end
+
+    @testset "Compute CF (Correlation Factor)" begin
+        # mock ce vector
+        ce = [2.5, 207.0, 25.0, 25.0]
+
+        # Expected Results
+        @test ADRIA.analysis.correlation_factor(ce[1], ce[2]) == 207.0 / 2.5
+        @test ADRIA.analysis.correlation_factor(ce[2], ce[3]) == 207.0 / 25.0
+        @test ADRIA.analysis.correlation_factor(ce[1], ce[3]) == 25.0 / 2.5
+        @test ADRIA.analysis.correlation_factor(ce[3], ce[4]) == 1
+    end
+
+    @testset "Comput CID Matrix (Complexity Invariance Matrix)" begin
+        cid = ADRIA.analysis.complexity_invariance_distance(test_data)
+
+        # CID is a Matrix (N,N)
+        @test size(cid, 1) == size(cid, 2) == size(test_data, 2)
+
+        # All CID are positive
+        @testset "CID positivity" for i in cid
+            @test i >= 0
+        end
+
+        # CID ij and ji entries are the same
+        @test cid[1,2] == cid[2,1] >= 0
+        @test cid[1,3] == cid[3,1] >= 0
+        @test cid[2,3] == cid[3,2] >= 0
+
+        # CID (i,i) is null
+        @test cid[1,1] == cid[2,2] == cid[3,3] == 0.0
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -59,5 +59,6 @@ include("metrics.jl")
 include("growth.jl")
 include("spec.jl")
 include("sampling.jl")
+include("clustering.jl")
 
 # include("example_run.jl")