Merck · thevolatilebit · Nov 28, 2023 · Nov 23, 2023 · Nov 24, 2023 · Nov 25, 2023
diff --git a/Project.toml b/Project.toml
@@ -10,6 +10,7 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 LibPQ = "194296ae-ab2e-5f79-8cd4-7183a0a5a0d1"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MCTS = "e12ccd36-dcad-5f33-8774-9175229e7b33"
 MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
 POMDPSimulators = "e0d0a172-29c6-5d4e-96d0-f262df5d01fd"
@@ -24,23 +25,24 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
-Clustering = "0.15"
-Combinatorics = "1.0"
+julia = "1.9"
+Plots = "1.39"
+ScientificTypes = "3.0"
+POMDPTools = "0.1"
 DataFrames = "1.6"
+HTTP = "1.10"
+LinearAlgebra = "1.9"
+LibPQ = "1.17"
+Combinatorics = "1.0"
+Statistics = "1.9"
+Random = "1.9"
+Reexport = "1.2"
 Distances = "0.10"
-HTTP = "1.9"
+POMDPs = "0.9"
 JSON = "0.21"
-LibPQ = "1.16"
-MCTS = "0.5.5"
+Clustering = "0.15"
+MCTS = "0.5"
 MLJ = "0.20"
-POMDPSimulators = "0.3"
-POMDPTools = "0.1"
-POMDPs = "0.9"
-Plots = "1.38"
-Random = "1.9"
-Reexport = "1.2"
 Requires = "1.3"
-ScientificTypes = "3.0"
-Statistics = "1.9"
+POMDPSimulators = "0.3"
 StatsBase = "0.34"
-julia = "1.9"
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -31,8 +31,9 @@ CEED.GenerativeDesigns.efficient_value
 
 ```@docs
 CEED.GenerativeDesigns.DistanceBased
-CEED.GenerativeDesigns.QuadraticStandardizedDistance
-CEED.GenerativeDesigns.DiscreteMetric
+CEED.GenerativeDesigns.QuadraticDistance
+CEED.GenerativeDesigns.DiscreteDistance
+CEED.GenerativeDesigns.MahalanobisDistance
 CEED.GenerativeDesigns.Exponential
 ```
 

diff --git a/docs/src/tutorials/GenerativeDesigns.jl b/docs/src/tutorials/GenerativeDesigns.jl
@@ -48,6 +48,8 @@
 
 # Therefore, given an experimental state with readouts over the feature set $F \subseteq X$, we can calculate the total distance from the entity recorded in the $j$-th row as $d_j = \sum_{x\in F} \rho_x (\hat x, x_j)$, where $\hat x$ and $x_j$ denote the readout for feature $x$ for the entity being tested and the entity recorded in $j$-th column, respectively. 
 
+# Alternatively, we could use the [Mahalanobis distance](https://en.wikipedia.org/wiki/Mahalanobis_distance#Definition).
+
 # Next, we convert distances $d_j$ into probabilistic weights $w_j$. By default, we use a rescaled exponential function, i.e., we put $w_j = \exp(-\lambda d_j)$ for some $\lambda>0$. Notably, $\lambda$'s value determines how belief is distributed across the historical entities. Larger values of $\lambda$ concentrate the belief tightly around the 'closest' historical entities, while smaller values distribute more belief to more distant entities.
 
 # Importantly, the proper choice of the distance functionals and the 'similarity' functional discussed above is a question of hyper-optimization. 
@@ -124,10 +126,44 @@ using CEED, CEED.GenerativeDesigns
 
 # Note that internally, a state of the decision process is represented as a tuple `(evidence, costs)`.
 
-(; sampler, uncertainty, weights) =
-    DistanceBased(data, "HeartDisease", Entropy, Exponential(; λ = 5));
+# You can specify the method for computing the distance using the `distance` keyword. By default, the Kronecker delta and quadratic distance will be utilised for categorical and continuous features, respectively. 
+
+(; sampler, uncertainty, weights) = DistanceBased(
+    data;
+    target = "HeartDisease",
+    uncertainty = Entropy,
+    similarity = Exponential(; λ = 5),
+);
+
+# Alternatively, you can provide a dictionary of `feature => distance` pairs. The implemented distance functionals are `DiscreteDistance(; λ)` and `QuadraticDistance(; λ, standardize=true)`. In that case, the specified distance will be applied to the respective feature, after which the distances will be collated across the range of features.
+
+# The above call is therefore equivalent to:
+
+numeric_feats = filter(c -> c <: Real, eltype.(eachcol(data)))
+categorical_feats = setdiff(names(data), numeric_feats)
 
-# The CEED package offers an additional flexibility by allowing an experiment to yield readouts over multiple features at the same time. In our scenario, we can consider the features `RestingECG`, `Oldpeak`, `ST_Slope`, and `MaxHR` to be obtained from a single experiment `ECG`.
+DistanceBased(
+    data;
+    target = "HeartDisease",
+    uncertainty = Entropy,
+    similarity = Exponential(; λ = 5),
+    distance = merge(
+        Dict(c => DiscreteDistance() for c in categorical_feats),
+        Dict(c => QuadraticDistance() for c in numeric_feats),
+    ),
+);
+
+# You can also use the Mahalanobis distance (`MahalanobisDistance(; diagonal)`). For example, we could write:
+
+DistanceBased(
+    data[!, ["RestingBP", "MaxHR", "Cholesterol", "FastingBS", "HeartDisease"]]; # the Mahalanobis distance only works with numeric features, so we selected a few, along with the target variable
+    target = "HeartDisease",
+    uncertainty = Entropy,
+    similarity = Exponential(; λ = 5),
+    distance = MahalanobisDistance(; diagonal = 1),
+);
+
+# The package offers an additional flexibility by allowing an experiment to yield readouts over multiple features at the same time. In our scenario, we can consider the features `RestingECG`, `Oldpeak`, `ST_Slope`, and `MaxHR` to be obtained from a single experiment `ECG`.
 
 # We specify the experiments along with the associated features:
 
@@ -200,11 +236,11 @@ solver = GenerativeDesigns.DPWSolver(;
     tree_in_info = true,
 )
 designs = efficient_designs(
-    experiments,
+    experiments;
     sampler,
     uncertainty,
-    6,
-    evidence;
+    thresholds = 6,
+    evidence,
     solver,
     mdp_options = (; max_parallel = 1),
     repetitions = 5,
@@ -238,18 +274,18 @@ experiments = Dict(
 seed!(1)
 ## use less number of iterations to speed up build process
 solver = GenerativeDesigns.DPWSolver(;
-    n_iterations = 20_000,
+    n_iterations = 2_000,
     exploration_constant = 5.0,
     tree_in_info = true,
 )
 designs = efficient_designs(
-    experiments,
+    experiments;
     sampler,
     uncertainty,
-    6,
-    evidence;
+    thresholds = 6,
+    evidence,
     solver,
-    mdp_options = (; max_parallel = 2, costs_tradeoff = [0, 1.0]),
+    mdp_options = (; max_parallel = 2, costs_tradeoff = (0, 1.0)),
     repetitions = 5,
 );
 
@@ -273,13 +309,12 @@ end
 #
 seed!(1)
 ## use less number of iterations to speed up build process
-solver =
-    GenerativeDesigns.DPWSolver(; n_iterations = 20_000, depth = 4, tree_in_info = true)
+solver = GenerativeDesigns.DPWSolver(; n_iterations = 2_000, depth = 4, tree_in_info = true)
 design = efficient_value(
-    experiments,
+    experiments;
     sampler,
     value,
-    evidence;
+    evidence,
     solver,
     repetitions = 5,
     mdp_options = (; discount = 0.8),

diff --git a/docs/src/tutorials/GenerativeDesigns.md b/docs/src/tutorials/GenerativeDesigns.md
@@ -52,6 +52,8 @@ For each feature $x\in X$, we consider a function $\rho_x$, which measures the d
 
 Therefore, given an experimental state with readouts over the feature set $F \subseteq X$, we can calculate the total distance from the entity recorded in the $j$-th row as $d_j = \sum_{x\in F} \rho_x (\hat x, x_j)$, where $\hat x$ and $x_j$ denote the readout for feature $x$ for the entity being tested and the entity recorded in $j$-th column, respectively.
 
+Alternatively, we could use the [Mahalanobis distance](https://en.wikipedia.org/wiki/Mahalanobis_distance#Definition).
+
 Next, we convert distances $d_j$ into probabilistic weights $w_j$. By default, we use a rescaled exponential function, i.e., we put $w_j = \exp(-\lambda d_j)$ for some $\lambda>0$. Notably, $\lambda$'s value determines how belief is distributed across the historical entities. Larger values of $\lambda$ concentrate the belief tightly around the 'closest' historical entities, while smaller values distribute more belief to more distant entities.
 
 Importantly, the proper choice of the distance functionals and the 'similarity' functional discussed above is a question of hyper-optimization.
@@ -135,13 +137,50 @@ In what follows, we obtain three functions:
 
 Note that internally, a state of the decision process is represented as a tuple `(evidence, costs)`.
 
+You can specify the method for computing the distance using the `distance` keyword. By default, the Kronecker delta and quadratic distance will be utilised for categorical and continuous features, respectively.
+
 ````@example GenerativeDesigns
-(; sampler, uncertainty, weights) =
-    DistanceBased(data, "HeartDisease", Entropy, Exponential(; λ = 5));
+(; sampler, uncertainty, weights) = DistanceBased(
+    data;
+    target = "HeartDisease",
+    uncertainty = Entropy,
+    similarity = Exponential(; λ = 5),
+);
 nothing #hide
 ````
 
-The CEED package offers an additional flexibility by allowing an experiment to yield readouts over multiple features at the same time. In our scenario, we can consider the features `RestingECG`, `Oldpeak`, `ST_Slope`, and `MaxHR` to be obtained from a single experiment `ECG`.
+Alternatively, you can provide a dictionary of `feature => distance` pairs. The implemented distance functionals are `DiscreteDistance(; λ)` and `QuadraticDistance(; λ, standardize=true)`. In that case, the specified distance will be applied to the respective feature, after which the distances will be collated across the range of features.
+
+The above call is therefore equivalent to:
+
+````@example GenerativeDesigns
+numeric_feats = filter(c -> c <: Real, eltype.(eachcol(data)))
+categorical_feats = setdiff(names(data), numeric_feats)
+
+DistanceBased(
+    data;
+    target = "HeartDisease",
+    uncertainty = Entropy,
+    similarity = Exponential(; λ = 5),
+    distance = merge(Dict(c => DiscreteDistance() for c in categorical_feats), Dict(c => QuadraticDistance() for c in numeric_feats))
+);
+nothing #hide
+````
+
+You can also use the Mahalanobis distance (`MahalanobisDistance(; diagonal)`). For example, we could write:
+
+````@example GenerativeDesigns
+DistanceBased(
+    data[!, ["RestingBP", "MaxHR", "Cholesterol", "FastingBS", "HeartDisease"]]; # the Mahalanobis distance only works with numeric features, so we selected a few, along with the target variable
+    target = "HeartDisease",
+    uncertainty = Entropy,
+    similarity = Exponential(; λ = 5),
+    distance = MahalanobisDistance(; diagonal = 1),
+);
+nothing #hide
+````
+
+The package offers an additional flexibility by allowing an experiment to yield readouts over multiple features at the same time. In our scenario, we can consider the features `RestingECG`, `Oldpeak`, `ST_Slope`, and `MaxHR` to be obtained from a single experiment `ECG`.
 
 We specify the experiments along with the associated features:
 
@@ -229,11 +268,11 @@ solver = GenerativeDesigns.DPWSolver(;
     tree_in_info = true,
 )
 designs = efficient_designs(
-    experiments,
+    experiments;
     sampler,
     uncertainty,
-    6,
-    evidence;
+    thresholds = 6,
+    evidence,
     solver,
     mdp_options = (; max_parallel = 1),
     repetitions = 5,
@@ -276,18 +315,18 @@ We have to provide the maximum number of concurrent experiments. Additionally, w
 seed!(1)
 # use less number of iterations to speed up build process
 solver = GenerativeDesigns.DPWSolver(;
-    n_iterations = 20_000,
+    n_iterations = 2_000,
     exploration_constant = 5.0,
     tree_in_info = true,
 )
 designs = efficient_designs(
-    experiments,
+    experiments;
     sampler,
     uncertainty,
-    6,
-    evidence;
+    thresholds = 6,
+    evidence,
     solver,
-    mdp_options = (; max_parallel = 2, costs_tradeoff = [0, 1.0]),
+    mdp_options = (; max_parallel = 2, costs_tradeoff = (0, 1.0)),
     repetitions = 5,
 );
 nothing #hide
@@ -318,13 +357,12 @@ In the following example, we also limit the maximum rollout horizon to 4.
 ````@example GenerativeDesigns
 seed!(1)
 # use less number of iterations to speed up build process
-solver =
-    GenerativeDesigns.DPWSolver(; n_iterations = 20_000, depth = 4, tree_in_info = true)
+solver = GenerativeDesigns.DPWSolver(; n_iterations = 2_000, depth = 4, tree_in_info = true)
 design = efficient_value(
-    experiments,
+    experiments;
     sampler,
     value,
-    evidence;
+    evidence,
     solver,
     repetitions = 5,
     mdp_options = (; discount = 0.8),

diff --git a/src/GenerativeDesigns/EfficientValueMDP.jl b/src/GenerativeDesigns/EfficientValueMDP.jl
@@ -1,5 +1,5 @@
 """
-    EfficientValueMDP(costs, sampler, value, evidence=Evidence(); <keyword arguments>)
+    EfficientValueMDP(costs; sampler, value, evidence=Evidence(), <keyword arguments>)
 
 Structure that parametrizes the experimental decision-making process. It is used in the object interface of POMDPs.
 
@@ -10,12 +10,12 @@ Internally, the reward associated with a particular experimental `evidence` and
 # Arguments
 
   - `costs`: a dictionary containing pairs `experiment => cost`, where `cost` can either be a scalar cost (modelled as a monetary cost) or a tuple `(monetary cost, execution time)`.
-  - `sampler`: a function of `(evidence, features, rng)`, in which `evidence` denotes the current experimental evidence, `features` represent the set of features we want to sample from, and `rng` is a random number generator; it returns a dictionary mapping the features to outcomes.
-  - `value`: a function of `(evidence)`; it quantifies the utility of experimental evidence.
-  - `evidence=Evidence()`: initial experimental evidence.
 
 # Keyword Arguments
 
+  - `sampler`: a function of `(evidence, features, rng)`, in which `evidence` denotes the current experimental evidence, `features` represent the set of features we want to sample from, and `rng` is a random number generator; it returns a dictionary mapping the features to outcomes.
+  - `value`: a function of `(evidence)`; it quantifies the utility of experimental evidence.
+  - `evidence=Evidence()`: initial experimental evidence.
   - `max_parallel`: maximum number of parallel experiments.
   - `discount`: this is the discounting factor utilized in reward computation.
 """
@@ -36,10 +36,10 @@ struct EfficientValueMDP <: POMDPs.MDP{State,Vector{String}}
     value::Function
 
     function EfficientValueMDP(
-        costs,
+        costs;
         sampler,
         value,
-        evidence = Evidence();
+        evidence = Evidence(),
         max_parallel::Int = 1,
         discount = 1.0,
     )
@@ -114,7 +114,7 @@ function POMDPs.reward(m::EfficientValueMDP, previous_state::State, _, state::St
 end
 
 """
-    efficient_value(costs, sampler, value, evidence=Evidence(); <keyword arguments>)
+    efficient_value(costs; sampler, value, evidence=Evidence(), <keyword arguments>)
 
 Estimate the maximum value of experimental evidence (such as clinical utility), adjusted for experimental costs.
 
@@ -123,48 +123,52 @@ Internally, an instance of the `EfficientValueMDP` structure is created and a su
 # Arguments
 
   - `costs`: a dictionary containing pairs `experiment => cost`, where `cost` can either be a scalar cost (modelled as a monetary cost) or a tuple `(monetary cost, execution time)`.
-  - `sampler`: a function of `(evidence, features, rng)`, in which `evidence` denotes the current experimental evidence, `features` represent the set of features we want to sample from, and `rng` is a random number generator; it returns a dictionary mapping the features to outcomes.
-  - `value`: a function of `(evidence, (monetary costs, execution time))`; it quantifies the utility of experimental evidence.
-  - `evidence=Evidence()`: initial experimental evidence.
 
 # Keyword Arguments
 
+  - `sampler`: a function of `(evidence, features, rng)`, in which `evidence` denotes the current experimental evidence, `features` represent the set of features we want to sample from, and `rng` is a random number generator; it returns a dictionary mapping the features to outcomes.
+  - `value`: a function of `(evidence, (monetary costs, execution time))`; it quantifies the utility of experimental evidence.
+  - `evidence=Evidence()`: initial experimental evidence.
   - `solver=default_solver`: a POMDPs.jl compatible solver used to solve the decision process. The default solver is [`DPWSolver`](https://juliapomdp.github.io/MCTS.jl/dev/dpw/).
   - `repetitions=0`: number of runoffs used to estimate the expected experimental cost.
   - `mdp_options`: a `NamedTuple` of additional keyword arguments that will be passed to the constructor of [`EfficientValueMDP`](@ref).
 
 # Example
 
 ```julia
-(; sampler, uncertainty, weights) =
-    DistanceBased(data, "HeartDisease", Entropy, Exponential(; λ = 5));
+(; sampler, uncertainty, weights) = DistanceBased(
+    data;
+    target = "HeartDisease",
+    uncertainty = Entropy,
+    similarity = Exponential(; λ = 5),
+);
 value = (evidence, costs) -> (1 - uncertainty(evidence) + 0.005 * sum(costs));
 # initialize evidence
 evidence = Evidence("Age" => 35, "Sex" => "M")
 # set up solver (or use default)
 solver =
     GenerativeDesigns.DPWSolver(; n_iterations = 10_000, depth = 3, tree_in_info = true)
 design = efficient_value(
-    experiments,
+    experiments;
     sampler,
     value,
-    evidence;
+    evidence,
     solver,            # planner
     mdp_options = (; max_parallel = 1),
     repetitions = 5,
 )
 ```
 """
 function efficient_value(
-    costs,
+    costs;
     sampler,
     value,
-    evidence = Evidence();
+    evidence = Evidence(),
     solver = default_solver,
     repetitions = 0,
     mdp_options = (;),
 )
-    mdp = EfficientValueMDP(costs, sampler, value, evidence; mdp_options...)
+    mdp = EfficientValueMDP(costs; sampler, value, evidence, mdp_options...)
 
     # planner
     planner = solve(solver, mdp)