-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathAnalysisGraph.hpp
1542 lines (1296 loc) · 60.1 KB
/
AnalysisGraph.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#pragma once
#include <Eigen/Dense>
#include <unsupported/Eigen/MatrixFunctions>
#include <boost/graph/graph_traits.hpp>
#include <boost/range/adaptor/transformed.hpp>
#include <boost/range/algorithm/for_each.hpp>
#include <boost/range/iterator_range.hpp>
#include <range/v3/all.hpp>
#include "graphviz_interface.hpp"
#include "DiGraph.hpp"
#include "Tran_Mat_Cell.hpp"
#include <fmt/format.h>
#include <nlohmann/json.hpp>
const double tuning_param = 1.0;
enum InitialBeta { ZERO, ONE, HALF, MEAN, MEDIAN, PRIOR, RANDOM };
//enum class InitialBeta : char { ZERO, ONE, HALF, MEAN, MEDIAN, PRIOR, RANDOM };
enum InitialDerivative { DERI_ZERO, DERI_PRIOR };
typedef std::unordered_map<std::string, std::vector<double>>
AdjectiveResponseMap;
// This is a multimap to keep provision to have multiple observations per
// time point per indicator.
// Access (concept is a vertex in the CAG)
// [ concept ][ indicator ][ epoch --→ observation ]
typedef std::vector<std::vector<std::multimap<long, double>>>
ConceptIndicatorData;
// Keeps the sequence of dates for which data points are available
// Data points are sorted according to dates
// Access:
// [ concept ][ indicator ][epoch]
typedef std::vector<std::vector<long>> ConceptIndicatorEpochs;
// Access
// [ timestep ][ concept ][ indicator ][ observation ]
typedef std::vector<std::vector<std::vector<std::vector<double>>>>
ObservedStateSequence;
typedef std::vector<std::vector<std::vector<double>>>
PredictedObservedStateSequence;
typedef std::pair<std::tuple<std::string, int, std::string>,
std::tuple<std::string, int, std::string>>
CausalFragment;
// { concept_name --> (ind_name, [obs_0, obs_1, ... ])}
typedef std::unordered_map<std::string,
std::pair<std::string, std::vector<double>>>
ConceptIndicatorAlignedData;
typedef std::tuple<std::vector<std::string>, std::vector<int>, std::string>
EventCollection;
typedef std::pair<EventCollection, EventCollection>
CausalFragmentCollection;
// Access
// [ sample ][ time_step ]{ vertex_name --> { indicator_name --> pred}}
// [ sample ][ time_step ][ vertex_name ][ indicator_name ]
typedef std::vector<std::vector<
std::unordered_map<std::string, std::unordered_map<std::string, double>>>>
FormattedPredictionResult;
// Access
// [ vertex_name ][ timestep ][ sample ]
typedef std::unordered_map<std::string, std::vector<std::vector<double>>>
FormattedProjectionResult;
// Access
// get<0>:
// Training range
// <<start_year, start_month>, <end_year, end_month>>
// get<1>:
// Sequence of prediction time steps
// [yyyy-mm₀, yyyy-mm₁, yyyy-mm₂, yyyy-mm₃, .....]
// get<2>:
// Prediction results
// [ sample ][ time_step ]{ vertex_name --> { indicator_name --> pred}}
// [ sample ][ time_step ][ vertex_name ][ indicator_name ]
typedef std::tuple<std::pair<std::pair<int, int>, std::pair<int, int>>,
std::vector<std::string>,
FormattedPredictionResult>
Prediction;
// Format AnalysisGraph state to output
// [ concept name ] --> [ ind1, ind2, ... ]
typedef std::unordered_map<std::string, std::vector<std::string>> ConceptIndicators;
// List of edges [(source, target), ...]
typedef std::vector<std::pair<std::string, std::string>> Edges;
// List of adjectives [(source, target), ...]
typedef std::vector<std::pair<std::string, std::string>> Adjectives;
// List of polarities [(source, target), ...]
typedef std::vector<std::pair<int, int>> Polarities;
// Vector of theta priors and samples pairs for each edge
// Ordering is according to order of edges in Edges data vector
// For each edge, there is a tuple of vectors
// first element of the tuple is a vector of theta priors KDEs
// second element of the tuple is a vector of sampled thetas
// [([p1, p1, ...], [s1, s2, ...]), ... ]
typedef std::vector<std::pair<std::vector<double>, std::vector<double>>> Thetas;
// Sampled Derivatives for each concept
// Access
// [ concept name ] --> [s_1, s_2, ..., s_res ]
typedef std::unordered_map<std::string, std::vector<double>> Derivatives;
// Data
// Access
// [ indicator name ] --> {
// [ "Time Step" ] --> [ts1, ts2, ...]
// [ "Data" ] --> [ d1, d2, ...]
// }
typedef std::unordered_map<std::string, std::unordered_map<std::string, std::vector<double>>> Data;
// Predictions
// Access
// [ indicator name ] --> {
// [ ts ] --> [p_1, p_2, p_3, ..., p_res]
// }
typedef std::unordered_map<std::string, std::unordered_map<int, std::vector<double>>> Predictions;
typedef std::unordered_map<std::string, std::unordered_map<std::string, std::vector<double>>>
CredibleIntervals;
typedef std::tuple<
ConceptIndicators,
Edges, // List of edges [(source, target), ...]
Adjectives,
Polarities,
// Theta priors and samples for each edge
// [(priors, samples), ... ]
Thetas,
Derivatives,
// Data year month range
//std::vector<std::string>,
std::vector<long>,
// Data
Data,
// Prediction year month range
//std::vector<std::string>,
std::vector<double>,
Predictions,
CredibleIntervals
> CompleteState;
// Access
// [prediction time step] -->
// get<0>:
// concept name
// get<1>
// indicator name
// get<2>
// value
typedef std::unordered_map<int, std::vector<std::tuple<std::string,
std::string, double>>> ConstraintSchedule;
typedef boost::graph_traits<DiGraph>::edge_descriptor EdgeDescriptor;
typedef boost::graph_traits<DiGraph>::edge_iterator EdgeIterator;
typedef std::multimap<std::pair<int, int>, std::pair<int, int>>::iterator
MMapIterator;
AdjectiveResponseMap construct_adjective_response_map(size_t n_kernels);
/**
* The AnalysisGraph class is the main model/interface for Delphi.
*/
class AnalysisGraph {
private:
// True only when Delphi is run through the CauseMos HMI.
bool causemos_call = false;
DiGraph graph;
// Handle to the random number generator singleton object
RNG* rng_instance = nullptr;
std::mt19937 rand_num_generator;
// Uniform distribution used by the MCMC sampler
std::uniform_real_distribution<double> uni_dist;
// Normal distribution used to perturb β
std::normal_distribution<double> norm_dist;
// Uniform discrete distribution used by the MCMC sampler
// to perturb the initial latent state
std::uniform_int_distribution<int> uni_disc_dist;
// Sampling resolution
size_t res;
/*
============================================================================
Meta Data Structures
============================================================================
*/
// Maps each concept name to the vertex id of the
// vertex that concept is represented in the CAG
// concept name --> CAG vertex id
std::unordered_map<std::string, int> name_to_vertex = {};
// Keeps track of indicators in CAG to ensure there are no duplicates.
std::unordered_set<std::string> indicators_in_CAG;
// A_beta_factors is a 2D array (std::vector of std::vectors) that keeps track
// of the β factors involved with each cell of the transition matrix A.
//
// According to our current model, which uses variables and their partial
// derivatives with respect to each other ( x --> y, βxy = ∂y/∂x ),
// at most half of the transition matrix cells can be affected by βs.
// According to the way we organize the transition matrix, the cells
// A[row][col] where row is an even index and col is an odd index
// are such cells.
//
// Each cell of matrix A_beta_factors represent all the directed paths
// starting at the vertex equal to the column index of the matrix and
// ending at the vertex equal to the row index of the matrix.
//
// Each cell of matrix A_beta_factors is an object of Tran_Mat_Cell class.
std::vector<std::vector<std::shared_ptr<Tran_Mat_Cell>>> A_beta_factors;
// A set of (row, column) numbers of the 2D matrix A_beta_factors
// where the cell (row, column) depends on β factors.
std::set<std::pair<int, int>> beta_dependent_cells;
// Maps each β to all the transition matrix cells that are dependent on it.
std::multimap<std::pair<int, int>, std::pair<int, int>> beta2cell;
std::unordered_set<int> dependent_nodes = {};
std::unordered_set<int> independent_nodes = {};
std::vector<double> generated_latent_sequence;
int generated_concept;
/*
============================================================================
Sampler Related Variables
============================================================================
*/
// Keep track whether the model is trained.
// Used to check whether there is a trained model before calling
// generate_prediction()
bool trained = false;
int n_timesteps = 0;
int pred_timesteps = 0;
std::pair<std::pair<int, int>, std::pair<int, int>> training_range;
std::vector<std::string> pred_range;
long train_start_epoch = -1;
long train_end_epoch = -1;
double pred_start_timestep = -1;
std::vector<double> observation_timestep_gaps;
std::unordered_map<double, Eigen::MatrixXd> e_A_ts;
long modeling_period = 1; // Number of epochs per one modeling timestep
std::unordered_map<int, std::function<double(unsigned int, double)>> external_concepts;
std::vector<unsigned int> concept_sample_pool;
double t = 0.0;
double delta_t = 1.0;
double log_likelihood = 0.0;
double previous_log_likelihood = 0.0;
// To decide whether to perturb a θ or a derivative
// If coin_flip < coin_flip_thresh perturb θ else perturb derivative
double coin_flip = 0;
double coin_flip_thresh = 0.5;
// Remember the old θ and the edge where we perturbed the θ.
// We need this to revert the system to the previous state if the proposal
// gets rejected.
std::pair<EdgeDescriptor, double> previous_theta;
// Remember the old derivative and the concept we perturbed the derivative
int changed_derivative = 0;
double previous_derivative = 0;
// Latent state that is evolved by sampling.
Eigen::VectorXd s0;
Eigen::VectorXd s0_prev;
double derivative_prior_variance = 0.1;
// Transition matrix that is evolved by sampling.
// Since variable A has been already used locally in other methods,
// I chose to name this A_original. After refactoring the code, we could
// rename this to A.
Eigen::MatrixXd A_original;
// Determines whether to use the continuous version or the discretized
// version of the solution for the system of differential equations.
//
// continuous = true:
// Continuous version of the solution. We use the continuous form of the
// transition matrix and matrix exponential.
//
// continuous = false:
// Discretized version of the solution. We use the discretized version of
// the transition matrix and repeated matrix multiplication.
//
// A_discretized = I + A_continuous * Δt
bool continuous = true;
// Access this as
// current_latent_state
Eigen::VectorXd current_latent_state;
// Access this as
// observed_state_sequence[ time step ][ vertex ][ indicator ]
ObservedStateSequence observed_state_sequence;
// Access this as
// prediction_latent_state_sequences[ sample ][ time step ]
std::vector<std::vector<Eigen::VectorXd>> predicted_latent_state_sequences;
// Access this as
// predicted_observed_state_sequences
// [ sample ][ time step ][ vertex ][ indicator ]
std::vector<PredictedObservedStateSequence>
predicted_observed_state_sequences;
PredictedObservedStateSequence test_observed_state_sequence;
// Implementing constraints or interventions.
// -------------------------------------------------------------------------
// We are implementing two ways to constrain the model.
// 1. One-off constraints.
// A latent state is clamped to a constrained value just for the
// specified time step and released to evolve from the subsequent time
// step onward until the next constrained time step and so on.
// E.g. Getting a one time grant.
// 2. Perpetual constraints
// Once a latent state gets clamped at a value at a particular time
// step, it stays clamped at that value in subsequent time steps until
// another constrain overwrites the current constrain or the end of
// the prediction time is reached.
// NOTE: Currently we do not have a way to have a semi-perpetual
// constraint: A constraint is applied perpetually for some number of
// continuous time steps and then switched off. With a little bit of
// work we can implement this. We just need a special constraint value
// to signal end of a constraint. One suggestion is to use NaN.
// E.g. Maintaining a water level of a reservoir at a certain amount.
//
// NOTE: WE either apply One-off or Perpetual constraints to all the
// concepts. The current design does not permit applying mixed
// constraints such that some concepts are constrained one-off while
// some others are constrained perpetual. With a little bit more work,
// we could also achieve this. Moving the constraint type into the
// constraint information data structure would work for keeping track
// of mixed constraint types:
// std::unordered_map<int, std::vector<std::tuple<int, double, bool>>>
// Then we would have to update the constraint processing logic
// accordingly.
// -------------------------------------------------------------------------
//
// NOTE: This implementation of the constraints does not work at all with
// multiple indicators being attached to a single concept. Constraining the
// concept effects all the indicators and we cannot constrain targeted for a
// particular indicator. In the current model we might achieve this by
// constraining the scaling factor (which we incorrectly call as the
// indicator mean).
// Currently we are doing:
// constrained latent state = constrained indicator value / scaling factor
// The constraining that might work with multiple indicators per concept:
// constrained scaling factor = constrained indicator value / latent state
// -------------------------------------------------------------------------
//
// Implementing the One-off constraints:
// -------------------------------------------------------------------------
// To store constraints (or interventions)
// For some times steps of the prediction range, latent state values could be
// constrained to a value external from what the LDS predicts that value
// should be. When prediction happens, if constrains are present at a time
// step for some concepts, the predicted latent state values for those
// concepts are overwritten by the constraints supplied in this data
// structure.
// Access
// [ time step ] --> [(concept id, constrained value), ... ]
// latent_state_constraints.at(time step)
std::unordered_map<int, std::vector<std::pair<int, double>>>
one_off_constraints;
//
// Implementing Perpetual constraints:
// -------------------------------------------------------------------------
// Access
// [ concept id ] --> constrained value
// perpetual_constraints.at(concept id)
std::unordered_map<int, double> perpetual_constraints;
//
// Deciding which type of constraints to enforce
// one_off_constraints is empty => unconstrained prediction
// is_one_off_constraints = true => One-off constraints
// is_one_off_constraints = false => Perpetual constraints
bool is_one_off_constraints = true;
//
// Deciding whether to clamp the latent variable or the derivative
// true => clamp at derivative
// false => clamp at latent variable
bool clamp_at_derivative = true;
//
// When we are clamping derivatives the clamp sticks since derivatives never
// chance in our current model. So for one-off clamping, we have to reset the
// derivative back to original after the clamping step. This variable
// remembers the time step to reset the clamped derivatives.
int rest_derivative_clamp_ts = -1;
std::vector<Eigen::MatrixXd> transition_matrix_collection;
std::vector<Eigen::VectorXd> initial_latent_state_collection;
std::vector<std::vector<double>> latent_mean_collection;
std::vector<std::vector<double>> latent_std_collection;
std::vector<Eigen::VectorXd> synthetic_latent_state_sequence;
bool synthetic_data_experiment = false;
/*
============================================================================
Private: Integration with Uncharted's CauseMos interface
(in causemos_integration.cpp)
============================================================================
*/
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
create-model
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
/** Extracts concept to indicator mapping and the indicator observation
* sequences from the create model JSON input received from the CauseMose
* HMI. The JSON input specifies time as POSIX time stamps in milliseconds.
* Also the JSON input does not mention anything about the observation
* frequency, missing data points, or whether observation sequences for
* multiple indicators are time aligned (e.g. Whether they have the same
* starting and ending time, whether data points for a single indicator are
* ordered in chronologically increasing order).
*
* This method does not assume any of these unspoken qualities. This method
* reads in the observations from JSON and populate an internal intermediate
* and temporary data structure time aligning observations.
*
* All the parameters except the first are used to return the results back to
* the caller. The caller should declare these variables and pass them here so
* that after the execution of this method, the caller can access the results.
*
* @param json_indicators : conceptIndicators portion of the JSON
* input received from the HMI.
* @param concept_indicator_data : This data structure gets filled with
* chronologically ordered observation
* sequences for all the indicators,
* segmented according to the concepts they
* attach to. This is a temporary
* intermediate data structure used to time
* align observation and accumulate multiple
* observations for an indicator at a time
* point. Observed state sequence is filled
* using data in this data structure.
* @param concept_indicator_epochs : This data structure gets filled with
* epochs where observations
* are available for each indicator. Each
* indicator gets a separate sequence of
* chronologically ordered epochs.
* These are used to asses the best
* frequency to align observations across
* all the indicators.
* @returns void
*
*/
void extract_concept_indicator_mapping_and_observations_from_json(
const nlohmann::json &json_indicators,
ConceptIndicatorData &concept_indicator_data,
ConceptIndicatorEpochs &concept_indicator_epochs);
static double epoch_to_timestep(long epoch, long train_start_epoch, long modeling_frequency);
/** Infer the best sampling period to align observations to be used as the
* modeling frequency from all the observation sequences.
*
* We consider the sequence of epochs where observations are available and
* then the gaps in epochs between adjacent observations. We take the most
* frequent gap as the modeling frequency. When more than one gap is most
* frequent, we take the smallest such gap.
*
* NOTE: Some thought about how to use this information:
* shortest_gap = longest_gap ⇒ no missing data
* shortest_gap < longest_gap ⇒ missing data
* 1 < shortest_gap < longest_gap
* Best frequency to model at is the greatest common divisor of all
* gaps. For example if we see gaps 4, 6, 10 then gcd(4, 6, 10) = 2
* and modeling at a frequency of 2 months starting from the start
* date would allow us to capture all the observation sequences while
* aligning them with each other.
*
* All the parameters except the first are used to return the results back to
* the caller. The caller should declare these variables and pass them here so
* that after the execution of this method, the caller can access the results.
*
* @param concept_indicator_epochs : Chronologically ordered observation epoch
* sequences for each indicator extracted
* from the JSON data in the create model
* request. This data structure is populated
* by AnalysisGraph::
* extract_concept_indicator_mapping_and_observations_from_json().
* @param shortest_gap : Least number of epochs between any two
* consecutive observations.
* @param longest_gap : Most number of epochs between any two
* consecutive observations.
* @param frequent_gap : Most frequent number of epochs between
* two consecutive observations.
* @param highest_frequency : Number of time the frequent_gap is seen
* in all the observation sequences.
* @returns epochs_sorted : A sorted list of epochs where observations
* are present for at least one indicator
*/
std::vector<long>
infer_modeling_period(
const ConceptIndicatorEpochs &concept_indicator_epochs,
long &shortest_gap,
long &longest_gap,
long &frequent_gap,
int &highest_frequency);
/**
* Set the observed state sequence from the create model JSON input received
* from the HMI.
* The training_start_epoch and training_end_epochs are extracted from the
* observation sequences for indicators provided in the JSON input.
* The sequence includes both ends of the range.
*
* NOTE: When Delphi is run locally, the observed state sequence is set in a
* separate method:
* AnalysisGraph::set_observed_state_sequence_from_data(), which the
* code could be found in train_model.cpp.
* It would be better if we could combine these two methods into one.
*
* @param json_indicators : JSON concept-indicator mapping and observations
* @returns void
*
*/
void
set_observed_state_sequence_from_json_dict(const nlohmann::json &json_indicators);
/** Construct an AnalysisGraph object from JSON exported by CauseMos. */
void from_causemos_json_dict(const nlohmann::json &json_data,
double belief_score_cutoff,
double grounding_score_cutoff);
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
create-experiment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
std::pair<int, int> timestamp_to_year_month(long timestamp);
void extract_projection_constraints(
const nlohmann::json &projection_constraints, long skip_steps);
FormattedProjectionResult run_causemos_projection_experiment_from_json_dict(
const nlohmann::json &json_data);
FormattedProjectionResult format_projection_result();
void sample_transition_matrix_collection_from_prior();
/*
============================================================================
Private: Model serialization (in serialize.cpp)
============================================================================
*/
void from_delphi_json_dict(const nlohmann::json &json_data, bool verbose);
/*
============================================================================
Private: Utilities (in graph_utils.cpp)
============================================================================
*/
void clear_state();
void initialize_random_number_generator();
void remove_node(int node_id);
// Allocate a num_verts x num_verts 2D array (std::vector of std::vectors)
void allocate_A_beta_factors();
/**
* Finds all the simple paths starting at the start vertex and
* ending at the end vertex.
* Uses find_all_paths_between_util() as a helper to recursively find the
* paths
*/
void find_all_paths_between(int start, int end, int cutoff);
/**
* Recursively finds all the simple paths starting at the start vertex and
* ending at the end vertex. Used by find_all_paths_between()
* Paths found are added to the Tran_Mat_Cell object that is tracking the
* transition matrix cell (2*end, 2*start)
*
* @param start: Start vertex of the path
* @param end : End vertex of the path
* @param path : A path starting at vettex start that is being explored
*
* @return void
*/
void find_all_paths_between_util(int start,
int end,
std::vector<int>& path,
int cutoff);
/**
* Utility function that converts a time range given a start date and end date
* into an integer value.
* At the moment returns the number of months withing the time range.
* This should be the number of training data time points we have
*
* @param start_year : Start year of the training data sequence
* @param start_month : Start month of the training data sequence
* @param end_year : End year of the training data sequence
* @param end_month : End month of the training data sequence
*
* @return : Number of months in the training data sequence
* Including both start and end months
*/
int calculate_num_timesteps(int start_year,
int start_month,
int end_year,
int end_month);
/*
============================================================================
Private: Subgraphs (in subgraphs.cpp)
============================================================================
*/
void get_subgraph(int vert,
std::unordered_set<int>& vertices_to_keep,
int cutoff,
bool inward);
void get_subgraph_between(int start,
int end,
std::vector<int>& path,
std::unordered_set<int>& vertices_to_keep,
int cutoff);
/*
============================================================================
Private: Accessors
============================================================================
*/
int num_nodes() { return boost::num_vertices(graph); }
int get_vertex_id(std::string concept) {
using namespace fmt::literals;
try {
return this->name_to_vertex.at(concept);
}
catch (const std::out_of_range& oor) {
throw std::out_of_range("Concept \"{}\" not in CAG!"_format(concept));
}
}
auto node_indices() const {
return boost::make_iterator_range(boost::vertices(this->graph));
};
auto nodes() {
using boost::adaptors::transformed;
return this->node_indices() |
transformed([&](int v) -> Node& { return (*this)[v]; });
};
auto node_names() {
using boost::adaptors::transformed;
return this->nodes() |
transformed([&](auto node) -> std::string { return node.name; });
};
int get_degree(int vertex_id) {
return boost::in_degree(vertex_id, this->graph) +
boost::out_degree(vertex_id, this->graph);
};
auto out_edges(int i) {
return boost::make_iterator_range(boost::out_edges(i, graph));
}
Node& source(EdgeDescriptor e) {
return (*this)[boost::source(e, this->graph)];
};
Node& target(EdgeDescriptor e) {
return (*this)[boost::target(e, this->graph)];
};
auto successors(int i) {
return boost::make_iterator_range(boost::adjacent_vertices(i, this->graph));
}
auto successors(std::string node_name) {
return this->successors(this->name_to_vertex.at(node_name));
}
std::vector<Node> get_successor_list(std::string node) {
std::vector<Node> successors = {};
for (int successor : this->successors(node)) {
successors.push_back((*this)[successor]);
}
return successors;
}
auto predecessors(int i) {
return boost::make_iterator_range(
boost::inv_adjacent_vertices(i, this->graph));
}
auto predecessors(std::string node_name) {
return this->predecessors(this->name_to_vertex.at(node_name));
}
std::vector<Node> get_predecessor_list(std::string node) {
std::vector<Node> predecessors = {};
for (int predecessor : this->predecessors(node)) {
predecessors.push_back((*this)[predecessor]);
}
return predecessors;
}
double get_beta(std::string source_vertex_name,
std::string target_vertex_name) {
// This is ∂target / ∂source
return this->A_original(2 * get_vertex_id(target_vertex_name),
2 * get_vertex_id(source_vertex_name) + 1);
}
/*
============================================================================
Private: Get Training Data Sequence (in train_model.cpp)
============================================================================
*/
/**
* Set the observed state sequence for a given time range from data.
* The sequence includes both ends of the range.
* See data.hpp::get_observations_for() for missing data rules.
* Note: units are automatically set according
* to the parameterization of the given CAG.
*
* @param start_year : Start year of the sequence of data
* @param start_month : Start month of the sequence of data
* @param end_year : End year of the sequence of data
* @param end_month : End month of the sequence of data
* @param country : Country where the data is about
* @param state : State where the data is about
* @param county : County where the data is about
*
*/
void
set_observed_state_sequence_from_data(std::string country = "South Sudan",
std::string state = "",
std::string county = "");
/**
* Get the observed state (values for all the indicators)
* for a given time point from data.
* See data.hpp::get_observations_for() for missing data rules.
* Note: units are automatically set according
* to the parameterization of the given CAG.
*
* @param year : Year of the time point data is extracted
* @param month : Month of the time point data is extracted
* @param country : Country where the data is about
* @param state : State where the data is about
* @param county : County where the data is about
*
* @return : Observed state std::vector for the specified location
* on the specified time point.
* Access it as: [ vertex id ][ indicator id ]
*/
std::vector<std::vector<std::vector<double>>>
get_observed_state_from_data(int year,
int month,
std::string country,
std::string state = "",
std::string county = "");
/*
============================================================================
Private: Initializing model parameters (in parameter_initialization.cpp)
============================================================================
*/
/**
* Initialize all the parameters and hyper-parameters of the Delphi model.
*
* @param start_year : Start year of the sequence of data
* @param start_month : Start month of the sequence of data
* @param end_year : End year of the sequence of data
* @param end_month : End month of the sequence of data
* @param res : Sampling resolution. The number of samples to retain.
* @param initial_beta: Criteria to initialize β
* @param use_heuristic : Informs how to handle missing observations.
* false => let them be missing.
* true => fill them. See
* data.hpp::get_observations_for() for missing data
* rules.
* @param use_continuous: Choose between continuous vs discretized versions
* of the differential equation solution.
* Default is to use the continuous version with
* matrix exponential.
*/
void initialize_parameters(int res = 200,
InitialBeta initial_beta = InitialBeta::ZERO,
InitialDerivative initial_derivative = InitialDerivative::DERI_ZERO,
bool use_heuristic = false,
bool use_continuous = true);
void set_indicator_means_and_standard_deviations();
/**
* To help experiment with initializing βs to different values
*
* @param ib: Criteria to initialize β
*/
void init_betas_to(InitialBeta ib = InitialBeta::MEAN);
void construct_theta_pdfs();
/*
============================================================================
Private: Training by MCMC Sampling (in sampling.cpp)
============================================================================
*/
void set_base_transition_matrix();
// Sample elements of the stochastic transition matrix from the
// prior distribution, based on gradable adjectives.
void set_transition_matrix_from_betas();
void set_log_likelihood_helper(int ts);
void set_log_likelihood();
/**
* Run Bayesian inference - sample from the posterior distribution.
*/
void sample_from_posterior();
/**
* Sample a new transition matrix from the proposal distribution,
* given a current candidate transition matrix.
* In practice, this amounts to:
* Selecting a random β.
* Perturbing it a bit.
* Updating all the transition matrix cells that are dependent on it.
*/
// TODO: Need testng
// TODO: Before calling sample_from_proposal() we must call
// AnalysisGraph::find_all_paths()
// TODO: Before calling sample_from_proposal(), we mush assign initial βs and
// run Tran_Mat_Cell::compute_cell() to initialize the first transistion
// matrix.
// TODO: Update Tran_Mat_Cell::compute_cell() to calculate the proper value.
// At the moment it just computes sum of length of all the paths realted to
// this cell
void sample_from_proposal();
/**
* Find all the transition matrix (A) cells that are dependent on the β
* attached to the provided edge and update them.
* Acts upon this->A_original
*
* @param e: The directed edge ≡ β that has been perturbed
*/
void update_transition_matrix_cells(EdgeDescriptor e);
double calculate_delta_log_prior();
void revert_back_to_previous_state();
/*
============================================================================
Private: Modeling independent nodes (in independent_nodes.cpp)
============================================================================
*/
void generate_from_data_mean_and_std_gussian(double mean,
double std,
int num_timesteps);
void generate_independent_node_latent_sequences(int samp, int num_timesteps);
void update_independent_node_latent_state_with_generated_derivatives(
int ts, int concept_id, std::vector<double>& latent_sequence);
void update_latent_state_with_generated_derivatives(int ts);
/*
============================================================================
Private: Prediction (in prediction.cpp)
============================================================================
*/
/**
* Generate a collection of latent state sequences from the likelihood
* model given a collection of sampled
* (initial latent state, transition matrix) pairs.
*
* @param prediction_timesteps : The number of timesteps for the prediction
* sequences.
* @param initial_prediction_step: The initial prediction timestep relative
* to training timesteps.
* @param total_timesteps : Total number of timesteps from the initial
* training date to the end prediction date.
* @param project : Default false. If true, generate a single
* latent state sequence based on the
* perturbed initial latent state s0.
*/
void generate_latent_state_sequences(double initial_prediction_step);
void perturb_predicted_latent_state_at(int timestep, int sample_number);
/** Generate observed state sequences given predicted latent state
* sequences using the emission model
*/
void generate_observed_state_sequences();
std::vector<std::vector<double>>
generate_observed_state(Eigen::VectorXd latent_state);
/**
* Format the prediction result into a format Python callers favor.
*
* @param pred_timestes: Number of timesteps in the predicted sequence.
*
* @return Re-formatted prediction result.
* Access it as:
* [ sample number ][ time point ][ vertex name ][ indicator name ]
*/
FormattedPredictionResult format_prediction_result();
void run_model(int start_year,
int start_month,
int end_year,
int end_month);
void add_constraint(int step, std::string concept_name, std::string indicator_name,
double indicator_clamp_value);
/*
============================================================================
Private: Synthetic Data Experiment (in synthetic_data.cpp)
============================================================================
*/
void set_random_initial_latent_state();
void generate_synthetic_latent_state_sequence();
void
generate_synthetic_observed_state_sequence_from_synthetic_latent_state_sequence();
// TODO: Need testing
/**
* Sample observed state std::vector.
* This is the implementation of the emission function.
*
* @param latent_state: Latent state std::vector.
* This has 2 * number of vertices in the CAG.
* Even indices track the state of each vertex.
* Odd indices track the state of the derivative.
*
* @return Observed state std::vector. Observed state for each indicator for
* each vertex. Indexed by: [ vertex id ][ indicator id ]
*/
std::vector<std::vector<double>>
sample_observed_state(Eigen::VectorXd latent_state);
/*