Merge pull request #17 from handwerkerd/gen-req-metrics-dh

tsalo · web-flow · commit 8613505b1c7f · 2024-02-28T10:41:29.000-05:00
Getting generate metrics fully running
diff --git a/docs/building_decision_trees.rst b/docs/building_decision_trees.rst
@@ -210,11 +210,9 @@ that is used to check whether results are plausible & can help avoid mistakes.
 
 - necessary_metrics
     A list of the necessary metrics in the component table that will be used
-    by the tree. If a metric doesn't exist then this will raise an error instead
-    of executing a tree. (Depending on future code development, this could
-    potentially be used to run ``tedana`` by specifying a decision tree and
-    metrics are calculated based on the contents of this field.) If a necessary
-    metric isn't used, there will be a warning.
+    by the tree. This field defines what metrics will be calculated on each ICA
+    comopnent. If a metric doesn't exist then this will raise an error instead
+    of executing a tree. If a necessary metric isn't used, there will be a warning.
 
 - generated_metrics
     An optional initial field. It lists metrics that are to be calculated as
@@ -378,7 +376,7 @@ dataframe column that is True or False for the components in ``decide_comps`` ba
 the function's criteria.
 That column is an input to :func:`~tedana.selection.selection_utils.change_comptable_classifications`,
 which will update the component_table classifications, update the classification history
-in component_status_table, and update the component classification_tags. Components not
+in ``selector.component_status_table_``, and update the component classification_tags. Components not
 in ``decide_comps`` retain their existing classifications and tags.
 :func:`~tedana.selection.selection_utils.change_comptable_classifications`
 also returns and should assign values to
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
     "nibabel>=2.5.1,<=5.2.0",
     "nilearn>=0.7,<=0.10.3",
     "numpy>=1.16,<=1.26.4",
-    "pandas>=2.0,<=2.2.0",
+    "pandas>=2.0,<=2.2.1",
     "pybtex",
     "pybtex-apa-style",
     "scikit-learn>=0.21, <=1.4.1.post1",
diff --git a/tedana/io.py b/tedana/io.py
@@ -643,7 +643,7 @@ def writeresults(ts, mask, comptable, mmix, io_generator):
     =========================================    ===========================================
     Filename                                     Content
     =========================================    ===========================================
-    desc-denoised_bold.nii.gz              Denoised time series.
+    desc-denoised_bold.nii.gz                    Denoised time series.
 
     desc-optcomAccepted_bold.nii.gz              High-Kappa time series. (only with verbose)
     desc-optcomRejected_bold.nii.gz              Low-Kappa time series. (only with verbose)
diff --git a/tedana/selection/component_selector.py b/tedana/selection/component_selector.py
@@ -272,8 +272,8 @@ def select(self, component_table, cross_component_metrics={}, status_table=None)
         -----
         Adds to the ``ComponentSelector``:
 
-        - component_status_table_: empty dataframe or contents of inputted status_table
-        - cross_component_metrics_: empty dict or contents of inputed values
+        - ``component_status_table_``: empty dataframe or contents of inputted status_table
+        - ``cross_component_metrics_``: empty dict or contents of inputed values
         - used_metrics: empty set
 
         Any parameter that is used by a decision tree node function can be passed
@@ -307,19 +307,18 @@ def select(self, component_table, cross_component_metrics={}, status_table=None)
 
         When this is run, multiple elements in `ComponentSelector` will change including:
 
-        - component_table_: ``classification`` column with ``accepted`` or ``rejected`` labels
+        - ``component_table_``: ``classification`` column with ``accepted`` or ``rejected`` labels
           and ``classification_tags`` column with can hold multiple comma-separated labels
           explaining why a classification happened
-        - cross_component_metrics_: Any values that were calculated based on the metric
+        - ``cross_component_metrics_``: Any values that were calculated based on the metric
           values across components or by direct user input
-        - component_status_table: Contains the classification statuses at each node in
+        - ``component_status_table_``: Contains the classification statuses at each node in
           the decision tree
         - used_metrics: A list of metrics used in the selection process
         - nodes: The original tree definition with an added ``outputs`` key listing
           everything that changed in each node
-        - current_node_idx_: The total number of nodes run in ``ComponentSelector``
+        - ``current_node_idx_``: The total number of nodes run in ``ComponentSelector``
         """
-        self.__dict__.update(cross_component_metrics)
         self.cross_component_metrics_ = cross_component_metrics
 
         # Construct an un-executed selector
@@ -349,8 +348,13 @@ def select(self, component_table, cross_component_metrics={}, status_table=None)
             self.start_idx_ = 0
         else:
             # Since a status table exists, we need to skip nodes up to the
-            # point where the last tree finished
-            self.start_idx_ = len(self.tree["nodes"])
+            # point where the last tree finished. Notes that were executed
+            # have an output field. Identify the last node with an output field
+            tmp_idx = len(self.tree["nodes"]) - 1
+            while ("outputs" not in self.tree["nodes"][tmp_idx]) and (tmp_idx > 0):
+                tmp_idx -= 1
+            # start at the first node that does not have an output field
+            self.start_idx_ = tmp_idx + 1
             LGR.info(f"Start is {self.start_idx_}")
             self.component_status_table_ = status_table
 
@@ -440,7 +444,7 @@ def check_null(self, params, fcn):
         for key, val in params.items():
             if val is None:
                 try:
-                    params[key] = getattr(self, key)
+                    params[key] = self.cross_component_metrics_[key]
                 except AttributeError:
                     raise ValueError(
                         f"Parameter {key} is required in node {fcn}, but not defined. "
diff --git a/tedana/selection/selection_nodes.py b/tedana/selection/selection_nodes.py
@@ -717,7 +717,7 @@ def calc_kappa_elbow(
     outputs = {
         "decision_node_idx": selector.current_node_idx_,
         "node_label": None,
-        "n_echos": selector.n_echos,
+        "n_echos": selector.cross_component_metrics_["n_echos"],
         "used_metrics": {"kappa"},
         "calc_cross_comp_metrics": [
             "kappa_elbow_kundu",
@@ -775,7 +775,11 @@ def calc_kappa_elbow(
             outputs["kappa_allcomps_elbow"],
             outputs["kappa_nonsig_elbow"],
             outputs["varex_upper_p"],
-        ) = kappa_elbow_kundu(selector.component_table_, selector.n_echos, comps2use=comps2use)
+        ) = kappa_elbow_kundu(
+            selector.component_table_,
+            selector.cross_component_metrics_["n_echos"],
+            comps2use=comps2use,
+        )
         selector.cross_component_metrics_["kappa_elbow_kundu"] = outputs["kappa_elbow_kundu"]
         selector.cross_component_metrics_["kappa_allcomps_elbow"] = outputs["kappa_allcomps_elbow"]
         selector.cross_component_metrics_["kappa_nonsig_elbow"] = outputs["kappa_nonsig_elbow"]
@@ -845,7 +849,7 @@ def calc_rho_elbow(
     outputs = {
         "decision_node_idx": selector.current_node_idx_,
         "node_label": None,
-        "n_echos": selector.n_echos,
+        "n_echos": selector.cross_component_metrics_["n_echos"],
         "calc_cross_comp_metrics": [
             elbow_name,
             "rho_allcomps_elbow",
@@ -900,7 +904,7 @@ def calc_rho_elbow(
             outputs["elbow_f05"],
         ) = rho_elbow_kundu_liberal(
             selector.component_table_,
-            selector.n_echos,
+            selector.cross_component_metrics_["n_echos"],
             rho_elbow_type=rho_elbow_type,
             comps2use=comps2use,
             subset_comps2use=subset_comps2use,
@@ -1214,7 +1218,7 @@ def calc_varex_thresh(
     num_highest_var_comps : :obj:`str` :obj:`int`
         percentile can be calculated on the num_highest_var_comps components with the
         lowest variance. Either input an integer directly or input a string that is
-        a parameter stored in selector.cross_component_metrics_ ("num_acc_guess" in
+        a parameter stored in ``selector.cross_component_metrics_`` ("num_acc_guess" in
         original decision tree). Default=None
     %(log_extra_info)s
     %(custom_node_label)s
diff --git a/tedana/selection/selection_utils.py b/tedana/selection/selection_utils.py
@@ -14,13 +14,13 @@
 ##############################################################
 
 
-def selectcomps2use(comptable, decide_comps):
+def selectcomps2use(component_table, decide_comps):
     """Get a list of component numbers that fit the classification types in ``decide_comps``.
 
     Parameters
     ----------
-    comptable : :obj:`~pandas.DataFrame`
-        Only uses the component_table in this object
+    component_table : :obj:`~pandas.DataFrame`
+        The component_table with metrics and labels for each ICA component
     decide_comps : :obj:`str` or :obj:`list[str]` or :obj:`list[int]`
         This is string or a list of strings describing what classifications
         of components to operate on, using default or intermediate_classification
@@ -34,31 +34,31 @@ def selectcomps2use(comptable, decide_comps):
     comps2use : :obj:`list[int]`
         A list of component indices with classifications included in decide_comps
     """
-    if "classification" not in comptable:
-        raise ValueError("comptable needs a 'classification' column to run selectcomps2use")
+    if "classification" not in component_table:
+        raise ValueError("component_table needs a 'classification' column to run selectcomps2use")
 
     if isinstance(decide_comps, (str, int)):
         decide_comps = [decide_comps]
 
     if isinstance(decide_comps, list) and (decide_comps[0] == "all"):
         # All components with any string in the classification field are set to True
-        comps2use = list(range(comptable.shape[0]))
+        comps2use = list(range(component_table.shape[0]))
 
     elif isinstance(decide_comps, list) and all(isinstance(elem, str) for elem in decide_comps):
         comps2use = []
         for didx in range(len(decide_comps)):
-            newcomps2use = comptable.index[
-                comptable["classification"] == decide_comps[didx]
+            newcomps2use = component_table.index[
+                component_table["classification"] == decide_comps[didx]
             ].tolist()
             comps2use = list(set(comps2use + newcomps2use))
 
     elif isinstance(decide_comps, list) and all(isinstance(elem, int) for elem in decide_comps):
         # decide_comps is already a list of indices
-        if len(comptable) <= max(decide_comps):
+        if len(component_table) <= max(decide_comps):
             raise ValueError(
                 "decide_comps for selectcomps2use is selecting for a component with index"
                 f"{max(decide_comps)} (0 indexing) which is greater than the number "
-                f"of components: {len(comptable)}"
+                f"of components: {len(component_table)}"
             )
         elif min(decide_comps) < 0:
             raise ValueError(
@@ -100,8 +100,8 @@ def change_comptable_classifications(
     Parameters
     ----------
     selector : :obj:`tedana.selection.component_selector.ComponentSelector`
-        The attributes used are component_table, component_status_table, and
-        current_node_idx_
+        The attributes used are ``component_table_``, ``component_status_table_``, and
+        ``current_node_idx_``
     if_true, if_false : :obj:`str`
         If the condition in this step is true or false, give the component
         the label in this string. Options are 'accepted', 'rejected',
@@ -123,12 +123,12 @@ def change_comptable_classifications(
     Returns
     -------
     selector : :obj:`tedana.selection.component_selector.ComponentSelector`
-        component_table["classifications"] will reflect any new
+        ``component_table_["classifications"]`` will reflect any new
         classifications.
-        component_status_table will have a new column titled
-        "Node current_node_idx_" that is a copy of the updated classifications
+        ``component_status_table_`` will have a new column titled
+        "Node ``current_node_idx_``" that is a copy of the updated classifications
         column.
-        component_table["classification_tags"] will be updated to include any
+        ``component_table_["classification_tags"]`` will be updated to include any
         new tags. Each tag should appear only once in the string and tags will
         be separated by commas.
     n_true, n_false : :obj:`int`
@@ -178,8 +178,8 @@ def comptable_classification_changer(
     Parameters
     ----------
     selector : :obj:`tedana.selection.component_selector.ComponentSelector`
-        The attributes used are component_table, component_status_table, and
-        current_node_idx_
+        The attributes used are ``component_table_``, ``component_status_table_``, and
+        ``current_node_idx_``
     boolstate : :obj:`bool`
         Change classifications only for True or False components in
         decision_boolean based on this variable
@@ -207,12 +207,12 @@ def comptable_classification_changer(
     -------
     selector : :obj:`tedana.selection.component_selector.ComponentSelector`
         Operates on the True OR False components depending on boolstate
-        component_table["classifications"] will reflect any new
+        ``component_table_["classifications"]`` will reflect any new
         classifications.
-        component_status_table will have a new column titled
-        "Node current_node_idx_" that is a copy of the updated classifications
+        ``component_status_table_`` will have a new column titled
+        "Node ``current_node_idx_``" that is a copy of the updated classifications
         column.
-        component_table["classification_tags"] will be updated to include any
+        component_table_["classification_tags"] will be updated to include any
         new tags. Each tag should appear only once in the string and tags will
         be separated by commas.
 
@@ -363,7 +363,7 @@ def log_decision_tree_step(
     ----------
     function_name_idx : :obj:`str`
         The name of the function that should be logged. By convention, this
-        be "Step current_node_idx_: function_name"
+        be "Step ``current_node_idx_``: function_name"
     comps2use : :obj:`list[int]` or -1
         A list of component indices that should be used by a function.
         Only used to report no components found if empty and report
diff --git a/tedana/tests/test_component_selector.py b/tedana/tests/test_component_selector.py
@@ -38,6 +38,7 @@ def dicts_to_test(treechoice):
         "missing_req_param": A missing required param in a decision node function
         "missing_function": An undefined decision node function
         "missing_key": A dict missing one of the required keys (report)
+        "null_value": A parameter in one node improperly has a null value
 
     Returns
     -------
@@ -267,6 +268,13 @@ def test_check_null_succeeds():
     selector = component_selector.ComponentSelector(tree="minimal")
     selector.tree = dicts_to_test("null_value")
 
+    # "left" is missing from the function definition in node
+    # but is found as an initialized cross component metric
+    # so this should execute successfully
+    selector.cross_component_metrics_ = {
+        "left": 3,
+    }
+
     params = selector.tree["nodes"][0]["parameters"]
     functionname = selector.tree["nodes"][0]["functionname"]
     selector.check_null(params, functionname)
@@ -295,8 +303,11 @@ def test_are_all_components_accepted_or_rejected():
 def test_selector_properties_smoke():
     """Tests to confirm properties match expected results."""
 
+    # Runs on un-executed component table to smoke test three class
+    # functions that are used to count various types of component
+    # classifications in the component table
     selector = component_selector.ComponentSelector(tree="minimal")
-    selector.select(component_table=sample_comptable(), cross_component_metrics={"n_echos": 3})
+    selector.component_table_ = sample_comptable()
 
     assert selector.n_comps_ == 21
 
diff --git a/tedana/tests/test_selection_utils.py b/tedana/tests/test_selection_utils.py
@@ -55,7 +55,17 @@ def sample_selector(options=None):
         "test_elbow": 21,
     }
     selector = ComponentSelector(tree=tree)
-    selector.select(component_table=component_table, cross_component_metrics=xcomp)
+
+    # Add an un-executed component table,cross component metrics, and status table
+    selector.component_table_ = component_table.copy()
+    selector.cross_component_metrics_ = xcomp
+    selector.component_status_table_ = selector.component_table_[
+        ["Component", "classification"]
+    ].copy()
+    selector.component_status_table_ = selector.component_status_table_.rename(
+        columns={"classification": "initialized classification"}
+    )
+
     selector.current_node_idx_ = 0
 
     return selector
diff --git a/tedana/workflows/tedana.py b/tedana/workflows/tedana.py
@@ -499,6 +499,9 @@ def tedana_workflow(
     if isinstance(data, str):
         data = [data]
 
+    LGR.info("Initializing and validating component selection tree")
+    selector = ComponentSelector(tree)
+
     LGR.info(f"Loading input data: {[f for f in data]}")
     catd, ref_img = io.load_data(data, n_echos=n_echos)
 
@@ -630,8 +633,8 @@ def tedana_workflow(
     # optimally combine data
     data_oc = combine.make_optcom(catd, tes, masksum_denoise, t2s=t2s_full, combmode=combmode)
 
-    # regress out global signal unless explicitly not desired
     if "gsr" in gscontrol:
+        # regress out global signal
         catd, data_oc = gsc.gscontrol_raw(catd, data_oc, n_echos, io_generator)
 
     fout = io_generator.save_file(data_oc, "combined img")
@@ -669,8 +672,6 @@ def tedana_workflow(
             # Estimate betas and compute selection metrics for mixing matrix
             # generated from dimensionally reduced data using full data (i.e., data
             # with thermal noise)
-            LGR.info("Making second component selection guess from ICA results")
-            selector = ComponentSelector(tree)
             necessary_metrics = selector.necessary_metrics
             # The figures require some metrics that might not be used by the decision tree.
             extra_metrics = ["variance explained", "normalized variance explained", "kappa", "rho"]
@@ -686,6 +687,7 @@ def tedana_workflow(
                 "ICA",
                 metrics=necessary_metrics,
             )
+            LGR.info("Selecting components from ICA results")
             selector = selection.automatic_selection(
                 comptable,
                 selector,
@@ -704,6 +706,9 @@ def tedana_workflow(
             # If we're going to restart, temporarily allow force overwrite
             if keep_restarting:
                 io_generator.overwrite = True
+                # Create a re-initialized selector object if rerunning
+                selector = ComponentSelector(tree)
+
             RepLGR.disabled = True  # Disable the report to avoid duplicate text
         RepLGR.disabled = False  # Re-enable the report after the while loop is escaped
         io_generator.overwrite = overwrite  # Re-enable original overwrite behavior