v0.3.2 (#368)

* Updates for v0.3.2 * Bump version to v0.3.2 --------- Co-authored-by: kshakir <github@kshakir.org> Co-authored-by: alecw <alecw@users.noreply.github.com> Co-authored-by: Jura Pintar <jpintar@users.noreply.github.com> Co-authored-by: aawdeh <aseel.awdeh@gmail.com>
broadinstitute · Jun 24, 2024 · 04c2f5b · 04c2f5b
1 parent 9a464cd
commit 04c2f5b
Show file tree

Hide file tree

Showing 10 changed files with 354 additions and 43 deletions.
diff --git a/cellbender/VERSION.txt b/cellbender/VERSION.txt
@@ -1 +1 @@
-0.3.1.dev0
+0.3.2
diff --git a/cellbender/remove_background/cli.py b/cellbender/remove_background/cli.py
@@ -117,14 +117,22 @@ def validate_args(args) -> argparse.Namespace:
         args.fpr = fpr_list_correct_dtypes
 
         # Ensure that "exclude_features" specifies allowed features.
-        # As of CellRanger 6.0, the possible features are:
+        # As of CellRanger 7.2, the possible features are:
         #     Gene Expression
         #     Antibody Capture
         #     CRISPR Guide Capture
         #     Custom
         #     Peaks
+        #     Multiplexing Capture
+        #     VDJ
+        #     VDJ-T
+        #     VDJ-T-GD
+        #     VDJ-B
+        #     Antigen Capture
         allowed_features = ['Gene Expression', 'Antibody Capture',
-                            'CRISPR Guide Capture', 'Custom', 'Peaks']
+                            'CRISPR Guide Capture', 'Custom', 'Peaks',
+                            'Multiplexing Capture', 'VDJ', 'VDJ-T',
+                            'VDJ-T-GD', 'VDJ-B', 'Antigen Capture']
         for feature in args.exclude_features:
             if feature not in allowed_features:
                 sys.stdout.write(f"Specified '{feature}' using --exclude-feature-types, "

diff --git a/cellbender/remove_background/tests/benchmarking/run_benchmark.py b/cellbender/remove_background/tests/benchmarking/run_benchmark.py
@@ -141,7 +141,9 @@ def cromshell_submit(wdl: str,
     submit_cmd = ['cromshell', 'submit',
                   tmp_wdl,
                   inputs,
+                  '--options-json',
                   options,
+                  '--dependencies-zip',
                   dependencies_zip]
 
     # submit job

diff --git a/cellbender/remove_background/tests/benchmarking/run_benchmark_result_tabulation.py b/cellbender/remove_background/tests/benchmarking/run_benchmark_result_tabulation.py
@@ -69,7 +69,7 @@ def get_cromshell_output_h5(workflow: str, grep: str = '_out.h5') -> Union[str,
     """Use cromshell list-outputs to get the relevant file gsURL"""
 
     output = grep_from_command(['cromshell', 'list-outputs', workflow], grep=grep)
-    out = output[:-1].decode().split('\n')
+    out = output.decode().lstrip('run_cellbender_benchmark.h5_array: ').rstrip('\n').split('\n')
     if len(out) > 1:
         return out
     else:
@@ -95,18 +95,18 @@ def metadata_from_workflow_id(workflow: str) -> Tuple[str, str, Optional[str]]:
     # git hash
     output = grep_from_command(['cromshell', 'metadata', workflow],
                                grep='"git_hash":')
-    git_hash = output[17:-3].decode()
+    git_hash = output.decode().split('"git_hash": ')[-1].lstrip('"').split('"')[0]
 
     # input file
     output = grep_from_command(['cromshell', 'metadata', workflow],
                                grep='run_cellbender_benchmark.cb.input_file_unfiltered')
-    input_file = output[58:-3].decode()
+    input_file = 'gs://' + output.decode().split('gs://')[-1].split('"')[0]
 
     # truth file
     output = grep_from_command(['cromshell', 'metadata', workflow],
                                grep='run_cellbender_benchmark.cb.truth_file')
     if 'null' not in output.decode():
-        truth_file = output[47:-3].decode()
+        truth_file = 'gs://' + output.decode().split('gs://')[-1].split('"')[0]
     else:
         truth_file = None
 

diff --git a/cellbender/remove_background/train.py b/cellbender/remove_background/train.py
@@ -152,7 +152,6 @@ def run_training(model: RemoveBackgroundPyroModel,
 
     # Initialize train and tests ELBO with empty lists.
     train_elbo = []
-    test_elbo = []
     lr = []
     epoch_checkpoint_freq = 1000  # a large number... it will be recalculated
 
@@ -212,16 +211,15 @@ def run_training(model: RemoveBackgroundPyroModel,
                 if epoch % test_freq == 0:
                     model.eval()
                     total_epoch_loss_test = evaluate_epoch(svi, test_loader)
-                    test_elbo.append(-total_epoch_loss_test)
                     model.loss['test']['epoch'].append(epoch)
                     model.loss['test']['elbo'].append(-total_epoch_loss_test)
                     logger.info("[epoch %03d] average test loss: %.4f"
                                 % (epoch, total_epoch_loss_test))
 
                     # Check whether test ELBO has spiked beyond specified conditions.
-                    if (epoch_elbo_fail_fraction is not None) and (len(test_elbo) > 2):
-                        current_diff = max(0., test_elbo[-2] - test_elbo[-1])
-                        overall_diff = np.abs(test_elbo[-2] - test_elbo[0])
+                    if (epoch_elbo_fail_fraction is not None) and (len(model.loss['test']['elbo']) > 2):
+                        current_diff = max(0., model.loss['test']['elbo'][-2] - model.loss['test']['elbo'][-1])
+                        overall_diff = np.abs(model.loss['test']['elbo'][-2] - model.loss['test']['elbo'][0])
                         fractional_spike = current_diff / overall_diff
                         if fractional_spike > epoch_elbo_fail_fraction:
                             raise ElboException(
@@ -245,15 +243,20 @@ def run_training(model: RemoveBackgroundPyroModel,
 
         # Check on the final test ELBO to see if it meets criteria.
         if final_elbo_fail_fraction is not None:
-            best_test_elbo = max(test_elbo)
-            if test_elbo[-1] < best_test_elbo:
-                final_best_diff = best_test_elbo - test_elbo[-1]
-                initial_best_diff = best_test_elbo - test_elbo[0]
-                if (final_best_diff / initial_best_diff) > final_elbo_fail_fraction:
+            best_test_elbo = max(model.loss['test']['elbo'])
+            if model.loss['test']['elbo'][-1] < best_test_elbo:
+                final_best_diff = best_test_elbo - model.loss['test']['elbo'][-1]
+                initial_best_diff = best_test_elbo - model.loss['test']['elbo'][0]
+                if initial_best_diff == 0:
                     raise ElboException(
-                        f'Training failed because final test loss {test_elbo[-1]:.2f} '
+                        f"Training failed because there was no improvement from the initial test loss {model.loss['test']['elbo'][0]:.2f}. "
+                        f"Final test loss was {model.loss['test']['elbo'][-1]}"
+                    )
+                elif (final_best_diff / initial_best_diff) > final_elbo_fail_fraction:
+                    raise ElboException(
+                        f"Training failed because final test loss {model.loss['test']['elbo'][-1]:.2f} "
                         f'is not sufficiently close to best test loss {best_test_elbo:.2f}, '
-                        f'compared to the initial test loss {test_elbo[0]:.2f}. '
+                        f"compared to the initial test loss {model.loss['test']['elbo'][0]:.2f}. "
                         f'Fractional difference is {final_best_diff / initial_best_diff:.2f}, '
                         f'which is > specified final_elbo_fail_fraction {final_elbo_fail_fraction:.2f}'
                     )
@@ -284,14 +287,14 @@ def run_training(model: RemoveBackgroundPyroModel,
     logger.info(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
 
     # Check final ELBO meets conditions.
-    if (final_elbo_fail_fraction is not None) and (len(test_elbo) > 1):
-        best_test_elbo = max(test_elbo)
-        if -test_elbo[-1] >= -best_test_elbo * (1 + final_elbo_fail_fraction):
-            raise ElboException(f'Training failed because final test loss ({-test_elbo[-1]:.4f}) '
+    if (final_elbo_fail_fraction is not None) and (len(model.loss['test']['elbo']) > 1):
+        best_test_elbo = max(model.loss['test']['elbo'])
+        if -model.loss['test']['elbo'][-1] >= -best_test_elbo * (1 + final_elbo_fail_fraction):
+            raise ElboException(f"Training failed because final test loss ({-model.loss['test']['elbo'][-1]:.4f}) "
                                 f'exceeds best test loss ({-best_test_elbo:.4f}) by >= '
                                 f'{100 * final_elbo_fail_fraction:.1f}%')
 
     # Free up all the GPU memory we can once training is complete.
     torch.cuda.empty_cache()
 
-    return train_elbo, test_elbo
+    return train_elbo, model.loss['test']['elbo']
diff --git a/docs/source/_static/remove_background/v0.3.2_hgmm.png b/docs/source/_static/remove_background/v0.3.2_hgmm.png
diff --git a/docs/source/changelog/index.rst b/docs/source/changelog/index.rst
@@ -11,33 +11,37 @@ edge case bug fixes, speedups, and small new features might bump up the last
 digit of the version number. For example, the difference between 0.2.1 and 0.2.0
 represents this kind of small change.
 
-Version 0.1.0
+
+Version 0.3.2
 -------------
 
-This was the initial release. The output count matrix was constructed via
-imputation, so that there were no explicit guarantees that CellBender would
-only subtract counts and never add.
+Small improvements aimed at reducing memory footprint, along with bug fixes.
 
-This version has been deprecated, and we do not recommend using it any longer.
+Improvements:
 
-- Imputes the "denoised" count matrix using a variational autoencoder
+- Make posterior generation more memory efficient
 
-Version 0.2.0
--------------
+New features:
 
-A significant overhaul of the model and the output generation procedure were
-undertaken to explicitly guarantee that CellBender only subtracts counts and
-never adds. The output is not constructed by imputation or smoothing, and
-CellBender intentionally tries to modify the raw data as little as possible in
-order to achieve denoising. A nominal false positive rate is approximately
-controlled at the level of the entire dataset, to prevent removal of too much
-signal.
+- WDL workflow updates to facilitate automatic retries on failure
+- Added to list of allowed feature types to match 2024.04 CellRanger definitions
 
-- Uses a variational autoencoder as a prior
+Bug fixes:
 
-- Computes the "denoised" count matrix using a MAP estimate and posterior regularization
+- Fix bug with MTX inputs for WDL
+- Fix Windows bug during posterior generation
+- Fix report generation bugs on Mac and Windows
+
+
+(Version 0.3.1 -- redacted)
+---------------------------
+
+WARNING: redacted
+
+If you managed to obtain a copy of v0.3.1 before it was redacted, do not use it.  An integer 
+overflow bug caused outputs to be incorrect in nearly all cases. For more information, see
+`github issue 347 here <https://github.com/broadinstitute/CellBender/pull/347>`_.
 
-  - CellBender never adds counts
 
 Version 0.3.0
 -------------
@@ -84,6 +88,37 @@ a workflow using Google Colab on a GPU for free.
   hundreds of samples in automated pipelines. This file can be parsed to look for
   indications that a sample may need to be re-run.
 
+
+Version 0.2.0
+-------------
+
+A significant overhaul of the model and the output generation procedure were
+undertaken to explicitly guarantee that CellBender only subtracts counts and
+never adds. The output is not constructed by imputation or smoothing, and
+CellBender intentionally tries to modify the raw data as little as possible in
+order to achieve denoising. A nominal false positive rate is approximately
+controlled at the level of the entire dataset, to prevent removal of too much
+signal.
+
+- Uses a variational autoencoder as a prior
+
+- Computes the "denoised" count matrix using a MAP estimate and posterior regularization
+
+  - CellBender never adds counts
+
+
+Version 0.1.0
+-------------
+
+This was the initial release. The output count matrix was constructed via
+imputation, so that there were no explicit guarantees that CellBender would
+only subtract counts and never add.
+
+This version has been deprecated, and we do not recommend using it any longer.
+
+- Imputes the "denoised" count matrix using a variational autoencoder
+
+
 Human-mouse mixture benchmark
 -----------------------------
 
@@ -137,3 +172,14 @@ v0.3.0
 
 .. image:: /_static/remove_background/v0.3.0_hgmm.png
    :width: 750 px
+
+This represents a real improvement over the results published in the paper.
+
+v0.3.2
+~~~~~~
+
+.. image:: /_static/remove_background/v0.3.2_hgmm.png
+   :width: 750 px
+
+This appears identical to v0.3.0, as the changes were intended to fix bugs and 
+reduce memory footprint.
diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst
@@ -185,4 +185,5 @@ The information contained in the posterior can be used to
 quantitatively answer questions such as "What is the probability that the
 number of viral gene counts in this cell is nonzero?" For help with these kinds
 of computations, please open a
-`github issue <https://github.com/broadinstitute/CellBender/issues>`_.
+`github issue <https://github.com/broadinstitute/CellBender/issues>`_, or see 
+the `semi-worked example on the github issue here <https://github.com/broadinstitute/CellBender/issues/299>`_.
diff --git a/wdl/cellbender_remove_background.wdl b/wdl/cellbender_remove_background.wdl
@@ -90,6 +90,7 @@ task run_cellbender_remove_background_gpu {
             git clone -q https://github.com/broadinstitute/CellBender.git /cromwell_root/CellBender
             cd /cromwell_root/CellBender
             git checkout -q ~{dev_git_hash__}
+            yes | pip install -U pip setuptools
             yes | pip install --no-cache-dir -U -e /cromwell_root/CellBender
             pip list
             cd /cromwell_root