openpipelines-bio · DriesSchaumont · Aug 14, 2024 · Mar 11, 2024 · Mar 13, 2024 · Mar 14, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -115,6 +115,13 @@ after concatenation (PR #807)
 
 * Docker image names now use `/` instead of `_` between the name of the component and the namespace (PR #712).
 
+## MINOR CHANGES
+
+* `neighbors/find_neighbors` component: Modified to include results of KNN in the output file (PR #748).
+  2 new optional arguments added to set .obsm slots to save KNN results into:
+  - `obsm_knn_indices`
+  - `obsm_knn_distances`
+
 ## BUG FIXES
 
 * `rna_singlesample`: fixed a bug where selecting the column for the filtering with mitochondrial fractions 
@@ -126,6 +133,8 @@ after concatenation (PR #807)
 
 * `dimred/tsne` component: Added a tSNE dimensionality reduction component (PR #742).
 
+* `dimred/densmap` component: Added a densMAP dimensionality reduction component (PR #748).
+
 # openpipelines 1.0.0-rc2
 
 ## BUG FIXES

diff --git a/resources_test_scripts/pbmc_1k_protein_v3.sh b/resources_test_scripts/pbmc_1k_protein_v3.sh
@@ -52,31 +52,46 @@ target/docker/convert/from_10xh5_to_h5mu/from_10xh5_to_h5mu \
   --output "${OUT}_raw_feature_bc_matrix.h5mu"
 
 # run single sample
-NXF_VER=21.10.6 nextflow \
+nextflow \
   run . \
-  -main-script src/workflows/multiomics/rna_singlesample/main.nf \
+  -main-script target/nextflow/workflows/rna/rna_singlesample/main.nf \
   -profile docker \
   --id pbmc_1k_protein_v3_uss \
   --input "${OUT}_filtered_feature_bc_matrix.h5mu" \
   --output "`basename $OUT`_uss.h5mu" \
   --publishDir `dirname $OUT` \
   -resume
 
+# add the sample ID to the mudata object
+nextflow \
+  run . \
+  -main-script target/nextflow/metadata/add_id/main.nf \
+  -profile docker \
+  --id pbmc_1k_protein_v3_uss \
+  --input "${OUT}_uss.h5mu" \
+  --input_id "pbmc_1k_protein_v3_uss" \
+  --output "`basename $OUT`_uss_with_id.h5mu" \
+  --output_compression "gzip" \
+  --publishDir `dirname $OUT` \
+  -resume
+
 # run multisample
-NXF_VER=21.10.6 nextflow \
+nextflow \
   run . \
-  -main-script src/workflows/multiomics/rna_multisample/main.nf \
+  -main-script target/nextflow/workflows/rna/rna_multisample/main.nf \
   -profile docker \
   --id pbmc_1k_protein_v3_ums \
-  --input "${OUT}_uss.h5mu" \
+  --input "${OUT}_uss_with_id.h5mu" \
   --output "`basename $OUT`_ums.h5mu" \
   --publishDir `dirname $OUT` \
   -resume
 
-# run integration
-NXF_VER=21.10.6 nextflow \
+rm "${OUT}_uss_with_id.h5mu"
+
+# run dimred
+nextflow \
   run . \
-  -main-script src/workflows/multiomics/integration/main.nf \
+  -main-script target/nextflow/workflows/multiomics/dimensionality_reduction/main.nf \
   -profile docker \
   --id pbmc_1k_protein_v3_mms \
   --input "${OUT}_ums.h5mu" \

diff --git a/src/dimred/densmap/config.vsh.yaml b/src/dimred/densmap/config.vsh.yaml
@@ -0,0 +1,176 @@
+functionality:
+  name: densmap
+  namespace: "dimred"
+  description: |
+    A modification of UMAP that adds an extra cost term in order to preserve information 
+    about the relative local density of the data. It is performed on the same inputs as UMAP.
+  authors:
+    - __merge__: /src/authors/jakub_majercik.yaml
+      roles: [ maintainer ]
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: "--input"
+          type: file
+          description: Input h5mu file
+          direction: input
+          required: true
+          example: input.h5mu
+
+        - name: "--modality"
+          type: string
+          default: "rna"
+          required: false
+
+        - name: "--uns_neighbors"
+          type: string
+          default: "neighbors"
+          description: The `.uns` neighbors slot as output by the `find_neighbors` component.
+
+    - name: Outputs
+      arguments:
+        - name: "--output"
+          alternatives: ["-o"]
+          type: file
+          description: Output h5mu file.
+          direction: output
+          required: true
+          example: output.h5mu
+
+        - name: "--output_compression"
+          type: string
+          description: The compression format to be used on the output h5mu object.
+          choices: ["gzip", "lzf"]
+          required: false
+          example: "gzip"
+
+        - name: "--obsm_output"
+          type: string
+          description: The .obsm key to use for storing the densMAP results..
+          default: "X_densmap"
+
+    - name: Arguments UMAP
+      arguments:
+        - name: "--min_dist"
+          type: double
+          description: |
+            The effective minimum distance between embedded points. Smaller values will result 
+            in a more clustered/clumped embedding where nearby points on the manifold are drawn 
+            closer together, while larger values will result on a more even dispersal of points. 
+            The value should be set relative to the spread value, which determines the scale at 
+            which embedded points will be spread out. 
+          default: 0.5
+          min: 0.0
+          max: 10.0
+
+        - name: "--spread"
+          type: double
+          description: | 
+            The effective scale of embedded points. In combination with `min_dist` this 
+            determines how clustered/clumped the embedded points are.
+          default: 1.0
+          min: 0.0
+          max: 10.0
+
+        - name: "--num_components"
+          type: integer
+          description: The number of dimensions of the embedding.
+          default: 2
+          min: 1
+
+        - name: "--max_iter"
+          type: integer
+          description: | 
+            The number of iterations (epochs) of the optimization. Called `n_epochs` 
+            in the original UMAP. Default is set to 500 if 
+            neighbors['connectivities'].shape[0] <= 10000, else 200.
+          default: 0
+          min: 0
+          max: 1000
+
+        - name: "--alpha"
+          type: double
+          description: The initial learning rate for the embedding optimization.
+          default: 1.0
+
+        - name: "--gamma"
+          type: double
+          description: | 
+            Weighting applied to negative samples in low dimensional embedding optimization. 
+            Values higher than one will result in greater weight being given to negative samples.
+          default: 1.0
+
+        - name: "--negative_sample_rate"
+          type: integer
+          description: |
+            The number of negative samples to select per positive sample
+            in the optimization process. Increasing this value will result
+            in greater repulsive force being applied, greater optimization
+            cost, but slightly more accuracy.
+          default: 5
+
+        - name: "--init_pos"
+          type: string
+          description: |
+            How to initialize the low dimensional embedding. Called `init` in the original UMAP. Options are:
+
+            * Any key from `.obsm`
+            * `'paga'`: positions from `paga()`
+            * `'spectral'`: use a spectral embedding of the graph
+            * `'random'`: assign initial embedding positions at random.
+
+          default: spectral
+          choices: [ 'paga', 'spectral', 'random' ]
+
+    - name: Arguments densMAP
+      arguments:
+        - name: "--lambda"
+          type: double
+          description: |
+            Controls the regularization weight of the density correlation term in densMAP. 
+            Higher values prioritize density preservation over the UMAP objective, and vice versa 
+            for values closer to zero. Setting this parameter to zero is equivalent to running 
+            the original UMAP algorithm.
+          default: 2.0
+          min: 0.01
+          max: 10.0
+
+        - name: "--fraction"
+          type: double
+          description: | 
+            Controls the fraction of epochs (between 0 and 1) where the density-augmented objective 
+            is used in densMAP. The first (1 - dens_frac) fraction of epochs optimize the original 
+            UMAP objective before introducing the density correlation term.
+          default: 0.3
+
+        - name: "--var_shift"
+          type: double
+          description: | 
+            A small constant added to the variance of local radii in the embedding when calculating 
+            the density correlation objective to prevent numerical instability from dividing by a 
+            small number.
+          default: 0.1
+
+  resources:
+    - type: python_script
+      path: script.py
+    - path: /src/utils/setup_logger.py
+  test_resources:
+    - type: python_script
+      path: test.py
+    - path: /resources_test/pbmc_1k_protein_v3
+platforms:
+  - type: docker
+    image: python:3.12-slim
+    setup:
+      - type: apt
+        packages:
+          - procps
+      - type: python
+        packages:
+          - umap-learn
+        __merge__: [/src/base/requirements/anndata_mudata.yaml, .]
+    __merge__: [/src/base/requirements/python_test_setup.yaml, .]
+  - type: nextflow
+    directives:
+      label: [highcpu, midmem]
diff --git a/src/dimred/densmap/script.py b/src/dimred/densmap/script.py
@@ -0,0 +1,109 @@
+from umap import UMAP
+import mudata as mu
+import sys
+import anndata as ad
+
+## VIASH START
+par = {
+  'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu',
+  'modality': 'rna',
+  'output': 'output.h5mu',
+  'obsm_output': 'X_densmap',
+  'lambda': 2.0,
+  'fraction': 0.3,
+  'var_shift': 0.1
+}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+# START TEMPORARY WORKAROUND setup_logger
+# reason: resources aren't available when using Nextflow fusion
+# from setup_logger import setup_logger
+def setup_logger():
+    import logging
+    from sys import stdout
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
+# END TEMPORARY WORKAROUND setup_logger
+logger = setup_logger()
+
+logger.info("Reading %s", par["input"])
+mdata = mu.read_h5mu(par["input"])
+
+if par['modality'] not in mdata.mod:
+    raise ValueError(f"Modality '{par['modality']}' not found in the input data.")
+
+logger.info("Computing densMAP for modality '%s'", par['modality'])
+data = mdata.mod[par['modality']]
+
+neigh_key = par["uns_neighbors"]
+
+if neigh_key not in data.uns:
+    raise ValueError(f"'{neigh_key}' was not found in .mod['{par['modality']}'].uns. Set the correct key or run 'find_neighbors' first.")
+
+temp_uns = { neigh_key: data.uns[neigh_key] }
+
+if 'use_rep' not in temp_uns[neigh_key]['params']:
+    raise ValueError(f"'use_rep' was not found in .mod['{par['modality']}'].uns['{neigh_key}'].params. Set the correct key or run PCA first.")
+
+pca_key = temp_uns[neigh_key]['params']['use_rep']
+knn_indices_key = temp_uns[neigh_key]['knn_indices_key']
+knn_distances_key = temp_uns[neigh_key]['knn_distances_key']
+
+
+X_densmap = UMAP(
+  min_dist=par["min_dist"],
+  spread=par["spread"],
+  n_components=par["num_components"],
+  n_epochs=par["max_iter"],
+  learning_rate=par["alpha"],
+  repulsion_strength=par["gamma"],
+  negative_sample_rate=par["negative_sample_rate"],
+  init=par["init_pos"],
+  metric=data.uns["neighbors"].get("metric", "euclidean"),
+  metric_kwds=data.uns["neighbors"].get("metric_kwds", {}),
+  densmap=True,
+  dens_lambda=par["lambda"],
+  dens_frac=par["fraction"],
+  dens_var_shift=par["var_shift"],
+  precomputed_knn=(
+    data.obsm[knn_indices_key],
+    data.obsm[knn_distances_key]
+  )
+).fit_transform(data.obsm[pca_key])
+
+logger.info(f"Writing densMAP embeddings to .mod[{par['modality']}].obsm[{par['obsm_output']}]")
+data.obsm[par['obsm_output']] = X_densmap
+
+logger.info(f"Writing densMAP metadata to .mod[{par['modality']}].uns['densmap']")
+data.uns['densmap'] = {
+  'params': {
+    'min_dist': par["min_dist"],
+    'spread': par["spread"],
+    'n_components': par["num_components"],
+    'n_epochs': par["max_iter"],
+    'learning_rate': par["alpha"],
+    'repulsion_strength': par["gamma"],
+    'negative_sample_rate': par["negative_sample_rate"],
+    'init': par["init_pos"],
+    'metric': data.uns["neighbors"].get("metric", "euclidean"),
+    'metric_kwds': data.uns["neighbors"].get("metric_kwds", {}),
+    'dens_lambda': par["lambda"],
+    'dens_frac': par["fraction"],
+    'dens_var_shift': par["var_shift"],
+    'knn_indices_key': knn_indices_key,
+    'knn_distances_key': knn_distances_key
+  }
+}
+
+logger.info("Writing to %s.", par["output"])
+mdata.write_h5mu(filename=par["output"], compression=par["output_compression"])
+
+logger.info("Finished")