diff --git a/.vscode/settings.json b/.vscode/settings.json index 338af49..6a2331d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,6 @@ { "python.pythonPath": "/home/kevin/anaconda3/envs/scaden/bin/python", - "python.linting.pylintEnabled": false, + "python.linting.pylintEnabled": true, "python.linting.enabled": true, - "python.linting.flake8Enabled": true + "python.linting.flake8Enabled": false } \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 212a600..481b06a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,6 @@ -FROM continuumio/miniconda3 +FROM ubuntu -COPY environment.yml / -RUN conda env create -f /environment.yml && conda clean -a -ENV PATH /opt/conda/envs/scaden/bin:$PATH \ No newline at end of file +RUN apt-get update && apt-get upgrade -y +RUN apt-get install python3 -y +RUN apt-get install python3-pip -y +RUN pip3 install scaden \ No newline at end of file diff --git a/README.md b/README.md index 38151de..d565492 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,13 @@ ![Scaden](docs/img/scaden_logo.png) -![MIT](https://anaconda.org/bioconda/scaden/badges/license.svg) -![Install with Bioconda](https://anaconda.org/bioconda/scaden/badges/installer/conda.svg) + +![Scaden version](https://img.shields.io/badge/scaden-v0.9.5-cyan) +![MIT](https://img.shields.io/badge/License-MIT-black) +![Install with pip](https://img.shields.io/badge/Install%20with-pip-blue) +![Install with Bioconda](https://img.shields.io/badge/Install%20with-conda-green) +![Docker build](https://img.shields.io/docker/cloud/build/kevinmenden/scaden) +![Downloads](https://static.pepy.tech/personalized-badge/scaden?period=total&units=international_system&left_color=blue&right_color=green&left_text=Downloads) + ## Single-cell assisted deconvolutional network Scaden is a deep-learning based algorithm for cell type deconvolution of bulk RNA-seq samples. It was developed diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 0000000..2f0e14a --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,31 @@ +# Changelog + +### Version 0.9.6 ++ fixed Dockerfile (switched to pip installation) ++ added better error messages to `simulate` command ++ cleaned up dependencies + +### Version 0.9.5 ++ added `scaden simulate` command to perform bulk simulation and training file creation ++ added `--seed` parameter to allow reproducible Scaden runs + +### Version 0.9.4 ++ fixed dependencies (added python>=3.6 requirement) + +### Version 0.9.3 ++ upgrade to Tensorflow 2 ++ cleaned up dependencies + +### Version 0.9.2 ++ RAM usage improvement + +### Version 0.9.1 ++ Added automatic removal of duplicate genes in Mixture file ++ Changed name of final prediction file ++ Added Scaden logo to main script + +### Version 0.9.0 +This is the initial release version of Scaden. While this version contains full functionality for pre-processing, training and prediction, it does not +contain thorough error messages, plotting functionality and a solid helper function for generation training data. These are all features +planned for the release of v.1.0.0. +The core functionality of Scaden is, however, implemented and fully operational. Please check the [Usage](usage) section to learn how to use Scaden. \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index be88f3c..dc26879 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,35 +8,3 @@ at the [DZNE Tübingen](https://www.dzne.de/en/about-us/sites/tuebingen/) and th A pre-print describing the method is available on Biorxiv: [Deep-learning-based cell composition analysis from tissue expression profiles](https://www.biorxiv.org/content/10.1101/659227v1) - - - - - -## Changelog - -### Version 0.9.5 -+ added `scaden simulate` command to perform bulk simulation and training file creation -+ added `--seed` parameter to allow reproducible Scaden runs - -### Version 0.9.4 -+ fixed dependencies (added python>=3.6 requirement) - -### Version 0.9.3 -+ upgrade to Tensorflow 2 -+ cleaned up dependencies - -### Version 0.9.2 -+ RAM usage improvement - -### Version 0.9.1 -+ Added automatic removal of duplicate genes in Mixture file -+ Changed name of final prediction file -+ Added Scaden logo to main script - - -### Version 0.9.0 -This is the initial release version of Scaden. While this version contains full functionality for pre-processing, training and prediction, it does not -contain thorough error messages, plotting functionality and a solid helper function for generation training data. These are all features -planned for the release of v.1.0.0. -The core functionality of Scaden is, however, implemented and fully operational. Please check the [Usage](usage) section to learn how to use Scaden. \ No newline at end of file diff --git a/docs/installation.md b/docs/installation.md index c656db0..23afed2 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -3,18 +3,16 @@ Scaden be easily installed on a Linux system, and should also work on Mac. There are currently two options for installing Scaden, either using [Bioconda](https://bioconda.github.io/) or via [pip](https://pypi.org/). -## Bioconda -Installation via Bioconda is the preferred route of installation, and we highly recommend using conda. To install Scaden, use: - -`conda install -c bioconda scaden` +## pip +To install Scaden via pip, simply run the following command: -It is always recommended to create a separate conda environment for installation. +`pip install scaden` -## pip -If you don't want to use conda, you can also install Scaden using pip: +## Bioconda +You can also install Scaden via bioconda, using:: -`pip install scaden` +`conda install -c bioconda scaden` ## Docker diff --git a/docs/usage.md b/docs/usage.md index ab71666..a64bfda 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -96,10 +96,9 @@ Once you have done this, you can use Scaden's command `scaden simulate` to gener The first step is to process your scRNA-seq dataset(s) you want to use for training. I used Scanpy for this, and would therefore recommend to do the same, but you can of course use other software for this purpose. I've uploaded the scripts I used to preprocess the data used for the Scaden paper [here](https://doi.org/10.6084/m9.figshare.8234030.v1). Mainly you have to normalize your count data -and create a file containing the cell type labels. The file for the cell type labels should be of size (n x 2), where n is the number of cells -you have in your data. The two columns correspond to a label for your cells, and a 'Celltype' column. In fact, the only necessary column is the 'Celltype' -column, which Scaden uses to extract the information. The count data should be of size (n x g), where g is the number of genes and n is the number of samples. -The order must be the same as for the cell type labels. +and create a file containing the cell type labels. +The file for the cell type labels should be of size (n x 1), where n is the number of cells +you have in your data. The single column in this file should be labeled 'Celltype'. You can have extra columns if you like, as long as you have a 'Celltype' column which specifies the cell type label in the correct order. The count data should be of size (n x g), where g is the number of genes and n is the number of samples. The order must be the same as for the cell type labels. #### Bulk simulation Once the data is processed, you can use the command `scaden simulate` to generate your artificial bulk samples for training. @@ -116,6 +115,12 @@ As example, you can generate 1000 artificial bulk samples from 100 cells per sam scaden simulate --cells 100 --n_samples 1000 --data --pattern ``` +An example for a pattern would be `*_counts.txt`. This pattern would find the following dataset: +* `dataset_counts.txt` +* `dataset_celltypes.txt` + +Make sure to include an `*` in your pattern! + This command will create the artificial samples in the current working directory. You can also specificy an output directory using the `--out` parameter. Scaden will also directly create a .h5ad file in this directory, which is the file you will need for training. By default, this file will be called `data.h5ad`, however you can change the prefix using the `--prefix` flag. diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 515d843..0000000 --- a/environment.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: scaden -channels: - - bioconda - - r - - defaults - - conda-forge -dependencies: - - scaden=0.9.4=py_0 -prefix: /home/kevin/anaconda3/envs/scaden - diff --git a/mkdocs.yml b/mkdocs.yml index 5b64afe..e78c266 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,4 +4,5 @@ nav: - Installation: installation.md - Usage: usage.md - Datasets: datasets.md + - Changelog: changelog.md theme: readthedocs diff --git a/scaden/model/scaden.py b/scaden/model/scaden.py index ab252a8..d224f92 100644 --- a/scaden/model/scaden.py +++ b/scaden/model/scaden.py @@ -295,7 +295,7 @@ def train(self, input_path, train_datasets): pd.DataFrame(self.sig_genes).to_csv(self.model_dir + "/genes.txt", sep="\t") - def predict(self, input_path, out_name="cdn_predictions.txt"): + def predict(self, input_path, out_name="scaden_predictions.txt"): """ Perform prediction with a pre-trained model :param out_dir: path to store results in diff --git a/scaden/preprocessing/bulk_simulation.py b/scaden/preprocessing/bulk_simulation.py index 3db2d3c..8cc495c 100644 --- a/scaden/preprocessing/bulk_simulation.py +++ b/scaden/preprocessing/bulk_simulation.py @@ -160,6 +160,22 @@ def filter_matrix_signature(mat, genes): mat = mat[genes] return mat +def load_celltypes(path, name): + """ Load the cell type information """ + try: + y = pd.read_table(path) + # Check if has Celltype column + if not 'Celltype' in y.columns: + logger.error(f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column.") + sys.exit() + except FileNotFoundError as e: + logger.error(f"No celltypes file found for {name}. It should be called {name}_celltypes.txt.") + sys.exit(e) + + return y + + + def load_dataset(name, dir, pattern): """ @@ -172,12 +188,7 @@ def load_dataset(name, dir, pattern): pattern = pattern.replace("*", "") print("Loading " + name + " dataset ...") - try: - y = pd.read_table(dir + name + "_celltypes.txt") - except FileNotFoundError as e: - logger.error(f"No celltypes file found for {name}. It should be called {name}_celltypes.txt.") - sys.exit() - + y = load_celltypes(dir + name + "_celltypes.txt", name) x = pd.read_table(dir + name + pattern, index_col=0) return (x, y) @@ -285,7 +296,7 @@ def simulate_bulk( datasets = [x.split("_")[0] for x in files] if len(datasets) == 0: - logging.error("No datasetes fround! Have you specified the pattern correctly?") + logging.error("No datasets fround! Have you specified the pattern correctly?") sys.exit() print("Datasets: " + str(datasets)) diff --git a/scaden/preprocessing/create_h5ad_file.py b/scaden/preprocessing/create_h5ad_file.py index 3ef423b..6ce1656 100644 --- a/scaden/preprocessing/create_h5ad_file.py +++ b/scaden/preprocessing/create_h5ad_file.py @@ -27,7 +27,7 @@ def parse_data(x_path, y_path): x = pd.read_table(x_path, sep="\t") y = pd.read_table(y_path, sep="\t") except FileNotFoundError as e: - logging.error(f"Could not find simulated data files: {e}") + logging.error(f" Could not find simulated data files: {e}") sys.exit() labels = list(y.columns) diff --git a/setup.py b/setup.py index 03cd660..e6c6e4a 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages -version = '0.9.5' +version = '0.9.6' with open("README.md", "r", encoding="UTF-8") as fh: @@ -30,13 +30,10 @@ 'pandas', 'numpy', 'scikit-learn', - 'scipy', 'tensorflow>=2.0', 'anndata', 'tqdm', - 'click' - ], - extras_require = { - 'scanpy': ["scanpy", "matplotlib", "seaborn"] - } + 'click', + 'h5py~=2.10.0' + ] )