Skip to content

Commit

Permalink
fix tests with reqs (#567)
Browse files Browse the repository at this point in the history
* fix tests, fix mint demo notebook
  • Loading branch information
lucidtronix committed May 21, 2024
1 parent 1d00147 commit 308faa3
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 65 deletions.
2 changes: 1 addition & 1 deletion docker/vm_boot_images/config/tensorflow-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ pydot
nibabel==4.0.2
pydicom==1.2.2
hyperopt==0.1.2
seaborn==0.11.2
seaborn
scikit-image
biosppy
vtk==9.2.6
Expand Down
124 changes: 61 additions & 63 deletions notebooks/ML4H_Model_Factory_Intro.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"import os\n",
"import sys\n",
"import pickle\n",
"import random\n",
"import gzip\n",
"from typing import List, Dict, Callable\n",
"from collections import defaultdict, Counter\n",
"\n",
Expand Down Expand Up @@ -57,7 +57,7 @@
"outputs": [],
"source": [
"# Constants\n",
"HD5_FOLDER = './tensors/'\n",
"HD5_FOLDER = './mnist_tensors/'\n",
"OUTPUT_FOLDER = './outputs/'"
]
},
Expand Down Expand Up @@ -119,35 +119,48 @@
"metadata": {},
"outputs": [],
"source": [
"DNA_SYMBOLS = {'A': 0, 'C': 1, 'G': 2, 'T': 3}\n",
"VARIANT_LABELS = {'NOT_SNP': 0, 'NOT_INDEL': 1, 'SNP': 2, 'INDEL': 3}\n",
"def load_data(dataset):\n",
" ''' Loads the dataset\n",
" :param dataset: the path to the dataset (here MNIST)'''\n",
" data_dir, data_file = os.path.split(dataset)\n",
" if data_dir == \"\" and not os.path.isfile(dataset):\n",
" # Check if dataset is in the data directory.\n",
" new_path = os.path.join(\"data\", dataset)\n",
" if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':\n",
" dataset = new_path\n",
"\n",
" if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':\n",
" from urllib.request import urlretrieve\n",
" origin = ('http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz')\n",
" print('Downloading data from %s' % origin)\n",
" if not os.path.exists(os.path.dirname(dataset)):\n",
" os.makedirs(os.path.dirname(dataset))\n",
" urlretrieve(origin, dataset)\n",
"\n",
"def tensor_from_hd5(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}) -> np.ndarray:\n",
" return np.array(hd5[tm.name])\n",
" print('loading data...')\n",
" f = gzip.open(dataset, 'rb')\n",
" if sys.version_info[0] == 3:\n",
" u = pickle._Unpickler(f)\n",
" u.encoding = 'latin1'\n",
" train_set, valid_set, test_set = u.load()\n",
" else:\n",
" train_set, valid_set, test_set = pickle.load(f)\n",
" f.close()\n",
"\n",
" return train_set, valid_set, test_set\n",
"\n",
"reference = TensorMap('reference', shape=(128, len(DNA_SYMBOLS)), tensor_from_file=tensor_from_hd5)\n",
"read_tensor = TensorMap('read_tensor', shape=(128, 128, 15), tensor_from_file=tensor_from_hd5)\n",
"\n",
"\n",
"def variant_label_from_hd5(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}) -> np.ndarray:\n",
" one_hot = np.zeros(tm.shape, dtype=np.float32)\n",
" variant_str = str(hd5['variant_label'][()], 'utf-8')\n",
" for channel in tm.channel_map:\n",
" if channel.lower() == variant_str.lower():\n",
" one_hot[tm.channel_map[channel]] = 1.0\n",
" if one_hot.sum() != 1:\n",
" raise ValueError(f'TensorMap {tm.name} missing or invalid label: {variant_str} one_hot: {one_hot}')\n",
" return one_hot\n",
"\n",
"\n",
"variant_label = TensorMap(\n",
" 'variant_label', Interpretation.CATEGORICAL,\n",
" shape=(len(VARIANT_LABELS),),\n",
" tensor_from_file=variant_label_from_hd5,\n",
" channel_map=VARIANT_LABELS,\n",
")"
"def mnist_as_hd5(hd5_folder):\n",
" train, _, _ = load_data('mnist.pkl.gz')\n",
" mnist_images = train[0].reshape((-1, 28, 28, 1))\n",
" if not os.path.exists(hd5_folder):\n",
" os.makedirs(hd5_folder)\n",
" for i, mnist_image in enumerate(mnist_images):\n",
" with h5py.File(os.path.join(hd5_folder, f'{i}.hd5'), 'w') as hd5:\n",
" hd5.create_dataset('mnist_image', data=mnist_image)\n",
" hd5.create_dataset('mnist_label', data=[train[1][i]])\n",
" if (i+1) % 5000 == 0:\n",
" print(f'Wrote {i+1} MNIST images and labels as HD5 files')"
]
},
{
Expand All @@ -163,9 +176,7 @@
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(HD5_FOLDER):\n",
" os.makedirs(HD5_FOLDER)\n",
"!tar -zxvf ./hg002_na24385_ml4h_tensors_v2021_10_14.tar.gz -C ./tensors/"
"mnist_as_hd5(HD5_FOLDER)"
]
},
{
Expand All @@ -183,14 +194,6 @@
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1D CNN for Classification of Genomic Variants\n",
"Jupyter is great, but can complicate productionizing code. We try to mitigate this by interacting with the jupyter notebook as if it were a command line call to one of ml4h's modes. "
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -199,31 +202,26 @@
"source": [
"sys.argv = ['train', \n",
" '--tensors', HD5_FOLDER, \n",
" '--input_tensors', 'gatk.reference',\n",
" '--output_tensors', 'gatk.variant_label',\n",
" '--input_tensors', 'mnist.mnist_image',\n",
" '--output_tensors', 'mnit.mnist_label',\n",
" '--batch_size', '16',\n",
" '--epochs', '12',\n",
" '--output_folder', OUTPUT_FOLDER,\n",
" '--id', 'learn_1d_cnn'\n",
" '--id', 'learn_2d_cnn'\n",
" ]\n",
"args = parse_args()\n",
"metrics = train_multimodal_multitask(args)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sys.argv = ['train', \n",
" '--tensors', HD5_FOLDER, \n",
" '--input_tensors', 'gatk.read_tensor',\n",
" '--output_tensors', 'gatk.variant_label',\n",
"metrics = train_multimodal_multitask(args)\n",
"\n",
"sys.argv = ['train',\n",
" '--tensors', HD5_FOLDER,\n",
" '--input_tensors', 'mnist.mnist_image',\n",
" '--output_tensors', 'mnit.mnist_label',\n",
" '--activation', 'mish',\n",
" '--dense_blocks', '64', '64', '64',\n",
" '--batch_size', '16',\n",
" '--epochs', '12',\n",
" '--output_folder', OUTPUT_FOLDER,\n",
" '--id', 'learn_2d_cnn'\n",
" '--id', 'learn_2d_cnn2'\n",
" ]\n",
"args = parse_args()\n",
"metrics = train_multimodal_multitask(args)"
Expand All @@ -244,12 +242,12 @@
"source": [
"sys.argv = ['compare_scalar', \n",
" '--tensors', HD5_FOLDER, \n",
" '--input_tensors', 'gatk.reference', 'gatk.read_tensor',\n",
" '--output_tensors', 'gatk.variant_label',\n",
" '--id', 'gatk_model_comparison',\n",
" '--input_tensors', 'mnist.mnist_image',\n",
" '--output_tensors', 'mnit.mnist_label',\n",
" '--id', 'mnist_model_comparison',\n",
" '--output_folder', OUTPUT_FOLDER,\n",
" '--model_files', f'{OUTPUT_FOLDER}learn_1d_cnn/learn_1d_cnn.h5',\n",
" f'{OUTPUT_FOLDER}learn_2d_cnn/learn_2d_cnn.h5',\n",
" '--model_files', f'{OUTPUT_FOLDER}learn_2d_cnn/learn_2d_cnn.h5',\n",
" f'{OUTPUT_FOLDER}learn_2d_cnn2/learn_2d_cnn2.h5',\n",
" '--test_steps', '100', \n",
" '--batch_size', '16',\n",
" ]\n",
Expand All @@ -275,8 +273,8 @@
"source": [
"sys.argv = ['train', \n",
" '--tensors', HD5_FOLDER, \n",
" '--input_tensors', 'gatk.reference',\n",
" '--output_tensors', 'gatk.variant_label',\n",
" '--input_tensors', 'mnist.mnist_image',\n",
" '--output_tensors', 'mnit.mnist_label',\n",
" '--output_folder', OUTPUT_FOLDER,\n",
" '--activation', 'swish',\n",
" '--conv_layers', '32',\n",
Expand All @@ -289,7 +287,7 @@
" '--inspect_model',\n",
" '--epochs', '1',\n",
" '--batch_size', '4',\n",
" '--id', 'hypertuned_1d',\n",
" '--id', 'hypertuned_2d',\n",
" ]\n",
"args = parse_args()\n",
"generate_train, generate_valid, generate_test = test_train_valid_tensor_generators(**args.__dict__)\n",
Expand All @@ -300,7 +298,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"After running the cell above the diagram of the model architecture will be saved at: `./outputs/hypertuned_1d/architecture_graph_hypertuned_1d.png`"
"After running the cell above the diagram of the model architecture will be saved at: `./outputs/hypertuned_2d/architecture_graph_hypertuned_2d.png`"
]
}
],
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

here = pathlib.Path(__file__).parent.resolve()
# Get the requirements from the requirements file
requirements = (here / 'docker/vm_boot_images/config/tensorflow-requirements.txt').read_text(encoding='utf-8')
long_description = (here / 'README.md').read_text(encoding='utf-8')
setup(
name='ml4h',
Expand All @@ -12,6 +13,7 @@
long_description_content_type='text/markdown',
url='https://github.com/broadinstitute/ml4h',
python_requires='>=3.6',
install_requires=["ml4ht", "tensorflow", "pytest", "numcodecs"], # requirements
#install_requires=["ml4ht", "tensorflow", "pytest", "numcodecs"], # requirements
install_requires=requirements,
packages=find_packages(),
)

0 comments on commit 308faa3

Please sign in to comment.