fix tests with reqs (#567)

* fix tests, fix mint demo notebook
broadinstitute · May 21, 2024 · 308faa3 · 308faa3
1 parent 1d00147
commit 308faa3
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 65 deletions.
diff --git a/docker/vm_boot_images/config/tensorflow-requirements.txt b/docker/vm_boot_images/config/tensorflow-requirements.txt
@@ -2,7 +2,7 @@ pydot
 nibabel==4.0.2
 pydicom==1.2.2
 hyperopt==0.1.2
-seaborn==0.11.2
+seaborn
 scikit-image
 biosppy
 vtk==9.2.6

diff --git a/notebooks/ML4H_Model_Factory_Intro.ipynb b/notebooks/ML4H_Model_Factory_Intro.ipynb
@@ -28,7 +28,7 @@
     "import os\n",
     "import sys\n",
     "import pickle\n",
-    "import random\n",
+    "import gzip\n",
     "from typing import List, Dict, Callable\n",
     "from collections import defaultdict, Counter\n",
     "\n",
@@ -57,7 +57,7 @@
    "outputs": [],
    "source": [
     "# Constants\n",
-    "HD5_FOLDER = './tensors/'\n",
+    "HD5_FOLDER = './mnist_tensors/'\n",
     "OUTPUT_FOLDER = './outputs/'"
    ]
   },
@@ -119,35 +119,48 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "DNA_SYMBOLS = {'A': 0, 'C': 1, 'G': 2, 'T': 3}\n",
-    "VARIANT_LABELS = {'NOT_SNP': 0, 'NOT_INDEL': 1, 'SNP': 2, 'INDEL': 3}\n",
+    "def load_data(dataset):\n",
+    "    ''' Loads the dataset\n",
+    "    :param dataset: the path to the dataset (here MNIST)'''\n",
+    "    data_dir, data_file = os.path.split(dataset)\n",
+    "    if data_dir == \"\" and not os.path.isfile(dataset):\n",
+    "        # Check if dataset is in the data directory.\n",
+    "        new_path = os.path.join(\"data\", dataset)\n",
+    "        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':\n",
+    "            dataset = new_path\n",
     "\n",
+    "    if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':\n",
+    "        from urllib.request import urlretrieve\n",
+    "        origin = ('http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz')\n",
+    "        print('Downloading data from %s' % origin)\n",
+    "        if not os.path.exists(os.path.dirname(dataset)):\n",
+    "            os.makedirs(os.path.dirname(dataset))\n",
+    "        urlretrieve(origin, dataset)\n",
     "\n",
-    "def tensor_from_hd5(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}) -> np.ndarray:\n",
-    "    return np.array(hd5[tm.name])\n",
+    "    print('loading data...')\n",
+    "    f = gzip.open(dataset, 'rb')\n",
+    "    if sys.version_info[0] == 3:\n",
+    "        u = pickle._Unpickler(f)\n",
+    "        u.encoding = 'latin1'\n",
+    "        train_set, valid_set, test_set = u.load()\n",
+    "    else:\n",
+    "        train_set, valid_set, test_set = pickle.load(f)\n",
+    "    f.close()\n",
     "\n",
+    "    return train_set, valid_set, test_set\n",
     "\n",
-    "reference = TensorMap('reference', shape=(128, len(DNA_SYMBOLS)), tensor_from_file=tensor_from_hd5)\n",
-    "read_tensor = TensorMap('read_tensor', shape=(128, 128, 15), tensor_from_file=tensor_from_hd5)\n",
     "\n",
-    "\n",
-    "def variant_label_from_hd5(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}) -> np.ndarray:\n",
-    "    one_hot = np.zeros(tm.shape, dtype=np.float32)\n",
-    "    variant_str = str(hd5['variant_label'][()], 'utf-8')\n",
-    "    for channel in tm.channel_map:\n",
-    "        if channel.lower() == variant_str.lower():\n",
-    "            one_hot[tm.channel_map[channel]] = 1.0\n",
-    "    if one_hot.sum() != 1:\n",
-    "        raise ValueError(f'TensorMap {tm.name} missing or invalid label: {variant_str} one_hot: {one_hot}')\n",
-    "    return one_hot\n",
-    "\n",
-    "\n",
-    "variant_label = TensorMap(\n",
-    "    'variant_label', Interpretation.CATEGORICAL,\n",
-    "    shape=(len(VARIANT_LABELS),),\n",
-    "    tensor_from_file=variant_label_from_hd5,\n",
-    "    channel_map=VARIANT_LABELS,\n",
-    ")"
+    "def mnist_as_hd5(hd5_folder):\n",
+    "    train, _, _ = load_data('mnist.pkl.gz')\n",
+    "    mnist_images = train[0].reshape((-1, 28, 28, 1))\n",
+    "    if not os.path.exists(hd5_folder):\n",
+    "        os.makedirs(hd5_folder)\n",
+    "    for i, mnist_image in enumerate(mnist_images):\n",
+    "        with h5py.File(os.path.join(hd5_folder, f'{i}.hd5'), 'w') as hd5:\n",
+    "            hd5.create_dataset('mnist_image', data=mnist_image)\n",
+    "            hd5.create_dataset('mnist_label', data=[train[1][i]])\n",
+    "        if (i+1) % 5000 == 0:\n",
+    "            print(f'Wrote {i+1} MNIST images and labels as HD5 files')"
    ]
   },
   {
@@ -163,9 +176,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if not os.path.exists(HD5_FOLDER):\n",
-    "    os.makedirs(HD5_FOLDER)\n",
-    "!tar -zxvf ./hg002_na24385_ml4h_tensors_v2021_10_14.tar.gz  -C ./tensors/"
+    "mnist_as_hd5(HD5_FOLDER)"
    ]
   },
   {
@@ -183,14 +194,6 @@
     "\n"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1D CNN for Classification of Genomic Variants\n",
-    "Jupyter is great, but can complicate productionizing code. We try to mitigate this by interacting with the jupyter notebook as if it were a command line call to one of ml4h's modes. "
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -199,31 +202,26 @@
    "source": [
     "sys.argv = ['train', \n",
     "            '--tensors', HD5_FOLDER, \n",
-    "            '--input_tensors', 'gatk.reference',\n",
-    "            '--output_tensors', 'gatk.variant_label',\n",
+    "            '--input_tensors', 'mnist.mnist_image',\n",
+    "            '--output_tensors', 'mnit.mnist_label',\n",
     "            '--batch_size', '16',\n",
     "            '--epochs', '12',\n",
     "            '--output_folder', OUTPUT_FOLDER,\n",
-    "            '--id', 'learn_1d_cnn'\n",
+    "            '--id', 'learn_2d_cnn'\n",
     "           ]\n",
     "args = parse_args()\n",
-    "metrics = train_multimodal_multitask(args)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sys.argv = ['train', \n",
-    "            '--tensors', HD5_FOLDER, \n",
-    "            '--input_tensors', 'gatk.read_tensor',\n",
-    "            '--output_tensors', 'gatk.variant_label',\n",
+    "metrics = train_multimodal_multitask(args)\n",
+    "\n",
+    "sys.argv = ['train',\n",
+    "            '--tensors', HD5_FOLDER,\n",
+    "            '--input_tensors', 'mnist.mnist_image',\n",
+    "            '--output_tensors', 'mnit.mnist_label',\n",
+    "            '--activation', 'mish',\n",
+    "            '--dense_blocks', '64', '64', '64',\n",
     "            '--batch_size', '16',\n",
     "            '--epochs', '12',\n",
     "            '--output_folder', OUTPUT_FOLDER,\n",
-    "            '--id', 'learn_2d_cnn'\n",
+    "            '--id', 'learn_2d_cnn2'\n",
     "           ]\n",
     "args = parse_args()\n",
     "metrics = train_multimodal_multitask(args)"
@@ -244,12 +242,12 @@
    "source": [
     "sys.argv = ['compare_scalar', \n",
     "            '--tensors', HD5_FOLDER, \n",
-    "            '--input_tensors', 'gatk.reference', 'gatk.read_tensor',\n",
-    "            '--output_tensors', 'gatk.variant_label',\n",
-    "            '--id', 'gatk_model_comparison',\n",
+    "            '--input_tensors', 'mnist.mnist_image',\n",
+    "            '--output_tensors', 'mnit.mnist_label',\n",
+    "            '--id', 'mnist_model_comparison',\n",
     "            '--output_folder', OUTPUT_FOLDER,\n",
-    "            '--model_files', f'{OUTPUT_FOLDER}learn_1d_cnn/learn_1d_cnn.h5',\n",
-    "                             f'{OUTPUT_FOLDER}learn_2d_cnn/learn_2d_cnn.h5',\n",
+    "            '--model_files', f'{OUTPUT_FOLDER}learn_2d_cnn/learn_2d_cnn.h5',\n",
+    "                            f'{OUTPUT_FOLDER}learn_2d_cnn2/learn_2d_cnn2.h5',\n",
     "            '--test_steps', '100', \n",
     "            '--batch_size', '16',\n",
     "           ]\n",
@@ -275,8 +273,8 @@
    "source": [
     "sys.argv = ['train', \n",
     "            '--tensors', HD5_FOLDER, \n",
-    "            '--input_tensors', 'gatk.reference',\n",
-    "            '--output_tensors', 'gatk.variant_label',\n",
+    "            '--input_tensors', 'mnist.mnist_image',\n",
+    "            '--output_tensors', 'mnit.mnist_label',\n",
     "            '--output_folder', OUTPUT_FOLDER,\n",
     "            '--activation', 'swish',\n",
     "            '--conv_layers', '32',\n",
@@ -289,7 +287,7 @@
     "            '--inspect_model',\n",
     "            '--epochs', '1',\n",
     "            '--batch_size', '4',\n",
-    "            '--id', 'hypertuned_1d',\n",
+    "            '--id', 'hypertuned_2d',\n",
     "           ]\n",
     "args = parse_args()\n",
     "generate_train, generate_valid, generate_test = test_train_valid_tensor_generators(**args.__dict__)\n",
@@ -300,7 +298,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "After running the cell above the diagram of the model architecture will be saved at: `./outputs/hypertuned_1d/architecture_graph_hypertuned_1d.png`"
+    "After running the cell above the diagram of the model architecture will be saved at: `./outputs/hypertuned_2d/architecture_graph_hypertuned_2d.png`"
    ]
   }
  ],

diff --git a/setup.py b/setup.py
@@ -3,6 +3,7 @@
 
 here = pathlib.Path(__file__).parent.resolve()
 # Get the requirements from the requirements file
+requirements = (here / 'docker/vm_boot_images/config/tensorflow-requirements.txt').read_text(encoding='utf-8')
 long_description = (here / 'README.md').read_text(encoding='utf-8')
 setup(
     name='ml4h',
@@ -12,6 +13,7 @@
     long_description_content_type='text/markdown',
     url='https://github.com/broadinstitute/ml4h',
     python_requires='>=3.6',
-    install_requires=["ml4ht", "tensorflow", "pytest", "numcodecs"],  # requirements
+    #install_requires=["ml4ht", "tensorflow", "pytest", "numcodecs"],  # requirements
+    install_requires=requirements,
     packages=find_packages(),
 )