microsoft · maxkazmsft · Jul 2, 2020 · Jul 1, 2020 · Jul 1, 2020 · Jul 1, 2020
diff --git a/README.md b/README.md
@@ -127,7 +127,9 @@ cd ..
 ```
 Refer to the script itself for more argument options.
 
-#### Bring your own SEG-Y data
+#### Bring Your Own Data [BYOD]
+
+##### Bring your own SEG-Y data
 
 If you want to train these models using your own seismic and label data, the files will need to be prepped and
 converted to npy files. Typically, the [segyio](https://pypi.org/project/segyio/) can be used to open SEG-Y files that follow the standard, but more often than not, there are non standard settings or missing traces that will cause segyio to fail. If this happens with your data, read these notebooks and scripts to help prepare your data files:
@@ -137,6 +139,21 @@ converted to npy files. Typically, the [segyio](https://pypi.org/project/segyio/
 * [segy_convert_sample notebook](contrib/segyconverter/segy_convert_sample.ipynb) - Details on SEG-Y data conversion
 * [segy_sample_files notebook](contrib/segyconverter/segy_sample_files.ipynb) - Create test SEG-Y files that describe the scenarios that may cause issues when converting the data to numpy arrays
 
+##### Penobscot example
+
+We also offer starter code to convert [Penobscot](https://arxiv.org/abs/1905.04307) dataset (available [here](https://zenodo.org/record/3924682))
+into Tensor format used by the Dutch F3 dataset - once converted, you can run Penobscot through the same
+mechanisms as the Dutch F3 dataset. The rough sequence of steps is:
+
+```bash
+conda activate seismic-interpretation
+cd scripts
+wget -o /dev/null -O dataset.h5 https://zenodo.org/record/3924682/files/dataset.h5?download=1
+# convert penobscot
+python byod_penobscot.py --filename dataset.h5 --outdir <where to output data>
+# preprocess for experiments
+python prepare_dutchf3.py split_train_val patch --data_dir=<outdir from the previous step> --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both --section_stride=100
+```
 
 ### Run Examples
 

diff --git a/scripts/byod_penobscot.py b/scripts/byod_penobscot.py
@@ -0,0 +1,149 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Run example:
+python byod_penobscot.py --filename <input HDF5 file> --outdir <where to output data>
+python prepare_dutchf3.py split_train_val patch   --data_dir=<outdir from the previous step> --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both --section_stride=100
+
+"""
+import sklearn
+
+""" libraries """
+import h5py
+
+import numpy as np
+import os
+
+np.set_printoptions(linewidth=200)
+import logging
+
+# toggle to WARNING when running in production, or use CLI
+logging.getLogger().setLevel(logging.DEBUG)
+# logging.getLogger().setLevel(logging.WARNING)
+import argparse
+
+parser = argparse.ArgumentParser()
+
+""" useful information when running from a GIT folder."""
+myname = os.path.realpath(__file__)
+mypath = os.path.dirname(myname)
+myname = os.path.basename(myname)
+
+def main(args):
+    """
+    Transforms Penobscot HDF5 dataset into DeepSeismic Tensor Format
+    """
+
+    logging.info("loading data")
+    f = h5py.File(args.filename, 'r')
+    data = f['features'][:,:,:,0]
+    labels = f['label'][:, :, :]
+    assert labels.min() == 0
+    n_classes = labels.max()+1
+    assert n_classes == N_CLASSES
+
+    # inline x depth x crossline, make it inline x crossline x depth
+    data   = np.swapaxes(data, 1, 2)
+    labels = np.swapaxes(labels, 1, 2)
+
+    #Make data cube fast to access
+    data = np.ascontiguousarray(data,'float32')
+    labels = np.ascontiguousarray(labels,'uint8')
+
+    # combine classes 4 and 5 (index 3 and 4)- shift others down
+    labels[labels>3]-=1
+
+    # rescale to be within a certain range
+    range_min, range_max = -1., 1.
+    data_std = (data - data.min()) / (data.max() - data.min())
+    data = data_std * (range_max - range_min) + range_min
+
+    """
+    # cut off a buffer zone around the volume (to avoid mislabeled data):
+    buffer = 25
+    data = data[:, buffer:-buffer, buffer:-buffer]
+    labels = labels[:, buffer:-buffer, buffer:-buffer]
+    """
+
+    # time by crosslines by inlines
+    n_inlines = data.shape[0]
+    n_crosslines = data.shape[1]
+
+    inline_cut = int(np.floor(n_inlines * INLINE_FRACTION))
+    crossline_cut = int(np.floor(n_crosslines*CROSSLINE_FRACTION))
+
+    data_train = data[0:inline_cut,0:crossline_cut,:]
+    data_test1 = data[inline_cut:n_inlines,:,:]
+    data_test2 = data[:,crossline_cut:n_crosslines,:]
+
+    labels_train = labels[0:inline_cut, 0:crossline_cut, :]
+    labels_test1 = labels[inline_cut:n_inlines, :, :]
+    labels_test2 = labels[:, crossline_cut:n_crosslines, :]
+
+    def mkdir(dirname):
+
+        if os.path.isdir(dirname) and os.path.exists(dirname):
+            return
+
+        if not os.path.isdir(dirname) and os.path.exists(dirname):
+            logging.info("remote file", dirname, "and run this script again")
+
+        os.mkdir(dirname)
+
+    mkdir(args.outdir)
+    mkdir(os.path.join(args.outdir, "splits"))
+    mkdir(os.path.join(args.outdir, "train"))
+    mkdir(os.path.join(args.outdir, "test_once"))
+
+    np.save(os.path.join(args.outdir, "train", "train_seismic.npy"), data_train)
+    np.save(os.path.join(args.outdir, "train", "train_labels.npy"), labels_train)
+
+    np.save(os.path.join(args.outdir, "test_once", "test1_seismic.npy"), data_test1)
+    np.save(os.path.join(args.outdir, "test_once", "test1_labels.npy"), labels_test1)
+
+    np.save(os.path.join(args.outdir, "test_once", "test2_seismic.npy"), data_test2)
+    np.save(os.path.join(args.outdir, "test_once", "test2_labels.npy"), labels_test2)
+
+    # Compute class weights:
+    num_classes, class_count = np.unique(labels[:], return_counts=True)
+    # class_probabilities = np.histogram(labels[:], bins= , density=True)
+    class_weights = 1 - class_count / np.sum(class_count)
+    logging.info("CLASS WEIGHTS TO USE")
+    logging.info(class_weights)
+    logging.info("MEAN")
+    logging.info(data.mean())
+    logging.info("STANDARD DEVIATION")
+    logging.info(data.std())
+
+""" GLOBAL VARIABLES """
+INLINE_FRACTION = 0.7
+CROSSLINE_FRACTION = 1.0
+N_CLASSES = 8
+
+parser.add_argument("--filename", help="Name of HDF5 data", type=str, required=True)
+parser.add_argument("--outdir", help="Output data directory location", type=str, required=True)
+
+""" main wrapper with profiler """
+if __name__ == "__main__":
+    main(parser.parse_args())
+
+# pretty printing of the stack
+"""
+  try:
+    logging.info('before main')
+    main(parser.parse_args())
+    logging.info('after main')
+  except:
+    for frame in traceback.extract_tb(sys.exc_info()[2]):
+      fname,lineno,fn,text = frame
+      print ("Error in %s on line %d" % (fname, lineno))
+"""
+# optionally enable profiling information
+#  import cProfile
+#  name = <insert_name_here>
+#  cProfile.run('main.run()', name + '.prof')
+#  import pstats
+#  p = pstats.Stats(name + '.prof')
+#  p.sort_stats('cumulative').print_stats(10)
+#  p.sort_stats('time').print_stats()
diff --git a/scripts/gen_synthetic_data.py b/scripts/gen_synthetic_data.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
 """ Please see the def main() function for code description."""
 
 """ libraries """

diff --git a/tests/cicd/main_build.yml b/tests/cicd/main_build.yml
@@ -52,6 +52,14 @@ jobs:
 
       ./tests/cicd/src/scripts/get_data_for_builds.sh ${DATA_ROOT}
 
+      # taken from https://zenodo.org/record/3924682
+      # paper https://arxiv.org/abs/1905.04307
+      # TODO: enable when Penobscot is ready to be provided in the repo - rough sequence of steps below
+      # cd scripts
+      # wget -o /dev/null -O dataset.h5 https://zenodo.org/record/3924682/files/dataset.h5?download=1
+      # python byod_penobscot.py --filename dataset.h5 --outdir <where to output data>
+      # python prepare_dutchf3.py split_train_val patch   --data_dir=<outdir from the previous step> --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both --section_stride=100
+
       # copy your model files like so - using dummy file to illustrate
       azcopy --quiet --source:https://$(storagename).blob.core.windows.net/models/model --source-key $(storagekey) --destination /home/alfred/models/your_model_name
 

diff --git a/tests/cicd/src/scripts/get_data_for_builds.sh b/tests/cicd/src/scripts/get_data_for_builds.sh
@@ -50,4 +50,6 @@ python prepare_dutchf3.py split_train_val patch   --data_dir=${DATA_F3} --label_
 DATA_CHECKERBOARD="${DATA_CHECKERBOARD}/data"
 # repeat for checkerboard dataset
 python prepare_dutchf3.py split_train_val section --data_dir=${DATA_CHECKERBOARD} --label_file=train/train_labels.npy --output_dir=splits --split_direction=both
-python prepare_dutchf3.py split_train_val patch   --data_dir=${DATA_CHECKERBOARD} --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both --section_stride=100
+python prepare_dutchf3.py split_train_val patch   --data_dir=${DATA_CHECKERBOARD} --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 --split_direction=both --section_stride=100
+
+