From 5f7d1aa296ceeb7eafd41669faa1060406cb7a34 Mon Sep 17 00:00:00 2001 From: risquass Date: Wed, 12 Feb 2020 14:15:17 -0300 Subject: [PATCH 01/21] select contiguous data splits for test and train --- scripts/prepare_dutchf3.py | 322 ++++++++++++++++++++++++---------- tests/test_prepare_dutchf3.py | 286 ++++++++++++++++++++++++++++++ 2 files changed, 512 insertions(+), 96 deletions(-) create mode 100644 tests/test_prepare_dutchf3.py diff --git a/scripts/prepare_dutchf3.py b/scripts/prepare_dutchf3.py index 2e9e57f9..bf5c0886 100644 --- a/scripts/prepare_dutchf3.py +++ b/scripts/prepare_dutchf3.py @@ -9,49 +9,61 @@ import logging.config import math import warnings -from os import path +from os import path, mkdir import fire import numpy as np from sklearn.model_selection import train_test_split -def _get_splits_path(data_dir): - return path.join(data_dir, "splits") - - -def _get_labels_path(data_dir): - return path.join(data_dir, "train", "train_labels.npy") - - def _write_split_files(splits_path, train_list, test_list, loader_type): - file_object = open(path.join(splits_path, loader_type + "_train_val.txt"), "w") + if not path.isdir(splits_path): + mkdir(splits_path) + file_object = open(path.join(splits_path, + loader_type + "_train_val.txt"), "w") file_object.write("\n".join(train_list + test_list)) file_object.close() - file_object = open(path.join(splits_path, loader_type + "_train.txt"), "w") + file_object = open(path.join(splits_path, + loader_type + "_train.txt"), "w") file_object.write("\n".join(train_list)) file_object.close() - file_object = open(path.join(splits_path, loader_type + "_val.txt"), "w") + file_object = open(path.join(splits_path, + loader_type + "_val.txt"), "w") file_object.write("\n".join(test_list)) file_object.close() -def _get_aline_range(aline, per_val): - # Inline sections - test_aline = math.floor(aline * per_val / 2) - test_aline_range = itertools.chain(range(0, test_aline), range(aline - test_aline, aline)) - train_aline_range = range(test_aline, aline - test_aline) - - return train_aline_range, test_aline_range +def _get_aline_range(aline, per_val, slice_steps): + try: + if slice_steps < 1: + raise ValueError('slice_steps cannot be zero or a negative number') + # Inline and Crossline sections + test_aline = math.floor(aline * per_val / 2) + test_aline_range = itertools.chain(range(0, test_aline), + range(aline - test_aline, aline)) + train_aline_range = range(test_aline, aline - test_aline, slice_steps) + return train_aline_range, test_aline_range + except (Exception, ValueError): + raise -def split_section_train_val(data_dir, per_val=0.2, log_config=None): +def split_section_train_val(data_dir, output_dir, label_file, per_val=0.2, + log_config=None, slice_steps=1): """Generate train and validation files for Netherlands F3 dataset. Args: data_dir (str): data directory path - per_val (float, optional): the fraction of the volume to use for validation. - Defaults to 0.2. + output_dir (str): directory under data_dir to store the split files + label_file (str): npy files with labels. Stored in data_dir + stride (int): stride to use when sectioning of the volume + patch_size (int): size of patch to extract + per_val (float, optional): the fraction of the volume to use for + validation. Defaults to 0.2. + log_config (str): path to log configurations + slice_steps (int): increment to the slices count. + If slice_steps > 1 the function will skip: + slice_steps - 1 slice. + Defaults to 1, do not skip any slice. """ if log_config is not None: @@ -62,19 +74,23 @@ def split_section_train_val(data_dir, per_val=0.2, log_config=None): logger.info("Splitting data into sections .... ") logger.info(f"Reading data from {data_dir}") - labels_path = _get_labels_path(data_dir) - logger.info(f"Loading {labels_path}") - labels = np.load(labels_path) + # labels_path = _get_labels_path(data_dir) + logger.info(f"Loading {label_file}") + labels = np.load(label_file) logger.debug(f"Data shape [iline|xline|depth] {labels.shape}") iline, xline, _ = labels.shape # Inline sections - train_iline_range, test_iline_range = _get_aline_range(iline, per_val) + train_iline_range, test_iline_range = _get_aline_range(iline, + per_val, + slice_steps) train_i_list = ["i_" + str(i) for i in train_iline_range] test_i_list = ["i_" + str(i) for i in test_iline_range] # Xline sections - train_xline_range, test_xline_range = _get_aline_range(xline, per_val) + train_xline_range, test_xline_range = _get_aline_range(xline, + per_val, + slice_steps) train_x_list = ["x_" + str(x) for x in train_xline_range] test_x_list = ["x_" + str(x) for x in test_xline_range] @@ -82,19 +98,29 @@ def split_section_train_val(data_dir, per_val=0.2, log_config=None): test_list = test_x_list + test_i_list # write to files to disk - splits_path = _get_splits_path(data_dir) - _write_split_files(splits_path, train_list, test_list, "section") + # splits_path = _get_splits_path(data_dir) + # _write_split_files(splits_path, train_list, test_list, "section") + logger.info(f"Writing {output_dir}") + _write_split_files(output_dir, train_list, test_list, "section") -def split_patch_train_val(data_dir, stride, patch, per_val=0.2, log_config=None): +def split_patch_train_val(data_dir, output_dir, label_file, stride, patch_size, + slice_steps=1, per_val=0.2, log_config=None): """Generate train and validation files for Netherlands F3 dataset. Args: data_dir (str): data directory path + output_dir (str): directory under data_dir to store the split files + label_file (str): npy files with labels. Stored in data_dir stride (int): stride to use when sectioning of the volume - patch (int): size of patch to extract - per_val (float, optional): the fraction of the volume to use for validation. - Defaults to 0.2. + patch_size (int): size of patch to extract + per_val (float, optional): the fraction of the volume to use for + validation. Defaults to 0.2. + log_config (str): path to log configurations + slice_steps (int): increment to the slices count. + If slice_steps > 1 the function will skip: + slice_steps - 1 slice. + Defaults to 1, do not skip any slice. """ if log_config is not None: @@ -105,53 +131,109 @@ def split_patch_train_val(data_dir, stride, patch, per_val=0.2, log_config=None) logger.info("Splitting data into patches .... ") logger.info(f"Reading data from {data_dir}") - labels_path = _get_labels_path(data_dir) - logger.info(f"Loading {labels_path}") - labels = np.load(labels_path) + # labels_path = _get_labels_path(data_dir) + logger.info(f"Loading {label_file}") + labels = np.load(label_file) logger.debug(f"Data shape [iline|xline|depth] {labels.shape}") iline, xline, depth = labels.shape # Inline sections - train_iline_range, test_iline_range = _get_aline_range(iline, per_val) + train_iline_range, test_iline_range = _get_aline_range(iline, + per_val, + slice_steps) # Xline sections - train_xline_range, test_xline_range = _get_aline_range(xline, per_val) + train_xline_range, test_xline_range = _get_aline_range(xline, + per_val, + slice_steps) # Generate patches from sections - # Process inlines - horz_locations = range(0, xline - stride, stride) - vert_locations = range(0, depth - stride, stride) - logger.debug("Generating Inline patches") - logger.debug(horz_locations) + # Vertical locations is common to all patches processed + vert_locations = range(0, depth - patch_size, stride) logger.debug(vert_locations) + # Process inlines + # iline = xline x depth + test_iline = math.floor(xline * per_val / 2) + logger.debug(test_iline) + def _i_extract_patches(iline_range, horz_locations, vert_locations): for i in iline_range: - locations = ([j, k] for j in horz_locations for k in vert_locations) + locations = ([j, k] for j in horz_locations + for k in vert_locations) for j, k in locations: yield "i_" + str(i) + "_" + str(j) + "_" + str(k) + # Process inlines - train + horz_locations_train = range(test_iline, xline - patch_size, stride) + logger.debug("Generating Inline patches") + logger.debug("Generating Inline patches - Train") + logger.debug(horz_locations_train) + train_i_list = list(_i_extract_patches(train_iline_range, + horz_locations_train, + vert_locations)) + # Process inlines - test - begining + test_iline_range = list(test_iline_range) - test_i_list = list(_i_extract_patches(test_iline_range, horz_locations, vert_locations)) - train_i_list = list(_i_extract_patches(train_iline_range, horz_locations, vert_locations)) + + # test_iline - define size of the test set for the fist part + + logger.debug("Generating Inline patches - Test") + horz_locations_test_begin = range(0, test_iline, max(1,stride)) + test_i_list = list(_i_extract_patches(test_iline_range, + horz_locations_test_begin, + vert_locations)) + # Process inlines - test - end + horz_locations_test_end = range(xline - stride + 1, xline, max(1,stride)) + test_i_list += list(_i_extract_patches(test_iline_range, + horz_locations_test_end, + vert_locations)) + logger.debug(train_iline_range) logger.debug(test_iline_range) # Process crosslines - horz_locations = range(0, iline - stride, stride) - vert_locations = range(0, depth - stride, stride) - logger.debug("Generating Crossline patches") - logger.debug(horz_locations) - logger.debug(vert_locations) - def _x_extract_patches(xline_range, horz_locations, vert_locations): for j in xline_range: - locations = ([i, k] for i in horz_locations for k in vert_locations) + locations = ([i, k] for i in horz_locations + for k in vert_locations) for i, k in locations: yield "x_" + str(i) + "_" + str(j) + "_" + str(k) + + logger.debug("Generating Crossline patches") + logger.debug("Generating Crossline patches - Train") + # xline = iline x depth + test_xline = math.floor(iline * per_val / 2) + logger.debug(test_xline) + + # Process xlines - train + horz_locations_train = range(test_xline, iline - patch_size, stride) + logger.debug(horz_locations_train) + train_x_list = list(_x_extract_patches(train_xline_range, + horz_locations_train, + vert_locations)) + + # Process xlines - test - begining + test_xline_range = list(test_xline_range) - test_x_list = list(_x_extract_patches(test_xline_range, horz_locations, vert_locations)) - train_x_list = list(_x_extract_patches(train_xline_range, horz_locations, vert_locations)) + + # test_iline - define size of the test set for the fist part + logger.debug("Generating Inline patches - Test") + horz_locations_test_begin = range(0, test_xline, max(1,stride)) + test_x_list = list(_i_extract_patches(test_xline_range, + horz_locations_test_begin, + vert_locations)) + # Process xlines - test - end + horz_locations_test_end = range(iline - stride + 1, iline, max(1,stride)) + test_x_list += list(_i_extract_patches(test_xline_range, + horz_locations_test_end, + vert_locations)) + + + test_xline_range = list(test_xline_range) + test_x_list = list(_x_extract_patches(test_xline_range, + horz_locations_test_end, + vert_locations)) logger.debug(train_xline_range) logger.debug(test_xline_range) @@ -159,12 +241,15 @@ def _x_extract_patches(xline_range, horz_locations, vert_locations): test_list = test_x_list + test_i_list # write to files to disk: - # NOTE: This isn't quite right we should calculate the patches again for the whole volume - splits_path = _get_splits_path(data_dir) - _write_split_files(splits_path, train_list, test_list, "patch") + # NOTE: This isn't quite right we should calculate the patches + # again for the whole volume + # splits_path = _get_splits_path(data_dir) + # _write_split_files(splits_path, train_list, test_list, "patch") + logger.info(f"Writing {output_dir}") + _write_split_files(output_dir, train_list, test_list, "patch") - -_LOADER_TYPES = {"section": split_section_train_val, "patch": split_patch_train_val} +_LOADER_TYPES = {"section": split_section_train_val, + "patch": split_patch_train_val} def get_split_function(loader_type): @@ -176,23 +261,27 @@ def run_split_func(loader_type, *args, **kwargs): split_func(*args, **kwargs) -def split_alaudah_et_al_19(data_dir, stride, fraction_validation=0.2, loader_type="patch", log_config=None): - """Generate train and validation files (with overlap) for Netherlands F3 dataset. - The original split method from https://github.com/olivesgatech/facies_classification_benchmark +def split_alaudah_et_al_19(data_dir, output_dir, labels_file, stride, + patch_size, fraction_validation=0.2, + loader_type="patch", log_config=None): + """Generate train and validation files (with overlap) for Netherlands F3 + dataset. The original split method from + https://github.com/olivesgatech/facies_classification_benchmark DON'T USE, SEE NOTES BELOW Args: data_dir (str): data directory path stride (int): stride to use when sectioning of the volume - fraction_validation (float, optional): the fraction of the volume to use for validation. - Defaults to 0.2. - loader_type (str, optional): type of data loader, can be "patch" or "section". - Defaults to "patch". + patch_size (int): size of patch to extract + fraction_validation (float, optional): the fraction of the volume to + use for validation. Defaults to 0.2. + loader_type (str, optional): type of data loader, can be "patch" + or "section". Defaults to "patch". log_config (str, optional): path to log config. Defaults to None. Notes: - Only kept for reproducibility. It generates overlapping train and val which makes - validation results unreliable. + Only kept for reproducibility. It generates overlapping train and + val which makes validation results unreliable. """ if log_config is not None: @@ -211,9 +300,9 @@ def split_alaudah_et_al_19(data_dir, stride, fraction_validation=0.2, loader_typ logger.info("Reading data from {data_dir}") - labels_path = _get_labels_path(data_dir) - logger.info("Loading {labels_path}") - labels = np.load(labels_path) + # labels_path = _get_labels_path(data_dir) + logger.info("Loading {labels_file}") + labels = np.load(labels_file) iline, xline, depth = labels.shape logger.debug(f"Data shape [iline|xline|depth] {labels.shape}") @@ -222,29 +311,33 @@ def split_alaudah_et_al_19(data_dir, stride, fraction_validation=0.2, loader_typ x_list = ["x_" + str(x) for x in range(xline)] elif loader_type == "patch": i_list = [] - horz_locations = range(0, xline - stride, stride) - vert_locations = range(0, depth - stride, stride) + horz_locations = range(0, xline - patch_size + 1, stride) + vert_locations = range(0, depth - patch_size + 1, stride) logger.debug("Generating Inline patches") logger.debug(horz_locations) logger.debug(vert_locations) for i in range(iline): # for every inline: # images are references by top-left corner: - locations = [[j, k] for j in horz_locations for k in vert_locations] - patches_list = ["i_" + str(i) + "_" + str(j) + "_" + str(k) for j, k in locations] + locations = [[j, k] for j in horz_locations + for k in vert_locations] + patches_list = ["i_" + str(i) + "_" + str(j) + "_" + str(k) + for j, k in locations] i_list.append(patches_list) # flatten the list i_list = list(itertools.chain(*i_list)) x_list = [] - horz_locations = range(0, iline - stride, stride) - vert_locations = range(0, depth - stride, stride) + horz_locations = range(0, iline - patch_size + 1, stride) + vert_locations = range(0, depth - patch_size + 1, stride) for j in range(xline): # for every xline: # images are references by top-left corner: - locations = [[i, k] for i in horz_locations for k in vert_locations] - patches_list = ["x_" + str(i) + "_" + str(j) + "_" + str(k) for i, k in locations] + locations = [[i, k] for i in horz_locations + for k in vert_locations] + patches_list = ["x_" + str(i) + "_" + str(j) + "_" + str(k) + for i, k in locations] x_list.append(patches_list) # flatten the list @@ -253,47 +346,84 @@ def split_alaudah_et_al_19(data_dir, stride, fraction_validation=0.2, loader_typ list_train_val = i_list + x_list # create train and test splits: - train_list, test_list = train_test_split(list_train_val, test_size=fraction_validation, shuffle=True) + train_list, test_list = train_test_split(list_train_val, + test_size=fraction_validation, + shuffle=True) # write to files to disk: - splits_path = _get_splits_path(data_dir) - _write_split_files(splits_path, train_list, test_list, loader_type) + # splits_path = _get_splits_path(data_dir) + # _write_split_files(splits_path, train_list, test_list, loader_type) + logger.info(f"Writing {output_dir}") + _write_split_files(output_dir, train_list, test_list, loader_type) # TODO: Try https://github.com/Chilipp/docrep for doscstring reuse class SplitTrainValCLI(object): - def section(self, data_dir, per_val=0.2, log_config="logging.conf"): - """Generate section based train and validation files for Netherlands F3 dataset. + def section(self, data_dir, label_file, per_val=0.2, + log_config="logging.conf", output_dir=None, + slice_steps=1): + """Generate section based train and validation files for Netherlands F3 + dataset. Args: data_dir (str): data directory path - per_val (float, optional): the fraction of the volume to use for validation. - Defaults to 0.2. + output_dir (str): directory under data_dir to store the split files + label_file (str): npy files with labels. Stored in data_dir + stride (int): stride to use when sectioning of the volume + patch_size (int): size of patch to extract + per_val (float, optional): the fraction of the volume to use for + validation. Defaults to 0.2. + log_config (str): path to log configurations + slice_steps (int): increment to the slices count. + If slice_steps > 1 the function will skip: + slice_steps - 1 slice. + Defaults to 1, do not skip any slice. log_config (str): path to log configurations """ - return split_section_train_val(data_dir, per_val=per_val, log_config=log_config) + if data_dir is not None: + label_file = path.join(data_dir, label_file) + output_dir = path.join(data_dir, output_dir) + return split_section_train_val(data_dir, output_dir, label_file, + slice_steps, per_val, log_config) - def patch(self, data_dir, stride, patch, per_val=0.2, log_config="logging.conf"): - """Generate patch based train and validation files for Netherlands F3 dataset. + def patch(self, label_file, stride, patch_size, + per_val=0.2, log_config="train/deepseismic/configs/logging.conf", + data_dir=None, output_dir=None, slice_steps=1): + """Generate train and validation files for Netherlands F3 dataset. Args: data_dir (str): data directory path + output_dir (str): directory under data_dir to store the split files + label_file (str): npy files with labels. Stored in data_dir stride (int): stride to use when sectioning of the volume - patch (int): size of patch to extract - per_val (float, optional): the fraction of the volume to use for validation. - Defaults to 0.2. + patch_size (int): size of patch to extract + per_val (float, optional): the fraction of the volume to use for + validation. Defaults to 0.2. log_config (str): path to log configurations + slice_steps (int): increment to the slices count. + If slice_steps > 1 the function will skip: + slice_steps - 1 slice. + Defaults to 1, do not skip any slice. """ - return split_patch_train_val(data_dir, stride, patch, per_val=per_val, log_config=log_config) + if data_dir is not None: + label_file = path.join(data_dir, label_file) + output_dir = path.join(data_dir, output_dir) + return split_patch_train_val(output_dir, label_file, + stride, patch_size, + per_val, log_config, + slice_steps) if __name__ == "__main__": """Example: python prepare_data.py split_train_val section --data-dir=/mnt/dutch or - python prepare_data.py split_train_val patch --data-dir=/mnt/dutch --stride=50 --patch=100 + python prepare_dutchf3.py split_train_val patch --output_dir=splits \ + --data_dir=data/ --slice_steps=2 --stride=50 \ + --patch size=100 --label_file=label_file.npy """ fire.Fire( - {"split_train_val": SplitTrainValCLI, "split_alaudah_et_al_19": split_alaudah_et_al_19,} + {"split_train_val": SplitTrainValCLI, + "split_alaudah_et_al_19": split_alaudah_et_al_19} ) diff --git a/tests/test_prepare_dutchf3.py b/tests/test_prepare_dutchf3.py new file mode 100644 index 00000000..e268ddbf --- /dev/null +++ b/tests/test_prepare_dutchf3.py @@ -0,0 +1,286 @@ +"""Test the extract functions against a variety of SEGY files and trace_header scenarioes +""" +import pytest +import numpy as np +import pandas as pd +import tempfile +import scripts.prepare_dutchf3 as prep_dutchf3 + +# Setup +OUTPUT = None +ILINE = XLINE = DEPTH = 111 +ALINE = np.zeros((ILINE, XLINE, DEPTH)) +STRIDE = 100 +PATCH = 50 +PER_VAL = 0.2 +LOG_CONFIG = None + + +def test_get_aline_range_step_one(): + + """check if it includes the step in the range if step = 1 + """ + SLICE_STEPS = 1 + + # Test + output_iline = prep_dutchf3._get_aline_range(ILINE, PER_VAL, SLICE_STEPS) + output_xline = prep_dutchf3._get_aline_range(XLINE, PER_VAL, SLICE_STEPS) + + assert str(output_iline[0].step) == str(SLICE_STEPS) + assert str(output_xline[0].step) == str(SLICE_STEPS) + + +def test_get_aline_range_step_zero(): + + """check if a ValueError exception is raised when slice_steps = 0 + """ + with pytest.raises(ValueError, match=r'slice_steps cannot be zero or a negative number'): + SLICE_STEPS = 0 + + # Test + output_iline = prep_dutchf3._get_aline_range(ILINE, PER_VAL, SLICE_STEPS) + output_xline = prep_dutchf3._get_aline_range(XLINE, PER_VAL, SLICE_STEPS) + + assert output_iline + assert output_xline + + +def test_get_aline_range_negative_step(): + + """check if a ValueError exception is raised when slice_steps = -1 + """ + with pytest.raises(ValueError, match='slice_steps cannot be zero or a negative number'): + SLICE_STEPS = -1 + + # Test + output_iline = prep_dutchf3._get_aline_range(ILINE, PER_VAL, SLICE_STEPS) + output_xline = prep_dutchf3._get_aline_range(XLINE, PER_VAL, SLICE_STEPS) + + assert output_iline + assert output_xline + + +def test_get_aline_range_float_step(): + + """check if a ValueError exception is raised when slice_steps = 1.1 + """ + with pytest.raises(TypeError, match="'float' object cannot be interpreted as an integer"): + SLICE_STEPS = 1. + + # Test + output_iline = prep_dutchf3._get_aline_range(ILINE, PER_VAL, SLICE_STEPS) + output_xline = prep_dutchf3._get_aline_range(XLINE, PER_VAL, SLICE_STEPS) + + assert output_iline + assert output_xline + + +def test_get_aline_range_single_digit_step(): + + """check if it includes the step in the range if 1 < step < 10 + """ + SLICE_STEPS = 1 + # Test + output_iline = prep_dutchf3._get_aline_range(ILINE, PER_VAL, SLICE_STEPS) + output_xline = prep_dutchf3._get_aline_range(XLINE, PER_VAL, SLICE_STEPS) + + assert str(output_iline[0].step) == str(SLICE_STEPS) + assert str(output_xline[0].step) == str(SLICE_STEPS) + + +def test_get_aline_range_double_digit_step(): + + """check if it includes the step in the range if step > 10 + """ + SLICE_STEPS = 17 + # Test + output_iline = prep_dutchf3._get_aline_range(ILINE, PER_VAL, SLICE_STEPS) + output_xline = prep_dutchf3._get_aline_range(XLINE, PER_VAL, SLICE_STEPS) + + assert str(output_iline[0].step) == str(SLICE_STEPS) + assert str(output_xline[0].step) == str(SLICE_STEPS) + + +def test_prepare_dutchf3_patch_step_1(): + + """check a complete run for the script in case further changes are needed + """ + # setting a value to SLICE_STEPS as needed to test the values + SLICE_STEPS = 1 + + # use a temp dir that will be discarded at the end of the execution + with tempfile.TemporaryDirectory() as tmpdirname: + + # saving the file to be used by the script + label_file = tmpdirname + '/label_file.npy' + np.save(label_file, ALINE) + + # stting the output directory to be used by the script + output = tmpdirname + '/split' + + # calling the main function of the script without SLICE_STEPS, to check default value + prep_dutchf3.split_patch_train_val(data_dir=tmpdirname, output_dir=output, label_file=label_file, + slice_steps=SLICE_STEPS, stride=STRIDE, patch_size=PATCH, per_val=PER_VAL,log_config=LOG_CONFIG) + + # reading the file and splitting the data + patch_train = pd.read_csv(output + '/patch_train.txt', header=None, names=['row', 'a', 'b']) + patch_train = pd.DataFrame(patch_train.row.str.split('_').tolist(), columns=['aline', 'x', 'y', 'z']) + + # test patch_train and slice_steps=1 + y = list(sorted(set(patch_train.y.astype(int)))) + x = list(sorted(set(patch_train.x.astype(int)),reverse=True)) + assert (float(y[2]) - float(y[1])) == float(SLICE_STEPS) + assert (float(x[1]) - float(x[2])) == float(SLICE_STEPS) + + # reading the file and splitting the data + patch_val = pd.read_csv(output + '/patch_val.txt', header=None, names=['row', 'a', 'b']) + patch_val = pd.DataFrame(patch_val.row.str.split('_').tolist(), columns=['aline', 'x', 'y', 'z']) + + # test patch_val and slice_steps=1 + y = list(sorted(set(patch_val.y.astype(int)))) + x = list(sorted(set(patch_val.x.astype(int)),reverse=True)) + assert (float(y[2]) - float(y[1])) == float(SLICE_STEPS) + assert (float(x[1]) - float(x[2])) == float(SLICE_STEPS) + +def test_prepare_dutchf3_patch_step_2(): + + """check a complete run for the script in case further changes are needed + """ + # setting a value to SLICE_STEPS as needed to test the values + SLICE_STEPS = 2 + + # use a temp dir that will be discarded at the end of the execution + with tempfile.TemporaryDirectory() as tmpdirname: + + # saving the file to be used by the script + label_file = tmpdirname + '/label_file.npy' + np.save(label_file, ALINE) + + # stting the output directory to be used by the script + output = tmpdirname + '/split' + + # calling the main function of the script without SLICE_STEPS, to check default value + prep_dutchf3.split_patch_train_val(data_dir=tmpdirname, output_dir=output, label_file=label_file, + slice_steps=SLICE_STEPS, stride=STRIDE, patch_size=PATCH, per_val=PER_VAL,log_config=LOG_CONFIG) + + # reading the file and splitting the data + patch_train = pd.read_csv(output + '/patch_train.txt', header=None, names=['row', 'a', 'b']) + patch_train = pd.DataFrame(patch_train.row.str.split('_').tolist(), columns=['aline', 'x', 'y', 'z']) + + # test patch_train and slice_steps=2 + y = list(sorted(set(patch_train.y.astype(int)))) + x = list(sorted(set(patch_train.x.astype(int)),reverse=True)) + assert (float(y[2]) - float(y[1])) == float(SLICE_STEPS) + assert (float(x[1]) - float(x[2])) == float(SLICE_STEPS) + + # reading the file and splitting the data + patch_val = pd.read_csv(output + '/patch_val.txt', header=None, names=['row', 'a', 'b']) + patch_val = pd.DataFrame(patch_val.row.str.split('_').tolist(), columns=['aline', 'x', 'y', 'z']) + + # test patch_val and slice_steps=2 + y = list(sorted(set(patch_val.y.astype(int)))) + x = list(sorted(set(patch_val.x.astype(int)),reverse=True)) + assert (float(y[2]) - float(y[1])) != float(SLICE_STEPS) + assert (float(x[1]) - float(x[2])) != float(SLICE_STEPS) + +def test_prepare_dutchf3_patch_train_and_test_sets(): + + """check a complete run for the script in case further changes are needed + """ + # setting a value to SLICE_STEPS as needed to test the values + SLICE_STEPS = 2 + + # use a temp dir that will be discarded at the end of the execution + with tempfile.TemporaryDirectory() as tmpdirname: + + # saving the file to be used by the script + label_file = tmpdirname + '/label_file.npy' + np.save(label_file, ALINE) + + # stting the output directory to be used by the script + output = tmpdirname + '/split' + + # calling the main function of the script without SLICE_STEPS, to check default value + prep_dutchf3.split_patch_train_val(data_dir=tmpdirname, output_dir=output, label_file=label_file, + slice_steps=SLICE_STEPS, stride=STRIDE, patch_size=PATCH, per_val=PER_VAL,log_config=LOG_CONFIG) + + # reading the file and splitting the data + patch_train = pd.read_csv(output + '/patch_train.txt', header=None, names=['row', 'a', 'b']) + patch_train = pd.DataFrame(patch_train.row.str.split('_').tolist(), columns=['aline', 'x', 'y', 'z']) + + # reading the file and splitting the data + patch_val = pd.read_csv(output + '/patch_val.txt', header=None, names=['row', 'a', 'b']) + patch_val = pd.DataFrame(patch_val.row.str.split('_').tolist(), columns=['aline', 'x', 'y', 'z']) + + y_train = set(patch_train.y) + x_train = set(patch_train.x) + y_val = set(patch_val.y) + x_val = set(patch_val.x) + + # The sets must not contain values common to both + assert y_train & y_val == set() + assert x_train & x_val == set() + + +def test_prepare_dutchf3_section_step_1(): + + """check a complete run for the script in case further changes are needed + """ + # setting a value to SLICE_STEPS as needed to test the values + SLICE_STEPS = 1 + + # use a temp dir that will be discarded at the end of the execution + with tempfile.TemporaryDirectory() as tmpdirname: + + # saving the file to be used by the script + label_file = tmpdirname + '/label_file.npy' + np.save(label_file, ALINE) + + # stting the output directory to be used by the script + output = tmpdirname + '/split' + + # calling the main function of the script without SLICE_STEPS, to check default value + prep_dutchf3.split_section_train_val(data_dir=tmpdirname, output_dir=output, label_file=label_file,slice_steps=SLICE_STEPS, per_val=PER_VAL, log_config=LOG_CONFIG) + + # reading the file and splitting the data + section_train = pd.read_csv(output + '/section_train.txt', header=None, names=['row']) + section_train = pd.DataFrame(section_train.row.str.split('_').tolist(), columns=['aline', 'section']) + + section_val = pd.read_csv(output + '/section_val.txt', header=None, names=['row']) + section_val = pd.DataFrame(section_val.row.str.split('_').tolist(), columns=['aline', 'section']) + + # test + assert (float(section_train.section[1]) - float(section_train.section[0])) % float(SLICE_STEPS) == 0.0 + assert (float(section_val.section[1]) - float(section_val.section[0])) % float(SLICE_STEPS) == 0.0 + +def test_prepare_dutchf3_section_step_2(): + + """check a complete run for the script in case further changes are needed + """ + # setting a value to SLICE_STEPS as needed to test the values + SLICE_STEPS = 2 + + # use a temp dir that will be discarded at the end of the execution + with tempfile.TemporaryDirectory() as tmpdirname: + + # saving the file to be used by the script + label_file = tmpdirname + '/label_file.npy' + np.save(label_file, ALINE) + + # stting the output directory to be used by the script + output = tmpdirname + '/split' + + # calling the main function of the script without SLICE_STEPS, to check default value + prep_dutchf3.split_section_train_val(data_dir=tmpdirname, output_dir=output, label_file=label_file, + slice_steps=SLICE_STEPS, per_val=PER_VAL, log_config=LOG_CONFIG) + + # reading the file and splitting the data + section_train = pd.read_csv(output + '/section_train.txt', header=None, names=['row']) + section_train = pd.DataFrame(section_train.row.str.split('_').tolist(), columns=['aline', 'section']) + + section_val = pd.read_csv(output + '/section_val.txt', header=None, names=['row']) + section_val = pd.DataFrame(section_val.row.str.split('_').tolist(), columns=['aline', 'section']) + + # test + assert (float(section_train.section[1]) - float(section_train.section[0])) % float(SLICE_STEPS) == 0.0 + assert (float(section_val.section[1]) - float(section_val.section[0])) % float(SLICE_STEPS) > 0.0 From 85ed0b13f0680c1d31db69fbb2b4a3fdb38c9603 Mon Sep 17 00:00:00 2001 From: risquass Date: Wed, 12 Feb 2020 16:54:12 -0300 Subject: [PATCH 02/21] revert changes to split_alaudah_et_al_19 method --- scripts/prepare_dutchf3.py | 71 +++++++++++++------------------------- 1 file changed, 24 insertions(+), 47 deletions(-) diff --git a/scripts/prepare_dutchf3.py b/scripts/prepare_dutchf3.py index bf5c0886..8ab95094 100644 --- a/scripts/prepare_dutchf3.py +++ b/scripts/prepare_dutchf3.py @@ -98,8 +98,6 @@ def split_section_train_val(data_dir, output_dir, label_file, per_val=0.2, test_list = test_x_list + test_i_list # write to files to disk - # splits_path = _get_splits_path(data_dir) - # _write_split_files(splits_path, train_list, test_list, "section") logger.info(f"Writing {output_dir}") _write_split_files(output_dir, train_list, test_list, "section") @@ -172,12 +170,10 @@ def _i_extract_patches(iline_range, horz_locations, vert_locations): train_i_list = list(_i_extract_patches(train_iline_range, horz_locations_train, vert_locations)) - # Process inlines - test - begining - - test_iline_range = list(test_iline_range) - # test_iline - define size of the test set for the fist part + test_iline_range = list(test_iline_range) + # Process inlines - test - begining logger.debug("Generating Inline patches - Test") horz_locations_test_begin = range(0, test_iline, max(1,stride)) test_i_list = list(_i_extract_patches(test_iline_range, @@ -213,11 +209,10 @@ def _x_extract_patches(xline_range, horz_locations, vert_locations): horz_locations_train, vert_locations)) - # Process xlines - test - begining - + # test_xline - define size of the test set for the fist part test_xline_range = list(test_xline_range) - # test_iline - define size of the test set for the fist part + # Process xlines - test - begining logger.debug("Generating Inline patches - Test") horz_locations_test_begin = range(0, test_xline, max(1,stride)) test_x_list = list(_i_extract_patches(test_xline_range, @@ -229,7 +224,6 @@ def _x_extract_patches(xline_range, horz_locations, vert_locations): horz_locations_test_end, vert_locations)) - test_xline_range = list(test_xline_range) test_x_list = list(_x_extract_patches(test_xline_range, horz_locations_test_end, @@ -243,8 +237,6 @@ def _x_extract_patches(xline_range, horz_locations, vert_locations): # write to files to disk: # NOTE: This isn't quite right we should calculate the patches # again for the whole volume - # splits_path = _get_splits_path(data_dir) - # _write_split_files(splits_path, train_list, test_list, "patch") logger.info(f"Writing {output_dir}") _write_split_files(output_dir, train_list, test_list, "patch") @@ -261,27 +253,22 @@ def run_split_func(loader_type, *args, **kwargs): split_func(*args, **kwargs) -def split_alaudah_et_al_19(data_dir, output_dir, labels_file, stride, - patch_size, fraction_validation=0.2, - loader_type="patch", log_config=None): - """Generate train and validation files (with overlap) for Netherlands F3 - dataset. The original split method from - https://github.com/olivesgatech/facies_classification_benchmark +def split_alaudah_et_al_19(data_dir, stride, patch_size, fraction_validation=0.2, loader_type="patch", log_config=None): + """Generate train and validation files (with overlap) for Netherlands F3 dataset. + The original split method from https://github.com/olivesgatech/facies_classification_benchmark DON'T USE, SEE NOTES BELOW - Args: data_dir (str): data directory path stride (int): stride to use when sectioning of the volume patch_size (int): size of patch to extract - fraction_validation (float, optional): the fraction of the volume to - use for validation. Defaults to 0.2. - loader_type (str, optional): type of data loader, can be "patch" - or "section". Defaults to "patch". + fraction_validation (float, optional): the fraction of the volume to use for validation. + Defaults to 0.2. + loader_type (str, optional): type of data loader, can be "patch" or "section". + Defaults to "patch". log_config (str, optional): path to log config. Defaults to None. - Notes: - Only kept for reproducibility. It generates overlapping train and - val which makes validation results unreliable. + Only kept for reproducibility. It generates overlapping train and val which makes + validation results unreliable. """ if log_config is not None: @@ -300,9 +287,9 @@ def split_alaudah_et_al_19(data_dir, output_dir, labels_file, stride, logger.info("Reading data from {data_dir}") - # labels_path = _get_labels_path(data_dir) - logger.info("Loading {labels_file}") - labels = np.load(labels_file) + labels_path = _get_labels_path(data_dir) + logger.info("Loading {labels_path}") + labels = np.load(labels_path) iline, xline, depth = labels.shape logger.debug(f"Data shape [iline|xline|depth] {labels.shape}") @@ -319,10 +306,8 @@ def split_alaudah_et_al_19(data_dir, output_dir, labels_file, stride, for i in range(iline): # for every inline: # images are references by top-left corner: - locations = [[j, k] for j in horz_locations - for k in vert_locations] - patches_list = ["i_" + str(i) + "_" + str(j) + "_" + str(k) - for j, k in locations] + locations = [[j, k] for j in horz_locations for k in vert_locations] + patches_list = ["i_" + str(i) + "_" + str(j) + "_" + str(k) for j, k in locations] i_list.append(patches_list) # flatten the list @@ -334,10 +319,8 @@ def split_alaudah_et_al_19(data_dir, output_dir, labels_file, stride, for j in range(xline): # for every xline: # images are references by top-left corner: - locations = [[i, k] for i in horz_locations - for k in vert_locations] - patches_list = ["x_" + str(i) + "_" + str(j) + "_" + str(k) - for i, k in locations] + locations = [[i, k] for i in horz_locations for k in vert_locations] + patches_list = ["x_" + str(i) + "_" + str(j) + "_" + str(k) for i, k in locations] x_list.append(patches_list) # flatten the list @@ -346,15 +329,11 @@ def split_alaudah_et_al_19(data_dir, output_dir, labels_file, stride, list_train_val = i_list + x_list # create train and test splits: - train_list, test_list = train_test_split(list_train_val, - test_size=fraction_validation, - shuffle=True) + train_list, test_list = train_test_split(list_train_val, test_size=fraction_validation, shuffle=True) # write to files to disk: - # splits_path = _get_splits_path(data_dir) - # _write_split_files(splits_path, train_list, test_list, loader_type) - logger.info(f"Writing {output_dir}") - _write_split_files(output_dir, train_list, test_list, loader_type) + splits_path = _get_splits_path(data_dir) + _write_split_files(splits_path, train_list, test_list, loader_type) # TODO: Try https://github.com/Chilipp/docrep for doscstring reuse @@ -369,8 +348,6 @@ def section(self, data_dir, label_file, per_val=0.2, data_dir (str): data directory path output_dir (str): directory under data_dir to store the split files label_file (str): npy files with labels. Stored in data_dir - stride (int): stride to use when sectioning of the volume - patch_size (int): size of patch to extract per_val (float, optional): the fraction of the volume to use for validation. Defaults to 0.2. log_config (str): path to log configurations @@ -416,7 +393,7 @@ def patch(self, label_file, stride, patch_size, if __name__ == "__main__": """Example: - python prepare_data.py split_train_val section --data-dir=/mnt/dutch + python prepare_data.py split_train_val section --data_dir=/mnt/dutch or python prepare_dutchf3.py split_train_val patch --output_dir=splits \ --data_dir=data/ --slice_steps=2 --stride=50 \ From 51e6c678322b0c7a71f139fb70286df127a970e3 Mon Sep 17 00:00:00 2001 From: risquass Date: Wed, 12 Feb 2020 16:55:07 -0300 Subject: [PATCH 03/21] changed data-dir to data_dir as arg to prepare_dutchf3.py --- README.md | 4 ++-- contrib/scripts/ablation.sh | 8 ++++---- tests/cicd/main_build.yml | 12 +++++++++--- tests/cicd/src/scripts/get_data_for_builds.sh | 4 ++-- tests/test_prepare_dutchf3.py | 2 ++ 5 files changed, 19 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 767cb66c..8b4e1acc 100644 --- a/README.md +++ b/README.md @@ -142,11 +142,11 @@ To prepare the data for the experiments (e.g. split into train/val/test), please cd scripts # For section-based experiments -python prepare_dutchf3.py split_train_val section --data-dir=${data_dir}/data +python prepare_dutchf3.py split_train_val section --data_dir=${data_dir}/data # For patch-based experiments -python prepare_dutchf3.py split_train_val patch --data-dir=${data_dir}/data --stride=50 --patch=100 +python prepare_dutchf3.py split_train_val patch --data_dir=${data_dir}/data --stride=50 --patch=100 # go back to repo root cd .. diff --git a/contrib/scripts/ablation.sh b/contrib/scripts/ablation.sh index 81fcdaa6..63991519 100755 --- a/contrib/scripts/ablation.sh +++ b/contrib/scripts/ablation.sh @@ -3,22 +3,22 @@ source activate seismic-interpretation # Patch_Size 100: Patch vs Section Depth -python scripts/prepare_dutchf3.py split_train_val patch --data-dir=/mnt/dutch --stride=50 --patch=100 +python scripts/prepare_dutchf3.py split_train_val patch --data_dir=/mnt/dutch --stride=50 --patch=100 python train.py OUTPUT_DIR /data/output/hrnet_patch TRAIN.DEPTH patch TRAIN.PATCH_SIZE 100 --cfg 'configs/hrnet.yaml' python train.py OUTPUT_DIR /data/output/hrnet_section TRAIN.DEPTH section TRAIN.PATCH_SIZE 100 --cfg 'configs/hrnet.yaml' # Patch_Size 150: Patch vs Section Depth -python scripts/prepare_dutchf3.py split_train_val patch --data-dir=/mnt/dutch --stride=50 --patch=150 +python scripts/prepare_dutchf3.py split_train_val patch --data_dir=/mnt/dutch --stride=50 --patch=150 python train.py OUTPUT_DIR /data/output/hrnet_patch TRAIN.DEPTH patch TRAIN.PATCH_SIZE 150 --cfg 'configs/hrnet.yaml' python train.py OUTPUT_DIR /data/output/hrnet_section TRAIN.DEPTH section TRAIN.PATCH_SIZE 150 --cfg 'configs/hrnet.yaml' # Patch_Size 200: Patch vs Section Depth -python scripts/prepare_dutchf3.py split_train_val patch --data-dir=/mnt/dutch --stride=50 --patch=200 +python scripts/prepare_dutchf3.py split_train_val patch --data_dir=/mnt/dutch --stride=50 --patch=200 python train.py OUTPUT_DIR /data/output/hrnet_patch TRAIN.DEPTH patch TRAIN.PATCH_SIZE 200 --cfg 'configs/hrnet.yaml' python train.py OUTPUT_DIR /data/output/hrnet_section TRAIN.DEPTH section TRAIN.PATCH_SIZE 200 --cfg 'configs/hrnet.yaml' # Patch_Size 250: Patch vs Section Depth -python scripts/prepare_dutchf3.py split_train_val patch --data-dir=/mnt/dutch --stride=50 --patch=250 +python scripts/prepare_dutchf3.py split_train_val patch --data_dir=/mnt/dutch --stride=50 --patch=250 python train.py OUTPUT_DIR /data/output/hrnet_patch TRAIN.DEPTH patch TRAIN.PATCH_SIZE 250 TRAIN.AUGMENTATIONS.RESIZE.HEIGHT 250 TRAIN.AUGMENTATIONS.RESIZE.WIDTH 250 --cfg 'configs/hrnet.yaml' python train.py OUTPUT_DIR /data/output/hrnet_section TRAIN.DEPTH section TRAIN.PATCH_SIZE 250 TRAIN.AUGMENTATIONS.RESIZE.HEIGHT 250 TRAIN.AUGMENTATIONS.RESIZE.WIDTH 250 --cfg 'configs/hrnet.yaml' diff --git a/tests/cicd/main_build.yml b/tests/cicd/main_build.yml index 4e0f145c..4e89388b 100644 --- a/tests/cicd/main_build.yml +++ b/tests/cicd/main_build.yml @@ -5,11 +5,13 @@ pr: - master - staging +- contrib # Any commit to this branch will trigger the build. trigger: - master - staging +- contrib jobs: # partially disable setup for now - done manually on build VM @@ -48,11 +50,15 @@ jobs: name: deepseismicagentpool steps: - bash: | - echo "Starting unit tests" + echo "Starting cv_lib unit tests" source activate seismic-interpretation pytest --durations=0 cv_lib/tests/ - echo "Unit test job passed" - + echo "cv_lib unit test job passed" + echo "" + echo "Starting scripts unit tests" + source activate seismic-interpretation + pytest --durations=0 tests/ + echo "Script unit test job passed" ################################################################################################### # LOCAL PATCH JOBS diff --git a/tests/cicd/src/scripts/get_data_for_builds.sh b/tests/cicd/src/scripts/get_data_for_builds.sh index e9d498bd..da11188e 100755 --- a/tests/cicd/src/scripts/get_data_for_builds.sh +++ b/tests/cicd/src/scripts/get_data_for_builds.sh @@ -39,5 +39,5 @@ DATA_F3="${DATA_F3}/data" # test preprocessing scripts cd scripts python prepare_penobscot.py split_inline --data-dir=${DATA_PENOBSCOT} --val-ratio=.1 --test-ratio=.2 -python prepare_dutchf3.py split_train_val section --data-dir=${DATA_F3} -python prepare_dutchf3.py split_train_val patch --data-dir=${DATA_F3} --stride=50 --patch=100 +python prepare_dutchf3.py split_train_val section --data_dir=${DATA_F3} +python prepare_dutchf3.py split_train_val patch --data_dir=${DATA_F3} --stride=50 --patch=100 diff --git a/tests/test_prepare_dutchf3.py b/tests/test_prepare_dutchf3.py index e268ddbf..a6c9737b 100644 --- a/tests/test_prepare_dutchf3.py +++ b/tests/test_prepare_dutchf3.py @@ -1,3 +1,5 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. """Test the extract functions against a variety of SEGY files and trace_header scenarioes """ import pytest From a81ab3bcc9695c9ed74be7f298ddd4c1df6b1248 Mon Sep 17 00:00:00 2001 From: risquass Date: Wed, 12 Feb 2020 17:22:58 -0300 Subject: [PATCH 04/21] changed data-dir to data_dir as arg to prepare_dutchf3.py --- contrib/scripts/ablation.sh | 8 ++++---- tests/cicd/main_build.yml | 1 - tests/cicd/src/scripts/get_data_for_builds.sh | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/contrib/scripts/ablation.sh b/contrib/scripts/ablation.sh index 63991519..6d5a0245 100755 --- a/contrib/scripts/ablation.sh +++ b/contrib/scripts/ablation.sh @@ -3,22 +3,22 @@ source activate seismic-interpretation # Patch_Size 100: Patch vs Section Depth -python scripts/prepare_dutchf3.py split_train_val patch --data_dir=/mnt/dutch --stride=50 --patch=100 +python scripts/prepare_dutchf3.py split_train_val patch --data_dir=/mnt/dutch --stride=50 --patch_size=100 python train.py OUTPUT_DIR /data/output/hrnet_patch TRAIN.DEPTH patch TRAIN.PATCH_SIZE 100 --cfg 'configs/hrnet.yaml' python train.py OUTPUT_DIR /data/output/hrnet_section TRAIN.DEPTH section TRAIN.PATCH_SIZE 100 --cfg 'configs/hrnet.yaml' # Patch_Size 150: Patch vs Section Depth -python scripts/prepare_dutchf3.py split_train_val patch --data_dir=/mnt/dutch --stride=50 --patch=150 +python scripts/prepare_dutchf3.py split_train_val patch --data_dir=/mnt/dutch --stride=50 --patch_size=150 python train.py OUTPUT_DIR /data/output/hrnet_patch TRAIN.DEPTH patch TRAIN.PATCH_SIZE 150 --cfg 'configs/hrnet.yaml' python train.py OUTPUT_DIR /data/output/hrnet_section TRAIN.DEPTH section TRAIN.PATCH_SIZE 150 --cfg 'configs/hrnet.yaml' # Patch_Size 200: Patch vs Section Depth -python scripts/prepare_dutchf3.py split_train_val patch --data_dir=/mnt/dutch --stride=50 --patch=200 +python scripts/prepare_dutchf3.py split_train_val patch --data_dir=/mnt/dutch --stride=50 --patch_size=200 python train.py OUTPUT_DIR /data/output/hrnet_patch TRAIN.DEPTH patch TRAIN.PATCH_SIZE 200 --cfg 'configs/hrnet.yaml' python train.py OUTPUT_DIR /data/output/hrnet_section TRAIN.DEPTH section TRAIN.PATCH_SIZE 200 --cfg 'configs/hrnet.yaml' # Patch_Size 250: Patch vs Section Depth -python scripts/prepare_dutchf3.py split_train_val patch --data_dir=/mnt/dutch --stride=50 --patch=250 +python scripts/prepare_dutchf3.py split_train_val patch --data_dir=/mnt/dutch --stride=50 --patch_size=250 python train.py OUTPUT_DIR /data/output/hrnet_patch TRAIN.DEPTH patch TRAIN.PATCH_SIZE 250 TRAIN.AUGMENTATIONS.RESIZE.HEIGHT 250 TRAIN.AUGMENTATIONS.RESIZE.WIDTH 250 --cfg 'configs/hrnet.yaml' python train.py OUTPUT_DIR /data/output/hrnet_section TRAIN.DEPTH section TRAIN.PATCH_SIZE 250 TRAIN.AUGMENTATIONS.RESIZE.HEIGHT 250 TRAIN.AUGMENTATIONS.RESIZE.WIDTH 250 --cfg 'configs/hrnet.yaml' diff --git a/tests/cicd/main_build.yml b/tests/cicd/main_build.yml index 4e89388b..a2b2c13b 100644 --- a/tests/cicd/main_build.yml +++ b/tests/cicd/main_build.yml @@ -14,7 +14,6 @@ trigger: - contrib jobs: -# partially disable setup for now - done manually on build VM - job: setup timeoutInMinutes: 10 displayName: Setup diff --git a/tests/cicd/src/scripts/get_data_for_builds.sh b/tests/cicd/src/scripts/get_data_for_builds.sh index da11188e..bacaa98a 100755 --- a/tests/cicd/src/scripts/get_data_for_builds.sh +++ b/tests/cicd/src/scripts/get_data_for_builds.sh @@ -40,4 +40,4 @@ DATA_F3="${DATA_F3}/data" cd scripts python prepare_penobscot.py split_inline --data-dir=${DATA_PENOBSCOT} --val-ratio=.1 --test-ratio=.2 python prepare_dutchf3.py split_train_val section --data_dir=${DATA_F3} -python prepare_dutchf3.py split_train_val patch --data_dir=${DATA_F3} --stride=50 --patch=100 +python prepare_dutchf3.py split_train_val patch --data_dir=${DATA_F3} --stride=50 --patch_size=100 From a73737475536a108245428248f0efc3305e6a225 Mon Sep 17 00:00:00 2001 From: risquass Date: Wed, 12 Feb 2020 17:39:42 -0300 Subject: [PATCH 05/21] fix merge errors --- tests/cicd/aml_build.yml | 54 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 tests/cicd/aml_build.yml diff --git a/tests/cicd/aml_build.yml b/tests/cicd/aml_build.yml new file mode 100644 index 00000000..a443e124 --- /dev/null +++ b/tests/cicd/aml_build.yml @@ -0,0 +1,54 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +# Pull request against these branches will trigger this build +pr: +- master +- staging +- contrib + +# Any commit to this branch will trigger the build. +trigger: +- master +- staging +- contrib + +jobs: + +# partially disable setup for now - done manually on build VM +- job: setup + timeoutInMinutes: 10 + displayName: Setup + pool: + name: deepseismicagentpool + + steps: + - bash: | + # terminate as soon as any internal script fails + set -e + + echo "Running setup..." + pwd + ls + git branch + uname -ra + + # ENABLE ALL FOLLOWING CODE WHEN YOU'RE READY TO ADD AML BUILD - disabled right now + # ./scripts/env_reinstall.sh + # use hardcoded root for now because not sure how env changes under ADO policy + # DATA_ROOT="/home/alfred/data_dynamic" + # ./tests/cicd/src/scripts/get_data_for_builds.sh ${DATA_ROOT} + # copy your model files like so - using dummy file to illustrate + # azcopy --quiet --source:https://$(storagename).blob.core.windows.net/models/model --source-key $(storagekey) --destination /home/alfred/models/your_model_name + +- job: AML_job_placeholder + dependsOn: setup + timeoutInMinutes: 5 + displayName: AML job placeholder + pool: + name: deepseismicagentpool + steps: + - bash: | + # UNCOMMENT THIS WHEN YOU HAVE UNCOMMENTED THE SETUP JOB + # source activate seismic-interpretation + echo "TADA!!" From da9784286d8d02953c97710235290ba1b4c2c053 Mon Sep 17 00:00:00 2001 From: risquass Date: Thu, 13 Feb 2020 09:06:31 -0300 Subject: [PATCH 06/21] fix merge errors - files not related to change --- README.md | 3 ++- examples/interpretation/README.md | 8 +++++--- .../F3_block_training_and_evaluation_local.ipynb | 5 +++-- .../notebooks/HRNet_Penobscot_demo_notebook.ipynb | 8 ++++++-- tests/cicd/component_governance.yml | 2 ++ tests/cicd/notebooks_build.yml | 13 ++++++++++--- tests/cicd/src/conftest.py | 7 ++++++- tests/cicd/src/notebook_integration_tests.py | 12 +++++++++--- 8 files changed, 43 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 8b4e1acc..2b0e219a 100644 --- a/README.md +++ b/README.md @@ -296,6 +296,8 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope | **Core Tests** | master | [![Build Status](https://dev.azure.com/best-practices/deepseismic/_apis/build/status/microsoft.Tests%20(seismic-deeplearning)?branchName=master)](https://dev.azure.com/best-practices/deepseismic/_build/latest?definitionId=126&branchName=master) | | **Notebook Tests** | staging | [![Build Status](https://dev.azure.com/best-practices/deepseismic/_apis/build/status/microsoft.Notebooks%20(seismic-deeplearning)?branchName=staging)](https://dev.azure.com/best-practices/deepseismic/_build/latest?definitionId=125&branchName=staging) | | **Notebook Tests** | master | [![Build Status](https://dev.azure.com/best-practices/deepseismic/_apis/build/status/microsoft.Notebooks%20(seismic-deeplearning)?branchName=master)](https://dev.azure.com/best-practices/deepseismic/_build/latest?definitionId=125&branchName=master) | +| **Azure ML Tests** | staging | TODO add badge link | +| **Azure ML Tests** | master | TODO add badge link | # Troubleshooting @@ -411,4 +413,3 @@ which will indicate that anaconda folder is __/anaconda__. We'll refer to this l - diff --git a/examples/interpretation/README.md b/examples/interpretation/README.md index 2706244f..4efe6b09 100644 --- a/examples/interpretation/README.md +++ b/examples/interpretation/README.md @@ -1,5 +1,7 @@ -The repository contains the following notebook examples -* [Dutch F3 dataset](notebooks/F3_block_training_and_evaluation_local.ipynb): This notebook illustrates section and patch based segmentation approaches on the [Dutch F3](https://terranubis.com/datainfo/Netherlands-Offshore-F3-Block-Complete) open dataset. +The folder contains notebook examples illustrating the use of segmentation algorithms on openly available datasets. Make sure you have followed the [set up instructions](../README.md) before running these examples. We provide the following notebook examples + +* [Dutch F3 dataset](notebooks/F3_block_training_and_evaluation_local.ipynb): This notebook illustrates section and patch based segmentation approaches on the [Dutch F3](https://terranubis.com/datainfo/Netherlands-Offshore-F3-Block-Complete) open dataset. This notebook uses denconvolution based segmentation algorithm on 2D patches. The notebook will guide you through visualization of the input volume, setting up model training and evaluation. + * [Penobscot dataset](notebooks/HRNet_Penobscot_demo_notebook.ipynb): -This notebook illustrates the use of HRNet based segmentation algorithm on the [Penobscot](https://terranubis.com/datainfo/Penobscot) dataset +In this notebook, we demonstrate how to train an [HRNet](https://github.com/HRNet/HRNet-Semantic-Segmentation) model for facies prediction using [Penobscot](https://terranubis.com/datainfo/Penobscot) dataset. The Penobscot 3D seismic dataset was acquired in the Scotian shelf, offshore Nova Scotia, Canada. This notebook illustrates the use of HRNet based segmentation algorithm on the dataset. Details of HRNet based model can be found [here](https://arxiv.org/abs/1904.04514) diff --git a/examples/interpretation/notebooks/F3_block_training_and_evaluation_local.ipynb b/examples/interpretation/notebooks/F3_block_training_and_evaluation_local.ipynb index b2c54194..c4a280ee 100644 --- a/examples/interpretation/notebooks/F3_block_training_and_evaluation_local.ipynb +++ b/examples/interpretation/notebooks/F3_block_training_and_evaluation_local.ipynb @@ -262,7 +262,8 @@ "# The number of epochs to run in training\n", "max_epochs = config.TRAIN.END_EPOCH \n", "max_snapshots = config.TRAIN.SNAPSHOTS\n", - "dataset_root = config.DATASET.ROOT" + "dataset_root = config.DATASET.ROOT\n", + "model_pretrained = None" ] }, { @@ -1021,4 +1022,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/examples/interpretation/notebooks/HRNet_Penobscot_demo_notebook.ipynb b/examples/interpretation/notebooks/HRNet_Penobscot_demo_notebook.ipynb index 95f0b9b8..191c7be0 100644 --- a/examples/interpretation/notebooks/HRNet_Penobscot_demo_notebook.ipynb +++ b/examples/interpretation/notebooks/HRNet_Penobscot_demo_notebook.ipynb @@ -175,7 +175,8 @@ "# The number of epochs to run in training\n", "max_epochs = config.TRAIN.END_EPOCH \n", "max_snapshots = config.TRAIN.SNAPSHOTS\n", - "dataset_root = config.DATASET.ROOT" + "dataset_root = config.DATASET.ROOT\n", + "model_pretrained = config.MODEL.PRETRAINED" ] }, { @@ -186,7 +187,7 @@ "source": [ "# make sure data location exists and we're specifying a pre-trained model\n", "assert path.exists(dataset_root) , \"Path defined in DATASET.ROOT:%s does not exist\"%(dataset_root)\n", - "assert (not config.MODEL.PRETRAINED or path.exists(config.MODEL.PRETRAINED)), \"Model pre-trained path has to be empty or should exist: %s\"%(config.MODEL.PRETRAINED) " + "assert (not model_pretrained or path.exists(model_pretrained)), \"Model pre-trained path has to be empty or should exist: %s\"%(model_pretrained) " ] }, { @@ -420,6 +421,9 @@ "metadata": {}, "outputs": [], "source": [ + "# our models currently work through common config file, in order to run this notebook from Papermill\n", + "# in our tests we need to update the config with papermill setting. \n", + "config.MODEL.PRETRAINED = model_pretrained\n", "model = getattr(models, config.MODEL.NAME).get_seg_model(config)\n", "\n", "device = \"cpu\"\n", diff --git a/tests/cicd/component_governance.yml b/tests/cicd/component_governance.yml index cae6b7a9..172fb40a 100644 --- a/tests/cicd/component_governance.yml +++ b/tests/cicd/component_governance.yml @@ -10,10 +10,12 @@ pr: - master - staging +- contrib trigger: - master - staging +- contrib pool: vmImage: 'ubuntu-latest' diff --git a/tests/cicd/notebooks_build.yml b/tests/cicd/notebooks_build.yml index 48a7c9a3..bd3c1007 100644 --- a/tests/cicd/notebooks_build.yml +++ b/tests/cicd/notebooks_build.yml @@ -5,15 +5,16 @@ pr: - master - staging +- contrib # Any commit to this branch will trigger the build. trigger: - master - staging +- contrib jobs: -# partially disable setup for now - done manually on build VM - job: setup timeoutInMinutes: 10 displayName: Setup @@ -50,7 +51,10 @@ jobs: steps: - bash: | source activate seismic-interpretation - pytest -s tests/cicd/src/notebook_integration_tests.py --nbname examples/interpretation/notebooks/HRNet_Penobscot_demo_notebook.ipynb --dataset_root /home/alfred/data_dynamic/penobscot + pytest -s tests/cicd/src/notebook_integration_tests.py \ + --nbname examples/interpretation/notebooks/HRNet_Penobscot_demo_notebook.ipynb \ + --dataset_root /home/alfred/data_dynamic/penobscot \ + --model_pretrained /home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth - job: F3_block_training_and_evaluation_local dependsOn: setup @@ -61,4 +65,7 @@ jobs: steps: - bash: | source activate seismic-interpretation - pytest -s tests/cicd/src/notebook_integration_tests.py --nbname examples/interpretation/notebooks/F3_block_training_and_evaluation_local.ipynb --dataset_root /home/alfred/data_dynamic/dutch_f3/data + pytest -s tests/cicd/src/notebook_integration_tests.py \ + --nbname examples/interpretation/notebooks/F3_block_training_and_evaluation_local.ipynb \ + --dataset_root /home/alfred/data_dynamic/dutch_f3/data \ + --model_pretrained /home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth \ No newline at end of file diff --git a/tests/cicd/src/conftest.py b/tests/cicd/src/conftest.py index c222d3b9..b84e9c34 100644 --- a/tests/cicd/src/conftest.py +++ b/tests/cicd/src/conftest.py @@ -7,6 +7,7 @@ def pytest_addoption(parser): parser.addoption("--nbname", action="store", type=str, default=None) parser.addoption("--dataset_root", action="store", type=str, default=None) + parser.addoption("--model_pretrained", action="store", type=str, default=None) @pytest.fixture @@ -18,6 +19,10 @@ def nbname(request): def dataset_root(request): return request.config.getoption("--dataset_root") +@pytest.fixture +def model_pretrained(request): + return request.config.getoption("--model_pretrained") + """ def pytest_generate_tests(metafunc): @@ -29,4 +34,4 @@ def pytest_generate_tests(metafunc): option_value = metafunc.config.option.dataset_root if 'dataset_root' in metafunc.fixturenames and option_value is not None: metafunc.parametrize("dataset_root", [option_value]) -""" +""" \ No newline at end of file diff --git a/tests/cicd/src/notebook_integration_tests.py b/tests/cicd/src/notebook_integration_tests.py index 1c0ccbf8..4bddc97d 100644 --- a/tests/cicd/src/notebook_integration_tests.py +++ b/tests/cicd/src/notebook_integration_tests.py @@ -9,11 +9,17 @@ # don't add any markup as this just runs any notebook which name is supplied # @pytest.mark.integration # @pytest.mark.notebooks -def test_notebook_run(nbname, dataset_root): +def test_notebook_run(nbname, dataset_root, model_pretrained): pm.execute_notebook( nbname, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, - parameters={"max_iterations": 3, "max_epochs": 1, "max_snapshots": 1, "dataset_root": dataset_root}, + parameters={ + "max_iterations": 3, + "max_epochs": 1, + "max_snapshots": 1, + "dataset_root": dataset_root, + "model_pretrained": model_pretrained, + }, cwd="examples/interpretation/notebooks", - ) + ) \ No newline at end of file From 123db24d5536848c28b5b41b108cc6e467e7d97e Mon Sep 17 00:00:00 2001 From: risquass Date: Thu, 13 Feb 2020 09:14:34 -0300 Subject: [PATCH 07/21] fix merge errors - other diff - files not related to change --- README.md | 2 +- examples/interpretation/README.md | 2 +- .../notebooks/F3_block_training_and_evaluation_local.ipynb | 2 +- .../notebooks/HRNet_Penobscot_demo_notebook.ipynb | 2 +- tests/cicd/notebooks_build.yml | 6 +++--- tests/cicd/src/conftest.py | 2 +- tests/cicd/src/notebook_integration_tests.py | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 2b0e219a..f1c480a3 100644 --- a/README.md +++ b/README.md @@ -146,7 +146,7 @@ python prepare_dutchf3.py split_train_val section --data_dir=${data_dir}/data # For patch-based experiments -python prepare_dutchf3.py split_train_val patch --data_dir=${data_dir}/data --stride=50 --patch=100 +python prepare_dutchf3.py split_train_val patch --data_dir=${data_dir}/data --stride=50 --patch_size=100 # go back to repo root cd .. diff --git a/examples/interpretation/README.md b/examples/interpretation/README.md index 4efe6b09..edd043eb 100644 --- a/examples/interpretation/README.md +++ b/examples/interpretation/README.md @@ -4,4 +4,4 @@ The folder contains notebook examples illustrating the use of segmentation algor * [Penobscot dataset](notebooks/HRNet_Penobscot_demo_notebook.ipynb): -In this notebook, we demonstrate how to train an [HRNet](https://github.com/HRNet/HRNet-Semantic-Segmentation) model for facies prediction using [Penobscot](https://terranubis.com/datainfo/Penobscot) dataset. The Penobscot 3D seismic dataset was acquired in the Scotian shelf, offshore Nova Scotia, Canada. This notebook illustrates the use of HRNet based segmentation algorithm on the dataset. Details of HRNet based model can be found [here](https://arxiv.org/abs/1904.04514) +In this notebook, we demonstrate how to train an [HRNet](https://github.com/HRNet/HRNet-Semantic-Segmentation) model for facies prediction using [Penobscot](https://terranubis.com/datainfo/Penobscot) dataset. The Penobscot 3D seismic dataset was acquired in the Scotian shelf, offshore Nova Scotia, Canada. This notebook illustrates the use of HRNet based segmentation algorithm on the dataset. Details of HRNet based model can be found [here](https://arxiv.org/abs/1904.04514) \ No newline at end of file diff --git a/examples/interpretation/notebooks/F3_block_training_and_evaluation_local.ipynb b/examples/interpretation/notebooks/F3_block_training_and_evaluation_local.ipynb index c4a280ee..61270609 100644 --- a/examples/interpretation/notebooks/F3_block_training_and_evaluation_local.ipynb +++ b/examples/interpretation/notebooks/F3_block_training_and_evaluation_local.ipynb @@ -1022,4 +1022,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/examples/interpretation/notebooks/HRNet_Penobscot_demo_notebook.ipynb b/examples/interpretation/notebooks/HRNet_Penobscot_demo_notebook.ipynb index 191c7be0..fed506bd 100644 --- a/examples/interpretation/notebooks/HRNet_Penobscot_demo_notebook.ipynb +++ b/examples/interpretation/notebooks/HRNet_Penobscot_demo_notebook.ipynb @@ -666,4 +666,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/tests/cicd/notebooks_build.yml b/tests/cicd/notebooks_build.yml index bd3c1007..8dd03519 100644 --- a/tests/cicd/notebooks_build.yml +++ b/tests/cicd/notebooks_build.yml @@ -55,7 +55,7 @@ jobs: --nbname examples/interpretation/notebooks/HRNet_Penobscot_demo_notebook.ipynb \ --dataset_root /home/alfred/data_dynamic/penobscot \ --model_pretrained /home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth - + - job: F3_block_training_and_evaluation_local dependsOn: setup timeoutInMinutes: 5 @@ -63,9 +63,9 @@ jobs: pool: name: deepseismicagentpool steps: - - bash: | + - bash: | source activate seismic-interpretation pytest -s tests/cicd/src/notebook_integration_tests.py \ --nbname examples/interpretation/notebooks/F3_block_training_and_evaluation_local.ipynb \ --dataset_root /home/alfred/data_dynamic/dutch_f3/data \ - --model_pretrained /home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth \ No newline at end of file + --model_pretrained /home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth diff --git a/tests/cicd/src/conftest.py b/tests/cicd/src/conftest.py index b84e9c34..95f8eea3 100644 --- a/tests/cicd/src/conftest.py +++ b/tests/cicd/src/conftest.py @@ -34,4 +34,4 @@ def pytest_generate_tests(metafunc): option_value = metafunc.config.option.dataset_root if 'dataset_root' in metafunc.fixturenames and option_value is not None: metafunc.parametrize("dataset_root", [option_value]) -""" \ No newline at end of file +""" diff --git a/tests/cicd/src/notebook_integration_tests.py b/tests/cicd/src/notebook_integration_tests.py index 4bddc97d..12593f75 100644 --- a/tests/cicd/src/notebook_integration_tests.py +++ b/tests/cicd/src/notebook_integration_tests.py @@ -22,4 +22,4 @@ def test_notebook_run(nbname, dataset_root, model_pretrained): "model_pretrained": model_pretrained, }, cwd="examples/interpretation/notebooks", - ) \ No newline at end of file + ) From e88b5b6d654fbb9f2555ce360811625d4c4eb200 Mon Sep 17 00:00:00 2001 From: risquass Date: Thu, 13 Feb 2020 09:16:14 -0300 Subject: [PATCH 08/21] fix merge errors - other diff - files not related to change --- examples/interpretation/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/interpretation/README.md b/examples/interpretation/README.md index edd043eb..4efe6b09 100644 --- a/examples/interpretation/README.md +++ b/examples/interpretation/README.md @@ -4,4 +4,4 @@ The folder contains notebook examples illustrating the use of segmentation algor * [Penobscot dataset](notebooks/HRNet_Penobscot_demo_notebook.ipynb): -In this notebook, we demonstrate how to train an [HRNet](https://github.com/HRNet/HRNet-Semantic-Segmentation) model for facies prediction using [Penobscot](https://terranubis.com/datainfo/Penobscot) dataset. The Penobscot 3D seismic dataset was acquired in the Scotian shelf, offshore Nova Scotia, Canada. This notebook illustrates the use of HRNet based segmentation algorithm on the dataset. Details of HRNet based model can be found [here](https://arxiv.org/abs/1904.04514) \ No newline at end of file +In this notebook, we demonstrate how to train an [HRNet](https://github.com/HRNet/HRNet-Semantic-Segmentation) model for facies prediction using [Penobscot](https://terranubis.com/datainfo/Penobscot) dataset. The Penobscot 3D seismic dataset was acquired in the Scotian shelf, offshore Nova Scotia, Canada. This notebook illustrates the use of HRNet based segmentation algorithm on the dataset. Details of HRNet based model can be found [here](https://arxiv.org/abs/1904.04514) From 66ce29e0ddf338c3f4edfa3d79c00473a4dd2557 Mon Sep 17 00:00:00 2001 From: risquass Date: Thu, 13 Feb 2020 09:22:58 -0300 Subject: [PATCH 09/21] fix merge errors - other diff - files not related to change --- examples/interpretation/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/interpretation/README.md b/examples/interpretation/README.md index 4efe6b09..01bc2dff 100644 --- a/examples/interpretation/README.md +++ b/examples/interpretation/README.md @@ -5,3 +5,4 @@ The folder contains notebook examples illustrating the use of segmentation algor * [Penobscot dataset](notebooks/HRNet_Penobscot_demo_notebook.ipynb): In this notebook, we demonstrate how to train an [HRNet](https://github.com/HRNet/HRNet-Semantic-Segmentation) model for facies prediction using [Penobscot](https://terranubis.com/datainfo/Penobscot) dataset. The Penobscot 3D seismic dataset was acquired in the Scotian shelf, offshore Nova Scotia, Canada. This notebook illustrates the use of HRNet based segmentation algorithm on the dataset. Details of HRNet based model can be found [here](https://arxiv.org/abs/1904.04514) + From a9176e090e70d8d86f8ff147a68feaf1172b70ae Mon Sep 17 00:00:00 2001 From: risquass Date: Thu, 13 Feb 2020 09:25:28 -0300 Subject: [PATCH 10/21] fix merge errors - other diff - files not related to change --- tests/cicd/notebooks_build.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/cicd/notebooks_build.yml b/tests/cicd/notebooks_build.yml index 8dd03519..3d394cea 100644 --- a/tests/cicd/notebooks_build.yml +++ b/tests/cicd/notebooks_build.yml @@ -5,16 +5,15 @@ pr: - master - staging -- contrib # Any commit to this branch will trigger the build. trigger: - master - staging -- contrib jobs: +# partially disable setup for now - done manually on build VM - job: setup timeoutInMinutes: 10 displayName: Setup @@ -55,7 +54,7 @@ jobs: --nbname examples/interpretation/notebooks/HRNet_Penobscot_demo_notebook.ipynb \ --dataset_root /home/alfred/data_dynamic/penobscot \ --model_pretrained /home/alfred/models/hrnetv2_w48_imagenet_pretrained.pth - + - job: F3_block_training_and_evaluation_local dependsOn: setup timeoutInMinutes: 5 @@ -63,7 +62,7 @@ jobs: pool: name: deepseismicagentpool steps: - - bash: | + - bash: | source activate seismic-interpretation pytest -s tests/cicd/src/notebook_integration_tests.py \ --nbname examples/interpretation/notebooks/F3_block_training_and_evaluation_local.ipynb \ From 91c49e97ac19d155dbcefc430cbfbc3140cef3c3 Mon Sep 17 00:00:00 2001 From: risquass Date: Thu, 13 Feb 2020 09:26:48 -0300 Subject: [PATCH 11/21] fix merge errors - other diff - files not related to change --- tests/cicd/notebooks_build.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/cicd/notebooks_build.yml b/tests/cicd/notebooks_build.yml index 3d394cea..eac04aab 100644 --- a/tests/cicd/notebooks_build.yml +++ b/tests/cicd/notebooks_build.yml @@ -5,15 +5,16 @@ pr: - master - staging +- contrib # Any commit to this branch will trigger the build. trigger: - master - staging +- contrib jobs: -# partially disable setup for now - done manually on build VM - job: setup timeoutInMinutes: 10 displayName: Setup From 0da3ab4b3fd9b6db3c77d552f2b04215431bb12b Mon Sep 17 00:00:00 2001 From: risquass Date: Thu, 13 Feb 2020 09:45:08 -0300 Subject: [PATCH 12/21] update script with new required parameter label_file --- tests/cicd/src/scripts/get_data_for_builds.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/cicd/src/scripts/get_data_for_builds.sh b/tests/cicd/src/scripts/get_data_for_builds.sh index bacaa98a..d7a0dabb 100755 --- a/tests/cicd/src/scripts/get_data_for_builds.sh +++ b/tests/cicd/src/scripts/get_data_for_builds.sh @@ -39,5 +39,5 @@ DATA_F3="${DATA_F3}/data" # test preprocessing scripts cd scripts python prepare_penobscot.py split_inline --data-dir=${DATA_PENOBSCOT} --val-ratio=.1 --test-ratio=.2 -python prepare_dutchf3.py split_train_val section --data_dir=${DATA_F3} -python prepare_dutchf3.py split_train_val patch --data_dir=${DATA_F3} --stride=50 --patch_size=100 +python prepare_dutchf3.py split_train_val section --data_dir=${DATA_F3} --label_file=${DATA_F3}/train/train_labels.npy +python prepare_dutchf3.py split_train_val patch --data_dir=${DATA_F3} --label_file=${DATA_F3}/train/train_labels.npy --stride=50 --patch_size=100 From 909c3d8db10b84629f228acccf66fe48e68e9927 Mon Sep 17 00:00:00 2001 From: risquass Date: Thu, 13 Feb 2020 10:02:04 -0300 Subject: [PATCH 13/21] ignoring split_alaudah_et_al_19 as it is not updated --- scripts/prepare_dutchf3.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/prepare_dutchf3.py b/scripts/prepare_dutchf3.py index 869da009..2c94f888 100644 --- a/scripts/prepare_dutchf3.py +++ b/scripts/prepare_dutchf3.py @@ -401,6 +401,8 @@ def patch(self, label_file, stride, patch_size, --patch size=100 --label_file=label_file.npy """ fire.Fire( - {"split_train_val": SplitTrainValCLI, - "split_alaudah_et_al_19": split_alaudah_et_al_19} + {"split_train_val": SplitTrainValCLI} + # commenting the following line as this was not updated with + # the new parameters names + # "split_alaudah_et_al_19": split_alaudah_et_al_19} ) From ecac9e8bb4f5af7cb8a5ef18aab239a4e3364af7 Mon Sep 17 00:00:00 2001 From: risquass Date: Thu, 13 Feb 2020 10:22:26 -0300 Subject: [PATCH 14/21] updated arguments used --- README.md | 5 +++-- scripts/prepare_dutchf3.py | 7 ++++--- tests/cicd/src/scripts/get_data_for_builds.sh | 4 ++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index f1c480a3..bfc8c424 100644 --- a/README.md +++ b/README.md @@ -142,11 +142,12 @@ To prepare the data for the experiments (e.g. split into train/val/test), please cd scripts # For section-based experiments -python prepare_dutchf3.py split_train_val section --data_dir=${data_dir}/data +python prepare_dutchf3.py split_train_val section --data_dir=${data_dir} --label_file=train/train_labels.npy --output_dir=splits # For patch-based experiments -python prepare_dutchf3.py split_train_val patch --data_dir=${data_dir}/data --stride=50 --patch_size=100 +python prepare_dutchf3.py split_train_val patch --data_dir=${data_dir} --label_file=train/train_labels.npy --output_dir=splits \ +--stride=50 --patch_size=100 # go back to repo root cd .. diff --git a/scripts/prepare_dutchf3.py b/scripts/prepare_dutchf3.py index 2c94f888..6896092d 100644 --- a/scripts/prepare_dutchf3.py +++ b/scripts/prepare_dutchf3.py @@ -394,11 +394,12 @@ def patch(self, label_file, stride, patch_size, if __name__ == "__main__": """Example: - python prepare_data.py split_train_val section --data_dir=/mnt/dutch + python prepare_data.py split_train_val section --data_dir=/mnt/dutch \ + --label_file=label_file.npy --output_dir=splits --slice_steps=2 or python prepare_dutchf3.py split_train_val patch --output_dir=splits \ - --data_dir=data/ --slice_steps=2 --stride=50 \ - --patch size=100 --label_file=label_file.npy + --data_dir=data/ --slice_steps=2 --stride=50 --patch_size=100 \ + --label_file=label_file.npy """ fire.Fire( {"split_train_val": SplitTrainValCLI} diff --git a/tests/cicd/src/scripts/get_data_for_builds.sh b/tests/cicd/src/scripts/get_data_for_builds.sh index d7a0dabb..e9302e14 100755 --- a/tests/cicd/src/scripts/get_data_for_builds.sh +++ b/tests/cicd/src/scripts/get_data_for_builds.sh @@ -39,5 +39,5 @@ DATA_F3="${DATA_F3}/data" # test preprocessing scripts cd scripts python prepare_penobscot.py split_inline --data-dir=${DATA_PENOBSCOT} --val-ratio=.1 --test-ratio=.2 -python prepare_dutchf3.py split_train_val section --data_dir=${DATA_F3} --label_file=${DATA_F3}/train/train_labels.npy -python prepare_dutchf3.py split_train_val patch --data_dir=${DATA_F3} --label_file=${DATA_F3}/train/train_labels.npy --stride=50 --patch_size=100 +python prepare_dutchf3.py split_train_val section --data_dir=${DATA_F3} --label_file=train/train_labels.npy --output_dir=splits +python prepare_dutchf3.py split_train_val patch --data_dir=${DATA_F3} --label_file=train/train_labels.npy --output_dir=splits --stride=50 --patch_size=100 From 22817f6c8121e47c32c3048cbb4c249b3b851a86 Mon Sep 17 00:00:00 2001 From: risquass Date: Thu, 13 Feb 2020 11:21:23 -0300 Subject: [PATCH 15/21] fix commnad line arguments --- scripts/prepare_dutchf3.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/scripts/prepare_dutchf3.py b/scripts/prepare_dutchf3.py index 6896092d..d8ed6d14 100644 --- a/scripts/prepare_dutchf3.py +++ b/scripts/prepare_dutchf3.py @@ -361,11 +361,15 @@ def section(self, data_dir, label_file, per_val=0.2, if data_dir is not None: label_file = path.join(data_dir, label_file) output_dir = path.join(data_dir, output_dir) - return split_section_train_val(data_dir, output_dir, label_file, - slice_steps, per_val, log_config) + return split_section_train_val(data_dir=data_dir, + output_dir=output_dir, + label_file=label_file, + slice_steps=slice_steps, + per_val=per_val, + log_config=log_config) def patch(self, label_file, stride, patch_size, - per_val=0.2, log_config="train/deepseismic/configs/logging.conf", + per_val=0.2, log_config="logging.conf", data_dir=None, output_dir=None, slice_steps=1): """Generate train and validation files for Netherlands F3 dataset. @@ -386,20 +390,25 @@ def patch(self, label_file, stride, patch_size, if data_dir is not None: label_file = path.join(data_dir, label_file) output_dir = path.join(data_dir, output_dir) - return split_patch_train_val(output_dir, label_file, - stride, patch_size, - per_val, log_config, - slice_steps) + + return split_patch_train_val(data_dir=data_dir, + output_dir=output_dir, + label_file=label_file, + stride=stride, + patch_size=patch_size, + slice_steps=slice_steps, + per_val=per_val, + log_config=log_config) if __name__ == "__main__": """Example: - python prepare_data.py split_train_val section --data_dir=/mnt/dutch \ + python prepare_data.py split_train_val section --data_dir=data \ --label_file=label_file.npy --output_dir=splits --slice_steps=2 or - python prepare_dutchf3.py split_train_val patch --output_dir=splits \ - --data_dir=data/ --slice_steps=2 --stride=50 --patch_size=100 \ - --label_file=label_file.npy + python prepare_dutchf3.py split_train_val patch --data_dir=data \ + --label_file=label_file.npy --output_dir=splits --stride=50 \ + --patch_size=100 --slice_steps=2 """ fire.Fire( {"split_train_val": SplitTrainValCLI} From 5a88ad0641eccca39ac22b26620e8e721779709f Mon Sep 17 00:00:00 2001 From: risquass Date: Thu, 13 Feb 2020 14:38:41 -0300 Subject: [PATCH 16/21] changed TEST to VALIDATION for clarity in the code --- scripts/prepare_dutchf3.py | 112 ++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 57 deletions(-) diff --git a/scripts/prepare_dutchf3.py b/scripts/prepare_dutchf3.py index d8ed6d14..a5b8d3ca 100644 --- a/scripts/prepare_dutchf3.py +++ b/scripts/prepare_dutchf3.py @@ -16,12 +16,12 @@ from sklearn.model_selection import train_test_split -def _write_split_files(splits_path, train_list, test_list, loader_type): +def _write_split_files(splits_path, train_list, val_list, loader_type): if not path.isdir(splits_path): mkdir(splits_path) file_object = open(path.join(splits_path, loader_type + "_train_val.txt"), "w") - file_object.write("\n".join(train_list + test_list)) + file_object.write("\n".join(train_list + val_list)) file_object.close() file_object = open(path.join(splits_path, loader_type + "_train.txt"), "w") @@ -29,7 +29,7 @@ def _write_split_files(splits_path, train_list, test_list, loader_type): file_object.close() file_object = open(path.join(splits_path, loader_type + "_val.txt"), "w") - file_object.write("\n".join(test_list)) + file_object.write("\n".join(val_list)) file_object.close() @@ -38,11 +38,11 @@ def _get_aline_range(aline, per_val, slice_steps): if slice_steps < 1: raise ValueError('slice_steps cannot be zero or a negative number') # Inline and Crossline sections - test_aline = math.floor(aline * per_val / 2) - test_aline_range = itertools.chain(range(0, test_aline), - range(aline - test_aline, aline)) - train_aline_range = range(test_aline, aline - test_aline, slice_steps) - return train_aline_range, test_aline_range + val_aline = math.floor(aline * per_val / 2) + val_aline_range = itertools.chain(range(0, val_aline), + range(aline - val_aline, aline)) + train_aline_range = range(val_aline, aline - val_aline, slice_steps) + return train_aline_range, val_aline_range except (Exception, ValueError): raise @@ -74,32 +74,31 @@ def split_section_train_val(data_dir, output_dir, label_file, per_val=0.2, logger.info("Splitting data into sections .... ") logger.info(f"Reading data from {data_dir}") - # labels_path = _get_labels_path(data_dir) logger.info(f"Loading {label_file}") labels = np.load(label_file) logger.debug(f"Data shape [iline|xline|depth] {labels.shape}") iline, xline, _ = labels.shape # Inline sections - train_iline_range, test_iline_range = _get_aline_range(iline, + train_iline_range, val_iline_range = _get_aline_range(iline, per_val, slice_steps) train_i_list = ["i_" + str(i) for i in train_iline_range] - test_i_list = ["i_" + str(i) for i in test_iline_range] + val_i_list = ["i_" + str(i) for i in val_iline_range] # Xline sections - train_xline_range, test_xline_range = _get_aline_range(xline, + train_xline_range, val_xline_range = _get_aline_range(xline, per_val, slice_steps) train_x_list = ["x_" + str(x) for x in train_xline_range] - test_x_list = ["x_" + str(x) for x in test_xline_range] + val_x_list = ["x_" + str(x) for x in val_xline_range] train_list = train_x_list + train_i_list - test_list = test_x_list + test_i_list + val_list = val_x_list + val_i_list # write to files to disk logger.info(f"Writing {output_dir}") - _write_split_files(output_dir, train_list, test_list, "section") + _write_split_files(output_dir, train_list, val_list, "section") def split_patch_train_val(data_dir, output_dir, label_file, stride, patch_size, @@ -129,19 +128,18 @@ def split_patch_train_val(data_dir, output_dir, label_file, stride, patch_size, logger.info("Splitting data into patches .... ") logger.info(f"Reading data from {data_dir}") - # labels_path = _get_labels_path(data_dir) logger.info(f"Loading {label_file}") labels = np.load(label_file) logger.debug(f"Data shape [iline|xline|depth] {labels.shape}") iline, xline, depth = labels.shape # Inline sections - train_iline_range, test_iline_range = _get_aline_range(iline, + train_iline_range, val_iline_range = _get_aline_range(iline, per_val, slice_steps) # Xline sections - train_xline_range, test_xline_range = _get_aline_range(xline, + train_xline_range, val_xline_range = _get_aline_range(xline, per_val, slice_steps) @@ -152,8 +150,8 @@ def split_patch_train_val(data_dir, output_dir, label_file, stride, patch_size, # Process inlines # iline = xline x depth - test_iline = math.floor(xline * per_val / 2) - logger.debug(test_iline) + val_iline = math.floor(xline * per_val / 2) + logger.debug(val_iline) def _i_extract_patches(iline_range, horz_locations, vert_locations): for i in iline_range: @@ -163,30 +161,30 @@ def _i_extract_patches(iline_range, horz_locations, vert_locations): yield "i_" + str(i) + "_" + str(j) + "_" + str(k) # Process inlines - train - horz_locations_train = range(test_iline, xline - patch_size, stride) + horz_locations_train = range(val_iline, xline - patch_size, stride) logger.debug("Generating Inline patches") logger.debug("Generating Inline patches - Train") logger.debug(horz_locations_train) train_i_list = list(_i_extract_patches(train_iline_range, horz_locations_train, vert_locations)) - # test_iline - define size of the test set for the fist part - test_iline_range = list(test_iline_range) - - # Process inlines - test - begining - logger.debug("Generating Inline patches - Test") - horz_locations_test_begin = range(0, test_iline, max(1,stride)) - test_i_list = list(_i_extract_patches(test_iline_range, - horz_locations_test_begin, + # val_iline - define size of the validation set for the fist part + val_iline_range = list(val_iline_range) + + # Process inlines - validation - begining + logger.debug("Generating Inline patches - validation") + horz_locations_val_begin = range(0, val_iline, max(1,stride)) + val_i_list = list(_i_extract_patches(val_iline_range, + horz_locations_val_begin, vert_locations)) - # Process inlines - test - end - horz_locations_test_end = range(xline - stride + 1, xline, max(1,stride)) - test_i_list += list(_i_extract_patches(test_iline_range, - horz_locations_test_end, + # Process inlines - validation - end + horz_locations_val_end = range(xline - stride + 1, xline, max(1,stride)) + val_i_list += list(_i_extract_patches(val_iline_range, + horz_locations_val_end, vert_locations)) logger.debug(train_iline_range) - logger.debug(test_iline_range) + logger.debug(val_iline_range) # Process crosslines def _x_extract_patches(xline_range, horz_locations, vert_locations): @@ -199,46 +197,46 @@ def _x_extract_patches(xline_range, horz_locations, vert_locations): logger.debug("Generating Crossline patches") logger.debug("Generating Crossline patches - Train") # xline = iline x depth - test_xline = math.floor(iline * per_val / 2) - logger.debug(test_xline) + val_xline = math.floor(iline * per_val / 2) + logger.debug(val_xline) # Process xlines - train - horz_locations_train = range(test_xline, iline - patch_size, stride) + horz_locations_train = range(val_xline, iline - patch_size, stride) logger.debug(horz_locations_train) train_x_list = list(_x_extract_patches(train_xline_range, horz_locations_train, vert_locations)) - # test_xline - define size of the test set for the fist part - test_xline_range = list(test_xline_range) + # val_xline - define size of the validation set for the fist part + val_xline_range = list(val_xline_range) - # Process xlines - test - begining - logger.debug("Generating Inline patches - Test") - horz_locations_test_begin = range(0, test_xline, max(1,stride)) - test_x_list = list(_i_extract_patches(test_xline_range, - horz_locations_test_begin, + # Process xlines - validation - begining + logger.debug("Generating Inline patches - validation") + horz_locations_val_begin = range(0, val_xline, max(1,stride)) + val_x_list = list(_i_extract_patches(val_xline_range, + horz_locations_val_begin, vert_locations)) - # Process xlines - test - end - horz_locations_test_end = range(iline - stride + 1, iline, max(1,stride)) - test_x_list += list(_i_extract_patches(test_xline_range, - horz_locations_test_end, + # Process xlines - validation - end + horz_locations_val_end = range(iline - stride + 1, iline, max(1,stride)) + val_x_list += list(_i_extract_patches(val_xline_range, + horz_locations_val_end, vert_locations)) - test_xline_range = list(test_xline_range) - test_x_list = list(_x_extract_patches(test_xline_range, - horz_locations_test_end, + val_xline_range = list(val_xline_range) + val_x_list = list(_x_extract_patches(val_xline_range, + horz_locations_val_end, vert_locations)) logger.debug(train_xline_range) - logger.debug(test_xline_range) + logger.debug(val_xline_range) train_list = train_x_list + train_i_list - test_list = test_x_list + test_i_list + val_list = val_x_list + val_i_list # write to files to disk: # NOTE: This isn't quite right we should calculate the patches # again for the whole volume logger.info(f"Writing {output_dir}") - _write_split_files(output_dir, train_list, test_list, "patch") + _write_split_files(output_dir, train_list, val_list, "patch") _LOADER_TYPES = {"section": split_section_train_val, "patch": split_patch_train_val} @@ -330,12 +328,12 @@ def split_alaudah_et_al_19(data_dir, stride, patch_size, fraction_validation=0.2 list_train_val = i_list + x_list - # create train and test splits: - train_list, test_list = train_test_split(list_train_val, test_size=fraction_validation, shuffle=True) + # create train and validation splits: + train_list, val_list = train_val_split(list_train_val, val_size=fraction_validation, shuffle=True) # write to files to disk: splits_path = _get_splits_path(data_dir) - _write_split_files(splits_path, train_list, test_list, loader_type) + _write_split_files(splits_path, train_list, val_list, loader_type) # TODO: Try https://github.com/Chilipp/docrep for doscstring reuse From 9f8c023e397c7a92862cd1caddde46ed09248e25 Mon Sep 17 00:00:00 2001 From: risquass Date: Thu, 13 Feb 2020 17:01:21 -0300 Subject: [PATCH 17/21] included job to run scripts unit test --- tests/cicd/main_build.yml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/cicd/main_build.yml b/tests/cicd/main_build.yml index f8dbc1e9..32ad2b11 100644 --- a/tests/cicd/main_build.yml +++ b/tests/cicd/main_build.yml @@ -41,10 +41,10 @@ jobs: azcopy --quiet --source:https://$(storagename).blob.core.windows.net/models/model --source-key $(storagekey) --destination /home/alfred/models/your_model_name -- job: unit_tests_job +- job: cv_lib_unit_tests_job dependsOn: setup timeoutInMinutes: 5 - displayName: Unit Tests Job + displayName: cv_lib Unit Tests Job pool: name: deepseismicagentpool steps: @@ -53,7 +53,15 @@ jobs: source activate seismic-interpretation pytest --durations=0 cv_lib/tests/ echo "cv_lib unit test job passed" - echo "" + +- job: scripts_unit_tests_job + dependsOn: setup + timeoutInMinutes: 5 + displayName: Unit Tests Job + pool: + name: deepseismicagentpool + steps: + - bash: | echo "Starting scripts unit tests" source activate seismic-interpretation pytest --durations=0 tests/ From ca8127026b44d5d54d15ed0738752fb86198f8ea Mon Sep 17 00:00:00 2001 From: risquass Date: Thu, 13 Feb 2020 17:18:29 -0300 Subject: [PATCH 18/21] fix train_val_split that was changed by error --- scripts/prepare_dutchf3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/prepare_dutchf3.py b/scripts/prepare_dutchf3.py index a5b8d3ca..20841088 100644 --- a/scripts/prepare_dutchf3.py +++ b/scripts/prepare_dutchf3.py @@ -329,7 +329,7 @@ def split_alaudah_et_al_19(data_dir, stride, patch_size, fraction_validation=0.2 list_train_val = i_list + x_list # create train and validation splits: - train_list, val_list = train_val_split(list_train_val, val_size=fraction_validation, shuffle=True) + train_list, val_list = train_test_split(list_train_val, val_size=fraction_validation, shuffle=True) # write to files to disk: splits_path = _get_splits_path(data_dir) From 753e1ee5dc3a4f81f35f70cf41a1e83082f60ef8 Mon Sep 17 00:00:00 2001 From: risquass Date: Fri, 14 Feb 2020 19:27:53 -0300 Subject: [PATCH 19/21] Fix val/train split and add tests --- scripts/prepare_dutchf3.py | 59 +++++++++++++++-------------------- tests/test_prepare_dutchf3.py | 50 +++++++++++++++++++++++------ 2 files changed, 66 insertions(+), 43 deletions(-) diff --git a/scripts/prepare_dutchf3.py b/scripts/prepare_dutchf3.py index 20841088..12a12096 100644 --- a/scripts/prepare_dutchf3.py +++ b/scripts/prepare_dutchf3.py @@ -40,8 +40,11 @@ def _get_aline_range(aline, per_val, slice_steps): # Inline and Crossline sections val_aline = math.floor(aline * per_val / 2) val_aline_range = itertools.chain(range(0, val_aline), - range(aline - val_aline, aline)) + range(aline - val_aline, aline)) train_aline_range = range(val_aline, aline - val_aline, slice_steps) + + print("aline: ", aline) + print("val_aline: ", val_aline) return train_aline_range, val_aline_range except (Exception, ValueError): raise @@ -55,8 +58,6 @@ def split_section_train_val(data_dir, output_dir, label_file, per_val=0.2, data_dir (str): data directory path output_dir (str): directory under data_dir to store the split files label_file (str): npy files with labels. Stored in data_dir - stride (int): stride to use when sectioning of the volume - patch_size (int): size of patch to extract per_val (float, optional): the fraction of the volume to use for validation. Defaults to 0.2. log_config (str): path to log configurations @@ -145,14 +146,10 @@ def split_patch_train_val(data_dir, output_dir, label_file, stride, patch_size, # Generate patches from sections # Vertical locations is common to all patches processed - vert_locations = range(0, depth - patch_size, stride) + vert_locations = range(0, depth - patch_size, patch_size) logger.debug(vert_locations) # Process inlines - # iline = xline x depth - val_iline = math.floor(xline * per_val / 2) - logger.debug(val_iline) - def _i_extract_patches(iline_range, horz_locations, vert_locations): for i in iline_range: locations = ([j, k] for j in horz_locations @@ -161,27 +158,28 @@ def _i_extract_patches(iline_range, horz_locations, vert_locations): yield "i_" + str(i) + "_" + str(j) + "_" + str(k) # Process inlines - train - horz_locations_train = range(val_iline, xline - patch_size, stride) logger.debug("Generating Inline patches") logger.debug("Generating Inline patches - Train") + # iline = xline x depth + val_iline = math.floor(xline * per_val / 2) + logger.debug(val_iline) + + # Process ilines - train + horz_locations_train = range(val_iline, xline - val_iline, max(1,patch_size)) logger.debug(horz_locations_train) train_i_list = list(_i_extract_patches(train_iline_range, horz_locations_train, vert_locations)) + # val_iline - define size of the validation set for the fist part val_iline_range = list(val_iline_range) - # Process inlines - validation - begining - logger.debug("Generating Inline patches - validation") - horz_locations_val_begin = range(0, val_iline, max(1,stride)) + # Process inlines - validation + horz_locations_val_end = range(xline - val_iline, xline, max(1,patch_size)) + val_iline_range = list(val_iline_range) val_i_list = list(_i_extract_patches(val_iline_range, - horz_locations_val_begin, - vert_locations)) - # Process inlines - validation - end - horz_locations_val_end = range(xline - stride + 1, xline, max(1,stride)) - val_i_list += list(_i_extract_patches(val_iline_range, - horz_locations_val_end, - vert_locations)) + horz_locations_val_end, + vert_locations)) logger.debug(train_iline_range) logger.debug(val_iline_range) @@ -201,7 +199,7 @@ def _x_extract_patches(xline_range, horz_locations, vert_locations): logger.debug(val_xline) # Process xlines - train - horz_locations_train = range(val_xline, iline - patch_size, stride) + horz_locations_train = range(val_xline, iline - val_xline, max(1,patch_size)) logger.debug(horz_locations_train) train_x_list = list(_x_extract_patches(train_xline_range, horz_locations_train, @@ -210,28 +208,23 @@ def _x_extract_patches(xline_range, horz_locations, vert_locations): # val_xline - define size of the validation set for the fist part val_xline_range = list(val_xline_range) - # Process xlines - validation - begining - logger.debug("Generating Inline patches - validation") - horz_locations_val_begin = range(0, val_xline, max(1,stride)) - val_x_list = list(_i_extract_patches(val_xline_range, - horz_locations_val_begin, - vert_locations)) - # Process xlines - validation - end - horz_locations_val_end = range(iline - stride + 1, iline, max(1,stride)) - val_x_list += list(_i_extract_patches(val_xline_range, - horz_locations_val_end, - vert_locations)) - + # Process xlines - validation + horz_locations_val_end = range(iline - val_xline, iline, max(1,patch_size)) val_xline_range = list(val_xline_range) val_x_list = list(_x_extract_patches(val_xline_range, horz_locations_val_end, vert_locations)) + logger.debug(train_xline_range) logger.debug(val_xline_range) train_list = train_x_list + train_i_list val_list = val_x_list + val_i_list + logger.debug(train_list) + logger.debug(val_list) + + # write to files to disk: # NOTE: This isn't quite right we should calculate the patches # again for the whole volume @@ -413,4 +406,4 @@ def patch(self, label_file, stride, patch_size, # commenting the following line as this was not updated with # the new parameters names # "split_alaudah_et_al_19": split_alaudah_et_al_19} - ) + ) \ No newline at end of file diff --git a/tests/test_prepare_dutchf3.py b/tests/test_prepare_dutchf3.py index a6c9737b..90542f77 100644 --- a/tests/test_prepare_dutchf3.py +++ b/tests/test_prepare_dutchf3.py @@ -7,10 +7,13 @@ import pandas as pd import tempfile import scripts.prepare_dutchf3 as prep_dutchf3 +import math # Setup OUTPUT = None -ILINE = XLINE = DEPTH = 111 +ILINE = 551 +XLINE = 1008 +DEPTH = 351 ALINE = np.zeros((ILINE, XLINE, DEPTH)) STRIDE = 100 PATCH = 50 @@ -131,8 +134,8 @@ def test_prepare_dutchf3_patch_step_1(): # test patch_train and slice_steps=1 y = list(sorted(set(patch_train.y.astype(int)))) x = list(sorted(set(patch_train.x.astype(int)),reverse=True)) - assert (float(y[2]) - float(y[1])) == float(SLICE_STEPS) - assert (float(x[1]) - float(x[2])) == float(SLICE_STEPS) + assert (int(y[2]) - int(y[1])) == SLICE_STEPS + assert (int(x[1]) - int(x[2])) == SLICE_STEPS # reading the file and splitting the data patch_val = pd.read_csv(output + '/patch_val.txt', header=None, names=['row', 'a', 'b']) @@ -141,8 +144,15 @@ def test_prepare_dutchf3_patch_step_1(): # test patch_val and slice_steps=1 y = list(sorted(set(patch_val.y.astype(int)))) x = list(sorted(set(patch_val.x.astype(int)),reverse=True)) - assert (float(y[2]) - float(y[1])) == float(SLICE_STEPS) - assert (float(x[1]) - float(x[2])) == float(SLICE_STEPS) + assert (int(y[2]) - int(y[1])) == SLICE_STEPS + assert (int(x[1]) - int(x[2])) == SLICE_STEPS + + # test validation set is, at least, PER_VAL + # print(len(set(patch_train.y)), len(set(patch_val.y))) + PER_VAL_CHK = len(set(patch_train.y))/(len(set(patch_train.y))+len(set(patch_val.y))) * 100 + assert round(PER_VAL_CHK,0) >= int(PER_VAL * 100) + PER_VAL_CHK = len(set(patch_train.x))/(len(set(patch_train.x))+len(set(patch_val.x))) * 100 + assert round(PER_VAL_CHK,0) >= int(PER_VAL * 100) def test_prepare_dutchf3_patch_step_2(): @@ -172,8 +182,8 @@ def test_prepare_dutchf3_patch_step_2(): # test patch_train and slice_steps=2 y = list(sorted(set(patch_train.y.astype(int)))) x = list(sorted(set(patch_train.x.astype(int)),reverse=True)) - assert (float(y[2]) - float(y[1])) == float(SLICE_STEPS) - assert (float(x[1]) - float(x[2])) == float(SLICE_STEPS) + assert (int(y[2]) - int(y[1])) == SLICE_STEPS + assert (int(x[1]) - int(x[2])) == SLICE_STEPS # reading the file and splitting the data patch_val = pd.read_csv(output + '/patch_val.txt', header=None, names=['row', 'a', 'b']) @@ -182,15 +192,22 @@ def test_prepare_dutchf3_patch_step_2(): # test patch_val and slice_steps=2 y = list(sorted(set(patch_val.y.astype(int)))) x = list(sorted(set(patch_val.x.astype(int)),reverse=True)) - assert (float(y[2]) - float(y[1])) != float(SLICE_STEPS) - assert (float(x[1]) - float(x[2])) != float(SLICE_STEPS) + assert (int(y[2]) - int(y[1])) != SLICE_STEPS + assert (int(x[1]) - int(x[2])) != SLICE_STEPS + + # test validation set is, at least, PER_VAL + PER_VAL_CHK = len(set(patch_train.y))/(len(set(patch_train.y))+len(set(patch_val.y))) * 100 + assert round(PER_VAL_CHK,0) >= int(PER_VAL * 100) + PER_VAL_CHK = len(set(patch_train.x))/(len(set(patch_train.x))+len(set(patch_val.x))) * 100 + assert round(PER_VAL_CHK,0) >= int(PER_VAL * 100) + def test_prepare_dutchf3_patch_train_and_test_sets(): """check a complete run for the script in case further changes are needed """ # setting a value to SLICE_STEPS as needed to test the values - SLICE_STEPS = 2 + SLICE_STEPS = 1 # use a temp dir that will be discarded at the end of the execution with tempfile.TemporaryDirectory() as tmpdirname: @@ -223,6 +240,11 @@ def test_prepare_dutchf3_patch_train_and_test_sets(): assert y_train & y_val == set() assert x_train & x_val == set() + # test validation set is, at least, PER_VAL + PER_VAL_CHK = len(set(patch_train.y))/(len(set(patch_train.y))+len(set(patch_val.y))) * 100 + assert round(PER_VAL_CHK,0) >= int(PER_VAL * 100) + PER_VAL_CHK = len(set(patch_train.x))/(len(set(patch_train.x))+len(set(patch_val.x))) * 100 + assert round(PER_VAL_CHK,0) >= int(PER_VAL * 100) def test_prepare_dutchf3_section_step_1(): @@ -255,6 +277,10 @@ def test_prepare_dutchf3_section_step_1(): assert (float(section_train.section[1]) - float(section_train.section[0])) % float(SLICE_STEPS) == 0.0 assert (float(section_val.section[1]) - float(section_val.section[0])) % float(SLICE_STEPS) == 0.0 + # test validation set is, at least, PER_VAL + PER_VAL_CHK = len(section_val)/(len(section_val)+len(section_train)) * 100 + assert round(PER_VAL_CHK,0) >= int(PER_VAL * 100) + def test_prepare_dutchf3_section_step_2(): """check a complete run for the script in case further changes are needed @@ -286,3 +312,7 @@ def test_prepare_dutchf3_section_step_2(): # test assert (float(section_train.section[1]) - float(section_train.section[0])) % float(SLICE_STEPS) == 0.0 assert (float(section_val.section[1]) - float(section_val.section[0])) % float(SLICE_STEPS) > 0.0 + + # test validation set is, at least, PER_VAL + PER_VAL_CHK = len(section_val)/(len(section_val)+len(section_train)) * 100 + assert round(PER_VAL_CHK,0) >= int(PER_VAL * 100) \ No newline at end of file From de2b86b520bd5d244d5c9948987c16d190a1d8b9 Mon Sep 17 00:00:00 2001 From: risquass Date: Mon, 17 Feb 2020 07:52:54 -0300 Subject: [PATCH 20/21] adjust to consider the whole horz_lines --- scripts/prepare_dutchf3.py | 10 ++++++---- tests/test_prepare_dutchf3.py | 30 +++++++++++++++--------------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/scripts/prepare_dutchf3.py b/scripts/prepare_dutchf3.py index 12a12096..5c5a4e08 100644 --- a/scripts/prepare_dutchf3.py +++ b/scripts/prepare_dutchf3.py @@ -175,10 +175,11 @@ def _i_extract_patches(iline_range, horz_locations, vert_locations): val_iline_range = list(val_iline_range) # Process inlines - validation - horz_locations_val_end = range(xline - val_iline, xline, max(1,patch_size)) + horz_locations_val = itertools.chain(range(0, val_iline, max(1,patch_size)), + range(xline - val_iline, xline, max(1,patch_size))) val_iline_range = list(val_iline_range) val_i_list = list(_i_extract_patches(val_iline_range, - horz_locations_val_end, + horz_locations_val, vert_locations)) logger.debug(train_iline_range) @@ -209,10 +210,11 @@ def _x_extract_patches(xline_range, horz_locations, vert_locations): val_xline_range = list(val_xline_range) # Process xlines - validation - horz_locations_val_end = range(iline - val_xline, iline, max(1,patch_size)) + horz_locations_val = itertools.chain(range(0, val_xline, max(1,patch_size)), + range(iline - val_xline, iline, max(1,patch_size))) val_xline_range = list(val_xline_range) val_x_list = list(_x_extract_patches(val_xline_range, - horz_locations_val_end, + horz_locations_val, vert_locations)) logger.debug(train_xline_range) diff --git a/tests/test_prepare_dutchf3.py b/tests/test_prepare_dutchf3.py index 90542f77..1484ee49 100644 --- a/tests/test_prepare_dutchf3.py +++ b/tests/test_prepare_dutchf3.py @@ -131,29 +131,29 @@ def test_prepare_dutchf3_patch_step_1(): patch_train = pd.read_csv(output + '/patch_train.txt', header=None, names=['row', 'a', 'b']) patch_train = pd.DataFrame(patch_train.row.str.split('_').tolist(), columns=['aline', 'x', 'y', 'z']) - # test patch_train and slice_steps=1 + # test patch_train and slice_steps=2 y = list(sorted(set(patch_train.y.astype(int)))) - x = list(sorted(set(patch_train.x.astype(int)),reverse=True)) - assert (int(y[2]) - int(y[1])) == SLICE_STEPS - assert (int(x[1]) - int(x[2])) == SLICE_STEPS + x = list(sorted(set(patch_train.x.astype(int)))) + assert (int(y[1]) - int(y[0])) == SLICE_STEPS + assert (int(x[1]) - int(x[0])) == SLICE_STEPS # reading the file and splitting the data patch_val = pd.read_csv(output + '/patch_val.txt', header=None, names=['row', 'a', 'b']) patch_val = pd.DataFrame(patch_val.row.str.split('_').tolist(), columns=['aline', 'x', 'y', 'z']) - # test patch_val and slice_steps=1 + # test patch_val and slice_steps=2 y = list(sorted(set(patch_val.y.astype(int)))) - x = list(sorted(set(patch_val.x.astype(int)),reverse=True)) - assert (int(y[2]) - int(y[1])) == SLICE_STEPS - assert (int(x[1]) - int(x[2])) == SLICE_STEPS + x = list(sorted(set(patch_val.x.astype(int)))) + assert (int(y[1]) - int(y[0])) != SLICE_STEPS + assert (int(x[1]) - int(x[0])) != SLICE_STEPS # test validation set is, at least, PER_VAL - # print(len(set(patch_train.y)), len(set(patch_val.y))) PER_VAL_CHK = len(set(patch_train.y))/(len(set(patch_train.y))+len(set(patch_val.y))) * 100 assert round(PER_VAL_CHK,0) >= int(PER_VAL * 100) PER_VAL_CHK = len(set(patch_train.x))/(len(set(patch_train.x))+len(set(patch_val.x))) * 100 assert round(PER_VAL_CHK,0) >= int(PER_VAL * 100) + def test_prepare_dutchf3_patch_step_2(): """check a complete run for the script in case further changes are needed @@ -181,9 +181,9 @@ def test_prepare_dutchf3_patch_step_2(): # test patch_train and slice_steps=2 y = list(sorted(set(patch_train.y.astype(int)))) - x = list(sorted(set(patch_train.x.astype(int)),reverse=True)) - assert (int(y[2]) - int(y[1])) == SLICE_STEPS - assert (int(x[1]) - int(x[2])) == SLICE_STEPS + x = list(sorted(set(patch_train.x.astype(int)))) + assert (int(y[1]) - int(y[0])) == SLICE_STEPS + assert (int(x[1]) - int(x[0])) == SLICE_STEPS # reading the file and splitting the data patch_val = pd.read_csv(output + '/patch_val.txt', header=None, names=['row', 'a', 'b']) @@ -191,9 +191,9 @@ def test_prepare_dutchf3_patch_step_2(): # test patch_val and slice_steps=2 y = list(sorted(set(patch_val.y.astype(int)))) - x = list(sorted(set(patch_val.x.astype(int)),reverse=True)) - assert (int(y[2]) - int(y[1])) != SLICE_STEPS - assert (int(x[1]) - int(x[2])) != SLICE_STEPS + x = list(sorted(set(patch_val.x.astype(int)))) + assert (int(y[1]) - int(y[0])) != SLICE_STEPS + assert (int(x[1]) - int(x[0])) != SLICE_STEPS # test validation set is, at least, PER_VAL PER_VAL_CHK = len(set(patch_train.y))/(len(set(patch_train.y))+len(set(patch_val.y))) * 100 From 056585643bafd4fce26739ec0f39e1a750435fa1 Mon Sep 17 00:00:00 2001 From: risquass Date: Mon, 17 Feb 2020 19:54:40 -0300 Subject: [PATCH 21/21] update environment - gitpython version --- environment/anaconda/local/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment/anaconda/local/environment.yml b/environment/anaconda/local/environment.yml index a8079d76..e3d38115 100644 --- a/environment/anaconda/local/environment.yml +++ b/environment/anaconda/local/environment.yml @@ -26,7 +26,7 @@ dependencies: - toolz==0.10.0 - tabulate==0.8.2 - Jinja2==2.10.3 - - gitpython==3.0.5 + - gitpython==3.0.6 - tensorboard==2.0.1 - tensorboardx==1.9 - invoke==1.3.0