samadejacobs · samadejacobs · Mar 30, 2020 · Nov 22, 2019 · Nov 25, 2019 · Feb 29, 2020
diff --git a/moses/metrics/utils_fcd.py b/moses/metrics/utils_fcd.py
@@ -174,12 +174,16 @@ def get_predictions(smiles, gpu=-1, batch_size=128):
         device = "/gpu:{}".format(gpu)
     else:
         device = "/cpu"
-    config = tf.ConfigProto(allow_soft_placement=True)
+    #config = tf.ConfigProto(allow_soft_placement=True)
+    config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
     config.gpu_options.allow_growth = True
     with tf.device(device):
-        sess = tf.Session(config=config)
-        set_session(sess)
-        K.clear_session()
+        #sess = tf.Session(config=config)
+        sess = tf.compat.v1.Session(config=config)
+        #set_session(sess)
+        tf.compat.v1.keras.backend.set_session(sess)
+        #K.clear_session()
+        tf.keras.backend.clear_session()
         model = load_ref_model(model_path)
         smiles_act = model.predict_generator(
              myGenerator_predict(smiles, batch_size=batch_size),

diff --git a/scripts/char_vocab_utils.py b/scripts/char_vocab_utils.py
@@ -0,0 +1,80 @@
+import torch
+import numpy as np
+import multiprocessing as mp
+from tqdm import tqdm
+from moses.utils import CharVocab
+from functools import partial
+
+
+def compute_vocab_job(smiles):
+    return set(smiles)
+
+
+def compute_vocab(smiles_list, n_jobs=mp.cpu_count()):
+    """
+        simple function that can be used to create a vocabulary for an arbitrary set of smiles strings
+
+        smiles_list: list of smiles strings
+    """
+    # extract all unique characters in smiles_list
+    # char_set = set.union(*[set(x) for x in smiles_list])
+
+    with mp.Pool(n_jobs) as pool:
+        result = list(
+            tqdm(
+                pool.imap_unordered(compute_vocab_job, smiles_list),
+                total=len(smiles_list),
+            )
+        )
+        char_set = set.union(*result)
+
+    # create the vocab
+    vocab = CharVocab(char_set)
+
+    return vocab
+
+
+def compute_string_to_int(
+    smiles_list, vocab, n_jobs=mp.cpu_count(), add_bos=False, add_eos=False
+):
+
+    from tqdm import tqdm
+
+    """
+        simple function that is used to extract a set of integer representations of a smiles dataset given
+            the provided vocab. can compute in parallel by using n_jobs > 1.
+        smiles_list: list of smiles strings
+        n_jobs: number of processes to use for parallel computation
+        add_bos: add the begin of string integer 
+        add_eos: add the end of string integer
+    """
+    string2ids = partial(vocab.string2ids, add_bos=add_bos, add_eos=add_eos)
+    with mp.Pool(n_jobs) as pool:
+        result = list(
+            tqdm(pool.imap_unordered(string2ids, smiles_list), total=len(smiles_list))
+        )
+        data = np.asarray([np.asarray(x, dtype=int) for x in result])
+
+        return data
+
+
+def merge_vocab(*args):
+    """
+        helper function to merge multiple vocab objects...helpful for cases that may require the processing of more data than 
+        is able to held in memory or for getting a common vocab to use to merge multiple disjoint datasets, etc..
+
+        *args: a list of an arbitrary number of vocab objects
+    """
+
+    # use this list to filter out 'characters' that we don't need to make the new dataset
+    ignore_char_list = ["<bos>", "<eos>", "<pad>", "<unk>"]
+    merged_char_set = set()
+
+    for vocab_path in args:
+        vocab = torch.load(vocab_path)
+        vocab_chars_set = set(
+            [x for x in vocab.c2i.keys() if x not in ignore_char_list]
+        )
+        merged_char_set.update(vocab_chars_set)
+
+    return CharVocab(merged_char_set)
diff --git a/scripts/compute_vocab_main.py b/scripts/compute_vocab_main.py
@@ -0,0 +1,43 @@
+from char_vocab_utils import compute_vocab
+
+def main():
+    import os
+    import torch 
+    from sklearn.model_selection import train_test_split
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser()
+    parser.add_argument("--smiles-path", help="path to csv of smiles strings")
+    parser.add_argument("--smiles-col", help="column name that contains smiles strings", default=None)
+    parser.add_argument("--smiles-sep", help="delimiter used to seperate smiles strings, default is set to pandas default for csv", default=",")
+    parser.add_argument("--n-jobs", type=int, help="number of processes to use for parallel computations")
+
+    parser.add_argument("--output-dir", help="path to output directory to store vocab and numpy arrays") 
+    args = parser.parse_args()
+
+    # read the smiles strings from the csv path 
+    import modin.pandas as pd
+
+
+    if args.smiles_col is None:
+        smiles_df = pd.read_csv(args.smiles_path, header=None, sep=args.smiles_sep) 
+        smiles_list = smiles_df[0].values
+
+    else:
+        smiles_df = pd.read_csv(args.smiles_path, sep=args.smiles_sep)
+        smiles_list = smiles_df[args.smiles_col].values
+
+
+    # if output directory does not exist, create it 
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    # extract the vocab  
+    print("extracting the vocab...")
+    vocab = compute_vocab(smiles_list, n_jobs=args.n_jobs)
+    torch.save(vocab, args.output_dir+"/vocab.pt")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/extract_enamine_vocab.sl b/scripts/extract_enamine_vocab.sl
@@ -0,0 +1,22 @@
+#!/usr/bin/bash
+
+#SBATCH --time 1-00:00:00
+#SBATCH --partition pbatch
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=36
+#SBATCH --array=1-11
+
+#define the variables for location of moses repo (MOSES_HOME), directory containing smiles chunks (SMILES_DIR), and the directory where output is to be stored (OUTPUT_DIR)
+#it is assumedthe code will be run from $MOSES_HOME/scripts
+
+MOSES_HOME=/g/g13/jones289/workspace/lbann/applications/ATOM/moses
+SMILES_DIR=/p/lustre2/jones289/data/enamine
+OUTPUT_DIR=/g/g13/jones289/workspace/lbann/applications/ATOM/data/enamine
+
+# activate the python environment of you choice below...if using spack then just comment out
+source activate lbann
+
+cd $MOSES_HOME/scripts
+
+python compute_vocab_main.py --smiles-path ${SMILES_DIR}/2018q1-2_Enamine_REAL_680M_SMILES_part${SLURM_ARRAY_TASK_ID}.smiles --smiles-col smiles --smiles-sep='\t' --n-jobs ${SLURM_CPUS_PER_TASK} --output-dir ${OUTPUT_DIR}/part${SLURM_ARRAY_TASK_ID}
+
diff --git a/scripts/featurize_enamine.sl b/scripts/featurize_enamine.sl
@@ -0,0 +1,10 @@
+#!/usr/bin/bash
+
+#SBATCH --time 1-00:00:00
+#SBATCH --partition pbatch
+#SBATCH --array=1-11
+
+source activate lbann
+cd /g/g13/jones289/workspace/lbann/applications/ATOM/moses/scripts
+
+python preprocess_data.py --vocab-path /g/g13/jones289/workspace/lbann/applications/ATOM/data/enamine/full_vocab.pt --smiles-path ~/data/enamine/2018q1-2_Enamine_REAL_680M_SMILES_part${SLURM_ARRAY_TASK_ID}.smiles --smiles-col smiles --smiles-sep '\t' --add-bos --add-eos --n-jobs 72 --test-size 0.2 --val-size 0.1 --split-dataset --output-dir /g/g13/jones289/workspace/lbann/applications/ATOM/data/enamine/part${SLURM_ARRAY_TASK_ID} 
diff --git a/scripts/preprocess_data.py b/scripts/preprocess_data.py
@@ -0,0 +1,111 @@
+from char_vocab_utils import compute_string_to_int, merge_vocab
+
+
+def main():
+    import os
+    import torch
+    import numpy as np
+    from sklearn.model_selection import train_test_split
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--vocab-path",
+        nargs="+",
+        dest="vocab_path_list",
+        help="path to the vocab(s) to use to featurize the smiles data. if more than one vocab path is given, the vocabs"
+        "are merged and the result is used as the vocab to featurize with",
+    )
+    parser.add_argument("--smiles-path", help="path to csv of smiles strings")
+    parser.add_argument(
+        "--smiles-col", help="column name that contains smiles strings", default=None
+    )
+    parser.add_argument(
+        "--smiles-sep",
+        help="delimiter used to seperate smiles strings, default is set to pandas default for csv",
+        default=",",
+    )
+    parser.add_argument(
+        "--add-bos",
+        help="add the begin of string character to smiles data",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--add-eos",
+        help="add the end of string character to smiles data",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--n-jobs",
+        type=int,
+        help="number of processes to use for parallel computations",
+    )
+    parser.add_argument(
+        "--test-size",
+        type=float,
+        default=0.2,
+        help="if specified, saves the data into a seperate train/val/test split, where"
+        "test set will be test-size %% of the full data, val is then selected from remaining train data"
+        "using val-size %% of the train data",
+    )
+    parser.add_argument(
+        "--val-size",
+        type=float,
+        default=0.1,
+        help="%% of the training data to hold out as validation or dev set",
+    )
+    parser.add_argument("--split-dataset", action="store_true")
+    parser.add_argument(
+        "--output-dir", help="path to output directory to store vocab and numpy arrays"
+    )
+    args = parser.parse_args()
+
+    # read the smiles strings from the csv path, modin uses multiprocessing to do this more quickly
+    import modin.pandas as pd
+
+    if args.smiles_col is None:
+        smiles_df = pd.read_csv(args.smiles_path, header=None, sep=args.smiles_sep)
+        smiles_list = smiles_df[0].values
+
+    else:
+        smiles_df = pd.read_csv(args.smiles_path, sep=args.smiles_sep)
+        smiles_list = smiles_df[args.smiles_col].values
+
+    # if output directory does not exist, create it
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    # extract the vocab
+    print("reading vocab...")
+
+    if len(args.vocab_path_list) > 1:
+        print("more than one vocab was specified...merging vocabs...")
+        vocab = merge_vocab(args.vocab_path_list)
+
+    else:
+        vocab = torch.load(args.vocab_path_list[0])
+
+    # compute the integer representation of the smiles data
+    print("extracting dataset...")
+    data = compute_string_to_int(
+        smiles_list,
+        vocab,
+        n_jobs=args.n_jobs,
+        add_bos=args.add_bos,
+        add_eos=args.add_eos,
+    )
+    np.save(args.output_dir + "/full_data.npy", data)
+
+    if args.split_dataset:
+        # compute the splits for train/test using the full data
+        train_data, test_data = train_test_split(data, test_size=args.test_size)
+        # compute the splits for train/val using the remaining data
+        train_data, val_data = train_test_split(train_data, test_size=args.val_size)
+
+        np.save(args.output_dir + "/train.npy", train_data)
+        np.save(args.output_dir + "/val.npy", val_data)
+        np.save(args.output_dir + "/test.npy", test_data)
+
+
+if __name__ == "__main__":
+    main()