Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate lbann moses #1

Merged
merged 5 commits into from
Mar 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions moses/metrics/utils_fcd.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,12 +174,16 @@ def get_predictions(smiles, gpu=-1, batch_size=128):
device = "/gpu:{}".format(gpu)
else:
device = "/cpu"
config = tf.ConfigProto(allow_soft_placement=True)
#config = tf.ConfigProto(allow_soft_placement=True)
config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True
with tf.device(device):
sess = tf.Session(config=config)
set_session(sess)
K.clear_session()
#sess = tf.Session(config=config)
sess = tf.compat.v1.Session(config=config)
#set_session(sess)
tf.compat.v1.keras.backend.set_session(sess)
#K.clear_session()
tf.keras.backend.clear_session()
model = load_ref_model(model_path)
smiles_act = model.predict_generator(
myGenerator_predict(smiles, batch_size=batch_size),
Expand Down
80 changes: 80 additions & 0 deletions scripts/char_vocab_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import torch
import numpy as np
import multiprocessing as mp
from tqdm import tqdm
from moses.utils import CharVocab
from functools import partial


def compute_vocab_job(smiles):
return set(smiles)


def compute_vocab(smiles_list, n_jobs=mp.cpu_count()):
"""
simple function that can be used to create a vocabulary for an arbitrary set of smiles strings

smiles_list: list of smiles strings
"""
# extract all unique characters in smiles_list
# char_set = set.union(*[set(x) for x in smiles_list])

with mp.Pool(n_jobs) as pool:
result = list(
tqdm(
pool.imap_unordered(compute_vocab_job, smiles_list),
total=len(smiles_list),
)
)
char_set = set.union(*result)

# create the vocab
vocab = CharVocab(char_set)

return vocab


def compute_string_to_int(
smiles_list, vocab, n_jobs=mp.cpu_count(), add_bos=False, add_eos=False
):

from tqdm import tqdm

"""
simple function that is used to extract a set of integer representations of a smiles dataset given
the provided vocab. can compute in parallel by using n_jobs > 1.
smiles_list: list of smiles strings
n_jobs: number of processes to use for parallel computation
add_bos: add the begin of string integer
add_eos: add the end of string integer
"""
string2ids = partial(vocab.string2ids, add_bos=add_bos, add_eos=add_eos)
with mp.Pool(n_jobs) as pool:
result = list(
tqdm(pool.imap_unordered(string2ids, smiles_list), total=len(smiles_list))
)
data = np.asarray([np.asarray(x, dtype=int) for x in result])

return data


def merge_vocab(*args):
"""
helper function to merge multiple vocab objects...helpful for cases that may require the processing of more data than
is able to held in memory or for getting a common vocab to use to merge multiple disjoint datasets, etc..

*args: a list of an arbitrary number of vocab objects
"""

# use this list to filter out 'characters' that we don't need to make the new dataset
ignore_char_list = ["<bos>", "<eos>", "<pad>", "<unk>"]
merged_char_set = set()

for vocab_path in args:
vocab = torch.load(vocab_path)
vocab_chars_set = set(
[x for x in vocab.c2i.keys() if x not in ignore_char_list]
)
merged_char_set.update(vocab_chars_set)

return CharVocab(merged_char_set)
43 changes: 43 additions & 0 deletions scripts/compute_vocab_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from char_vocab_utils import compute_vocab

def main():
import os
import torch
from sklearn.model_selection import train_test_split
from argparse import ArgumentParser

parser = ArgumentParser()
parser.add_argument("--smiles-path", help="path to csv of smiles strings")
parser.add_argument("--smiles-col", help="column name that contains smiles strings", default=None)
parser.add_argument("--smiles-sep", help="delimiter used to seperate smiles strings, default is set to pandas default for csv", default=",")
parser.add_argument("--n-jobs", type=int, help="number of processes to use for parallel computations")

parser.add_argument("--output-dir", help="path to output directory to store vocab and numpy arrays")
args = parser.parse_args()

# read the smiles strings from the csv path
import modin.pandas as pd


if args.smiles_col is None:
smiles_df = pd.read_csv(args.smiles_path, header=None, sep=args.smiles_sep)
smiles_list = smiles_df[0].values

else:
smiles_df = pd.read_csv(args.smiles_path, sep=args.smiles_sep)
smiles_list = smiles_df[args.smiles_col].values


# if output directory does not exist, create it
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)

# extract the vocab
print("extracting the vocab...")
vocab = compute_vocab(smiles_list, n_jobs=args.n_jobs)
torch.save(vocab, args.output_dir+"/vocab.pt")


if __name__ == "__main__":
main()

22 changes: 22 additions & 0 deletions scripts/extract_enamine_vocab.sl
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/bash

#SBATCH --time 1-00:00:00
#SBATCH --partition pbatch
#SBATCH --nodes=1
#SBATCH --cpus-per-task=36
#SBATCH --array=1-11

#define the variables for location of moses repo (MOSES_HOME), directory containing smiles chunks (SMILES_DIR), and the directory where output is to be stored (OUTPUT_DIR)
#it is assumedthe code will be run from $MOSES_HOME/scripts

MOSES_HOME=/g/g13/jones289/workspace/lbann/applications/ATOM/moses
SMILES_DIR=/p/lustre2/jones289/data/enamine
OUTPUT_DIR=/g/g13/jones289/workspace/lbann/applications/ATOM/data/enamine

# activate the python environment of you choice below...if using spack then just comment out
source activate lbann

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hardcoded to your home directory, please fix

cd $MOSES_HOME/scripts

python compute_vocab_main.py --smiles-path ${SMILES_DIR}/2018q1-2_Enamine_REAL_680M_SMILES_part${SLURM_ARRAY_TASK_ID}.smiles --smiles-col smiles --smiles-sep='\t' --n-jobs ${SLURM_CPUS_PER_TASK} --output-dir ${OUTPUT_DIR}/part${SLURM_ARRAY_TASK_ID}

10 changes: 10 additions & 0 deletions scripts/featurize_enamine.sl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/bash

#SBATCH --time 1-00:00:00
#SBATCH --partition pbatch
#SBATCH --array=1-11

source activate lbann
cd /g/g13/jones289/workspace/lbann/applications/ATOM/moses/scripts

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comments as above

python preprocess_data.py --vocab-path /g/g13/jones289/workspace/lbann/applications/ATOM/data/enamine/full_vocab.pt --smiles-path ~/data/enamine/2018q1-2_Enamine_REAL_680M_SMILES_part${SLURM_ARRAY_TASK_ID}.smiles --smiles-col smiles --smiles-sep '\t' --add-bos --add-eos --n-jobs 72 --test-size 0.2 --val-size 0.1 --split-dataset --output-dir /g/g13/jones289/workspace/lbann/applications/ATOM/data/enamine/part${SLURM_ARRAY_TASK_ID}
111 changes: 111 additions & 0 deletions scripts/preprocess_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from char_vocab_utils import compute_string_to_int, merge_vocab


def main():
import os
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from argparse import ArgumentParser

parser = ArgumentParser()
parser.add_argument(
"--vocab-path",
nargs="+",
dest="vocab_path_list",
help="path to the vocab(s) to use to featurize the smiles data. if more than one vocab path is given, the vocabs"
"are merged and the result is used as the vocab to featurize with",
)
parser.add_argument("--smiles-path", help="path to csv of smiles strings")
parser.add_argument(
"--smiles-col", help="column name that contains smiles strings", default=None
)
parser.add_argument(
"--smiles-sep",
help="delimiter used to seperate smiles strings, default is set to pandas default for csv",
default=",",
)
parser.add_argument(
"--add-bos",
help="add the begin of string character to smiles data",
action="store_true",
)
parser.add_argument(
"--add-eos",
help="add the end of string character to smiles data",
action="store_true",
)
parser.add_argument(
"--n-jobs",
type=int,
help="number of processes to use for parallel computations",
)
parser.add_argument(
"--test-size",
type=float,
default=0.2,
help="if specified, saves the data into a seperate train/val/test split, where"
"test set will be test-size %% of the full data, val is then selected from remaining train data"
"using val-size %% of the train data",
)
parser.add_argument(
"--val-size",
type=float,
default=0.1,
help="%% of the training data to hold out as validation or dev set",
)
parser.add_argument("--split-dataset", action="store_true")
parser.add_argument(
"--output-dir", help="path to output directory to store vocab and numpy arrays"
)
args = parser.parse_args()

# read the smiles strings from the csv path, modin uses multiprocessing to do this more quickly
import modin.pandas as pd

if args.smiles_col is None:
smiles_df = pd.read_csv(args.smiles_path, header=None, sep=args.smiles_sep)
smiles_list = smiles_df[0].values

else:
smiles_df = pd.read_csv(args.smiles_path, sep=args.smiles_sep)
smiles_list = smiles_df[args.smiles_col].values

# if output directory does not exist, create it
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)

# extract the vocab
print("reading vocab...")

if len(args.vocab_path_list) > 1:
print("more than one vocab was specified...merging vocabs...")
vocab = merge_vocab(args.vocab_path_list)

else:
vocab = torch.load(args.vocab_path_list[0])

# compute the integer representation of the smiles data
print("extracting dataset...")
data = compute_string_to_int(
smiles_list,
vocab,
n_jobs=args.n_jobs,
add_bos=args.add_bos,
add_eos=args.add_eos,
)
np.save(args.output_dir + "/full_data.npy", data)

if args.split_dataset:
# compute the splits for train/test using the full data
train_data, test_data = train_test_split(data, test_size=args.test_size)
# compute the splits for train/val using the remaining data
train_data, val_data = train_test_split(train_data, test_size=args.val_size)

np.save(args.output_dir + "/train.npy", train_data)
np.save(args.output_dir + "/val.npy", val_data)
samadejacobs marked this conversation as resolved.
Show resolved Hide resolved
np.save(args.output_dir + "/test.npy", test_data)


if __name__ == "__main__":
main()
Loading