From 8a917e5ddb1fca71f3ede20d487966086448e99c Mon Sep 17 00:00:00 2001 From: Carlo Fisicaro Date: Mon, 7 Mar 2022 18:16:12 +0100 Subject: [PATCH] struct: create proteonemo package --- ...rt_inference_from_preprocessed_config.yaml | 0 ...t_pretrained_from_preprocessed_config.yaml | 0 proteonemo/__init__.py | 15 +++++++- proteonemo/{version.py => models/__init__.py} | 2 +- proteonemo/{ => models}/bert_prot_model.py | 0 proteonemo/package_info.py | 35 +++++++++++++++++++ .../preprocessing}/__init__.py | 10 +++++- .../preprocessing}/create_pretraining_data.py | 4 +-- .../preprocessing/downloader.py | 6 ++-- .../preprocessing/protein_sharding.py | 0 .../preprocessing}/tokenization.py | 0 .../preprocessing/uniparc_downloader.py | 0 .../preprocessing/uniprotkb_downloader.py | 0 .../preprocessing/uniref_downloader.py | 0 {proteonemo => scripts}/bert_pred.py | 2 +- .../bertPrep.py => scripts/bert_prep.py | 8 ++--- {proteonemo => scripts}/bert_pretraining.py | 4 +-- .../create_datasets_from_start.sh | 18 +++++----- {preprocessing => static}/vocab.txt | 0 {preprocessing => static}/vocab_small.txt | 0 20 files changed, 80 insertions(+), 24 deletions(-) rename {proteonemo/conf => conf}/bert_inference_from_preprocessed_config.yaml (100%) rename {proteonemo/conf => conf}/bert_pretrained_from_preprocessed_config.yaml (100%) rename proteonemo/{version.py => models/__init__.py} (90%) rename proteonemo/{ => models}/bert_prot_model.py (100%) create mode 100644 proteonemo/package_info.py rename {preprocessing => proteonemo/preprocessing}/__init__.py (52%) rename {preprocessing => proteonemo/preprocessing}/create_pretraining_data.py (98%) rename preprocessing/Downloader.py => proteonemo/preprocessing/downloader.py (91%) rename preprocessing/ProteinSharding.py => proteonemo/preprocessing/protein_sharding.py (100%) rename {preprocessing => proteonemo/preprocessing}/tokenization.py (100%) rename preprocessing/UniParcDownloader.py => proteonemo/preprocessing/uniparc_downloader.py (100%) rename preprocessing/UniProtKBDownloader.py => proteonemo/preprocessing/uniprotkb_downloader.py (100%) rename preprocessing/UniRefDownloader.py => proteonemo/preprocessing/uniref_downloader.py (100%) rename {proteonemo => scripts}/bert_pred.py (97%) rename preprocessing/bertPrep.py => scripts/bert_prep.py (96%) rename {proteonemo => scripts}/bert_pretraining.py (89%) rename {preprocessing => scripts}/create_datasets_from_start.sh (63%) rename {preprocessing => static}/vocab.txt (100%) rename {preprocessing => static}/vocab_small.txt (100%) diff --git a/proteonemo/conf/bert_inference_from_preprocessed_config.yaml b/conf/bert_inference_from_preprocessed_config.yaml similarity index 100% rename from proteonemo/conf/bert_inference_from_preprocessed_config.yaml rename to conf/bert_inference_from_preprocessed_config.yaml diff --git a/proteonemo/conf/bert_pretrained_from_preprocessed_config.yaml b/conf/bert_pretrained_from_preprocessed_config.yaml similarity index 100% rename from proteonemo/conf/bert_pretrained_from_preprocessed_config.yaml rename to conf/bert_pretrained_from_preprocessed_config.yaml diff --git a/proteonemo/__init__.py b/proteonemo/__init__.py index 9521972..247fe04 100644 --- a/proteonemo/__init__.py +++ b/proteonemo/__init__.py @@ -12,4 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .version import version as __version__ + +from proteonemo.package_info import ( + __contact_emails__, + __contact_names__, + __description__, + __download_url__, + __homepage__, + __keywords__, + __license__, + __package_name__, + __repository_url__, + __shortversion__, + __version__, +) \ No newline at end of file diff --git a/proteonemo/version.py b/proteonemo/models/__init__.py similarity index 90% rename from proteonemo/version.py rename to proteonemo/models/__init__.py index f4ae6d4..b5c9c31 100644 --- a/proteonemo/version.py +++ b/proteonemo/models/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -version = "0.1.0" +from proteonemo.models.bert_prot_model import BERTPROTModel \ No newline at end of file diff --git a/proteonemo/bert_prot_model.py b/proteonemo/models/bert_prot_model.py similarity index 100% rename from proteonemo/bert_prot_model.py rename to proteonemo/models/bert_prot_model.py diff --git a/proteonemo/package_info.py b/proteonemo/package_info.py new file mode 100644 index 0000000..81ae3e0 --- /dev/null +++ b/proteonemo/package_info.py @@ -0,0 +1,35 @@ +# Copyright (c) 2021 Peptone. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +MAJOR = 0 +MINOR = 1 +PATCH = 0 +PRE_RELEASE = '' + +# Use the following formatting: (major, minor, patch, pre-release) +VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE) + +__shortversion__ = '.'.join(map(str, VERSION[:3])) +__version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:]) + +__package_name__ = 'proteonemo' +__contact_names__ = 'Peptone' +__contact_emails__ = 'carlo@peptone.io' +__homepage__ = 'https://peptone.io/' +__repository_url__ = 'https://github.com/PeptoneInc/ProteoNeMo.git' +__download_url__ = 'https://github.com/PeptoneInc/ProteoNeMo/archive/refs/heads/main.zip' +__description__ = 'ProteoNeMo - protein embeddings at scale' +__license__ = 'Apache2' +__keywords__ = 'protein, embedding, deep learning, machine learning, gpu, NeMo, peptone, pytorch, torch, tts' \ No newline at end of file diff --git a/preprocessing/__init__.py b/proteonemo/preprocessing/__init__.py similarity index 52% rename from preprocessing/__init__.py rename to proteonemo/preprocessing/__init__.py index cff8137..4314dae 100644 --- a/preprocessing/__init__.py +++ b/proteonemo/preprocessing/__init__.py @@ -10,4 +10,12 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. + +from proteonemo.preprocessing.tokenization import ProteoNeMoTokenizer +from proteonemo.preprocessing import tokenization +from proteonemo.preprocessing.uniref_downloader import UniRefDownloader +from proteonemo.preprocessing.uniprotkb_downloader import UniProtKBDownloader +from proteonemo.preprocessing.uniparc_downloader import UniParcDownloader +from proteonemo.preprocessing.downloader import Downloader +from proteonemo.preprocessing.protein_sharding import Sharding \ No newline at end of file diff --git a/preprocessing/create_pretraining_data.py b/proteonemo/preprocessing/create_pretraining_data.py similarity index 98% rename from preprocessing/create_pretraining_data.py rename to proteonemo/preprocessing/create_pretraining_data.py index a935215..59d8028 100644 --- a/preprocessing/create_pretraining_data.py +++ b/proteonemo/preprocessing/create_pretraining_data.py @@ -24,8 +24,8 @@ import numpy as np from tqdm import tqdm, trange -from tokenization import ProteoNeMoTokenizer -import tokenization as tokenization +from proteonemo.preprocessing.tokenization import ProteoNeMoTokenizer +from proteonemo.preprocessing import tokenization as tokenization import random import collections diff --git a/preprocessing/Downloader.py b/proteonemo/preprocessing/downloader.py similarity index 91% rename from preprocessing/Downloader.py rename to proteonemo/preprocessing/downloader.py index 4d75b5b..dce860c 100644 --- a/preprocessing/Downloader.py +++ b/proteonemo/preprocessing/downloader.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from UniRefDownloader import UniRefDownloader -from UniProtKBDownloader import UniProtKBDownloader -from UniParcDownloader import UniParcDownloader +from proteonemo.preprocessing.uniref_downloader import UniRefDownloader +from proteonemo.preprocessing.uniprotkb_downloader import UniProtKBDownloader +from proteonemo.preprocessing.UniParcDownloader import UniParcDownloader class Downloader: diff --git a/preprocessing/ProteinSharding.py b/proteonemo/preprocessing/protein_sharding.py similarity index 100% rename from preprocessing/ProteinSharding.py rename to proteonemo/preprocessing/protein_sharding.py diff --git a/preprocessing/tokenization.py b/proteonemo/preprocessing/tokenization.py similarity index 100% rename from preprocessing/tokenization.py rename to proteonemo/preprocessing/tokenization.py diff --git a/preprocessing/UniParcDownloader.py b/proteonemo/preprocessing/uniparc_downloader.py similarity index 100% rename from preprocessing/UniParcDownloader.py rename to proteonemo/preprocessing/uniparc_downloader.py diff --git a/preprocessing/UniProtKBDownloader.py b/proteonemo/preprocessing/uniprotkb_downloader.py similarity index 100% rename from preprocessing/UniProtKBDownloader.py rename to proteonemo/preprocessing/uniprotkb_downloader.py diff --git a/preprocessing/UniRefDownloader.py b/proteonemo/preprocessing/uniref_downloader.py similarity index 100% rename from preprocessing/UniRefDownloader.py rename to proteonemo/preprocessing/uniref_downloader.py diff --git a/proteonemo/bert_pred.py b/scripts/bert_pred.py similarity index 97% rename from proteonemo/bert_pred.py rename to scripts/bert_pred.py index f77a197..ad79555 100644 --- a/proteonemo/bert_pred.py +++ b/scripts/bert_pred.py @@ -18,7 +18,7 @@ from nemo.core.config import hydra_runner from pytorch_lightning.plugins import DDPPlugin from nemo.utils.app_state import AppState -from bert_prot_model import BERTPROTModel +from proteonemo.models.bert_prot_model import BERTPROTModel from nemo.collections.nlp.data.language_modeling.lm_bert_dataset import BertPretrainingPreprocessedDataset from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank from torch.utils.data import DataLoader diff --git a/preprocessing/bertPrep.py b/scripts/bert_prep.py similarity index 96% rename from preprocessing/bertPrep.py rename to scripts/bert_prep.py index 625aad9..9ac5554 100644 --- a/preprocessing/bertPrep.py +++ b/scripts/bert_prep.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import Downloader -import ProteinSharding +from proteonemo.preprocessing.downloader import Downloader +from proteonemo.preprocessing.protein_sharding import Sharding import argparse import os @@ -51,7 +51,7 @@ def main(args): if not os.path.exists(directory_structure['download']): os.makedirs(directory_structure['download']) - downloader = Downloader.Downloader(args.dataset, directory_structure['download']) + downloader = Downloader(args.dataset, directory_structure['download']) downloader.download() elif args.action == 'sharding': @@ -90,7 +90,7 @@ def main(args): os.makedirs(directory_structure['sharded'] + '/' + args.dataset) rng = random.Random(args.random_seed) - sharding = ProteinSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set, rng) + sharding = Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set, rng) sharding.load_fastas() sharding.write_shards_to_disk() diff --git a/proteonemo/bert_pretraining.py b/scripts/bert_pretraining.py similarity index 89% rename from proteonemo/bert_pretraining.py rename to scripts/bert_pretraining.py index e79774b..5a66c47 100644 --- a/proteonemo/bert_pretraining.py +++ b/scripts/bert_pretraining.py @@ -18,13 +18,13 @@ from pytorch_lightning.plugins import DDPPlugin #from nemo.collections.nlp.models.language_modeling import BERTLMModel -from bert_prot_model import BERTPROTModel +from proteonemo.models.bert_prot_model import BERTPROTModel from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager -@hydra_runner(config_path="conf", config_name="bert_pretraining_from_preprocessed_config") +@hydra_runner(config_path="../conf", config_name="bert_pretraining_from_preprocessed_config") def main(cfg: DictConfig) -> None: logging.info(f'Config:\n {OmegaConf.to_yaml(cfg)}') trainer = pl.Trainer(plugins=[DDPPlugin(find_unused_parameters=True)], **cfg.trainer) diff --git a/preprocessing/create_datasets_from_start.sh b/scripts/create_datasets_from_start.sh similarity index 63% rename from preprocessing/create_datasets_from_start.sh rename to scripts/create_datasets_from_start.sh index aaa4186..14988e2 100644 --- a/preprocessing/create_datasets_from_start.sh +++ b/scripts/create_datasets_from_start.sh @@ -16,17 +16,17 @@ to_download=${1:-"uniref_50_only"} #Download if [ "$to_download" = "uniref_all" ] ; then - python3 bertPrep.py --action download --dataset uniref_90 - python3 bertPrep.py --action download --dataset uniref_100 + python3 bert_prep.py --action download --dataset uniref_90 + python3 bert_prep.py --action download --dataset uniref_100 elif [ "$to_download" = "uniparc" ] ; then python3 /proteonemo/preprocessing/bertPrep.py --action download --dataset uniparc elif [ "$to_download" = "uniprotkb_all" ] ; then - python3 bertPrep.py --action download --dataset uniprotkb_swissprot - python3 bertPrep.py --action download --dataset uniprotkb_trembl - python3 bertPrep.py --action download --dataset uniprotkb_isoformseqs + python3 bert_prep.py --action download --dataset uniprotkb_swissprot + python3 bert_prep.py --action download --dataset uniprotkb_trembl + python3 bert_prep.py --action download --dataset uniprotkb_isoformseqs fi -python3 /proteonemo/preprocessing/bertPrep.py --action download --dataset uniref_50 +python3 bert_prep.py --action download --dataset uniref_50 if [ "$to_download" = "uniref_all" ] ; then DATASET="uniref_all" @@ -40,9 +40,9 @@ else fi # Shard the text files -python3 bertPrep.py --action sharding --dataset $DATASET +python3 bert_prep.py --action sharding --dataset $DATASET # Create HDF5 files -python3 bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 1024 \ ---max_predictions_per_seq 160 --vocab_file vocab.txt --small_vocab_file vocab_small.txt --do_upper_case +python3 bert_prep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 1024 \ +--max_predictions_per_seq 160 --vocab_file ../static/vocab.txt --small_vocab_file ../static/vocab_small.txt --do_upper_case diff --git a/preprocessing/vocab.txt b/static/vocab.txt similarity index 100% rename from preprocessing/vocab.txt rename to static/vocab.txt diff --git a/preprocessing/vocab_small.txt b/static/vocab_small.txt similarity index 100% rename from preprocessing/vocab_small.txt rename to static/vocab_small.txt