struct: create proteonemo package

PeptoneLtd · Mar 7, 2022 · 8a917e5 · 8a917e5
1 parent 058b07a
commit 8a917e5
Show file tree

Hide file tree

Showing 20 changed files with 80 additions and 24 deletions.
diff --git a/...t_inference_from_preprocessed_config.yaml → ...t_inference_from_preprocessed_config.yaml b/...t_inference_from_preprocessed_config.yaml → ...t_inference_from_preprocessed_config.yaml
diff --git a/..._pretrained_from_preprocessed_config.yaml → ..._pretrained_from_preprocessed_config.yaml b/..._pretrained_from_preprocessed_config.yaml → ..._pretrained_from_preprocessed_config.yaml
diff --git a/proteonemo/__init__.py b/proteonemo/__init__.py
@@ -12,4 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .version import version as __version__
+
+from proteonemo.package_info import (
+    __contact_emails__,
+    __contact_names__,
+    __description__,
+    __download_url__,
+    __homepage__,
+    __keywords__,
+    __license__,
+    __package_name__,
+    __repository_url__,
+    __shortversion__,
+    __version__,
+)
diff --git a/proteonemo/version.py → proteonemo/models/__init__.py b/proteonemo/version.py → proteonemo/models/__init__.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-version = "0.1.0"
+from proteonemo.models.bert_prot_model import BERTPROTModel
diff --git a/proteonemo/bert_prot_model.py → proteonemo/models/bert_prot_model.py b/proteonemo/bert_prot_model.py → proteonemo/models/bert_prot_model.py
diff --git a/proteonemo/package_info.py b/proteonemo/package_info.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2021 Peptone.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+
+
+MAJOR = 0
+MINOR = 1
+PATCH = 0
+PRE_RELEASE = ''
+
+# Use the following formatting: (major, minor, patch, pre-release)
+VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
+
+__shortversion__ = '.'.join(map(str, VERSION[:3]))
+__version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:])
+
+__package_name__ = 'proteonemo'
+__contact_names__ = 'Peptone'
+__contact_emails__ = 'carlo@peptone.io'
+__homepage__ = 'https://peptone.io/'
+__repository_url__ = 'https://github.com/PeptoneInc/ProteoNeMo.git'
+__download_url__ = 'https://github.com/PeptoneInc/ProteoNeMo/archive/refs/heads/main.zip'
+__description__ = 'ProteoNeMo - protein embeddings at scale'
+__license__ = 'Apache2'
+__keywords__ = 'protein, embedding, deep learning, machine learning, gpu, NeMo, peptone, pytorch, torch, tts'
diff --git a/preprocessing/__init__.py → proteonemo/preprocessing/__init__.py b/preprocessing/__init__.py → proteonemo/preprocessing/__init__.py
@@ -10,4 +10,12 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
+# limitations under the License.
+
+from proteonemo.preprocessing.tokenization import ProteoNeMoTokenizer
+from proteonemo.preprocessing import tokenization
+from proteonemo.preprocessing.uniref_downloader import UniRefDownloader
+from proteonemo.preprocessing.uniprotkb_downloader import UniProtKBDownloader
+from proteonemo.preprocessing.uniparc_downloader import UniParcDownloader
+from proteonemo.preprocessing.downloader import Downloader
+from proteonemo.preprocessing.protein_sharding import Sharding
diff --git a/preprocessing/create_pretraining_data.py → .../preprocessing/create_pretraining_data.py b/preprocessing/create_pretraining_data.py → .../preprocessing/create_pretraining_data.py
@@ -24,8 +24,8 @@
 import numpy as np
 from tqdm import tqdm, trange
 
-from tokenization import ProteoNeMoTokenizer
-import tokenization as tokenization
+from proteonemo.preprocessing.tokenization import ProteoNeMoTokenizer
+from proteonemo.preprocessing import tokenization as tokenization
 
 import random
 import collections

diff --git a/preprocessing/Downloader.py → proteonemo/preprocessing/downloader.py b/preprocessing/Downloader.py → proteonemo/preprocessing/downloader.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from UniRefDownloader import UniRefDownloader
-from UniProtKBDownloader import UniProtKBDownloader
-from UniParcDownloader import UniParcDownloader
+from proteonemo.preprocessing.uniref_downloader import UniRefDownloader
+from proteonemo.preprocessing.uniprotkb_downloader import UniProtKBDownloader
+from proteonemo.preprocessing.UniParcDownloader import UniParcDownloader
 
 
 class Downloader:

diff --git a/preprocessing/ProteinSharding.py → proteonemo/preprocessing/protein_sharding.py b/preprocessing/ProteinSharding.py → proteonemo/preprocessing/protein_sharding.py
diff --git a/preprocessing/tokenization.py → proteonemo/preprocessing/tokenization.py b/preprocessing/tokenization.py → proteonemo/preprocessing/tokenization.py
diff --git a/preprocessing/UniParcDownloader.py → ...onemo/preprocessing/uniparc_downloader.py b/preprocessing/UniParcDownloader.py → ...onemo/preprocessing/uniparc_downloader.py
diff --git a/preprocessing/UniProtKBDownloader.py → ...emo/preprocessing/uniprotkb_downloader.py b/preprocessing/UniProtKBDownloader.py → ...emo/preprocessing/uniprotkb_downloader.py
diff --git a/preprocessing/UniRefDownloader.py → ...eonemo/preprocessing/uniref_downloader.py b/preprocessing/UniRefDownloader.py → ...eonemo/preprocessing/uniref_downloader.py
diff --git a/proteonemo/bert_pred.py → scripts/bert_pred.py b/proteonemo/bert_pred.py → scripts/bert_pred.py
@@ -18,7 +18,7 @@
 from nemo.core.config import hydra_runner
 from pytorch_lightning.plugins import DDPPlugin
 from nemo.utils.app_state import AppState
-from bert_prot_model import BERTPROTModel
+from proteonemo.models.bert_prot_model import BERTPROTModel
 from nemo.collections.nlp.data.language_modeling.lm_bert_dataset import BertPretrainingPreprocessedDataset
 from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank
 from torch.utils.data import DataLoader

diff --git a/preprocessing/bertPrep.py → scripts/bert_prep.py b/preprocessing/bertPrep.py → scripts/bert_prep.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import Downloader
-import ProteinSharding
+from proteonemo.preprocessing.downloader import Downloader
+from proteonemo.preprocessing.protein_sharding import Sharding
 
 import argparse
 import os
@@ -51,7 +51,7 @@ def main(args):
         if not os.path.exists(directory_structure['download']):
             os.makedirs(directory_structure['download'])
 
-        downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
+        downloader = Downloader(args.dataset, directory_structure['download'])
         downloader.download()
 
     elif args.action == 'sharding':
@@ -90,7 +90,7 @@ def main(args):
                 os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
 
             rng = random.Random(args.random_seed)
-            sharding = ProteinSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set, rng)
+            sharding = Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set, rng)
             sharding.load_fastas()
             sharding.write_shards_to_disk()
 

diff --git a/proteonemo/bert_pretraining.py → scripts/bert_pretraining.py b/proteonemo/bert_pretraining.py → scripts/bert_pretraining.py
@@ -18,13 +18,13 @@
 from pytorch_lightning.plugins import DDPPlugin
 
 #from nemo.collections.nlp.models.language_modeling import BERTLMModel
-from bert_prot_model import BERTPROTModel
+from proteonemo.models.bert_prot_model import BERTPROTModel
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
 
-@hydra_runner(config_path="conf", config_name="bert_pretraining_from_preprocessed_config")
+@hydra_runner(config_path="../conf", config_name="bert_pretraining_from_preprocessed_config")
 def main(cfg: DictConfig) -> None:
     logging.info(f'Config:\n {OmegaConf.to_yaml(cfg)}')
     trainer = pl.Trainer(plugins=[DDPPlugin(find_unused_parameters=True)], **cfg.trainer)

diff --git a/preprocessing/create_datasets_from_start.sh → scripts/create_datasets_from_start.sh b/preprocessing/create_datasets_from_start.sh → scripts/create_datasets_from_start.sh
@@ -16,17 +16,17 @@ to_download=${1:-"uniref_50_only"}
 
 #Download
 if [ "$to_download" = "uniref_all" ] ; then
-    python3 bertPrep.py --action download --dataset uniref_90
-    python3 bertPrep.py --action download --dataset uniref_100
+    python3 bert_prep.py --action download --dataset uniref_90
+    python3 bert_prep.py --action download --dataset uniref_100
 elif [ "$to_download" = "uniparc" ] ; then
     python3 /proteonemo/preprocessing/bertPrep.py --action download --dataset uniparc
 elif [ "$to_download" = "uniprotkb_all" ] ; then
-    python3 bertPrep.py --action download --dataset uniprotkb_swissprot
-    python3 bertPrep.py --action download --dataset uniprotkb_trembl
-    python3 bertPrep.py --action download --dataset uniprotkb_isoformseqs
+    python3 bert_prep.py --action download --dataset uniprotkb_swissprot
+    python3 bert_prep.py --action download --dataset uniprotkb_trembl
+    python3 bert_prep.py --action download --dataset uniprotkb_isoformseqs
 fi
 
-python3 /proteonemo/preprocessing/bertPrep.py --action download --dataset uniref_50
+python3 bert_prep.py --action download --dataset uniref_50
 
 if [ "$to_download" = "uniref_all" ] ; then
     DATASET="uniref_all"
@@ -40,9 +40,9 @@ else
 fi
 
 # Shard the text files
-python3 bertPrep.py --action sharding --dataset $DATASET
+python3 bert_prep.py --action sharding --dataset $DATASET
 
 # Create HDF5 files
-python3 bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 1024 \
---max_predictions_per_seq 160 --vocab_file vocab.txt --small_vocab_file vocab_small.txt --do_upper_case
+python3 bert_prep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 1024 \
+--max_predictions_per_seq 160 --vocab_file ../static/vocab.txt --small_vocab_file ../static/vocab_small.txt --do_upper_case
 
diff --git a/preprocessing/vocab.txt → static/vocab.txt b/preprocessing/vocab.txt → static/vocab.txt
diff --git a/preprocessing/vocab_small.txt → static/vocab_small.txt b/preprocessing/vocab_small.txt → static/vocab_small.txt