Skip to content
This repository was archived by the owner on Jan 15, 2024. It is now read-only.

Commit

Permalink
lowercase
Browse files Browse the repository at this point in the history
  • Loading branch information
zheyuye committed Jul 15, 2020
1 parent f5c94a6 commit 8dabfd6
Show file tree
Hide file tree
Showing 10 changed files with 66 additions and 39 deletions.
20 changes: 10 additions & 10 deletions src/gluonnlp/data/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,7 @@ def __setstate__(self, state):
from subword_nmt.apply_bpe import BPE
with open(self._codec_path, 'r', encoding='utf-8') as merge_codes:
self._bpe = BPE(codes=merge_codes, separator=self._separator)

class HuggingFaceTokenizer(BaseTokenizerWithVocab):
def encode(self, sentences, output_type=str):
is_multi_sentences = isinstance(sentences, list)
Expand Down Expand Up @@ -1252,7 +1252,7 @@ class SentencepieceTokenizer(BaseTokenizerWithVocab):
algorithm.
alpha
A scalar for a smoothing parameter for probability rescaling.
do_lower
lowercase
Whether to convert the input string to lower-case strings
**kwargs
Expand All @@ -1273,7 +1273,7 @@ class SentencepieceTokenizer(BaseTokenizerWithVocab):
"""
def __init__(self, model_path: Optional[str] = None,
vocab: Optional[Union[str, Vocab]] = None,
nbest: int = 0, alpha: float = 0.0, do_lower=False,
nbest: int = 0, alpha: float = 0.0, lowercase=False,
**kwargs):
self._model_path = model_path
sentencepiece = try_import_sentencepiece()
Expand All @@ -1283,7 +1283,7 @@ def __init__(self, model_path: Optional[str] = None,
self._sp_model.load(model_path)
self._nbest = nbest
self._alpha = alpha
self._do_lower = do_lower
self._lowercase = lowercase
self._meta_symbol = u'▁'
sp_model_all_tokens = [self._sp_model.id_to_piece(i) for i in range(len(self._sp_model))]
special_tokens_kv = dict()
Expand Down Expand Up @@ -1364,7 +1364,7 @@ def encode(self, sentences, output_type=str):
is_multi_sentences = isinstance(sentences, list)
if not is_multi_sentences:
sentences = [sentences]
if self._do_lower:
if self._lowercase:
sentences = [sentence.lower() for sentence in sentences]
if output_type is str:
ret = [self._sp_model.sample_encode_as_pieces(sentence, self._nbest, self._alpha)
Expand Down Expand Up @@ -1403,7 +1403,7 @@ def encode_with_offsets(self, sentences, output_type=str):
token_ids = []
offsets = []
for sentence in sentences:
if self._do_lower:
if self._lowercase:
sentence = sentence.lower()
spt = self._spt_cls()
spt.ParseFromString(self._sp_model.SampleEncodeAsSerializedProto(
Expand Down Expand Up @@ -1464,8 +1464,8 @@ def set_vocab(self, vocab):
'SentencepieceTokenizer.')

@property
def do_lower(self):
return self._do_lower
def lowercase(self):
return self._lowercase

def set_subword_regularization(self, nbest, alpha):
self._nbest = nbest
Expand All @@ -1474,11 +1474,11 @@ def set_subword_regularization(self, nbest, alpha):
def __repr__(self):
ret = '{}(\n' \
' model_path = {}\n' \
' do_lower = {}, nbest = {}, alpha = {}\n' \
' lowercase = {}, nbest = {}, alpha = {}\n' \
' vocab = {}\n' \
')'.format(self.__class__.__name__,
os.path.realpath(self._model_path),
self._do_lower, self._nbest, self._alpha,
self._lowercase, self._nbest, self._alpha,
self._vocab)
return ret

Expand Down
10 changes: 8 additions & 2 deletions src/gluonnlp/models/albert.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,27 +52,31 @@
'vocab': 'google_albert_base_v2/vocab-2ee53ae7.json',
'params': 'google_albert_base_v2/model-125be477.params',
'mlm_params': 'google_albert_base_v2/model_mlm-fe20650e.params',
'lowercase': True,
},
'google_albert_large_v2': {
'cfg': 'google_albert_large_v2/model-e2e9b974.yml',
'spm_model': 'google_albert_large_v2/spm-65999e5d.model',
'vocab': 'google_albert_large_v2/vocab-2ee53ae7.json',
'params': 'google_albert_large_v2/model-ad60bcd5.params',
'mlm_params': 'google_albert_large_v2/model_mlm-6a5015ee.params',
'lowercase': True,
},
'google_albert_xlarge_v2': {
'cfg': 'google_albert_xlarge_v2/model-8123bffd.yml',
'spm_model': 'google_albert_xlarge_v2/spm-65999e5d.model',
'vocab': 'google_albert_xlarge_v2/vocab-2ee53ae7.json',
'params': 'google_albert_xlarge_v2/model-4149c9e2.params',
'mlm_params': 'google_albert_xlarge_v2/model_mlm-ee184d38.params',
'lowercase': True,
},
'google_albert_xxlarge_v2': {
'cfg': 'google_albert_xxlarge_v2/model-07fbeebc.yml',
'spm_model': 'google_albert_xxlarge_v2/spm-65999e5d.model',
'vocab': 'google_albert_xxlarge_v2/vocab-2ee53ae7.json',
'params': 'google_albert_xxlarge_v2/model-5601a0ed.params',
'mlm_params': 'google_albert_xxlarge_v2/model_mlm-d2e2b06f.params',
'lowercase': True,
},
}

Expand Down Expand Up @@ -658,10 +662,12 @@ def get_pretrained_albert(model_name: str = 'google_albert_base_v2',
sha1_hash=FILE_STATS[mlm_params_path])
else:
local_mlm_params_path = None
# TODO(sxjscience) Move do_lower to assets.

do_lower = True if 'lowercase' in PRETRAINED_URL[model_name]\
and PRETRAINED_URL[model_name]['lowercase'] else False
tokenizer = SentencepieceTokenizer(local_paths['spm_model'],
vocab=local_paths['vocab'],
do_lower=True)
lowercase=do_lower)
cfg = AlbertModel.get_cfg().clone_merge(local_paths['cfg'])
return cfg, tokenizer, local_params_path, local_mlm_params_path

Expand Down
1 change: 0 additions & 1 deletion src/gluonnlp/models/bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,7 +650,6 @@ def get_pretrained_bert(model_name: str = 'google_en_cased_bert_base',
local_mlm_params_path = None
do_lower = True if 'lowercase' in PRETRAINED_URL[model_name]\
and PRETRAINED_URL[model_name]['lowercase'] else False
# TODO(sxjscience) Move do_lower to assets.
tokenizer = HuggingFaceWordPieceTokenizer(
vocab_file=local_paths['vocab'],
unk_token='[UNK]',
Expand Down
9 changes: 7 additions & 2 deletions src/gluonnlp/models/electra.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,27 +73,31 @@ def get_generator_cfg(model_config):
'params': 'google_electra_small/model-2654c8b4.params',
'disc_model': 'google_electra_small/disc_model-137714b6.params',
'gen_model': 'google_electra_small/gen_model-d11fd0b1.params',
'lowercase': True,
},
'google_electra_base': {
'cfg': 'google_electra_base/model-5b35ca0b.yml',
'vocab': 'google_electra_base/vocab-e6d2b21d.json',
'params': 'google_electra_base/model-31c235cc.params',
'disc_model': 'google_electra_base/disc_model-514bd353.params',
'gen_model': 'google_electra_base/gen_model-665ce594.params',
'lowercase': True,
},
'google_electra_large': {
'cfg': 'google_electra_large/model-31b7dfdd.yml',
'vocab': 'google_electra_large/vocab-e6d2b21d.json',
'params': 'google_electra_large/model-9baf9ff5.params',
'disc_model': 'google_electra_large/disc_model-5b820c02.params',
'gen_model': 'google_electra_large/gen_model-667121df.params',
'lowercase': True,
},
'gluon_electra_small_owt':{
'cfg': 'gluon_electra_small_owt/model-6e276d98.yml',
'vocab': 'gluon_electra_small_owt/vocab-e6d2b21d.json',
'params': 'gluon_electra_small_owt/model-e9636891.params',
'disc_model': 'gluon_electra_small_owt/disc_model-87836017.params',
'gen_model': 'gluon_electra_small_owt/gen_model-45a6fb67.params',
'lowercase': True,
}
}

Expand Down Expand Up @@ -951,15 +955,16 @@ def get_pretrained_electra(model_name: str = 'google_electra_small',
sha1_hash=FILE_STATS[gen_params_path])
else:
local_gen_params_path = None
# TODO(sxjscience) Move do_lower to assets.
do_lower = True if 'lowercase' in PRETRAINED_URL[model_name]\
and PRETRAINED_URL[model_name]['lowercase'] else False
tokenizer = HuggingFaceWordPieceTokenizer(
vocab_file=local_paths['vocab'],
unk_token='[UNK]',
pad_token='[PAD]',
cls_token='[CLS]',
sep_token='[SEP]',
mask_token='[MASK]',
lowercase=True)
lowercase=do_lower)
cfg = ElectraModel.get_cfg().clone_merge(local_paths['cfg'])
return cfg, tokenizer, local_params_path, (local_disc_params_path, local_gen_params_path)

Expand Down
9 changes: 6 additions & 3 deletions src/gluonnlp/models/mobilebert.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
'vocab': 'google_uncased_mobilebert/vocab-e6d2b21d.json',
'params': 'google_uncased_mobilebert/model-c8346cf2.params',
'mlm_params': 'google_uncased_mobilebert/model_mlm-53948e82.params',
'lowercase': True,
}
}

Expand All @@ -64,7 +65,7 @@
@use_np
class MobileBertEncoderLayer(HybridBlock):
"""The Transformer Encoder Layer in Mobile Bert"""
# TODO(zheyuye), use stacked groups for single ffn layer in transformer.TransformerEncoderLayer
# TODO(zheyuye), use stacked groups for single ffn layer in TransformerEncoderLayer
# and revise the other models and scripts, masking sure their are compatible.

def __init__(self,
Expand Down Expand Up @@ -959,15 +960,17 @@ def get_pretrained_mobilebert(model_name: str = 'google_uncased_mobilebert',
sha1_hash=FILE_STATS[mlm_params_path])
else:
local_mlm_params_path = None
# TODO(sxjscience) Move do_lower to assets.

do_lower = True if 'lowercase' in PRETRAINED_URL[model_name]\
and PRETRAINED_URL[model_name]['lowercase'] else False
tokenizer = HuggingFaceWordPieceTokenizer(
vocab_file=local_paths['vocab'],
unk_token='[UNK]',
pad_token='[PAD]',
cls_token='[CLS]',
sep_token='[SEP]',
mask_token='[MASK]',
lowercase=True)
lowercase=do_lower)
cfg = MobileBertModel.get_cfg().clone_merge(local_paths['cfg'])
return cfg, tokenizer, local_params_path, local_mlm_params_path

Expand Down
9 changes: 8 additions & 1 deletion src/gluonnlp/models/roberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,15 @@
'vocab': 'fairseq_roberta_base/gpt2-f1335494.vocab',
'params': 'fairseq_roberta_base/model-09a1520a.params',
'mlm_params': 'fairseq_roberta_base/model_mlm-29889e2b.params',
'lowercase': False,
},
'fairseq_roberta_large': {
'cfg': 'fairseq_roberta_large/model-6e66dc4a.yml',
'merges': 'fairseq_roberta_large/gpt2-396d4d8e.merges',
'vocab': 'fairseq_roberta_large/gpt2-f1335494.vocab',
'params': 'fairseq_roberta_large/model-6b043b91.params',
'mlm_params': 'fairseq_roberta_large/model_mlm-119f38e1.params',
'lowercase': False,
}
}

Expand Down Expand Up @@ -549,7 +551,12 @@ def get_pretrained_roberta(model_name: str = 'fairseq_roberta_base',
sha1_hash=FILE_STATS[mlm_params_path])
else:
local_mlm_params_path = None
tokenizer = HuggingFaceByteBPETokenizer(local_paths['merges'], local_paths['vocab'])
do_lower = True if 'lowercase' in PRETRAINED_URL[model_name]\
and PRETRAINED_URL[model_name]['lowercase'] else False
tokenizer = HuggingFaceByteBPETokenizer(
merges_file=local_paths['merges'],
vocab_file=local_paths['vocab'],
lowercase=do_lower)
cfg = RobertaModel.get_cfg().clone_merge(local_paths['cfg'])
return cfg, tokenizer, local_params_path, local_mlm_params_path

Expand Down
9 changes: 7 additions & 2 deletions src/gluonnlp/models/xlmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,14 @@
'sentencepiece.model': 'fairseq_xlmr_base/sentencepiece-18e17bae.model',
'params': 'fairseq_xlmr_base/model-3fa134e9.params',
'mlm_params': 'model_mlm-86e37954.params',
'lowercase': False,
},
'fairseq_xlmr_large': {
'cfg': 'fairseq_xlmr_large/model-01fc59fb.yml',
'sentencepiece.model': 'fairseq_xlmr_large/sentencepiece-18e17bae.model',
'params': 'fairseq_xlmr_large/model-b62b074c.params',
'mlm_params': 'model_mlm-887506c2.params',

'lowercase': False,
}
}

Expand Down Expand Up @@ -146,7 +147,11 @@ def get_pretrained_xlmr(model_name: str = 'fairseq_xlmr_base',
else:
local_mlm_params_path = None

tokenizer = SentencepieceTokenizer(local_paths['sentencepiece.model'])
do_lower = True if 'lowercase' in PRETRAINED_URL[model_name]\
and PRETRAINED_URL[model_name]['lowercase'] else False
tokenizer = SentencepieceTokenizer(
model_path=local_paths['sentencepiece.model'],
lowercase=do_lower)
cfg = XLMRModel.get_cfg().clone_merge(local_paths['cfg'])
return cfg, tokenizer, local_params_path, local_mlm_params_path

Expand Down
8 changes: 4 additions & 4 deletions tests/test_data_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,14 @@ def verify_decode_spm(tokenizer, all_sentences, gt_int_decode_sentences):
(all_sentences, gt_int_decode_sentences)]:
if isinstance(sentences, str):
gt_str_decode_sentences = sentences
if tokenizer.do_lower:
if tokenizer.lowercase:
gt_str_decode_sentences = gt_str_decode_sentences.lower()
gt_str_decode_sentences = unicodedata.normalize('NFKC', gt_str_decode_sentences)
elif isinstance(sentences, list):
gt_str_decode_sentences = []
for ele in sentences:
ele_gt_decode = ele
if tokenizer.do_lower:
if tokenizer.lowercase:
ele_gt_decode = ele_gt_decode.lower()
ele_gt_decode = unicodedata.normalize('NFKC', ele_gt_decode)
gt_str_decode_sentences.append(ele_gt_decode)
Expand Down Expand Up @@ -379,11 +379,11 @@ def test_sentencepiece_tokenizer():
gt_lower_case_int_decode = ['hello, y ⁇ all! how are you viii ⁇ ⁇ ⁇ ?',
'gluonnlp is great!!!!!!',
'gluonnlp-amazon-haibin-leonard-sheng-shuai-xingjian...../:! ⁇ # ⁇ abc ⁇ ']
tokenizer = SentencepieceTokenizer(model_path, do_lower=True)
tokenizer = SentencepieceTokenizer(model_path, lowercase=True)
verify_decode_spm(tokenizer, SUBWORD_TEST_SAMPLES, gt_lower_case_int_decode)

# Case3, Use the sentencepiece regularization commands, we test whether we can obtain different encoding results
tokenizer = SentencepieceTokenizer(model_path, do_lower=True, nbest=-1, alpha=1.0)
tokenizer = SentencepieceTokenizer(model_path, lowercase=True, nbest=-1, alpha=1.0)
has_different_encode_out = False
encode_out = None
for _ in range(10):
Expand Down
14 changes: 7 additions & 7 deletions tests/test_models_roberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,15 @@ def test_roberta(model_name):
cfg, tokenizer, params_path, mlm_params_path =\
get_pretrained_roberta(model_name, load_backbone=True, load_mlm=True, root=root)
assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
# test backbone
roberta_model = RobertaModel.from_cfg(cfg)
roberta_model.load_parameters(params_path)
# test mlm model
roberta_mlm_model = RobertaForMLM(cfg)
if mlm_params_path is not None:
roberta_mlm_model.load_parameters(mlm_params_path)
roberta_mlm_model = RobertaForMLM(cfg)
roberta_mlm_model.backbone_model.load_parameters(params_path)

# test forward
batch_size = 3
Expand Down Expand Up @@ -54,10 +61,3 @@ def test_roberta(model_name):
loss = label_smooth_loss(contextual_embeddings, input_ids)
loss.backward()
mx.npx.waitall()

# test for mlm model
roberta_mlm_model = RobertaForMLM(cfg)
if mlm_params_path is not None:
roberta_mlm_model.load_parameters(mlm_params_path)
roberta_mlm_model = RobertaForMLM(cfg)
roberta_mlm_model.backbone_model.load_parameters(params_path)
16 changes: 9 additions & 7 deletions tests/test_models_xlmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,17 @@ def test_xlmr():
cfg, tokenizer, params_path, mlm_params_path =\
get_pretrained_xlmr(model_name, load_backbone=True, load_mlm=True, root=root)
assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
# test backbone
xlmr_model = XLMRModel.from_cfg(cfg)
xlmr_model.load_parameters(params_path)
# test mlm model
xlmr = XLMRForMLM(cfg)
if mlm_params_path is not None:
xlmr.load_parameters(mlm_params_path)
xlmr = XLMRForMLM(cfg)
xlmr.backbone_model.load_parameters(params_path)


# test forward
batch_size = 1
seq_length = 8
Expand Down Expand Up @@ -53,10 +62,3 @@ def test_xlmr():
loss = label_smooth_loss(contextual_embeddings, input_ids)
loss.backward()
mx.npx.waitall()

# test for mlm model
xlmr = XLMRForMLM(cfg)
if mlm_params_path is not None:
xlmr.load_parameters(mlm_params_path)
xlmr = XLMRForMLM(cfg)
xlmr.backbone_model.load_parameters(params_path)

0 comments on commit 8dabfd6

Please sign in to comment.