Skip to content
This repository has been archived by the owner on Feb 22, 2020. It is now read-only.

Commit

Permalink
fix docker and orangeci
Browse files Browse the repository at this point in the history
  • Loading branch information
hanhxiao committed Apr 12, 2019
1 parent fa2b312 commit 5998558
Show file tree
Hide file tree
Showing 7 changed files with 25 additions and 17 deletions.
10 changes: 8 additions & 2 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
.pyre/
.idea/
.orange-ci.yml
docker-push.sh
download-models.sh
docker-up.sh
.orange-ci.yml
docker-compose.yml
docker-compose-simple.yml
README.ms
MAINTAINERS
test*.yml
toy*
toy*
release.sh
8 changes: 5 additions & 3 deletions download_model.sh → download-models.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ URL_TRANSFORMER_XL="https://gnes-1252847528.cos.ap-guangzhou.myqcloud.com/transf
URL_WORD2VEC="https://gnes-1252847528.cos.ap-guangzhou.myqcloud.com/sgns.wiki.bigram-char.bz2"

wget ${URL_CHINESE_BERT} -qO temp.zip; unzip temp.zip; rm temp.zip
wget ${URL_WORD2VEC} -qO tmp.bz2; bzip2 -d tmp.bz2; rm tmp.bz2

bz2array=($URL_CHINESE_ELMO $URL_GPT $URL_GPT2 $URL_TRANSFORMER_XL $URL_WORD2VEC)
tarbz2array=($URL_CHINESE_ELMO $URL_GPT $URL_GPT2 $URL_TRANSFORMER_XL)

for url in "${bz2array[@]}"
for url in "${tarbz2array[@]}"
do
printf "downloading ${url}\n"
wget ${url} -qO tmp.tar.bz2; tar -xjf tmp.tar.bz2; rm tmp.tar.bz2
done
done

8 changes: 4 additions & 4 deletions gnes/encoder/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@
class GPTEncoder(BaseEncoder):

def __init__(self,
model_path: str,
model_dir: str,
batch_size: int = 64,
use_cuda: bool = False,
pooling_strategy: str = 'REDUCE_MEAN',
*args,
**kwargs):
super().__init__(*args, **kwargs)

self.model_path = model_path
self.model_dir = model_dir
self.batch_size = batch_size

# Load pre-trained model tokenizer (vocabulary)
Expand All @@ -41,8 +41,8 @@ def _get_output_tensor(self, x):
return self._model(x)

def _init_model_tokenizer(self):
self._tokenizer = OpenAIGPTTokenizer.from_pretrained(self.model_path)
self._model = OpenAIGPTModel.from_pretrained(self.model_path)
self._tokenizer = OpenAIGPTTokenizer.from_pretrained(self.model_dir)
self._model = OpenAIGPTModel.from_pretrained(self.model_dir)
self._model.eval()

@batching
Expand Down
4 changes: 2 additions & 2 deletions gnes/encoder/gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@ def _get_output_tensor(self, x):
return self._model(x)[0]

def _init_model_tokenizer(self):
self._tokenizer = GPT2Tokenizer.from_pretrained(self.model_path)
self._model = GPT2Model.from_pretrained(self.model_path)
self._tokenizer = GPT2Tokenizer.from_pretrained(self.model_dir)
self._model = GPT2Model.from_pretrained(self.model_dir)
self._model.eval()
6 changes: 3 additions & 3 deletions gnes/encoder/w2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,20 @@


class Word2VecEncoder(BaseEncoder):
def __init__(self, model_path,
def __init__(self, model_dir,
skiprows: int = 1,
batch_size: int = 64,
pooling_strategy: str = 'REDUCE_MEAN', *args, **kwargs):
super().__init__(*args, **kwargs)
self.model_path = model_path
self.model_dir = model_dir
self.skiprows = skiprows
self.batch_size = batch_size
self.pooling_strategy = pooling_strategy
self.is_trained = True
self._init_word_embedding()

def _init_word_embedding(self):
self.word2vec_df = pd.read_table(self.model_path, sep=' ', quoting=3,
self.word2vec_df = pd.read_table(self.model_dir, sep=' ', quoting=3,
header=None, skiprows=self.skiprows,
index_col=0)
self.word2vec_df = self.word2vec_df.astype(np.float32).dropna(axis=1).dropna(axis=0)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_gpt_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def setUp(self):

def test_encoding(self):
_encoder = GPTEncoder(
model_path=os.environ.get(
model_dir=os.environ.get(
'GPT_CI_MODEL',
'/openai_gpt'
),
Expand Down
4 changes: 2 additions & 2 deletions tests/test_w2v_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@ def setUp(self):

def test_encoding(self):
w2v_encoder = Word2VecEncoder(
model_path=os.environ['WORD2VEC_MODEL'],
model_dir=os.environ['WORD2VEC_MODEL'],
pooling_strategy="REDUCE_MEAN")
vec = w2v_encoder.encode(self.test_str)
self.assertEqual(vec.shape[0], len(self.test_str))
self.assertEqual(vec.shape[1], 300)

def test_dump_load(self):
w2v_encoder = Word2VecEncoder(
model_path=os.environ['WORD2VEC_MODEL'],
model_dir=os.environ['WORD2VEC_MODEL'],
pooling_strategy="REDUCE_MEAN")
w2v_encoder.dump(self.dump_path)
w2v_encoder2 = Word2VecEncoder.load(self.dump_path)
Expand Down

0 comments on commit 5998558

Please sign in to comment.