Add qa example and minor fix

rixwew · Jan 24, 2019 · 7d96f84 · 7d96f84
1 parent 6e21874
commit 7d96f84
Show file tree

Hide file tree

Showing 13 changed files with 540 additions and 9 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "examples/question-answering/dataset"]
+	path = examples/question-answering/dataset
+	url = https://github.com/shuzi/insuranceQA.git
diff --git a/README.md b/README.md
@@ -95,7 +95,7 @@ The ivfpq_query within the search request body could be used with other elastics
 {
   "query": {
     "ivfpq_query": {
-      "query": "",
+      "query": "0.02125333994626999,0.000217707478441298,...,0.001304438104853034",
       "fields": ["feature"]
     }
   },

diff --git a/examples/image-search/README.md b/examples/image-search/README.md
@@ -14,9 +14,10 @@
 bash prepare.sh
 ```
 
-## Search answers using elasticsearch plugin
+## Search similar images using elasticsearch plugin
 
 ```bash
+export PYTHONPATH=$PATH_TO_SCRIPT_DIR/lib:$PYTHONPATH
 python search_example.py --query "./images/143700.jpg"
                          --result_size 5
 ```

diff --git a/examples/image-search/search_example.py b/examples/image-search/search_example.py
@@ -34,14 +34,14 @@ def get_features(image_encoder, image_path_iter, use_cuda, batch_size=64):
     return names, np.concatenate(feats, axis=0)
 
 
-def main(query, result_size, dataset_path, m, use_cuda):
+def main(query, result_size, dataset_path, nlist, m, use_cuda):
     image_encoder = ImageEncoder().eval()
     if use_cuda:
         image_encoder = image_encoder.cuda()
     es = elasticsearch.Elasticsearch()
     client = SearchClient(es, index_name='images', type_name='image')
     names, feats = get_features(image_encoder, Path(dataset_path).iterdir(), use_cuda)
-    coarse_centroids, pq_centroids, ksub, dsub = fit_pq_params(feats, feats.shape[1], m)
+    coarse_centroids, pq_centroids, ksub, dsub = fit_pq_params(feats, feats.shape[1], nlist, m)
     client.create_mapping(feats.shape[1], m, ksub, coarse_centroids, pq_centroids)
     client.add_vectors(names, feats)
     _, encoded_query = get_features(image_encoder, [query], use_cuda, batch_size=1)
@@ -56,12 +56,14 @@ def main(query, result_size, dataset_path, m, use_cuda):
     parser.add_argument('--query', required=True)
     parser.add_argument('--result_size', type=int, default=5)
     parser.add_argument('--dataset', default='dataset/jpg')
+    parser.add_argument('--nlist', type=int, default=8)
     parser.add_argument('--m', type=int, default=64)
     parser.add_argument('--use_cuda', type=bool, default=True)
     args = parser.parse_args()
 
     main(query=args.query,
          result_size=args.result_size,
          dataset_path=args.dataset,
+         nlist=args.nlist,
          m=args.m,
          use_cuda=args.use_cuda)
diff --git a/examples/lib/common.py b/examples/lib/common.py
@@ -1,9 +1,9 @@
 import faiss
 
 
-def fit_pq_params(xb, d, m):
+def fit_pq_params(xb, d, nlist, m):
     quantizer = faiss.IndexFlatL2(d)
-    index = faiss.IndexIVFPQ(quantizer, d, 8, m, 8)
+    index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 4)
     index.train(xb)
     coarse_centroids = [quantizer.xb.at(i) for i in range(quantizer.xb.size())]
     pq_centroids = [index.pq.centroids.at(i) for i in range(index.pq.centroids.size())]
@@ -37,9 +37,7 @@ def query(self, feat, result_size=10):
                     'fields': ['vector']
                 }
             },
-            "sort": [
-                {"_score": {"order": "asc"}},
-            ],
+            'sort': {'_score': {'order': 'asc'}},
             'size': result_size
         }
         response = self.client.search(self.index_name, body=query)

diff --git a/examples/question-answering/README.md b/examples/question-answering/README.md
@@ -0,0 +1,113 @@
+# Question answering Example
+
+Question answering implementation is based on paper LSTM-based Deep Learning Models
+for Non-factoid Answer Selection - Tan, dos Santos, Xiang and Zhou.
+
+## Requirement
+
+* pytorch 1.0
+* numpy
+* gensim
+* elasticsearch
+
+## Download insurance qa data and train model
+
+```bash
+bash prepare.sh
+python train.py
+```
+
+InsuranceQA Version1 top1 precision result
+
+| Model                            | Validation | Test1 | Test2 |
+|:---------------------------------|-----------:|------:|------:|
+| QA-LSTM basic-model, max pooling(100 epoch) | 62.2 | 63.8 | 58.8 |
+| QA-LSTM basic-model, max pooling(paper) | 64.3 | 63.1 | 58.0 |
+
+
+## Search answers using elasticsearch plugin
+
+```bash
+export PYTHONPATH=$PATH_TO_SCRIPT_DIR/lib:$PYTHONPATH
+python search_example.py --question "Can a Non us citizen get Life Insurance"
+                         --result_size 5
+```
+
+```json
+{
+  "took": 36,
+  "timed_out": false,
+  "_shards": {
+    "total": 5,
+    "successful": 5,
+    "skipped": 0,
+    "failed": 0
+  },
+  "hits": {
+    "total": 870,
+    "max_score": null,
+    "hits": [
+      {
+        "_index": "answers",
+        "_type": "answer",
+        "_id": "o1i1f2gBaJEWlukYG7sK",
+        "_score": 0.5443098,
+        "_source": {
+          "description": "a non citizen can get life insurance with most company if they have a green card or an H-1b work visa some company do require the applicant be a US citizen before allow them get a life insurance policy and some will only allow green card but not work visa contact an agent find out which company will work for your situation"
+        },
+        "sort": [
+          0.5443098
+        ]
+      },
+      {
+        "_index": "answers",
+        "_type": "answer",
+        "_id": "81i2f2gBaJEWlukYacAb",
+        "_score": 0.7198508,
+        "_source": {
+          "description": "yes there be absolutely no requirement a person be a citizen buy life insurance each company make its own decision on requirement but citizenship be not 1 them so long as you be in the country legally you can buy life insurance different ID be require different carrier but rest assure if your age and health warrant it you can buy life insurance on yourself here in the USA love help thank you Gary Lane"
+        },
+        "sort": [
+          0.7198508
+        ]
+      },
+      {
+        "_index": "answers",
+        "_type": "answer",
+        "_id": "0Fiyf2gBaJEWlukYdLDC",
+        "_score": 0.75013983,
+        "_source": {
+          "description": "you do not have be a citizen obtain life insurance US life insurer require the propose insured must be a permanent resident of the US that mean a US citizen or a non US citizen who be a lawful permanent US resident ( green card or on certain visa type the applicant will also need have the means pay premium and have a demonstrable life insurance need i.e. generate earn income or asset protect here some insurer have develop foreign national program that can also work in situation where established US interest and tie exist plus meet some additional criterion citizen of some country may not be eligible it can be a complex area of field underwriting so much so that our firm have develop a special questionnaire help shop for coverage be sure work with a life insurance professional with experience in this area"
+        },
+        "sort": [
+          0.75013983
+        ]
+      },
+      {
+        "_index": "answers",
+        "_type": "answer",
+        "_id": "9Fi2f2gBaJEWlukYacBr",
+        "_score": 0.75358534,
+        "_source": {
+          "description": "yes a non US citizen can get life insurance with many American company it be up to the discretion of each company as to what type of citizenship or residency they will accept a green card be usually ok and many company will accept a work visa as qualification for apply for life insurance in the US get life insurance in the US as a non US citizen however almost always require have a residence in the United States"
+        },
+        "sort": [
+          0.75358534
+        ]
+      },
+      {
+        "_index": "answers",
+        "_type": "answer",
+        "_id": "Nliwf2gBaJEWlukYIKcx",
+        "_score": 0.7816857,
+        "_source": {
+          "description": "almost anyone can get life insurance the only people who can not get life insurance those who have serious health problem who fall outside the age guideline guarantee issue those who do not have any income at all even they may able to get a policy with a cap on the face amount in the us those who do not have citizenship a green card work visa"
+        },
+        "sort": [
+          0.7816857
+        ]
+      }
+    ]
+  }
+}
+```
diff --git a/examples/question-answering/dataset b/examples/question-answering/dataset
diff --git a/examples/question-answering/dataset.py b/examples/question-answering/dataset.py
@@ -0,0 +1,114 @@
+import collections
+
+import numpy
+import torch.utils.data
+
+
+class Vocab(object):
+
+    def __init__(self, vocab_path, lexicon, unk_surf='<UNK>', thresh=5):
+        self.vid2surf = dict()
+        lexicon = {vocab_id for vocab_id, count in lexicon.items() if count >= thresh}
+        with open(vocab_path, encoding='utf-8') as f:
+            for _line in f:
+                vocab_id, surf = _line.rstrip().split('\t')
+                if vocab_id in lexicon:
+                    self.vid2surf[vocab_id] = surf
+        self.vid2wid = {vocab_id: i + 1 for i, vocab_id in enumerate(self.vid2surf)}
+        self.wid2surf = {self.vid2wid.get(vid): surf for vid, surf in self.vid2surf.items()}
+        self.unk_surf = unk_surf
+        self.unk_word_id = len(self.vid2wid) + 1
+
+    def surfaces(self, vocab_ids):
+        return [self.vid2surf.get(vocab_id, self.unk_surf) for vocab_id in vocab_ids]
+
+    def word_ids(self, vocab_ids):
+        return [self.vid2wid.get(vocab_id, self.unk_word_id) for vocab_id in vocab_ids]
+
+    def __len__(self):
+        return len(self.vid2surf) + 1
+
+
+class AnswerData(object):
+
+    def __init__(self, data_path):
+        self.answers = dict()
+        self.lexicon = list()
+        with open(data_path, encoding='utf-8') as f:
+            for _line in f:
+                answer_id, answer = _line.rstrip().split('\t')
+                vocab_ids = answer.split(' ')
+                self.answers[int(answer_id)] = vocab_ids
+                self.lexicon.extend(vocab_ids)
+        self.lexicon = collections.Counter(self.lexicon)
+
+
+class QaData(object):
+
+    def __init__(self, data_path):
+        self.questions = list()
+        self.positive = list()
+        self.negative = list()
+        self.lexicon = list()
+        with open(data_path, encoding='utf-8') as f:
+            for _line in f:
+                values = _line.rstrip().split('\t')
+                if len(values) == 2:
+                    question, answer_ids = values
+                    positive_ids = list(map(int, answer_ids.split(' ')))
+                    negative_ids = list()
+                elif len(values) == 3:
+                    answer_ids, question, pool = values
+                    positive_ids = list(map(int, answer_ids.split(' ')))
+                    negative_ids = list(filter(lambda x: x not in set(positive_ids),
+                                               map(int, pool.split(' '))))
+                else:
+                    continue
+                vocab_ids = question.split(' ')
+                self.questions.append(vocab_ids)
+                self.lexicon.extend(vocab_ids)
+                self.positive.append(positive_ids)
+                self.negative.append(negative_ids)
+        self.lexicon = collections.Counter(self.lexicon)
+
+
+class InsuranceQaDataset(torch.utils.data.Dataset):
+
+    def __init__(self, question_data, answer_data, vocab, max_length=200):
+        self.vocab = vocab
+        self.positive = question_data.positive
+        self.negative = question_data.negative
+        self.questions = list(map(self.vocab.word_ids, question_data.questions))
+        self.answer_map = dict()
+        for answer_id, vids in answer_data.answers.items():
+            self.answer_map[answer_id] = self.vocab.word_ids(vids[:max_length])
+        self.answers = list(self.answer_map.values())
+
+    def __len__(self):
+        return len(self.questions)
+
+    def __getitem__(self, index):
+        question, positive_ids, negative_ids = \
+            self.questions[index], self.positive[index], self.negative[index]
+        positive = self.answer_map[positive_ids[numpy.random.randint(len(positive_ids))]]
+        if len(negative_ids) > 0:
+            negative = self.answer_map[negative_ids[numpy.random.randint(len(negative_ids))]]
+        else:
+            negative = self.answers[numpy.random.randint(len(self.answers))]
+        return torch.LongTensor(question), \
+               torch.LongTensor(positive), \
+               torch.LongTensor(negative)
+
+    def get_qa_entry(self, index):
+        question, positive_ids, negative_ids = \
+            self.questions[index], self.positive[index], self.negative[index]
+        positives = [self.answer_map[positive_id] for positive_id in positive_ids]
+        negatives = [self.answer_map[negative_id] for negative_id in negative_ids]
+        return question, positives, negatives
+
+    @classmethod
+    def collate(cls, batch):
+        qs, ps, ns = zip(*batch)
+        return torch.nn.utils.rnn.pad_sequence(qs, batch_first=True), \
+               torch.nn.utils.rnn.pad_sequence(ps, batch_first=True), \
+               torch.nn.utils.rnn.pad_sequence(ns, batch_first=True),
diff --git a/examples/question-answering/loss.py b/examples/question-answering/loss.py
@@ -0,0 +1,19 @@
+import torch
+
+
+class QaLoss(torch.nn.Module):
+
+    def __init__(self, margin):
+        super().__init__()
+        self.margin = margin
+
+    def forward(self, question, positive, negative):
+        """
+        max {0, margin - cosine(q, a+) + cosine(q, a-)}
+        """
+        positive_sim = (question * positive).sum(1, keepdim=True)
+        negative_sim = (question * negative).sum(1, keepdim=True)
+        zeros = positive_sim.data.new_zeros(*positive_sim.shape)
+        loss = torch.cat((zeros, negative_sim - positive_sim + self.margin), dim=1)
+        loss, _ = torch.max(loss, dim=1)
+        return torch.mean(loss)
diff --git a/examples/question-answering/models.py b/examples/question-answering/models.py
@@ -0,0 +1,28 @@
+import torch
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+
+class SentenceEncoder(torch.nn.Module):
+
+    def __init__(self, embedding_weights, hidden_size):
+        super().__init__()
+        embedding_weights = torch.FloatTensor(embedding_weights)
+        self.embedding = torch.nn.Embedding.from_pretrained(embedding_weights)
+        self.rnn = torch.nn.LSTM(embedding_weights.shape[-1], hidden_size,
+                                 batch_first=True, bidirectional=True)
+
+    def forward(self, x):
+        lengths = (-x.data.eq(0).long() + 1).sum(1)
+        _, idx_sort = torch.sort(lengths, dim=0, descending=True)
+        _, idx_unsort = torch.sort(idx_sort, dim=0)
+        x = x.index_select(0, idx_sort)
+        lengths = lengths.index_select(0, idx_sort)
+        x = self.embedding(x)
+        x = pack_padded_sequence(x, lengths, batch_first=True)
+        x, *_ = self.rnn(x)
+        x, _ = pad_packed_sequence(x, batch_first=True, padding_value=float('-inf'))
+        x, _ = torch.max(x, dim=1)
+        norm = x.norm(p=2, dim=1, keepdim=True)
+        x = x.div(norm)
+        x = x.index_select(0, idx_unsort)
+        return x
diff --git a/examples/question-answering/prepare.sh b/examples/question-answering/prepare.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+# download and unzip insurance qa dataset
+git submodule update --recursive
+
+# download pretrained word2vec model
+curl -O https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
+gzip -d GoogleNews-vectors-negative300.bin.gz