-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQA_System.py
53 lines (41 loc) · 3.3 KB
/
QA_System.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import torch
import numpy as np
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer_for_bert = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
def answering_machine ( question, passage, max_len = 512):
#Tokenize input question and passage. Keeping maximum number of tokens as specified by max_len parameter.
#This will also add special tokens - [CLS] and [SEP]
input_ids = tokenizer_for_bert.encode ( question, passage, max_length= max_len, truncation=True)
#Getting number of tokens in 1st sentence (question) and 2nd sentence (passage)
cls_index = input_ids.index(102) #Getting index of first SEP token
len_question = cls_index + 1 # length of question (1st sentence)
len_answer = len(input_ids)- len_question # length of answer (2nd sentence)
#Segment Ids to understand which tokens belong to sentence 1 and which to sentence 2
segment_ids = [0]*len_question + [1]*(len_answer) #Segment ids will be 0 for question and 1 for answer
#Converting token ids to tokens
tokens = tokenizer_for_bert.convert_ids_to_tokens(input_ids)
# getting start and end scores for answer. Converting input arrays to torch tensors before passing to the model
start_token_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]) )[0]
end_token_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]) )[1]
#Converting scores tensors to numpy arrays so that we can use numpy functions
start_token_scores = start_token_scores.detach().numpy().flatten()
end_token_scores = end_token_scores.detach().numpy().flatten()
#Picking start index and end index of answer based on start/end indices with highest scores
answer_start_index = np.argmax(start_token_scores)
answer_end_index = np.argmax(end_token_scores)
#Getting scores for start token and end token of the answer. Also rounding it to 2 decimal digits
start_token_score = np.round(start_token_scores[answer_start_index], 2)
end_token_score = np.round(end_token_scores[answer_end_index], 2)
#Combining subwords starting with ## so that we can see full words in output. Note tokenizer breaks words which are not in its vocab.
answer = tokens[answer_start_index] #Answer starts with start index, we got based on highest score
for i in range(answer_start_index + 1, answer_end_index + 1):
if tokens[i][0:2] == '##': # Token for a splitted word starts with ##
answer += tokens[i][2:] # If token start with ## we remove ## and combine it with previous word so as to restore the unsplitted word
else:
answer += ' ' + tokens[i] # If token does not start with ## we just put a space in between while combining tokens
# Few patterns indicating that not get answer from the passage for question asked
if ( answer_start_index == 0) or (start_token_score < 0 ) or (answer == '[SEP]') or ( answer_end_index < answer_start_index):
answer = "Sorry!, I could not find an answer in the passage."
return ( answer_start_index, answer_end_index, start_token_score, end_token_score, answer)