-
Notifications
You must be signed in to change notification settings - Fork 17
/
classify.py
98 lines (77 loc) · 2.81 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import numpy as np
import dill as pickle
import tiktoken
import openai
import argparse
from sklearn.linear_model import LogisticRegression
from utils.featurize import normalize, t_featurize_logprobs, score_ngram
from utils.symbolic import train_trigram, get_words, vec_functions, scalar_functions
parser = argparse.ArgumentParser()
parser.add_argument("--file", type=str, default="input.txt")
parser.add_argument("--openai_key", type=str, default="")
args = parser.parse_args()
if args.openai_key != "":
openai.api_key = args.openai_key
file = args.file
MAX_TOKENS = 2047
best_features = open("model/features.txt").read().strip().split("\n")
# Load davinci tokenizer
enc = tiktoken.encoding_for_model("davinci")
# Load model
model = pickle.load(open("model/model", "rb"))
mu = pickle.load(open("model/mu", "rb"))
sigma = pickle.load(open("model/sigma", "rb"))
# Load data and featurize
with open(file) as f:
doc = f.read().strip()
# Strip data to first MAX_TOKENS tokens
tokens = enc.encode(doc)[:MAX_TOKENS]
doc = enc.decode(tokens).strip()
print(f"Input: {doc}")
# Train trigram
print("Loading Trigram...")
trigram_model = train_trigram()
trigram = np.array(score_ngram(doc, trigram_model, enc.encode, n=3, strip_first=False))
unigram = np.array(score_ngram(doc, trigram_model.base, enc.encode, n=1, strip_first=False))
response = openai.Completion.create(
model="ada",
prompt="<|endoftext|>" + doc,
max_tokens=0,
echo=True,
logprobs=1,
)
ada = np.array(list(map(lambda x: np.exp(x), response["choices"][0]["logprobs"]["token_logprobs"][1:])))
response = openai.Completion.create(
model="davinci",
prompt="<|endoftext|>" + doc,
max_tokens=0,
echo=True,
logprobs=1,
)
davinci = np.array(list(map(lambda x: np.exp(x), response["choices"][0]["logprobs"]["token_logprobs"][1:])))
subwords = response["choices"][0]["logprobs"]["tokens"][1:]
gpt2_map = {"\n": "Ċ", "\t": "ĉ", " ": "Ġ"}
for i in range(len(subwords)):
for k, v in gpt2_map.items():
subwords[i] = subwords[i].replace(k, v)
t_features = t_featurize_logprobs(davinci, ada, subwords)
vector_map = {
"davinci-logprobs": davinci,
"ada-logprobs": ada,
"trigram-logprobs": trigram,
"unigram-logprobs": unigram
}
exp_features = []
for exp in best_features:
exp_tokens = get_words(exp)
curr = vector_map[exp_tokens[0]]
for i in range(1, len(exp_tokens)):
if exp_tokens[i] in vec_functions:
next_vec = vector_map[exp_tokens[i+1]]
curr = vec_functions[exp_tokens[i]](curr, next_vec)
elif exp_tokens[i] in scalar_functions:
exp_features.append(scalar_functions[exp_tokens[i]](curr))
break
data = (np.array(t_features + exp_features) - mu) / sigma
preds = model.predict_proba(data.reshape(-1, 1).T)[:, 1]
print(f"Prediction: {preds}")