-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrouge.py
115 lines (94 loc) · 3.91 KB
/
rouge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
#
# File Name : rouge.py
#
# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
#
# Creation Date : 2015-01-07 06:03
# Author : Ramakrishna Vedantam <vrama91@vt.edu>
import numpy as np
import pdb
from ctypes import *
lib = cdll.LoadLibrary("./libs/lcs.so")
class GoString(Structure):
_fields_ = [("p", c_char_p), ("n", c_longlong)]
lib.LCS.argtypes = [GoString, GoString]
def my_lcs(string, sub):
"""
Calculates longest common subsequence for a pair of tokenized strings
:param string : list of str : tokens from a string split using whitespace
:param sub : list of str : shorter string, also split using whitespace
:returns: length (list of int): length of the longest common subsequence between the two strings
Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
"""
if(len(string)< len(sub)):
sub, string = string, sub
lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
for j in range(1,len(sub)+1):
for i in range(1,len(string)+1):
if(string[i-1] == sub[j-1]):
lengths[i][j] = lengths[i-1][j-1] + 1
else:
lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
return lengths[len(string)][len(sub)]
def go_lcs(string, sub):
return lib.LCS(GoString("".join(string).encode(), len(string)), GoString("".join(sub).encode(), len(sub)))
class Rouge():
'''
Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
'''
def __init__(self):
# vrama91: updated the value below based on discussion with Hovey
self.beta = 1.2
def calc_score(self, candidate, refs):
"""
Compute ROUGE-L score given one candidate and references for an image
:param candidate: str : candidate sentence to be evaluated
:param refs: list of str : COCO reference sentences for the particular image to be evaluated
:returns score: int (ROUGE-L score for the candidate evaluated against references)
"""
assert(len(candidate)==1)
assert(len(refs)>0)
prec = []
rec = []
# split into tokens
token_c = candidate[0].split(" ")
for reference in refs:
# split into tokens
token_r = reference.split(" ")
# compute the longest common subsequence
# lcs = my_lcs(token_r, token_c)
lcs = go_lcs(token_r, token_c)
prec.append(lcs/float(len(token_c)))
rec.append(lcs/float(len(token_r)))
prec_max = max(prec)
rec_max = max(rec)
if(prec_max!=0 and rec_max !=0):
score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
else:
score = 0.0
return score
def compute_score(self, gts, res):
"""
Computes Rouge-L score given a set of reference and candidate sentences for the dataset
Invoked by evaluate_captions.py
:param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
:param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
:returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
"""
assert(set(gts.keys()) == set(res.keys()))
imgIds = list(gts.keys())
score = []
for id in imgIds:
hypo = res[id]
ref = gts[id]
score.append(self.calc_score(hypo, ref))
# Sanity check.
assert(type(hypo) is list)
assert(len(hypo) == 1)
assert(type(ref) is list)
assert(len(ref) > 0)
average_score = np.mean(np.array(score))
return average_score, np.array(score)
def method(self):
return "Rouge"