5
5
Normalize entity linking by remapping links according to an external file
6
6
7
7
Usage:
8
- normalize_linking.py -i=<fpath> -o=<fpath> [--norm-time (--norm-histo --map=<fpath>) --union-meto-lit]
8
+ normalize_linking.py -i=<fpath> -o=<fpath> [--norm-time (--norm-histo --map=<fpath>) --union-meto-lit] [--hipe_edition=<str>]
9
9
normalize_linking.py -h | --help
10
10
11
11
Options:
16
16
--norm-time Normalize NEL for time mentions by linking to NIL.
17
17
--norm-histo Normalize NEL for historical entities
18
18
--union-meto-lit Unionize literal and metonymic columns (apply on both columns).
19
+ -e --hipe_edition=<str> Specify the HIPE edition. Ignores METO columns if set to hipe-2022. Possible values: hipe-2020, hipe-2022 [default: hipe-2020]
20
+
19
21
20
22
All file path can be local or remote URLs.
21
23
26
28
import pandas as pd
27
29
from docopt import docopt
28
30
31
+ HIPE_EDITIONS = ["HIPE-2020" , "HIPE-2022" ]
29
32
30
33
def get_mappings (f_map ):
31
34
df_mapping = pd .read_csv (f_map , delimiter = "\t " )
@@ -120,10 +123,11 @@ def union(list1, list2):
120
123
return df
121
124
122
125
123
- def remove_time_linking (df , replacement = "NIL" ):
126
+ def remove_time_linking (df , replacement = "NIL" , map_meto = True ):
124
127
try :
125
128
df .loc [df ["NE-COARSE-LIT" ].str .contains ("time" ), "NEL-LIT" ] = replacement
126
- df .loc [df ["NE-COARSE-LIT" ].str .contains ("time" ), "NEL-METO" ] = replacement
129
+ if map_meto :
130
+ df .loc [df ["NE-COARSE-LIT" ].str .contains ("time" ), "NEL-METO" ] = replacement
127
131
except KeyError :
128
132
pass
129
133
@@ -138,6 +142,12 @@ def main(args):
138
142
norm_time = args ["--norm-time" ]
139
143
norm_histo = args ["--norm-histo" ]
140
144
unionize = args ["--union-meto-lit" ]
145
+ hipe_edition = args ["--hipe_edition" ].upper () # mandatory option
146
+
147
+ if hipe_edition not in HIPE_EDITIONS :
148
+ msg = f"Hipe edition was not or incorrectly set. Use --hipe_edition=hipe-2022 or --hipe_edition=hipe-2022. '"
149
+ logging .error (msg )
150
+ sys .exit (1 )
141
151
142
152
df = pd .read_csv (f_in , sep = "\t " , quoting = csv .QUOTE_NONE , quotechar = "" , skip_blank_lines = False )
143
153
df = df .fillna (value = {"NE-COARSE-LIT" : "" , "NEL-LIT" : "" , "NEL-METO" : "" })
@@ -147,7 +157,7 @@ def main(args):
147
157
df = normalize_n_to_n (df , mappings )
148
158
149
159
if norm_time :
150
- df = remove_time_linking (df )
160
+ df = remove_time_linking (df , map_meto = hipe_edition == 'HIPE-2020' )
151
161
152
162
if unionize :
153
163
df = unionize_meto_lit (df )
0 commit comments