Skip to content

Commit 6605770

Browse files
add hipe-2022 edition option (to prevent spurious normalization on METO cols)
1 parent dffa502 commit 6605770

File tree

1 file changed

+14
-4
lines changed

1 file changed

+14
-4
lines changed

normalize_linking.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
Normalize entity linking by remapping links according to an external file
66
77
Usage:
8-
normalize_linking.py -i=<fpath> -o=<fpath> [--norm-time (--norm-histo --map=<fpath>) --union-meto-lit]
8+
normalize_linking.py -i=<fpath> -o=<fpath> [--norm-time (--norm-histo --map=<fpath>) --union-meto-lit] [--hipe_edition=<str>]
99
normalize_linking.py -h | --help
1010
1111
Options:
@@ -16,6 +16,8 @@
1616
--norm-time Normalize NEL for time mentions by linking to NIL.
1717
--norm-histo Normalize NEL for historical entities
1818
--union-meto-lit Unionize literal and metonymic columns (apply on both columns).
19+
-e --hipe_edition=<str> Specify the HIPE edition. Ignores METO columns if set to hipe-2022. Possible values: hipe-2020, hipe-2022 [default: hipe-2020]
20+
1921
2022
All file path can be local or remote URLs.
2123
@@ -26,6 +28,7 @@
2628
import pandas as pd
2729
from docopt import docopt
2830

31+
HIPE_EDITIONS = ["HIPE-2020", "HIPE-2022"]
2932

3033
def get_mappings(f_map):
3134
df_mapping = pd.read_csv(f_map, delimiter="\t")
@@ -120,10 +123,11 @@ def union(list1, list2):
120123
return df
121124

122125

123-
def remove_time_linking(df, replacement="NIL"):
126+
def remove_time_linking(df, replacement="NIL",map_meto=True):
124127
try:
125128
df.loc[df["NE-COARSE-LIT"].str.contains("time"), "NEL-LIT"] = replacement
126-
df.loc[df["NE-COARSE-LIT"].str.contains("time"), "NEL-METO"] = replacement
129+
if map_meto:
130+
df.loc[df["NE-COARSE-LIT"].str.contains("time"), "NEL-METO"] = replacement
127131
except KeyError:
128132
pass
129133

@@ -138,6 +142,12 @@ def main(args):
138142
norm_time = args["--norm-time"]
139143
norm_histo = args["--norm-histo"]
140144
unionize = args["--union-meto-lit"]
145+
hipe_edition = args["--hipe_edition"].upper() # mandatory option
146+
147+
if hipe_edition not in HIPE_EDITIONS:
148+
msg = f"Hipe edition was not or incorrectly set. Use --hipe_edition=hipe-2022 or --hipe_edition=hipe-2022. '"
149+
logging.error(msg)
150+
sys.exit(1)
141151

142152
df = pd.read_csv(f_in, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", skip_blank_lines=False)
143153
df = df.fillna(value={"NE-COARSE-LIT": "", "NEL-LIT": "", "NEL-METO": ""})
@@ -147,7 +157,7 @@ def main(args):
147157
df = normalize_n_to_n(df, mappings)
148158

149159
if norm_time:
150-
df = remove_time_linking(df)
160+
df = remove_time_linking(df,map_meto=hipe_edition == 'HIPE-2020')
151161

152162
if unionize:
153163
df = unionize_meto_lit(df)

0 commit comments

Comments
 (0)