-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #38 from georgetown-cset/135-remove-fp
Add ability to exclude pairs of ids for matching
- Loading branch information
Showing
13 changed files
with
223 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,3 +3,4 @@ metadata_match | |
all_match_pairs_with_um | ||
simhash_input | ||
lid_input | ||
ids_to_drop |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
SELECT DISTINCT merged_id | ||
FROM | ||
literature.sources | ||
WHERE | ||
orig_id IN (SELECT id1 FROM staging_literature.unlink) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"merged_id": "carticle_0000000003"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{"id1": "A", "id2": "B"} | ||
{"id1": "B", "id2": "A"} | ||
{"id1": "B", "id2": "C"} | ||
{"id1": "C", "id2": "B"} | ||
{"id1": "D", "id2": "E"} | ||
{"id1": "E", "id2": "D"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"id1": "B", "id2": "C"} | ||
{"id1": "D", "id2": "E"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import json | ||
import os | ||
import shutil | ||
import unittest | ||
|
||
from utils.make_unlink_rows import make_pairs | ||
|
||
static_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static") | ||
|
||
|
||
class TestMakeUnlinkRows(unittest.TestCase): | ||
@staticmethod | ||
def gen_sort_key(pair: tuple) -> str: | ||
return f"{pair[0]}-{pair[1]}" | ||
|
||
def test_make_pairs(self): | ||
manual_to_orig = {"1": {"a", "b"}, "2": {"d", "e"}, "3": {"f"}} | ||
expected_output = sorted( | ||
[ | ||
("a", "d"), | ||
("a", "e"), | ||
("a", "f"), | ||
("b", "d"), | ||
("b", "e"), | ||
("b", "f"), | ||
("d", "a"), | ||
("d", "b"), | ||
("d", "f"), | ||
("e", "a"), | ||
("e", "b"), | ||
("e", "f"), | ||
("f", "a"), | ||
("f", "b"), | ||
("f", "d"), | ||
("f", "e"), | ||
], | ||
key=self.gen_sort_key, | ||
) | ||
self.assertEqual( | ||
expected_output, sorted(make_pairs(manual_to_orig), key=self.gen_sort_key) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import argparse | ||
import csv | ||
|
||
|
||
def make_pairs(manual_to_orig: dict) -> list: | ||
""" | ||
Make all pairs of ids that should be unlinked | ||
:param manual_to_orig: Dict mapping manually assigned ids to original ids that we believe to be the same article | ||
:return: A list of pairs of ids that should not be linked together | ||
""" | ||
pairs = [] | ||
for manual1 in manual_to_orig: | ||
for orig1 in manual_to_orig[manual1]: | ||
for manual2 in manual_to_orig: | ||
if manual1 == manual2: | ||
continue | ||
for orig2 in manual_to_orig[manual2]: | ||
pairs.append((orig1, orig2)) | ||
return pairs | ||
|
||
|
||
def write_unlink_rows(unlinking_file: str, output_file: str) -> None: | ||
""" | ||
Write a sql file containing a query that adds new rows to the staging_literature.unlink table | ||
:param unlinking_file: CSV containing two columns, `manual_id` (a manually assigned id marking articles that are the same), | ||
and `orig_id`, the id for the article in its source corpus | ||
:param output_file: SQL file containing a query that adds new rows to staging_literature.unlink | ||
:return: None | ||
""" | ||
manual_to_orig = {} | ||
with open(unlinking_file) as f: | ||
for line in csv.DictReader(f): | ||
if line["manual_id"] not in manual_to_orig: | ||
manual_to_orig[line["manual_id"]] = set() | ||
manual_to_orig[line["manual_id"]].add(line["orig_id"]) | ||
pairs = make_pairs(manual_to_orig) | ||
with open(output_file, mode="w") as out: | ||
out.write( | ||
"create or replace table staging_literature.unlink as\nselect id1, id2 from staging_literature.unlink\nunion all\n" | ||
) | ||
out.write( | ||
"\nunion all\n".join( | ||
[f'select "{id1}" as id1, "{id2}" as id2' for id1, id2 in pairs] | ||
) | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"unlinking_file", help="csv with two columns: manual_id and orig_id" | ||
) | ||
parser.add_argument( | ||
"output_file", help="file where query adding new rows should be written" | ||
) | ||
args = parser.parse_args() | ||
|
||
write_unlink_rows(args.unlinking_file, args.output_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters