Skip to content

Commit

Permalink
Add USPTO regression test
Browse files Browse the repository at this point in the history
While at it:
- Make CROSSREF test data directory name more descriptive
- Remove spaces from USPTO test data directory
- Add rule to populate USPTO database to the common directory
  • Loading branch information
dspinellis committed Oct 19, 2023
1 parent 27db8ee commit 77fff33
Show file tree
Hide file tree
Showing 22 changed files with 50 additions and 43 deletions.
3 changes: 2 additions & 1 deletion examples/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
TOP_DIR=$(shell readlink -f ..)

# Use small data samples for all data files
export CROSSREF_DIR?=$(TOP_DIR)/tests/data/sample
export CROSSREF_DIR?=$(TOP_DIR)/tests/data/crossref-sample
export USPTO_DIR?=$(TOP_DIR)/tests/data/uspto-2023-04
export ORCID_SUMMARIES?=$(TOP_DIR)/tests/data/ORCID_2022_10_summaries.tar.gz
export DBLP?=$(TOP_DIR)/tests/data/dblp.xml.gz
export ROR?=$(TOP_DIR)/tests/data/ror.zip
Expand Down
1 change: 1 addition & 0 deletions examples/common/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ rolap
ror-v1.17.1-2022-12-16.zip
simple-rolap
tables
uspto-data
4 changes: 4 additions & 0 deletions examples/common/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export ROLAPDB?=rolap

A3K?=a3k
CROSSREF_DIR?=../common/Crossref-April-2022
USPTO_DIR?=../common/uspto-data
ORCID_SUMMARIES?=../common/ORCID_2022_10_summaries.tar.gz
ROR?=../common/ror-v1.17.1-2022-12-16.zip

Expand All @@ -22,6 +23,9 @@ $(CROSSREF_DIR):
aria2c https://doi.org/10.13003/83b2gq && \
mv 'April 2022 Public Data File from Crossref' Crossref-April-2022

$(USPTO_DIR):
cd ../common && ./fetch-uspto.sh

$(ORCID_SUMMARIES):
curl -L https://orcid.figshare.com/ndownloader/files/37635374 >$@

Expand Down
File renamed without changes.
7 changes: 7 additions & 0 deletions examples/uspto/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.depend
.depend.all
populate
reports
rolap
simple-rolap
tables
24 changes: 9 additions & 15 deletions examples/uspto/Makefile
Original file line number Diff line number Diff line change
@@ -1,20 +1,14 @@
#
# CI placeholder for USPTO example
# Run USPTO example queries
#

export RDBMS?=sqlite
export ROLAPDB?=rolap
export MAINDB?=uspto
export DEPENDENCIES=populate

A3K?=a3k
include ../common/Makefile

V?=1
TIME?=time
export SQLITE_TMPDIR?=.

include simple-rolap/Makefile

simple-rolap/Makefile:
git clone https://github.com/dspinellis/simple-rolap

depclean:
rm -f $(DEPENDENCIES)
# Populate database with USPTO data
populate: $(USPTO_DIR)
$(TIME) $(A3K) --debug progress \
populate "$(MAINDB).db" uspto "$(USPTO_DIR)"
touch $@
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
20 changes: 10 additions & 10 deletions tests/data_sources/test_crossref.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def setUpClass(cls):
FileCache.file_reads = 0
# debug.set_flags(["sql", "dump-matched"])

cls.crossref = crossref.Crossref(td("data/sample"))
cls.crossref = crossref.Crossref(td("data/crossref-sample"))
cls.crossref.populate(DATABASE_PATH)
cls.con = sqlite3.connect(DATABASE_PATH)
cls.cursor = cls.con.cursor()
Expand Down Expand Up @@ -237,7 +237,7 @@ def setUpClass(cls):
FileCache.file_reads = 0
# debug.set_flags(["sql", "dump-matched"])

cls.crossref = crossref.Crossref(td("data/sample"))
cls.crossref = crossref.Crossref(td("data/crossref-sample"))
cls.crossref.populate(DATABASE_PATH, None, "issn_print = '16191366'")
cls.con = sqlite3.connect(DATABASE_PATH)
cls.cursor = cls.con.cursor()
Expand All @@ -261,7 +261,7 @@ def setUpClass(cls):
FileCache.file_reads = 0
# debug.set_flags(["sql", "dump-matched"])

cls.crossref = crossref.Crossref(td("data/sample"))
cls.crossref = crossref.Crossref(td("data/crossref-sample"))
cls.crossref.populate(
DATABASE_PATH, None, "work_authors.orcid = '0000-0002-5878-603X'"
)
Expand Down Expand Up @@ -289,7 +289,7 @@ def setUpClass(cls):
FileCache.file_reads = 0

# debug.set_flags(["sql"])
cls.crossref = crossref.Crossref(td("data/sample"))
cls.crossref = crossref.Crossref(td("data/crossref-sample"))
cls.crossref.populate( DATABASE_PATH, ["works.doi"])
cls.con = sqlite3.connect(DATABASE_PATH)
cls.cursor = cls.con.cursor()
Expand Down Expand Up @@ -320,7 +320,7 @@ def setUpClass(cls):
FileCache.file_reads = 0

# debug.set_flags(["sql"])
cls.crossref = crossref.Crossref(td("data/sample"))
cls.crossref = crossref.Crossref(td("data/crossref-sample"))
cls.crossref.populate(
DATABASE_PATH,
["works.doi"],
Expand Down Expand Up @@ -356,7 +356,7 @@ def setUpClass(cls):
FileCache.file_reads = 0

# debug.set_flags(["sql"])
cls.crossref = crossref.Crossref(td("data/sample"))
cls.crossref = crossref.Crossref(td("data/crossref-sample"))
cls.crossref.populate(
DATABASE_PATH,
["works.doi", "work_funders.*"],
Expand Down Expand Up @@ -395,7 +395,7 @@ def setUpClass(cls):
ensure_unlinked(DATABASE_PATH)
FileCache.file_reads = 0

cls.crossref = crossref.Crossref(td("data/sample"))
cls.crossref = crossref.Crossref(td("data/crossref-sample"))
cls.crossref.populate(
DATABASE_PATH,
["work_updates.label"],
Expand Down Expand Up @@ -432,7 +432,7 @@ def setUpClass(cls):
FileCache.file_reads = 0
populate_attached()
cls.crossref = crossref.Crossref(
td("data/sample"),
td("data/crossref-sample"),
attach_databases=[f"attached:{ATTACHED_DATABASE_PATH}"]
)

Expand Down Expand Up @@ -479,7 +479,7 @@ def setUpClass(cls):
FileCache.file_reads = 0
populate_attached()
cls.crossref = crossref.Crossref(
td("data/sample"),
td("data/crossref-sample"),
attach_databases=[f"attached:{ATTACHED_DATABASE_PATH}"]
)

Expand Down Expand Up @@ -557,7 +557,7 @@ def setUpClass(cls):

# debug.set_flags(["sql"])
cls.crossref = crossref.Crossref(
td("data/sample"),
td("data/crossref-sample"),
attach_databases=[f"attached:{ATTACHED_DATABASE_PATH}"]
)
cls.crossref.populate(
Expand Down
4 changes: 2 additions & 2 deletions tests/data_sources/test_orcid.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def setUpClass(cls):
ensure_unlinked(DATABASE_PATH)

# Add known authors
cls.crossref = crossref.Crossref(td("data/sample"))
cls.crossref = crossref.Crossref(td("data/crossref-sample"))
cls.crossref.populate(DATABASE_PATH)

cls.orcid = orcid.Orcid(td("data/ORCID_2022_10_summaries.tar.gz"))
Expand Down Expand Up @@ -176,7 +176,7 @@ def setUpClass(cls):
ensure_unlinked(DATABASE_PATH)

# Add known authors
cls.crossref = crossref.Crossref(td("data/sample"))
cls.crossref = crossref.Crossref(td("data/crossref-sample"))
cls.crossref.populate(DATABASE_PATH)

cls.orcid = orcid.Orcid(td("data/ORCID_2022_10_summaries.tar.gz"))
Expand Down
22 changes: 11 additions & 11 deletions tests/data_sources/test_uspto.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def setUpClass(cls):

FileCache.parse_counter = 0
UsptoZipCache.file_reads = 0
cls.uspto = uspto.Uspto(td("data/April 2023 Patent Grant Bibliographic Data"))
cls.uspto = uspto.Uspto(td("data/uspto-2023-04"))
cls.uspto.populate(DATABASE_PATH)
cls.con = sqlite3.connect(DATABASE_PATH)
cls.cursor = cls.con.cursor()
Expand Down Expand Up @@ -131,7 +131,7 @@ def setUpClass(cls):
# debug.set_flags(["sql", "dump-matched"])

FileCache.parse_counter = 0
cls.uspto = uspto.Uspto(td("data/April 2023 Patent Grant Bibliographic Data"))
cls.uspto = uspto.Uspto(td("data/uspto-2023-04"))
cls.uspto.populate(DATABASE_PATH, None, "us_patents.type = 'plant'")
cls.con = sqlite3.connect(DATABASE_PATH)
cls.cursor = cls.con.cursor()
Expand All @@ -154,7 +154,7 @@ def setUpClass(cls):
FileCache.parse_counter = 0
# debug.set_flags(["sql", "dump-matched"])

cls.uspto = uspto.Uspto(td("data/April 2023 Patent Grant Bibliographic Data"))
cls.uspto = uspto.Uspto(td("data/uspto-2023-04"))
cls.uspto.populate(
DATABASE_PATH, None, "usp_icpr_classifications.subclass = 'G'"
)
Expand Down Expand Up @@ -187,7 +187,7 @@ def setUpClass(cls):
FileCache.parse_counter = 0

# debug.set_flags(["sql"])
cls.uspto = uspto.Uspto(td("data/April 2023 Patent Grant Bibliographic Data"))
cls.uspto = uspto.Uspto(td("data/uspto-2023-04"))
cls.uspto.populate(DATABASE_PATH, ["us_patents.type"])
cls.con = sqlite3.connect(DATABASE_PATH)
cls.cursor = cls.con.cursor()
Expand Down Expand Up @@ -219,7 +219,7 @@ def setUpClass(cls):
FileCache.parse_counter = 0

# debug.set_flags(["sql"])
cls.uspto = uspto.Uspto(td("data/April 2023 Patent Grant Bibliographic Data"))
cls.uspto = uspto.Uspto(td("data/uspto-2023-04"))
cls.uspto.populate(
DATABASE_PATH,
["us_patents.drawings_number"],
Expand Down Expand Up @@ -255,7 +255,7 @@ def setUpClass(cls):
FileCache.parse_counter = 0

# debug.set_flags(["sql"])
cls.uspto = uspto.Uspto(td("data/April 2023 Patent Grant Bibliographic Data"))
cls.uspto = uspto.Uspto(td("data/uspto-2023-04"))
cls.uspto.populate(
DATABASE_PATH,
["us_patents.drawings_number", "usp_icpr_classifications.*"],
Expand Down Expand Up @@ -288,7 +288,7 @@ def setUpClass(cls):
FileCache.parse_counter = 0

# debug.set_flags(["sql"])
cls.uspto = uspto.Uspto(td("data/April 2023 Patent Grant Bibliographic Data"))
cls.uspto = uspto.Uspto(td("data/uspto-2023-04"))
cls.uspto.populate(
DATABASE_PATH,
["us_patents.figures_number"],
Expand Down Expand Up @@ -324,7 +324,7 @@ def setUpClass(cls):

# debug.set_flags(["sql"])
cls.uspto = uspto.Uspto(
td("data/April 2023 Patent Grant Bibliographic Data"),
td("data/uspto-2023-04"),
attach_databases=[f"attached_uspto:{ATTACHED_DATABASE_PATH}"],
)

Expand Down Expand Up @@ -359,7 +359,7 @@ def setUpClass(cls):

# debug.set_flags(["sql"])
cls.uspto = uspto.Uspto(
td("data/April 2023 Patent Grant Bibliographic Data"),
td("data/uspto-2023-04"),
attach_databases=[f"attached_uspto:{ATTACHED_DATABASE_PATH}"],
)

Expand Down Expand Up @@ -572,7 +572,7 @@ def setUpClass(cls):

# debug.set_flags(["sql"])
cls.uspto = uspto.Uspto(
td("data/April 2023 Patent Grant Bibliographic Data"),
td("data/uspto-2023-04"),
attach_databases=[f"attached_uspto:{ATTACHED_DATABASE_PATH}"],
)
cls.uspto.populate(
Expand Down Expand Up @@ -603,7 +603,7 @@ def setUpClass(cls):
FileCache.parse_counter = 0
UsptoZipCache.file_reads = 0
cls.uspto = uspto.Uspto(
td("data/April 2023 Patent Grant Bibliographic Data"),
td("data/uspto-2023-04"),
sample=lambda data: True
if (data[0] == "path")
else True
Expand Down
2 changes: 1 addition & 1 deletion tests/processes/test_link_aa_ror.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def setUpClass(cls):
cls.ror.populate(DATABASE_PATH)

# Needed to test author-ror linking
cls.crossref = crossref.Crossref(td("data/sample"))
cls.crossref = crossref.Crossref(td("data/crossref-sample"))
cls.crossref.populate(DATABASE_PATH)

cls.con = apsw.Connection(DATABASE_PATH)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_tsort.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class TestTsort(unittest.TestCase):
def setUp(self):
# debug.set_flags(["sql"])
FileCache.file_reads = 0
self.crossref = crossref.Crossref(td("data/sample"))
self.crossref = crossref.Crossref(td("data/crossref-sample"))

def tsort_add_meta(self, table_names):
"""Add Crossref metadata and call tsort"""
Expand Down
4 changes: 2 additions & 2 deletions tests/test_uspto_zip_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
from alexandria3k.uspto_zip_cache import UsptoZipCache

FILE_PATH_1 = td(
"data/April 2023 Patent Grant Bibliographic Data/2022/ipgb20221025_wk43.zip"
"data/uspto-2023-04/2022/ipgb20221025_wk43.zip"
)
FILE_PATH_2 = td(
"data/April 2023 Patent Grant Bibliographic Data/2023/ipgb20230404_wk14.zip"
"data/uspto-2023-04/2023/ipgb20230404_wk14.zip"
)


Expand Down

0 comments on commit 77fff33

Please sign in to comment.