diff --git a/annif/cli.py b/annif/cli.py index 507b0241f..c6bd0399e 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -19,6 +19,7 @@ from annif.project import Access from annif.suggestion import SuggestionFilter, ListSuggestionResult from annif.exception import ConfigurationException, NotSupportedException +from annif.exception import NotInitializedException from annif.util import metric_code logger = annif.logger @@ -41,17 +42,32 @@ def get_project(project_id): sys.exit(1) -def open_documents(paths, subject_index, language, docs_limit): +def get_vocab(vocab_id): + """ + Helper function to get a vocabulary by ID and bail out if it doesn't + exist""" + try: + return annif.registry.get_vocab(vocab_id, + min_access=Access.private) + except ValueError: + click.echo( + f"No vocabularies found with the id '{vocab_id}'.", + err=True) + sys.exit(1) + + +def open_documents(paths, subject_index, vocab_lang, docs_limit): """Helper function to open a document corpus from a list of pathnames, - each of which is either a TSV file or a directory of TXT files. The - corpus will be returned as an instance of DocumentCorpus or - LimitingDocumentCorpus.""" + each of which is either a TSV file or a directory of TXT files. For + directories with subjects in TSV files, the given vocabulary language + will be used to convert subject labels into URIs. The corpus will be + returned as an instance of DocumentCorpus or LimitingDocumentCorpus.""" def open_doc_path(path, subject_index): """open a single path and return it as a DocumentCorpus""" if os.path.isdir(path): return annif.corpus.DocumentDirectory(path, subject_index, - language, + vocab_lang, require_subjects=True) return annif.corpus.DocumentFile(path, subject_index) @@ -165,6 +181,8 @@ def run_show_project(project_id): click.echo(f'Project ID: {proj.project_id}') click.echo(f'Project Name: {proj.name}') click.echo(f'Language: {proj.language}') + click.echo(f'Vocabulary: {proj.vocab.vocab_id}') + click.echo(f'Vocab language: {proj.vocab_lang}') click.echo(f'Access: {proj.access.name}') click.echo(f'Trained: {proj.is_trained}') click.echo(f'Modification time: {proj.modification_time}') @@ -181,7 +199,34 @@ def run_clear_project(project_id): proj.remove_model_data() -@cli.command('loadvoc') +@cli.command('list-vocabs') +@common_options +@click_log.simple_verbosity_option(logger, default='ERROR') +def run_list_vocabs(): + """ + List available vocabularies. + """ + + template = "{0: <20}{1: <20}{2: >10} {3: <6}" + header = template.format( + "Vocabulary ID", "Languages", "Size", "Loaded") + click.echo(header) + click.echo("-" * len(header)) + for vocab in annif.registry.get_vocabs( + min_access=Access.private).values(): + try: + languages = ','.join(sorted(vocab.languages)) + size = len(vocab) + loaded = True + except NotInitializedException: + languages = '-' + size = '-' + loaded = False + click.echo(template.format( + vocab.vocab_id, languages, size, str(loaded))) + + +@cli.command('loadvoc', deprecated=True) @click.argument('project_id') @click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False)) @click.option('--force', '-f', default=False, is_flag=True, @@ -214,10 +259,42 @@ def run_loadvoc(project_id, force, subjectfile): subjects = annif.corpus.SubjectFileCSV(subjectfile) else: # probably a TSV file - subjects = annif.corpus.SubjectFileTSV(subjectfile, proj.language) + subjects = annif.corpus.SubjectFileTSV(subjectfile, proj.vocab_lang) proj.vocab.load_vocabulary(subjects, force=force) +@cli.command('load-vocab') +@click.argument('vocab_id') +@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False)) +@click.option('--language', '-L', help='Language of subject file') +@click.option('--force', '-f', default=False, is_flag=True, + help='Replace existing vocabulary completely ' + + 'instead of updating it') +@common_options +def run_load_vocab(vocab_id, language, force, subjectfile): + """ + Load a vocabulary from a subject file. + """ + vocab = get_vocab(vocab_id) + if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile): + # SKOS/RDF file supported by rdflib + subjects = annif.corpus.SubjectFileSKOS(subjectfile) + click.echo(f"Loading vocabulary from SKOS file {subjectfile}...") + elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile): + # CSV file + subjects = annif.corpus.SubjectFileCSV(subjectfile) + click.echo(f"Loading vocabulary from CSV file {subjectfile}...") + else: + # probably a TSV file - we need to know its language + if not language: + click.echo("Please use --language option to set the language of " + + "a TSV vocabulary.", err=True) + sys.exit(1) + click.echo(f"Loading vocabulary from TSV file {subjectfile}...") + subjects = annif.corpus.SubjectFileTSV(subjectfile, language) + vocab.load_vocabulary(subjects, force=force) + + @cli.command('train') @click.argument('project_id') @click.argument('paths', type=click.Path(exists=True), nargs=-1) @@ -252,7 +329,7 @@ def run_train(project_id, paths, cached, docs_limit, jobs, backend_param): documents = 'cached' else: documents = open_documents(paths, proj.subjects, - proj.vocab.language, docs_limit) + proj.vocab_lang, docs_limit) proj.train(documents, backend_params, jobs) @@ -275,7 +352,7 @@ def run_learn(project_id, paths, docs_limit, backend_param): proj = get_project(project_id) backend_params = parse_backend_params(backend_param, proj) documents = open_documents(paths, proj.subjects, - proj.vocab.language, docs_limit) + proj.vocab_lang, docs_limit) proj.learn(documents, backend_params) @@ -303,7 +380,7 @@ def run_suggest(project_id, limit, threshold, backend_param): "<{}>\t{}\t{}".format( subj.uri, '\t'.join(filter(None, - (subj.labels[project.vocab.language], + (subj.labels[project.vocab_lang], subj.notation))), hit.score)) @@ -334,7 +411,7 @@ def run_index(project_id, directory, suffix, force, hit_filter = SuggestionFilter(project.subjects, limit, threshold) for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory( - directory, project.subjects, project.language, + directory, project.subjects, project.vocab_lang, require_subjects=False): with open(docfilename, encoding='utf-8-sig') as docfile: text = docfile.read() @@ -350,7 +427,7 @@ def run_index(project_id, directory, suffix, force, subj = project.subjects[hit.subject_id] line = "<{}>\t{}\t{}".format( subj.uri, - '\t'.join(filter(None, (subj.labels[project.language], + '\t'.join(filter(None, (subj.labels[project.vocab_lang], subj.notation))), hit.score) click.echo(line, file=subjfile) @@ -432,7 +509,7 @@ def run_eval( raise NotSupportedException( "cannot open results-file for writing: " + str(e)) docs = open_documents(paths, project.subjects, - project.vocab.language, docs_limit) + project.vocab_lang, docs_limit) jobs, pool_class = annif.parallel.get_pool(jobs) @@ -449,7 +526,7 @@ def run_eval( template = "{0:<30}\t{1}" metrics = eval_batch.results(metrics=metric, results_file=results_file, - language=project.vocab.language) + language=project.vocab_lang) for metric, score in metrics.items(): click.echo(template.format(metric + ":", score)) if metrics_file: @@ -484,7 +561,7 @@ def run_optimize(project_id, paths, docs_limit, backend_param): ndocs = 0 docs = open_documents(paths, project.subjects, - project.vocab.language, docs_limit) + project.vocab_lang, docs_limit) for doc in docs.documents: raw_hits = project.suggest(doc.text, backend_params) hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT) @@ -567,7 +644,7 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, """ proj = get_project(project_id) documents = open_documents(paths, proj.subjects, - proj.vocab.language, docs_limit) + proj.vocab_lang, docs_limit) click.echo(f"Looking for optimal hyperparameters using {trials} trials") rec = proj.hyperopt(documents, trials, jobs, metric, results_file) click.echo(f"Got best {metric} score {rec.score:.4f} with:") diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index 91855704d..e380d190a 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -118,6 +118,10 @@ def load_subjects(self, corpus): def __len__(self): return len(self._subjects) + @property + def languages(self): + return self._languages + def __getitem__(self, subject_id): return self._subjects[subject_id] diff --git a/annif/project.py b/annif/project.py index 05fa7353f..22e60594d 100644 --- a/annif/project.py +++ b/annif/project.py @@ -31,6 +31,7 @@ class AnnifProject(DatadirMixin): _analyzer = None _backend = None _vocab = None + _vocab_lang = None initialized = False # default values for configuration settings @@ -148,17 +149,25 @@ def backend(self): backend_id) return self._backend + def _initialize_vocab(self): + if self.vocab_spec is None: + raise ConfigurationException("vocab setting is missing", + project_id=self.project_id) + self._vocab, self._vocab_lang = self.registry.get_vocab( + self.vocab_spec, self.language) + @property def vocab(self): if self._vocab is None: - if self.vocab_spec is None: - raise ConfigurationException("vocab setting is missing", - project_id=self.project_id) - self._vocab = self.registry.get_vocab(self.vocab_spec, - self.language) - + self._initialize_vocab() return self._vocab + @property + def vocab_lang(self): + if self._vocab_lang is None: + self._initialize_vocab() + return self._vocab_lang + @property def subjects(self): return self.vocab.subjects diff --git a/annif/registry.py b/annif/registry.py index a56340ab0..0d4648958 100644 --- a/annif/registry.py +++ b/annif/registry.py @@ -5,6 +5,7 @@ from flask import current_app import annif from annif.config import parse_config +from annif.exception import ConfigurationException from annif.project import Access, AnnifProject from annif.vocab import AnnifVocabulary from annif.util import parse_args @@ -71,8 +72,10 @@ def get_project(self, project_id, min_access=Access.private): raise ValueError("No such project {}".format(project_id)) def get_vocab(self, vocab_spec, default_language): - """Return an AnnifVocabulary corresponding to the vocab_spec. If no - language information is specified, use the given default language.""" + """Return an (AnnifVocabulary, language) pair corresponding to the + vocab_spec. If no language information is specified, use the given + default language.""" + match = re.match(r'(\w+)(\((.*)\))?', vocab_spec) if match is None: raise ValueError( @@ -84,8 +87,8 @@ def get_vocab(self, vocab_spec, default_language): if vocab_key not in self._vocabs[self._rid]: self._vocabs[self._rid][vocab_key] = AnnifVocabulary( - vocab_id, self._datadir, language) - return self._vocabs[self._rid][vocab_key] + vocab_id, self._datadir) + return self._vocabs[self._rid][vocab_key], language def initialize_projects(app): @@ -113,4 +116,29 @@ def get_project(project_id, min_access=Access.private): try: return projects[project_id] except KeyError: - raise ValueError("No such project {}".format(project_id)) + raise ValueError(f"No such project '{project_id}'") + + +def get_vocabs(min_access=Access.private): + """Return the available vocabularies as a dict of vocab_id -> + AnnifVocabulary. The min_access parameter may be used to set the minimum + access level required for the returned vocabularies.""" + + vocabs = {} + for proj in get_projects(min_access).values(): + try: + vocabs[proj.vocab.vocab_id] = proj.vocab + except ConfigurationException: + pass + + return vocabs + + +def get_vocab(vocab_id, min_access=Access.private): + """return a single AnnifVocabulary by vocabulary id""" + + vocabs = get_vocabs(min_access) + try: + return vocabs[vocab_id] + except KeyError: + raise ValueError(f"No such vocabulary '{vocab_id}'") diff --git a/annif/rest.py b/annif/rest.py index 4165cc232..24b9aa116 100644 --- a/annif/rest.py +++ b/annif/rest.py @@ -75,7 +75,7 @@ def suggest(project_id, text, limit, threshold): return server_error(err) hits = hit_filter(result).as_list() return {'results': [_suggestion_to_dict(hit, project.subjects, - project.vocab.language) + project.vocab_lang) for hit in hits]} diff --git a/annif/vocab.py b/annif/vocab.py index 9605bdf9b..769d67431 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -22,10 +22,9 @@ class AnnifVocabulary(DatadirMixin): INDEX_FILENAME_TTL = "subjects.ttl" INDEX_FILENAME_CSV = "subjects.csv" - def __init__(self, vocab_id, datadir, language): + def __init__(self, vocab_id, datadir): DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id) self.vocab_id = vocab_id - self.language = language self._skos_vocab = None def _create_subject_index(self, subject_corpus): @@ -98,6 +97,13 @@ def skos(self): raise NotInitializedException(f'graph file {path} not found') + def __len__(self): + return len(self.subjects) + + @property + def languages(self): + return self.subjects.languages + def load_vocabulary(self, subject_corpus, force=False): """Load subjects from a subject corpus and save them into one or more subject index files as well as a SKOS/Turtle file for later @@ -105,13 +111,15 @@ def load_vocabulary(self, subject_corpus, force=False): if not force and os.path.exists( os.path.join(self.datadir, self.INDEX_FILENAME_CSV)): - logger.info('updating existing vocabulary') + logger.info('updating existing subject index') self._subjects = self._update_subject_index(subject_corpus) else: + logger.info('creating subject index') self._subjects = self._create_subject_index(subject_corpus) - subject_corpus.save_skos( - os.path.join(self.datadir, self.INDEX_FILENAME_TTL)) + skosfile = os.path.join(self.datadir, self.INDEX_FILENAME_TTL) + logger.info(f'saving vocabulary into SKOS file {skosfile}') + subject_corpus.save_skos(skosfile) def as_graph(self): """return the vocabulary as an rdflib graph""" diff --git a/docs/source/commands.rst b/docs/source/commands.rst index 751620f56..2a4876fdd 100644 --- a/docs/source/commands.rst +++ b/docs/source/commands.rst @@ -5,10 +5,10 @@ CLI commands These are the command-line interface commands of Annif, with REST API equivalents when applicable. -To reference a project most of the commands take a ``PROJECT_ID`` parameter, -which is an alphanumeric string ``(A-Za-z0-9_-)``. Common options of the -commands are ``--projects`` for setting a (non-default) path to a `project -configuration file +To reference a vocabulary or a project, most of the commands take either a +``VOCAB_ID`` or a ``PROJECT_ID`` parameter, which are alphanumeric strings +``(A-Za-z0-9_-)``. Common options of the commands are ``--projects`` for +setting a (non-default) path to a `project configuration file `_ and ``--verbosity`` for selecting logging level. @@ -16,17 +16,28 @@ configuration file :local: :backlinks: none -********************** -Project administration -********************** +************************* +Vocabulary administration +************************* + +.. click:: annif.cli:run_load_vocab + :prog: annif load-vocab + +**REST equivalent** -.. click:: annif.cli:run_loadvoc - :prog: annif loadvoc + N/A + +.. click:: annif.cli:run_list_vocabs + :prog: annif list-vocabs **REST equivalent** N/A +********************** +Project administration +********************** + .. click:: annif.cli:run_list_projects :prog: annif list-projects diff --git a/tests/conftest.py b/tests/conftest.py index 25c196022..2d5251cb1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -76,13 +76,13 @@ def subject_file(): @pytest.fixture(scope='module') def dummy_subject_index(testdatadir): """a fixture to access the subject index of the dummy vocabulary""" - vocab = annif.vocab.AnnifVocabulary('dummy', testdatadir, 'en') + vocab = annif.vocab.AnnifVocabulary('dummy', testdatadir) return vocab.subjects @pytest.fixture(scope='module') def vocabulary(datadir): - vocab = annif.vocab.AnnifVocabulary('my-vocab', datadir, 'fi') + vocab = annif.vocab.AnnifVocabulary('my-vocab', datadir) subjfile = os.path.join( os.path.dirname(__file__), 'corpora', diff --git a/tests/test_cli.py b/tests/test_cli.py index 24bde15c3..c1c91d3dd 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -4,6 +4,7 @@ import random import re import os.path +import shutil import importlib import json from click.testing import CliRunner @@ -112,11 +113,21 @@ def test_clear_project_nonexistent_data(testdatadir, caplog): assert expected_msg == caplog.records[0].message +def test_list_vocabs_before_load(testdatadir): + with contextlib.suppress(FileNotFoundError): + shutil.rmtree(str(testdatadir.join('vocabs/yso/'))) + result = runner.invoke(annif.cli.cli, ["list-vocabs"]) + assert not result.exception + assert result.exit_code == 0 + assert re.search(r'^yso\s+-\s+-\s+False', + result.output, re.MULTILINE) + + def test_loadvoc_csv(testdatadir): with contextlib.suppress(FileNotFoundError): - os.remove(str(testdatadir.join('projects/tfidf-fi/subjects.csv'))) + os.remove(str(testdatadir.join('vocabs/yso/subjects.csv'))) with contextlib.suppress(FileNotFoundError): - os.remove(str(testdatadir.join('projects/tfidf-fi/subjects.ttl'))) + os.remove(str(testdatadir.join('vocabs/yso/subjects.ttl'))) subjectfile = os.path.join( os.path.dirname(__file__), 'corpora', @@ -135,9 +146,9 @@ def test_loadvoc_csv(testdatadir): def test_loadvoc_tsv(testdatadir): with contextlib.suppress(FileNotFoundError): - os.remove(str(testdatadir.join('projects/tfidf-fi/subjects.csv'))) + os.remove(str(testdatadir.join('vocabs/yso/subjects.csv'))) with contextlib.suppress(FileNotFoundError): - os.remove(str(testdatadir.join('projects/tfidf-fi/subjects.ttl'))) + os.remove(str(testdatadir.join('vocabs/yso/subjects.ttl'))) subjectfile = os.path.join( os.path.dirname(__file__), 'corpora', @@ -156,9 +167,9 @@ def test_loadvoc_tsv(testdatadir): def test_loadvoc_tsv_with_bom(testdatadir): with contextlib.suppress(FileNotFoundError): - os.remove(str(testdatadir.join('projects/tfidf-fi/subjects.csv'))) + os.remove(str(testdatadir.join('vocabs/yso/subjects.csv'))) with contextlib.suppress(FileNotFoundError): - os.remove(str(testdatadir.join('projects/tfidf-fi/subjects.ttl'))) + os.remove(str(testdatadir.join('vocabs/yso/subjects.ttl'))) subjectfile = os.path.join( os.path.dirname(__file__), 'corpora', @@ -177,9 +188,9 @@ def test_loadvoc_tsv_with_bom(testdatadir): def test_loadvoc_rdf(testdatadir): with contextlib.suppress(FileNotFoundError): - os.remove(str(testdatadir.join('projects/tfidf-fi/subjects.csv'))) + os.remove(str(testdatadir.join('vocabs/yso/subjects.csv'))) with contextlib.suppress(FileNotFoundError): - os.remove(str(testdatadir.join('projects/tfidf-fi/subjects.ttl'))) + os.remove(str(testdatadir.join('vocabs/yso/subjects.ttl'))) subjectfile = os.path.join( os.path.dirname(__file__), 'corpora', @@ -198,9 +209,9 @@ def test_loadvoc_rdf(testdatadir): def test_loadvoc_ttl(testdatadir): with contextlib.suppress(FileNotFoundError): - os.remove(str(testdatadir.join('projects/tfidf-fi/subjects.csv'))) + os.remove(str(testdatadir.join('vocabs/yso/subjects.csv'))) with contextlib.suppress(FileNotFoundError): - os.remove(str(testdatadir.join('projects/tfidf-fi/subjects.ttl'))) + os.remove(str(testdatadir.join('vocabs/yso/subjects.ttl'))) subjectfile = os.path.join( os.path.dirname(__file__), 'corpora', @@ -227,6 +238,167 @@ def test_loadvoc_nonexistent_path(): "File 'nonexistent_path' does not exist." in failed_result.output +def test_load_vocab_csv(testdatadir): + with contextlib.suppress(FileNotFoundError): + os.remove(str(testdatadir.join('vocabs/yso/subjects.csv'))) + with contextlib.suppress(FileNotFoundError): + os.remove(str(testdatadir.join('vocabs/yso/subjects.ttl'))) + subjectfile = os.path.join( + os.path.dirname(__file__), + 'corpora', + 'archaeology', + 'subjects.csv') + result = runner.invoke(annif.cli.cli, + ['load-vocab', 'yso', subjectfile]) + assert not result.exception + assert result.exit_code == 0 + assert testdatadir.join('vocabs/yso/subjects.csv').exists() + assert testdatadir.join('vocabs/yso/subjects.csv').size() > 0 + assert testdatadir.join('vocabs/yso/subjects.ttl').exists() + assert testdatadir.join('vocabs/yso/subjects.ttl').size() > 0 + assert testdatadir.join('vocabs/yso/subjects.dump.gz').exists() + assert testdatadir.join('vocabs/yso/subjects.dump.gz').size() > 0 + + +def test_load_vocab_tsv(testdatadir): + with contextlib.suppress(FileNotFoundError): + os.remove(str(testdatadir.join('vocabs/yso/subjects.csv'))) + with contextlib.suppress(FileNotFoundError): + os.remove(str(testdatadir.join('vocabs/yso/subjects.ttl'))) + subjectfile = os.path.join( + os.path.dirname(__file__), + 'corpora', + 'archaeology', + 'subjects.tsv') + result = runner.invoke(annif.cli.cli, + ['load-vocab', '--language', 'fi', + 'yso', subjectfile]) + assert not result.exception + assert result.exit_code == 0 + assert testdatadir.join('vocabs/yso/subjects.csv').exists() + assert testdatadir.join('vocabs/yso/subjects.csv').size() > 0 + assert testdatadir.join('vocabs/yso/subjects.ttl').exists() + assert testdatadir.join('vocabs/yso/subjects.ttl').size() > 0 + assert testdatadir.join('vocabs/yso/subjects.dump.gz').exists() + assert testdatadir.join('vocabs/yso/subjects.dump.gz').size() > 0 + + +def test_load_vocab_tsv_no_lang(testdatadir): + subjectfile = os.path.join( + os.path.dirname(__file__), + 'corpora', + 'archaeology', + 'subjects.tsv') + failed_result = runner.invoke(annif.cli.cli, + ['load-vocab', 'yso', subjectfile]) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert "Please use --language option to set the language " \ + "of a TSV vocabulary." in failed_result.output + + +def test_load_vocab_tsv_with_bom(testdatadir): + with contextlib.suppress(FileNotFoundError): + os.remove(str(testdatadir.join('vocabs/yso/subjects.csv'))) + with contextlib.suppress(FileNotFoundError): + os.remove(str(testdatadir.join('vocabs/yso/subjects.ttl'))) + subjectfile = os.path.join( + os.path.dirname(__file__), + 'corpora', + 'archaeology', + 'subjects-bom.tsv') + result = runner.invoke(annif.cli.cli, + ['load-vocab', '--language', 'fi', + 'yso', subjectfile]) + assert not result.exception + assert result.exit_code == 0 + assert testdatadir.join('vocabs/yso/subjects.csv').exists() + assert testdatadir.join('vocabs/yso/subjects.csv').size() > 0 + assert testdatadir.join('vocabs/yso/subjects.ttl').exists() + assert testdatadir.join('vocabs/yso/subjects.ttl').size() > 0 + assert testdatadir.join('vocabs/yso/subjects.dump.gz').exists() + assert testdatadir.join('vocabs/yso/subjects.dump.gz').size() > 0 + + +def test_load_vocab_rdf(testdatadir): + with contextlib.suppress(FileNotFoundError): + os.remove(str(testdatadir.join('vocabs/yso/subjects.csv'))) + with contextlib.suppress(FileNotFoundError): + os.remove(str(testdatadir.join('vocabs/yso/subjects.ttl'))) + subjectfile = os.path.join( + os.path.dirname(__file__), + 'corpora', + 'archaeology', + 'yso-archaeology.rdf') + result = runner.invoke(annif.cli.cli, + ['load-vocab', 'yso', subjectfile]) + assert not result.exception + assert result.exit_code == 0 + assert testdatadir.join('vocabs/yso/subjects.csv').exists() + assert testdatadir.join('vocabs/yso/subjects.csv').size() > 0 + assert testdatadir.join('vocabs/yso/subjects.ttl').exists() + assert testdatadir.join('vocabs/yso/subjects.ttl').size() > 0 + assert testdatadir.join('vocabs/yso/subjects.dump.gz').exists() + assert testdatadir.join('vocabs/yso/subjects.dump.gz').size() > 0 + + +def test_load_vocab_ttl(testdatadir): + with contextlib.suppress(FileNotFoundError): + os.remove(str(testdatadir.join('vocabs/yso/subjects.csv'))) + with contextlib.suppress(FileNotFoundError): + os.remove(str(testdatadir.join('vocabs/yso/subjects.ttl'))) + subjectfile = os.path.join( + os.path.dirname(__file__), + 'corpora', + 'archaeology', + 'yso-archaeology.ttl') + result = runner.invoke(annif.cli.cli, + ['load-vocab', 'yso', subjectfile]) + assert not result.exception + assert result.exit_code == 0 + assert testdatadir.join('vocabs/yso/subjects.csv').exists() + assert testdatadir.join('vocabs/yso/subjects.csv').size() > 0 + assert testdatadir.join('vocabs/yso/subjects.ttl').exists() + assert testdatadir.join('vocabs/yso/subjects.ttl').size() > 0 + assert testdatadir.join('vocabs/yso/subjects.dump.gz').exists() + assert testdatadir.join('vocabs/yso/subjects.dump.gz').size() > 0 + + +def test_load_vocab_nonexistent_vocab(): + subjectfile = os.path.join( + os.path.dirname(__file__), + 'corpora', + 'archaeology', + 'yso-archaeology.ttl') + failed_result = runner.invoke( + annif.cli.cli, [ + 'load-vocab', 'notfound', subjectfile]) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert "No vocabularies found with the id 'notfound'." \ + in failed_result.output + + +def test_load_vocab_nonexistent_path(): + failed_result = runner.invoke( + annif.cli.cli, [ + 'load-vocab', 'dummy', 'nonexistent_path']) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert "Invalid value for 'SUBJECTFILE': " \ + "File 'nonexistent_path' does not exist." in failed_result.output + + +def test_list_vocabs_after_load(): + result = runner.invoke(annif.cli.cli, ["list-vocabs"]) + assert not result.exception + assert result.exit_code == 0 + assert re.search(r'^dummy\s+en,fi\s+2\s+True', + result.output, re.MULTILINE) + assert re.search(r'^yso\s+en,fi,sv\s+130\s+True', + result.output, re.MULTILINE) + + def test_train(testdatadir): docfile = os.path.join( os.path.dirname(__file__), diff --git a/tests/test_project.py b/tests/test_project.py index c4c708408..e5901ec8a 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -34,6 +34,7 @@ def test_get_project_fi(registry): assert project.language == 'fi' assert project.analyzer.name == 'snowball' assert project.analyzer.param == 'finnish' + assert project.vocab_lang == 'fi' assert project.access == Access.public assert isinstance(project.backend, annif.backend.dummy.DummyBackend) @@ -56,7 +57,7 @@ def test_get_project_dummy_vocablang(registry): assert project.analyzer.param == 'english' # project uses the dummy vocab, with language overridden to Finnish assert project.vocab.vocab_id == 'dummy' - assert project.vocab.language == 'fi' + assert project.vocab_lang == 'fi' assert project.access == Access.public assert isinstance(project.backend, annif.backend.dummy.DummyBackend) diff --git a/tests/test_vocab.py b/tests/test_vocab.py index 02225ce16..cc79dd6b3 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -9,7 +9,7 @@ def load_dummy_vocab(tmpdir): - vocab = annif.vocab.AnnifVocabulary('vocab-id', str(tmpdir), 'en') + vocab = annif.vocab.AnnifVocabulary('vocab-id', str(tmpdir)) subjfile = os.path.join( os.path.dirname(__file__), 'corpora', diff --git a/tests/test_vocab_skos.py b/tests/test_vocab_skos.py index 22d0d583f..49359eb86 100644 --- a/tests/test_vocab_skos.py +++ b/tests/test_vocab_skos.py @@ -50,7 +50,6 @@ def test_load_turtle(tmpdir): subjects = list(corpus.subjects) assert len(subjects) == 1 # one of the concepts was deprecated assert subjects[0].uri == 'http://www.yso.fi/onto/yso/p8993' - print(subjects[0].labels) assert subjects[0].labels['fi'] == 'hylyt' assert subjects[0].notation is None