From a415a832bd15e5986f8a24f73723fc4fbe097c3a Mon Sep 17 00:00:00 2001 From: Patrick Greene Date: Mon, 17 Apr 2023 23:16:41 -0400 Subject: [PATCH 1/3] Lack of last name doesn't raise error. --- indra/literature/pubmed_client.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/indra/literature/pubmed_client.py b/indra/literature/pubmed_client.py index 9502e01b47..2ec74c9725 100644 --- a/indra/literature/pubmed_client.py +++ b/indra/literature/pubmed_client.py @@ -367,7 +367,10 @@ def _find_date(element): def _parse_author(author_info, include_details=False): if not include_details: - return author_info.find("LastName").text + last_name = author_info.find("LastName") + if last_name is None: + return None + return last_name.text parsed_info = { "last_name": None, From 7fa538810dde5545f7c6056c40bb190dbde27cfb Mon Sep 17 00:00:00 2001 From: Patrick Greene Date: Mon, 17 Apr 2023 23:16:57 -0400 Subject: [PATCH 2/3] Get references for the article. --- indra/literature/pubmed_client.py | 32 ++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/indra/literature/pubmed_client.py b/indra/literature/pubmed_client.py index 2ec74c9725..37dc8a5876 100644 --- a/indra/literature/pubmed_client.py +++ b/indra/literature/pubmed_client.py @@ -400,6 +400,27 @@ def _parse_author(author_info, include_details=False): return parsed_info +def _get_references(reference_list, only_pmid=True): + """Return a list of references for an article.""" + if reference_list is None: + return None + + references = [] + for reference in reference_list.findall('Reference'): + pmid = _find_elem_text(reference, '*/ArticleId[@IdType="pubmed"]') + if only_pmid: + references.append(pmid) + else: + ref_dict = { + 'pmid': pmid, + 'doi': _find_elem_text(reference, '*/ArticleId[@IdType="doi"]'), + 'pmcid': _find_elem_text(reference, '*/ArticleId[@IdType="pmcid"]'), + 'citation': _find_elem_text(reference, 'Citation'), + } + references.append(ref_dict) + return references + + def _get_article_info(medline_citation, pubmed_data, detailed_authors=False): article = medline_citation.find('Article') pmid = _find_elem_text(medline_citation, './PMID') @@ -434,7 +455,8 @@ def _get_article_info(medline_citation, pubmed_data, detailed_authors=False): def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False, get_abstracts=False, prepend_title=False, - mesh_annotations=True, detailed_authors=False): + mesh_annotations=True, detailed_authors=False, + citations_included=None): """Get metadata for an XML tree containing PubmedArticle elements. Documentation on the XML structure can be found at: @@ -462,6 +484,9 @@ def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False, If True, extract as many of the author details as possible, such as first name, identifiers, and institutions. If false, only last names are returned. Default: False + citations_included : Optional[str] + If 'detailed', include detailed citations in the results. If 'pmid', only include + the PMID of the citation. If None, don't include citations. Default: None Returns ------- @@ -486,6 +511,11 @@ def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False, if mesh_annotations: context_info = _get_annotations(medline_citation) result.update(context_info) + if citations_included: + citations = _get_references(pubmed_data.find('ReferenceList'), + only_pmid=(citations_included == 'pmid')) + result['citations'] = citations + publication_date = _get_pubmed_publication_date(pubmed_data) result['publication_date'] = publication_date From ac477d149cdaf14af4b9956095596100b1ce40f2 Mon Sep 17 00:00:00 2001 From: Patrick Greene Date: Tue, 18 Apr 2023 00:05:33 -0400 Subject: [PATCH 3/3] Add test for references. --- indra/literature/pubmed_client.py | 24 ++++++++++++++---------- indra/tests/test_pubmed_client.py | 16 ++++++++++++++++ 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/indra/literature/pubmed_client.py b/indra/literature/pubmed_client.py index 37dc8a5876..9ae0eba7ec 100644 --- a/indra/literature/pubmed_client.py +++ b/indra/literature/pubmed_client.py @@ -456,7 +456,7 @@ def _get_article_info(medline_citation, pubmed_data, detailed_authors=False): def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False, get_abstracts=False, prepend_title=False, mesh_annotations=True, detailed_authors=False, - citations_included=None): + references_included=None): """Get metadata for an XML tree containing PubmedArticle elements. Documentation on the XML structure can be found at: @@ -484,9 +484,9 @@ def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False, If True, extract as many of the author details as possible, such as first name, identifiers, and institutions. If false, only last names are returned. Default: False - citations_included : Optional[str] - If 'detailed', include detailed citations in the results. If 'pmid', only include - the PMID of the citation. If None, don't include citations. Default: None + references_included : Optional[str] + If 'detailed', include detailed references in the results. If 'pmid', only include + the PMID of the reference. If None, don't include references. Default: None Returns ------- @@ -511,10 +511,10 @@ def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False, if mesh_annotations: context_info = _get_annotations(medline_citation) result.update(context_info) - if citations_included: - citations = _get_references(pubmed_data.find('ReferenceList'), - only_pmid=(citations_included == 'pmid')) - result['citations'] = citations + if references_included: + references = _get_references(pubmed_data.find('ReferenceList'), + only_pmid=(references_included == 'pmid')) + result['references'] = references publication_date = _get_pubmed_publication_date(pubmed_data) result['publication_date'] = publication_date @@ -599,7 +599,7 @@ def _major_topic(e): def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False, get_abstracts=False, prepend_title=False, - detailed_authors=False): + detailed_authors=False, references_included=None): """Get article metadata for up to 200 PMIDs from the Pubmed database. Parameters @@ -619,6 +619,9 @@ def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False, If True, extract as many of the author details as possible, such as first name, identifiers, and institutions. If false, only last names are returned. Default: False + references_included : Optional[str] + If 'detailed', include detailed references in the results. If 'pmid', only include + the PMID of the reference. If None, don't include references. Default: None Returns ------- @@ -637,7 +640,8 @@ def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False, return None return get_metadata_from_xml_tree(tree, get_issns_from_nlm, get_abstracts, prepend_title, - detailed_authors=detailed_authors) + detailed_authors=detailed_authors, + references_included=references_included) @lru_cache(maxsize=1000) diff --git a/indra/tests/test_pubmed_client.py b/indra/tests/test_pubmed_client.py index a3757cd5e3..4c839c8f30 100644 --- a/indra/tests/test_pubmed_client.py +++ b/indra/tests/test_pubmed_client.py @@ -82,6 +82,7 @@ def test_get_complex_title(): assert title.lower().startswith('atomic structures') assert title.lower().endswith('vascular plants.') + @pytest.mark.webservice def test_expand_pagination(): time.sleep(0.5) @@ -144,6 +145,21 @@ def test_get_metadata_for_ids(): metadata2[pmids1[0]]['authors'][0]['affiliations'][0]['name'] +@pytest.mark.webservice +def test_get_paper_references(): + time.sleep(0.5) + pmids = ['27123883', '27121204', '27115606'] + test_pmid = '27121204' + referenced_pmid = '25439075' + metadata_1 = pubmed_client.get_metadata_for_ids(pmids, references_included='pmid') + assert len(metadata_1[test_pmid]['references']) != 0 + assert metadata_1[test_pmid]['references'][0] == referenced_pmid + + metadata_2 = pubmed_client.get_metadata_for_ids(pmids, references_included='detailed') + assert len(metadata_2[test_pmid]['references']) != 0 + assert metadata_2[test_pmid]['references'][0]['pmid'] == referenced_pmid + + @pytest.mark.webservice def test_get_pub_date(): time.sleep(0.5)