From a415a832bd15e5986f8a24f73723fc4fbe097c3a Mon Sep 17 00:00:00 2001
From: Patrick Greene <patrick.anton.greene@gmail.com>
Date: Mon, 17 Apr 2023 23:16:41 -0400
Subject: [PATCH 1/3] Lack of last name doesn't raise error.

---
 indra/literature/pubmed_client.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/indra/literature/pubmed_client.py b/indra/literature/pubmed_client.py
index 9502e01b47..2ec74c9725 100644
--- a/indra/literature/pubmed_client.py
+++ b/indra/literature/pubmed_client.py
@@ -367,7 +367,10 @@ def _find_date(element):
 
 def _parse_author(author_info, include_details=False):
     if not include_details:
-        return author_info.find("LastName").text
+        last_name = author_info.find("LastName")
+        if last_name is None:
+            return None
+        return last_name.text
 
     parsed_info = {
         "last_name": None,

From 7fa538810dde5545f7c6056c40bb190dbde27cfb Mon Sep 17 00:00:00 2001
From: Patrick Greene <patrick.anton.greene@gmail.com>
Date: Mon, 17 Apr 2023 23:16:57 -0400
Subject: [PATCH 2/3] Get references for the article.

---
 indra/literature/pubmed_client.py | 32 ++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/indra/literature/pubmed_client.py b/indra/literature/pubmed_client.py
index 2ec74c9725..37dc8a5876 100644
--- a/indra/literature/pubmed_client.py
+++ b/indra/literature/pubmed_client.py
@@ -400,6 +400,27 @@ def _parse_author(author_info, include_details=False):
     return parsed_info
 
 
+def _get_references(reference_list, only_pmid=True):
+    """Return a list of references for an article."""
+    if reference_list is None:
+        return None
+
+    references = []
+    for reference in reference_list.findall('Reference'):
+        pmid = _find_elem_text(reference, '*/ArticleId[@IdType="pubmed"]')
+        if only_pmid:
+            references.append(pmid)
+        else:
+            ref_dict = {
+                'pmid': pmid,
+                'doi': _find_elem_text(reference, '*/ArticleId[@IdType="doi"]'),
+                'pmcid': _find_elem_text(reference, '*/ArticleId[@IdType="pmcid"]'),
+                'citation': _find_elem_text(reference, 'Citation'),
+            }
+            references.append(ref_dict)
+    return references
+
+
 def _get_article_info(medline_citation, pubmed_data, detailed_authors=False):
     article = medline_citation.find('Article')
     pmid = _find_elem_text(medline_citation, './PMID')
@@ -434,7 +455,8 @@ def _get_article_info(medline_citation, pubmed_data, detailed_authors=False):
 
 def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False,
                                get_abstracts=False, prepend_title=False,
-                               mesh_annotations=True, detailed_authors=False):
+                               mesh_annotations=True, detailed_authors=False,
+                               citations_included=None):
     """Get metadata for an XML tree containing PubmedArticle elements.
 
     Documentation on the XML structure can be found at:
@@ -462,6 +484,9 @@ def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False,
         If True, extract as many of the author details as possible, such as
         first name, identifiers, and institutions. If false, only last names
         are returned. Default: False
+    citations_included : Optional[str]
+        If 'detailed', include detailed citations in the results. If 'pmid', only include
+        the PMID of the citation. If None, don't include citations. Default: None
 
     Returns
     -------
@@ -486,6 +511,11 @@ def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False,
         if mesh_annotations:
             context_info = _get_annotations(medline_citation)
             result.update(context_info)
+        if citations_included:
+            citations = _get_references(pubmed_data.find('ReferenceList'),
+                                        only_pmid=(citations_included == 'pmid'))
+            result['citations'] = citations
+
         publication_date = _get_pubmed_publication_date(pubmed_data)
         result['publication_date'] = publication_date
 

From ac477d149cdaf14af4b9956095596100b1ce40f2 Mon Sep 17 00:00:00 2001
From: Patrick Greene <patrick.anton.greene@gmail.com>
Date: Tue, 18 Apr 2023 00:05:33 -0400
Subject: [PATCH 3/3] Add test for references.

---
 indra/literature/pubmed_client.py | 24 ++++++++++++++----------
 indra/tests/test_pubmed_client.py | 16 ++++++++++++++++
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/indra/literature/pubmed_client.py b/indra/literature/pubmed_client.py
index 37dc8a5876..9ae0eba7ec 100644
--- a/indra/literature/pubmed_client.py
+++ b/indra/literature/pubmed_client.py
@@ -456,7 +456,7 @@ def _get_article_info(medline_citation, pubmed_data, detailed_authors=False):
 def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False,
                                get_abstracts=False, prepend_title=False,
                                mesh_annotations=True, detailed_authors=False,
-                               citations_included=None):
+                               references_included=None):
     """Get metadata for an XML tree containing PubmedArticle elements.
 
     Documentation on the XML structure can be found at:
@@ -484,9 +484,9 @@ def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False,
         If True, extract as many of the author details as possible, such as
         first name, identifiers, and institutions. If false, only last names
         are returned. Default: False
-    citations_included : Optional[str]
-        If 'detailed', include detailed citations in the results. If 'pmid', only include
-        the PMID of the citation. If None, don't include citations. Default: None
+    references_included : Optional[str]
+        If 'detailed', include detailed references in the results. If 'pmid', only include
+        the PMID of the reference. If None, don't include references. Default: None
 
     Returns
     -------
@@ -511,10 +511,10 @@ def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False,
         if mesh_annotations:
             context_info = _get_annotations(medline_citation)
             result.update(context_info)
-        if citations_included:
-            citations = _get_references(pubmed_data.find('ReferenceList'),
-                                        only_pmid=(citations_included == 'pmid'))
-            result['citations'] = citations
+        if references_included:
+            references = _get_references(pubmed_data.find('ReferenceList'),
+                                         only_pmid=(references_included == 'pmid'))
+            result['references'] = references
 
         publication_date = _get_pubmed_publication_date(pubmed_data)
         result['publication_date'] = publication_date
@@ -599,7 +599,7 @@ def _major_topic(e):
 
 def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False,
                          get_abstracts=False, prepend_title=False,
-                         detailed_authors=False):
+                         detailed_authors=False, references_included=None):
     """Get article metadata for up to 200 PMIDs from the Pubmed database.
 
     Parameters
@@ -619,6 +619,9 @@ def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False,
         If True, extract as many of the author details as possible, such as
         first name, identifiers, and institutions. If false, only last names
         are returned. Default: False
+    references_included : Optional[str]
+        If 'detailed', include detailed references in the results. If 'pmid', only include
+        the PMID of the reference. If None, don't include references. Default: None
 
     Returns
     -------
@@ -637,7 +640,8 @@ def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False,
         return None
     return get_metadata_from_xml_tree(tree, get_issns_from_nlm, get_abstracts,
                                       prepend_title,
-                                      detailed_authors=detailed_authors)
+                                      detailed_authors=detailed_authors,
+                                      references_included=references_included)
 
 
 @lru_cache(maxsize=1000)
diff --git a/indra/tests/test_pubmed_client.py b/indra/tests/test_pubmed_client.py
index a3757cd5e3..4c839c8f30 100644
--- a/indra/tests/test_pubmed_client.py
+++ b/indra/tests/test_pubmed_client.py
@@ -82,6 +82,7 @@ def test_get_complex_title():
     assert title.lower().startswith('atomic structures')
     assert title.lower().endswith('vascular plants.')
 
+
 @pytest.mark.webservice
 def test_expand_pagination():
     time.sleep(0.5)
@@ -144,6 +145,21 @@ def test_get_metadata_for_ids():
         metadata2[pmids1[0]]['authors'][0]['affiliations'][0]['name']
 
 
+@pytest.mark.webservice
+def test_get_paper_references():
+    time.sleep(0.5)
+    pmids = ['27123883', '27121204', '27115606']
+    test_pmid = '27121204'
+    referenced_pmid = '25439075'
+    metadata_1 = pubmed_client.get_metadata_for_ids(pmids, references_included='pmid')
+    assert len(metadata_1[test_pmid]['references']) != 0
+    assert metadata_1[test_pmid]['references'][0] == referenced_pmid
+
+    metadata_2 = pubmed_client.get_metadata_for_ids(pmids, references_included='detailed')
+    assert len(metadata_2[test_pmid]['references']) != 0
+    assert metadata_2[test_pmid]['references'][0]['pmid'] == referenced_pmid
+
+
 @pytest.mark.webservice
 def test_get_pub_date():
     time.sleep(0.5)