sorgerlab · bgyori · Nov 25, 2022 · Nov 24, 2022 · Nov 24, 2022 · Nov 24, 2022
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6, 3.9]
+        python-version: [3.7, 3.9]
     steps:
     - uses: actions/checkout@v2
     - uses: actions/cache@v2

diff --git a/indra/literature/pubmed_client.py b/indra/literature/pubmed_client.py
@@ -57,8 +57,9 @@ def get_ids(search_term, **kwargs):
 
     PubMed, by default, limits returned PMIDs to a small number, and this
     number can be controlled by the "retmax" parameter. This function
-    uses a retmax value of 100,000 by default that can be changed via the
-    corresponding keyword argument.
+    uses a retmax value of 10,000 by default (the maximum supported by PubMed)
+    that can be changed via the corresponding keyword argument. Note also
+    the retstart argument along with retmax to page across batches of IDs.
 
     Parameters
     ----------
@@ -82,7 +83,7 @@ def get_ids(search_term, **kwargs):
     if use_text_word:
         search_term += '[tw]'
     params = {'term': search_term,
-              'retmax': 100000,
+              'retmax': 10000,
               'retstart': 0,
               'db': 'pubmed',
               'sort': 'pub+date'}

diff --git a/indra/sources/medscan/processor.py b/indra/sources/medscan/processor.py
@@ -8,6 +8,8 @@
 import tempfile
 import logging
 from math import floor
+
+import gilda
 import lxml.etree
 import collections
 
@@ -1040,7 +1042,15 @@ def _urn_to_db_refs(urn):
                 db_refs['MESH'] = mesh_id
                 db_name = mesh_name
             else:
-                db_name = urn_mesh_name
+                matches = gilda.ground(urn_mesh_name, namespaces=['MESH'])
+                if matches:
+                    for match_ns, match_id in matches[0].get_groundings():
+                        if match_ns == 'MESH':
+                            db_refs['MESH'] = match_id
+                            db_name = matches[0].term.entry_name
+                            break
+                else:
+                    db_name = urn_mesh_name
     elif urn_type == 'agi-gocomplex':
         # Identifier is GO
         db_refs['GO'] = 'GO:%s' % urn_id

diff --git a/indra/sources/reach/__init__.py b/indra/sources/reach/__init__.py
@@ -169,5 +169,6 @@
                   process_nxml_file,
                   process_json_str,
                   process_json_file,
+                  process_fries_json_group,
                   reach_text_url, reach_nxml_url,
                   local_text_url, local_nxml_url)
diff --git a/indra/sources/reach/api.py b/indra/sources/reach/api.py
@@ -363,6 +363,47 @@ def process_json_file(file_name, citation=None, organism_priority=None):
         logger.error('Could not read file %s.' % file_name)
 
 
+def process_fries_json_group(group_prefix, citation=None,
+                             organism_priority=None):
+    """Return a ReachProcessor by processing a REACH fries output file group.
+
+    When running REACH through its CLI, for each input file, it produces
+    three output JSON files when using the fries output format. These three
+    files jointly constitute the output, so they have to be combined to be
+    processed. For instance, one might have PMC9582577.uaz.entities.json,
+    PMC9582577.uaz.events.json, PMC9582577.uaz.sentence.json.
+
+    Parameters
+    ----------
+    group_prefix : str
+        The prefix for the group of output files, e.g., PMC9582577.uaz
+    citation : Optional[str]
+        A PubMed ID passed to be used in the evidence for the extracted INDRA
+        Statements. Default: None
+    organism_priority : Optional[list of str]
+        A list of Taxonomy IDs providing prioritization among organisms
+        when choosing protein grounding. If not given, the default behavior
+        takes the first match produced by Reach, which is prioritized to be
+        a human protein if such a match exists.
+
+    Returns
+    -------
+    rp : ReachProcessor
+        A ReachProcessor containing the extracted INDRA Statements
+        in rp.statements.
+    """
+    file_types = ['entities', 'events', 'sentences']
+    combined_json = {}
+    for file_type in file_types:
+        fname = '%s.%s.json' % (group_prefix, file_type)
+        with open(fname, 'r') as fh:
+            combined_json[file_type] = json.load(fh)
+    # Note that we serialize back to a JSON string here to make use of the
+    # replacements done in process_json_str below
+    return process_json_str(json.dumps(combined_json), citation=citation,
+                            organism_priority=organism_priority)
+
+
 def process_json_str(json_str, citation=None, organism_priority=None):
     """Return a ReachProcessor by processing the given REACH json string.
 

diff --git a/indra/sources/reach/processor.py b/indra/sources/reach/processor.py
@@ -699,50 +699,23 @@ def _get_epistemics(self, event):
         if 'is_direct' in event:
             direct = event['is_direct']
             epistemics['direct'] = direct
-        # Get the section of the paper it comes from
-        section = self._get_section(event)
-        epistemics['section_type'] = section
-        return epistemics
-
-    _section_list = ['title', 'abstract', 'introduction', 'background',
-                     'results', 'methods', 'discussion', 'conclusion',
-                     'supplementary', 'figure']
 
-    def _get_section(self, event):
-        """Get the section of the paper that the event is from."""
+        # Get the section of the paper it comes from
         sentence_id = event.get('sentence')
-        section = None
+        sections = []
         if sentence_id:
             qstr = "$.sentences.frames[(@.frame_id is \'%s\')]" % sentence_id
             res = self.tree.execute(qstr)
             if res:
                 sentence_frame = list(res)[0]
-                passage_id = sentence_frame.get('passage')
-                if passage_id:
-                    qstr = "$.sentences.frames[(@.frame_id is \'%s\')]" % \
-                            passage_id
-                    res = self.tree.execute(qstr)
-                    if res:
-                        passage_frame = list(res)[0]
-                        section = passage_frame.get('section-id')
-        # If the section is in the standard list, return as is
-        if section in self._section_list:
-            return section
-        # Next, handle a few special cases that come up in practice
-        elif section.startswith('fig'):
-            return 'figure'
-        elif section.startswith('supm'):
-            return 'supplementary'
-        elif section == 'article-title':
-            return 'title'
-        elif section in ['subjects|methods', 'methods|subjects']:
-            return 'methods'
-        elif section == 'conclusions':
-            return 'conclusion'
-        elif section == 'intro':
-            return 'introduction'
-        else:
-            return None
+                sections = sentence_frame.get('sections', [])
+        epistemics['raw_sections'] = sections
+        for section in sections:
+            norm_section = normalize_section(section)
+            if norm_section:
+                epistemics['section_type'] = norm_section
+                break
+        return epistemics
 
     def _get_controller_agent(self, arg):
         """Return a single or a complex controller agent."""
@@ -826,6 +799,40 @@ def _parse_site_text(s):
         return sites
 
 
+_section_list = ['title', 'abstract', 'introduction', 'background',
+                 'results', 'methods', 'discussion', 'conclusion',
+                 'supplementary', 'figure']
+
+
+def normalize_section(section):
+    # Strip off any spaces, new lines
+    section = section.strip()
+    # Next, we need to deal with a common pattern of section names like
+    # "3. results"
+    section = re.sub(r'^\d+[.]?[ ]?', '', section)
+    # Often, section title ends with a . like "discussion."
+    if section.endswith('.'):
+        section = section[:-1]
+    # If the section is in the standard list, return as is
+    if section in _section_list:
+        return section
+    # Next, handle a few special cases that come up in practice
+    elif section.startswith('fig'):
+        return 'figure'
+    elif section.startswith('supm'):
+        return 'supplementary'
+    elif section == 'article-title':
+        return 'title'
+    elif section in {'subjects|methods', 'methods|subjects', 'star*methods'}:
+        return 'methods'
+    elif section == 'conclusions':
+        return 'conclusion'
+    elif section == 'intro':
+        return 'introduction'
+    else:
+        return None
+
+
 def parse_amino_acid_string(s):
     s = s.strip()
     for p in (_site_pattern1, _site_pattern2, _site_pattern3):

diff --git a/indra/tests/test_pubmed_client.py b/indra/tests/test_pubmed_client.py
@@ -20,8 +20,8 @@ def test_get_no_ids():
 @attr('webservice')
 def test_get_ids2():
     time.sleep(0.5)
-    ids1 = pubmed_client.get_ids('JUN', use_text_word=False)
-    ids2 = pubmed_client.get_ids('JUN', use_text_word=True)
+    ids1 = pubmed_client.get_ids('JUN', use_text_word=False, reldate=365)
+    ids2 = pubmed_client.get_ids('JUN', use_text_word=True, reldate=365)
     assert len(ids1) > len(ids2)
 
 
@@ -37,17 +37,18 @@ def test_get_id_count():
 @attr('webservice')
 def test_get_id_mesh():
     time.sleep(0.5)
-    ids = pubmed_client.get_ids_for_mesh('D009101')
-    assert len(ids) > 35000
-    ids_maj = pubmed_client.get_ids_for_mesh('D009101', major_topic=True)
+    ids = pubmed_client.get_ids_for_mesh('D009101', reldate=365)
+    assert len(ids) > 100, len(ids)
+    ids_maj = pubmed_client.get_ids_for_mesh('D009101', major_topic=True,
+                                             reldate=365)
     assert len(ids_maj) < len(ids)
 
 
 @attr('webservice')
 def test_get_id_mesh_supc():
     time.sleep(0.5)
     ids = pubmed_client.get_ids_for_mesh('D000086382')
-    assert len(ids) > 15000, len(ids)
+    assert len(ids) > 100, len(ids)
 
 
 @attr('webservice')

diff --git a/indra/tests/test_reach.py b/indra/tests/test_reach.py
@@ -1,7 +1,7 @@
 import os
 from nose.plugins.attrib import attr
 from indra.sources import reach
-from indra.sources.reach.processor import ReachProcessor
+from indra.sources.reach.processor import ReachProcessor, normalize_section
 from indra.util import unicode_strs
 from indra.statements import IncreaseAmount, DecreaseAmount, \
     Dephosphorylation, Complex, Phosphorylation, Translocation
@@ -530,3 +530,12 @@ def process(organism_priority, expected_up_id):
     process(['1513314'], 'PRO_0000006688')
     process(['1513314', '9606'], 'PRO_0000006688')
     process(['1513314', '161274'], 'PRO_0000003427')
+
+
+def test_normalize_section():
+    assert normalize_section('results') == 'results'
+    assert normalize_section('3. results') == 'results'
+    assert normalize_section('3 results') == 'results'
+    assert normalize_section('results.') == 'results'
+    assert normalize_section('star*methods') == 'methods'
+    assert normalize_section('some random section') is None