Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Propagate section information from new Reach implementation #1399

Merged
merged 7 commits into from
Nov 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.9]
python-version: [3.7, 3.9]
steps:
- uses: actions/checkout@v2
- uses: actions/cache@v2
Expand Down
7 changes: 4 additions & 3 deletions indra/literature/pubmed_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ def get_ids(search_term, **kwargs):

PubMed, by default, limits returned PMIDs to a small number, and this
number can be controlled by the "retmax" parameter. This function
uses a retmax value of 100,000 by default that can be changed via the
corresponding keyword argument.
uses a retmax value of 10,000 by default (the maximum supported by PubMed)
that can be changed via the corresponding keyword argument. Note also
the retstart argument along with retmax to page across batches of IDs.

Parameters
----------
Expand All @@ -82,7 +83,7 @@ def get_ids(search_term, **kwargs):
if use_text_word:
search_term += '[tw]'
params = {'term': search_term,
'retmax': 100000,
'retmax': 10000,
'retstart': 0,
'db': 'pubmed',
'sort': 'pub+date'}
Expand Down
12 changes: 11 additions & 1 deletion indra/sources/medscan/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import tempfile
import logging
from math import floor

import gilda
import lxml.etree
import collections

Expand Down Expand Up @@ -1040,7 +1042,15 @@ def _urn_to_db_refs(urn):
db_refs['MESH'] = mesh_id
db_name = mesh_name
else:
db_name = urn_mesh_name
matches = gilda.ground(urn_mesh_name, namespaces=['MESH'])
if matches:
for match_ns, match_id in matches[0].get_groundings():
if match_ns == 'MESH':
db_refs['MESH'] = match_id
db_name = matches[0].term.entry_name
break
else:
db_name = urn_mesh_name
elif urn_type == 'agi-gocomplex':
# Identifier is GO
db_refs['GO'] = 'GO:%s' % urn_id
Expand Down
1 change: 1 addition & 0 deletions indra/sources/reach/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,5 +169,6 @@
process_nxml_file,
process_json_str,
process_json_file,
process_fries_json_group,
reach_text_url, reach_nxml_url,
local_text_url, local_nxml_url)
41 changes: 41 additions & 0 deletions indra/sources/reach/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,47 @@ def process_json_file(file_name, citation=None, organism_priority=None):
logger.error('Could not read file %s.' % file_name)


def process_fries_json_group(group_prefix, citation=None,
organism_priority=None):
"""Return a ReachProcessor by processing a REACH fries output file group.

When running REACH through its CLI, for each input file, it produces
three output JSON files when using the fries output format. These three
files jointly constitute the output, so they have to be combined to be
processed. For instance, one might have PMC9582577.uaz.entities.json,
PMC9582577.uaz.events.json, PMC9582577.uaz.sentence.json.

Parameters
----------
group_prefix : str
The prefix for the group of output files, e.g., PMC9582577.uaz
citation : Optional[str]
A PubMed ID passed to be used in the evidence for the extracted INDRA
Statements. Default: None
organism_priority : Optional[list of str]
A list of Taxonomy IDs providing prioritization among organisms
when choosing protein grounding. If not given, the default behavior
takes the first match produced by Reach, which is prioritized to be
a human protein if such a match exists.

Returns
-------
rp : ReachProcessor
A ReachProcessor containing the extracted INDRA Statements
in rp.statements.
"""
file_types = ['entities', 'events', 'sentences']
combined_json = {}
for file_type in file_types:
fname = '%s.%s.json' % (group_prefix, file_type)
with open(fname, 'r') as fh:
combined_json[file_type] = json.load(fh)
# Note that we serialize back to a JSON string here to make use of the
# replacements done in process_json_str below
return process_json_str(json.dumps(combined_json), citation=citation,
organism_priority=organism_priority)


def process_json_str(json_str, citation=None, organism_priority=None):
"""Return a ReachProcessor by processing the given REACH json string.

Expand Down
81 changes: 44 additions & 37 deletions indra/sources/reach/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,50 +699,23 @@ def _get_epistemics(self, event):
if 'is_direct' in event:
direct = event['is_direct']
epistemics['direct'] = direct
# Get the section of the paper it comes from
section = self._get_section(event)
epistemics['section_type'] = section
return epistemics

_section_list = ['title', 'abstract', 'introduction', 'background',
'results', 'methods', 'discussion', 'conclusion',
'supplementary', 'figure']

def _get_section(self, event):
"""Get the section of the paper that the event is from."""
# Get the section of the paper it comes from
sentence_id = event.get('sentence')
section = None
sections = []
if sentence_id:
qstr = "$.sentences.frames[(@.frame_id is \'%s\')]" % sentence_id
res = self.tree.execute(qstr)
if res:
sentence_frame = list(res)[0]
passage_id = sentence_frame.get('passage')
if passage_id:
qstr = "$.sentences.frames[(@.frame_id is \'%s\')]" % \
passage_id
res = self.tree.execute(qstr)
if res:
passage_frame = list(res)[0]
section = passage_frame.get('section-id')
# If the section is in the standard list, return as is
if section in self._section_list:
return section
# Next, handle a few special cases that come up in practice
elif section.startswith('fig'):
return 'figure'
elif section.startswith('supm'):
return 'supplementary'
elif section == 'article-title':
return 'title'
elif section in ['subjects|methods', 'methods|subjects']:
return 'methods'
elif section == 'conclusions':
return 'conclusion'
elif section == 'intro':
return 'introduction'
else:
return None
sections = sentence_frame.get('sections', [])
epistemics['raw_sections'] = sections
for section in sections:
norm_section = normalize_section(section)
if norm_section:
epistemics['section_type'] = norm_section
break
return epistemics

def _get_controller_agent(self, arg):
"""Return a single or a complex controller agent."""
Expand Down Expand Up @@ -826,6 +799,40 @@ def _parse_site_text(s):
return sites


_section_list = ['title', 'abstract', 'introduction', 'background',
'results', 'methods', 'discussion', 'conclusion',
'supplementary', 'figure']


def normalize_section(section):
# Strip off any spaces, new lines
section = section.strip()
# Next, we need to deal with a common pattern of section names like
# "3. results"
section = re.sub(r'^\d+[.]?[ ]?', '', section)
# Often, section title ends with a . like "discussion."
if section.endswith('.'):
section = section[:-1]
# If the section is in the standard list, return as is
if section in _section_list:
return section
# Next, handle a few special cases that come up in practice
elif section.startswith('fig'):
return 'figure'
elif section.startswith('supm'):
return 'supplementary'
elif section == 'article-title':
return 'title'
elif section in {'subjects|methods', 'methods|subjects', 'star*methods'}:
return 'methods'
elif section == 'conclusions':
return 'conclusion'
elif section == 'intro':
return 'introduction'
else:
return None


def parse_amino_acid_string(s):
s = s.strip()
for p in (_site_pattern1, _site_pattern2, _site_pattern3):
Expand Down
13 changes: 7 additions & 6 deletions indra/tests/test_pubmed_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ def test_get_no_ids():
@attr('webservice')
def test_get_ids2():
time.sleep(0.5)
ids1 = pubmed_client.get_ids('JUN', use_text_word=False)
ids2 = pubmed_client.get_ids('JUN', use_text_word=True)
ids1 = pubmed_client.get_ids('JUN', use_text_word=False, reldate=365)
ids2 = pubmed_client.get_ids('JUN', use_text_word=True, reldate=365)
assert len(ids1) > len(ids2)


Expand All @@ -37,17 +37,18 @@ def test_get_id_count():
@attr('webservice')
def test_get_id_mesh():
time.sleep(0.5)
ids = pubmed_client.get_ids_for_mesh('D009101')
assert len(ids) > 35000
ids_maj = pubmed_client.get_ids_for_mesh('D009101', major_topic=True)
ids = pubmed_client.get_ids_for_mesh('D009101', reldate=365)
assert len(ids) > 100, len(ids)
ids_maj = pubmed_client.get_ids_for_mesh('D009101', major_topic=True,
reldate=365)
assert len(ids_maj) < len(ids)


@attr('webservice')
def test_get_id_mesh_supc():
time.sleep(0.5)
ids = pubmed_client.get_ids_for_mesh('D000086382')
assert len(ids) > 15000, len(ids)
assert len(ids) > 100, len(ids)


@attr('webservice')
Expand Down
11 changes: 10 additions & 1 deletion indra/tests/test_reach.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from nose.plugins.attrib import attr
from indra.sources import reach
from indra.sources.reach.processor import ReachProcessor
from indra.sources.reach.processor import ReachProcessor, normalize_section
from indra.util import unicode_strs
from indra.statements import IncreaseAmount, DecreaseAmount, \
Dephosphorylation, Complex, Phosphorylation, Translocation
Expand Down Expand Up @@ -530,3 +530,12 @@ def process(organism_priority, expected_up_id):
process(['1513314'], 'PRO_0000006688')
process(['1513314', '9606'], 'PRO_0000006688')
process(['1513314', '161274'], 'PRO_0000003427')


def test_normalize_section():
assert normalize_section('results') == 'results'
assert normalize_section('3. results') == 'results'
assert normalize_section('3 results') == 'results'
assert normalize_section('results.') == 'results'
assert normalize_section('star*methods') == 'methods'
assert normalize_section('some random section') is None