Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Auto-populate Sample, Container on Biospecimen create/update #645

Closed
wants to merge 7 commits into from
314 changes: 314 additions & 0 deletions dataservice/api/biospecimen/manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
"""
Module to help manage the create/update lifecycle of Samples and Containers

The main method in this module is the `manage_sample_containers` which is
responsible for create/updating Samples and Containers every time a Biospecimen
is created or updated. It gets called in
dataservice.api.biospecimen.resources.py

Background:
The current Biospecimen table does not adequately model the hierarchical
relationship between specimen groups and specimens. The Sample and
Container tables have been created to fill in this gap.

A Sample is a biologically equivalent group of specimens. A Sample has
one or more Containers and a Container essentially mirrors the Biospecimen.

The Sample and Container tables were created in order to minimize any
changes to the existing Biospecimen table.
"""

from marshmallow import ValidationError
from flask import abort
from dataservice.extensions import db
from dataservice.api.sample.models import Sample
from dataservice.api.container.models import Container
from dataservice.api.sample.schemas import (
SampleSchema,
)
from dataservice.api.container.schemas import (
ContainerSchema,
)


def _create_sample_event_key(biospecimen):
"""
Create a sample event identifier from specific fields on the Biospecimen

Use:
participant_id
external_sample_id
age_at_event_days

Key format: <participant_id>-<external_sample_id>-<age_at_event_days>

If age_at_event_days is null, then use the value "Not Reported"
"""
components = [
biospecimen.participant_id,
biospecimen.external_sample_id,
biospecimen.age_at_event_days
]

return "-".join([str(c) if c else "Not Reported" for c in components])


def _create_concentration(biospecimen):
"""
Create the sample concentration given the biospecimen concentration

Only use the concentration value when the analyte_type is DNA or RNA
"""
if biospecimen.analyte_type in ["DNA", "RNA"]:
return biospecimen.concentration_mg_per_ml
else:
return None


def _get_sample_identifier(biospecimen):
"""
Helper to extract specific Biospecimen attributes to uniquely
identify a Sample
"""
return {
"sample_event_key": _create_sample_event_key(biospecimen),
"composition": biospecimen.composition,
"tissue_type": biospecimen.source_text_tissue_type,
"analyte_type": biospecimen.analyte_type,
"anatomical_location": biospecimen.source_text_anatomical_site,
"method_of_sample_procurement":
biospecimen.method_of_sample_procurement,
"preservation_method": biospecimen.preservation_method,
"concentration_mg_per_ml": _create_concentration(biospecimen)
}


def _get_container_identifier(biospecimen):
"""
Helper to extract specific Biospecimen attributes to uniquely identify
a Container
"""
return {
"biospecimen_id": biospecimen.kf_id,
}


def _get_visibility_params(biospecimen):
"""
Helper method to get dict of visibility parameters from the Biospecimen
"""
return {
"visible": biospecimen.visible,
"visibility_reason": biospecimen.visibility_reason,
"visibility_comment": biospecimen.visibility_comment
}


def _create_sample(biospecimen):
"""
Create Sample from specific Biospecimen attributes. Validate Sample
"""
# Extract the parameters that uniquely identify a sample
params = _get_sample_identifier(biospecimen)
# Add remaining sample attributes
params.update(
{
"participant_id": biospecimen.participant_id,
"external_id": biospecimen.external_sample_id,
"volume_ul": biospecimen.volume_ul,
}
)
# Set visibility params based on Biospecimen which represents both
# sample and containers
params.update(
_get_visibility_params(biospecimen)
)
# Validate sample parameters and create sample
try:
sample = SampleSchema(strict=True).load(params).data
# Params not valid
except ValidationError as e:
abort(400, 'could not create sample: {}'.format(e.messages))

return sample


def _update_sample(current_sample, biospecimen):
"""
Update Sample using specific Biospecimen attributes. Validate Sample
"""
# Extract the parameters that uniquely identify a sample
params = _get_sample_identifier(biospecimen)
# Add remaining sample attributes
params.update(
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

everything in this PR makes sense to me except the idea that we might update a participant ID. I think participant ID should be part of the defining characteristics of a sample so I'm struggling to understand how we could both identify an existing sample (which implies the participant ID on the sample matches that on the specimen being registered) but then update the sample participant ID field (which implies the participant ID does not match the specimen being registered).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh hold on... is this related to participant.kf_id really being the primary ID for particpant and participant_id being a sort of secondary/external ID? So we are updating the external ID if it changes but relying on the kf_id/PK for confirming the sample already exists?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@calkinsh Yep, the primary key for participant is participant.kf_id and the sample has a foreign key to it sample.participant_id so I think that does make it a defining characteristic of the sample.

Copy link
Member Author

@znatty22 znatty22 Jan 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically we may not need the Sample.participant_id or Sample.external_id bc they are captured in the Sample.sample_event_key but I included them in the Sample table in case we want to populate the sample event key with something else and bc I felt it would be ok to have some redundancy to gain some clarity on which participant the sample came from and what the original biospecimen's external sample ID was

"participant_id": biospecimen.participant_id,
"external_id": biospecimen.external_sample_id,
"volume_ul": biospecimen.volume_ul,
}
)
# Set visibility params based on Biospecimen which represents both
# sample and containers
params.update(
_get_visibility_params(biospecimen)
)
try:
sample = SampleSchema(strict=True).load(
params, instance=current_sample, partial=True
).data
except ValidationError as e:
abort(400, 'could not update sample: {}'.format(e.messages))

return sample


def _create_container(biospecimen, sample):
"""
Create Container using specific Biospecimen attributes.
Link Container to its associated biospecimen and sample
Validate Container
"""
# Extract the parameters that uniquely identify a sample
params = _get_container_identifier(biospecimen)
# Add remaining sample attributes
params.update(
{
"biospecimen_id": biospecimen.kf_id,
"sample_id": sample.kf_id,
"volume_ul": biospecimen.volume_ul,
"external_id": biospecimen.external_aliquot_id,
}
)
# Set visibility params based on Biospecimen which represents both
# sample and containers
params.update(
_get_visibility_params(biospecimen)
)
try:
container = ContainerSchema(strict=True).load(params).data
except ValidationError as e:
abort(400, 'could not create container: {}'.format(e.messages))

return container


def _update_container(current_container, biospecimen, sample):
"""
Update Container using specific Biospecimen attributes.
Link Container to its associated biospecimen and sample
Validate Container
"""
# Extract the parameters that uniquely identify a container
params = _get_container_identifier(biospecimen)
# Add remaining container attributes
params.update(
{
"biospecimen_id": biospecimen.kf_id,
"sample_id": sample.kf_id,
"volume_ul": biospecimen.volume_ul,
"external_id": biospecimen.external_aliquot_id,
}
)
# Set visibility params based on Biospecimen which represents both
# sample and containers
params.update(
_get_visibility_params(biospecimen)
)
try:
container = ContainerSchema(strict=True).load(
params, instance=current_container, partial=True
).data
except ValidationError as e:
abort(400, 'could not update container: {}'.format(e.messages))

return container


def _upsert_sample(biospecimen):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to change this approach. Read, modify, write is an anti-pattern and doesn't work with concurrent requests. Use postgresql internal upsert (update on conflict)

"""
Upsert Sample from specific Biospecimen attributes

Try to find exisiting Sample first
If it exists, update it using the Biospecimen attributes
If it does not exist, create Sample using the Biospecimen attributes
"""
# Extract biospecimen attributes that uniquely identify a sample
sample_query_params = _get_sample_identifier(biospecimen)

# Find sample if it exists
sample = Sample.query.filter_by(**sample_query_params).first()

# Sample does not exist, create it
if not sample:
sample = _create_sample(biospecimen)
# Sample exists, update it
else:
sample = _update_sample(sample, biospecimen)

db.session.add(sample)
db.session.commit()

return sample


def _upsert_container(biospecimen, sample):
"""
Upsert Container from specific Biospecimen attributes and link Container
to its associated Sample

Try to find existing Container first
If it exists, update it using the Biospecimen attributes
If it does not exist, create Container using the Biospecimen attributes
"""
# Extract biospecimen attributes that uniquely identify a container
container_query_params = _get_container_identifier(biospecimen)

# Find sample if it exists
container = Container.query.filter_by(**container_query_params).first()

# Container does not exist - create it
if not container:
container = _create_container(biospecimen, sample)
# Container exists - update it
else:
container = _update_container(container, biospecimen, sample)

db.session.add(container)
db.session.commit()

return container


def _update_sample_volume(sample_id):
"""
Update Sample's volume with the sum of all of its container volumes
"""
# Accumulate container volumes and update sample volume
sample_with_containers = Sample.query.get(sample_id)
total_volume = None
for ct in sample_with_containers.containers:
if ct.volume_ul is None:
continue
if total_volume is None:
total_volume = ct.volume_ul
else:
total_volume += ct.volume_ul

sample_with_containers.volume_ul = total_volume

db.session.add(sample_with_containers)
db.session.commit()

return sample_with_containers


def manage_sample_containers(biospecimen):
"""
Upsert a Sample and Container from the input Biospecimen
Update the sample's volume with the sum of the container volumes
"""
sample = _upsert_sample(biospecimen)
_upsert_container(biospecimen, sample)
sample = _update_sample_volume(sample.kf_id)

return sample
31 changes: 20 additions & 11 deletions dataservice/api/biospecimen/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
)
from dataservice.api.common.views import CRUDView
from dataservice.api.common.schemas import filter_schema_factory
from dataservice.api.biospecimen import manager


class BiospecimenListAPI(CRUDView):
Expand Down Expand Up @@ -92,17 +93,21 @@ def post(self):

# Deserialize
try:
s = BiospecimenSchema(strict=True).load(body).data
biospecimen = BiospecimenSchema(strict=True).load(body).data
# Request body not valid
except ValidationError as e:
abort(400, 'could not create biospecimen: {}'.format(e.messages))

# Add to and save in database
db.session.add(s)
db.session.add(biospecimen)
db.session.commit()

return BiospecimenSchema(201, 'biospecimen {} created'
.format(s.kf_id)).jsonify(s), 201
# Create the Biospecimen's associated Sample and Container
manager.manage_sample_containers(biospecimen)

return BiospecimenSchema(
201, 'biospecimen {} created'.format(biospecimen.kf_id)
).jsonify(biospecimen), 201


class BiospecimenAPI(CRUDView):
Expand Down Expand Up @@ -141,25 +146,29 @@ def patch(self, kf_id):
resource:
Biospecimen
"""
sa = Biospecimen.query.get(kf_id)
if sa is None:
biospecimen = Biospecimen.query.get(kf_id)
if biospecimen is None:
abort(404, 'could not find {} `{}`'
.format('biospecimen', kf_id))

# Partial update - validate but allow missing required fields
body = request.get_json(force=True) or {}
try:
sa = BiospecimenSchema(strict=True).load(body, instance=sa,
partial=True).data
biospecimen = BiospecimenSchema(strict=True).load(
body, instance=biospecimen, partial=True
).data
except ValidationError as err:
abort(400, 'could not update biospecimen: {}'.format(err.messages))

db.session.add(sa)
db.session.add(biospecimen)
db.session.commit()

# Create the Biospecimen's associated Sample and Container
manager.manage_sample_containers(biospecimen)

return BiospecimenSchema(
200, 'biospecimen {} updated'.format(sa.kf_id)
).jsonify(sa), 200
200, 'biospecimen {} updated'.format(biospecimen.kf_id)
).jsonify(biospecimen), 200

def delete(self, kf_id):
"""
Expand Down
4 changes: 4 additions & 0 deletions dataservice/api/sample/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,11 @@ class Sample(db.Model, Base):


@event.listens_for(Container, 'after_delete')
@event.listens_for(Container, 'after_update')
def delete_orphans(mapper, connection, state):
"""
Delete samples with 0 child containers
"""
q = (db.session.query(Sample)
.filter(~Sample.containers.any()))
q.delete(synchronize_session='fetch')
Loading