-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #34 from TRON-Bioinformatics/develop
Separate the computation of the cooccurrence matrix in a separate operation
- Loading branch information
Showing
12 changed files
with
195 additions
and
66 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,56 +1,21 @@ | ||
from itertools import combinations | ||
from typing import Union | ||
|
||
from sqlalchemy.orm import Session | ||
from covigator.database.model import VariantCooccurrence, DataSource | ||
from logzero import logger | ||
from covigator.database.queries import Queries | ||
from sqlalchemy.exc import IntegrityError | ||
|
||
|
||
class CooccurrenceMatrixException(Exception): | ||
pass | ||
|
||
|
||
class CooccurrenceMatrix: | ||
|
||
def compute(self, run_accession: str, source: DataSource, session: Session): | ||
def compute(self, run_accession: str, source: str, session: Session, maximum_length: int = 10): | ||
|
||
assert run_accession is not None or run_accession == "", "Missing sample identifier" | ||
assert session is not None, "Missing DB session" | ||
|
||
queries = Queries(session=session) | ||
sample_id = run_accession | ||
logger.info("Processing cooccurrent variants for sample {}".format(sample_id)) | ||
|
||
# the order by position is important to ensure we store only half the matrix and the same half of the matrix | ||
variants = queries.get_variants_by_sample(sample_id, source=source.name) | ||
failed_variants = [] | ||
variant_ids = queries.get_variant_ids_by_sample(sample_id, source=source, maximum_length=maximum_length) | ||
|
||
# process all pairwise combinations without repetitions including the diagoonal | ||
for (variant_one, variant_two) in list(combinations(variants, 2)) + list(zip(variants, variants)): | ||
try: | ||
variant_cooccurrence = queries.get_variant_cooccurrence(variant_one, variant_two) | ||
if variant_cooccurrence is None: | ||
variant_cooccurrence = VariantCooccurrence( | ||
variant_id_one=variant_one.variant_id, | ||
variant_id_two=variant_two.variant_id, | ||
count=1 | ||
) | ||
session.add(variant_cooccurrence) | ||
session.commit() | ||
else: | ||
# NOTE: it is important to increase the counter like this to avoid race conditions | ||
# the increase happens in the database server and not in python | ||
# see https://stackoverflow.com/questions/2334824/how-to-increase-a-counter-in-sqlalchemy | ||
variant_cooccurrence.count = VariantCooccurrence.count + 1 | ||
except IntegrityError: | ||
session.rollback() | ||
failed_variants.append((variant_one, variant_two)) | ||
|
||
# tries again the failed variants as these are expected to be there now | ||
for (variant_one, variant_two) in failed_variants: | ||
variant_cooccurrence = queries.get_variant_cooccurrence(variant_one, variant_two) | ||
if variant_cooccurrence is None: | ||
raise CooccurrenceMatrixException("Some cooccurrent variants failed to be persisted twice") | ||
variant_cooccurrence.count = VariantCooccurrence.count + 1 | ||
for (variant_id_one, variant_id_two) in list(combinations(variant_ids, 2)) + list(zip(variant_ids, variant_ids)): | ||
queries.increment_variant_cooccurrence(variant_id_one, variant_id_two, source) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from sqlalchemy.orm import Session | ||
from covigator.database.model import DataSource, JobStatus | ||
from covigator.database.queries import Queries | ||
from covigator.pipeline.cooccurrence_matrix import CooccurrenceMatrix | ||
from logzero import logger | ||
|
||
|
||
class CooccurrenceMatrixLoader: | ||
|
||
def __init__(self, session: Session): | ||
self.session = session | ||
self.queries = Queries(session=self.session) | ||
self.cooccurrence_matrix = CooccurrenceMatrix() | ||
|
||
def load(self, data_source: str, maximum_length: int): | ||
|
||
# deletes the database before loading | ||
self.session.query(self.queries.get_variant_cooccurrence_klass(data_source)).delete() | ||
|
||
# iterates over every sample in FINISHED status and computes the cooccurrence matrix | ||
sample_klass = self.queries.get_sample_klass(data_source) | ||
count_samples = self.queries.count_samples(source=data_source, cache=False) | ||
computed = 0 | ||
query = self.session.query(sample_klass).filter(sample_klass.status == JobStatus.FINISHED) | ||
for sample in self.queries.windowed_query(query=query, column=sample_klass.run_accession, windowsize=1000): | ||
self.cooccurrence_matrix.compute(sample.run_accession, data_source, self.session, | ||
maximum_length=maximum_length) | ||
computed += 1 | ||
if computed % 1000 == 0: | ||
logger.info('Processed cooccurrence over {}/{} ({}) samples'.format( | ||
computed, count_samples, round(float(computed) / count_samples, 3))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.