Skip to content

Commit

Permalink
Merge pull request #49 from TRON-Bioinformatics/develop
Browse files Browse the repository at this point in the history
Upgrade pangolin + remove GISAID from data model
  • Loading branch information
priesgo authored Sep 13, 2022
2 parents 8118281 + 3eb4188 commit f475f19
Show file tree
Hide file tree
Showing 16 changed files with 59 additions and 317 deletions.
4 changes: 2 additions & 2 deletions covigator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
VERSION = "v0.8.2"
ANALYSIS_PIPELINE_VERSION = "v0.12.0"
VERSION = "v0.9.0"
ANALYSIS_PIPELINE_VERSION = "v0.13.0"

MISSENSE_VARIANT = "missense_variant"
SYNONYMOUS_VARIANT = "synonymous_variant"
Expand Down
2 changes: 1 addition & 1 deletion covigator/accessor/ena_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def _get_ena_runs_page(self):
try:
json = response.json()
except JSONDecodeError as e:
logger.error("Response content: {}".format(response.content))
logger.exception(e)
raise e
return json

Expand Down
2 changes: 1 addition & 1 deletion covigator/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def log_configuration(self):

def initialise_logs(logfile, sample_id: str = None):
if logfile is not None:
logzero.logfile(logfile)
logzero.logfile(logfile, maxBytes=1048576, backupCount=10)
logzero.loglevel(logging.INFO)
if sample_id is not None:
logzero.formatter(logging.Formatter(
Expand Down
201 changes: 4 additions & 197 deletions covigator/database/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,81 +117,6 @@ class VariantType(enum.Enum):
MNV = 4


class SampleGisaid(Base):
"""
The table that holds all metadata for a GISAID sample
"""
__tablename__ = SAMPLE_GISAID_TABLE_NAME

run_accession = Column(String, primary_key=True)
# DEPRECATED
finished = Column(Boolean)
collection_date = Column(Date)
# Host information
host_tax_id = Column(String)
host = Column(String)
# geographical data
country_raw = Column(String)
region = Column(String)
country = Column(String)
country_alpha_2 = Column(String)
country_alpha_3 = Column(String)
continent = Column(String)
continent_alpha_2 = Column(String)
site = Column(String)
site2 = Column(String)

# sequence information
sequence = Column(JSON)
sequence_length = Column(Integer)
count_n_bases = Column(Integer)
count_ambiguous_bases = Column(Integer)

# counts of variants
count_snvs = Column(Integer)
count_insertions = Column(Integer)
count_deletions = Column(Integer)

# job status
status = Column(Enum(JobStatus, name=JobStatus.__constraint_name__), default=JobStatus.PENDING)
created_at = Column(DateTime(timezone=True), nullable=False, default=datetime.now())
queued_at = Column(DateTime(timezone=True))
analysed_at = Column(DateTime(timezone=True))
loaded_at = Column(DateTime(timezone=True))
failed_at = Column(DateTime(timezone=True))
error_message = Column(String)

# output files
sample_folder = Column(String)
vcf_path = Column(String)
fasta_path = Column(String)
pangolin_path = Column(String)

# pango output (corresponding only to LoFreq)
pangolin_lineage = Column(String)
pangolin_conflict = Column(Float)
pangolin_ambiguity_score = Column(Float)
pangolin_scorpio_call = Column(String)
pangolin_scorpio_support = Column(Float)
pangolin_scorpio_conflict = Column(Float)
pangolin_version = Column(String)
pangolin_pangolin_version = Column(String)
# TODO: remove upper case from here, it confuses SQL for some operations
pangolin_pangoLEARN_version = Column(String)
pangolin_pango_version = Column(String)
pangolin_status = Column(String)
pangolin_note = Column(String)

covigator_accessor_version = Column(String)
covigator_processor_version = Column(String)

def get_sample_folder(self, base_folder):
return os.path.join(
base_folder,
self.collection_date.strftime("%Y%m%d") if self.collection_date is not None else "nodate",
self.run_accession)


class SampleEna(Base):
"""
The table that holds all metadata for a ENA sample
Expand Down Expand Up @@ -306,10 +231,10 @@ class SampleEna(Base):
pangolin_scorpio_conflict = Column(Float)
pangolin_version = Column(String)
pangolin_pangolin_version = Column(String)
# TODO: remove upper case from here, it confuses SQL for some operations
pangolin_pangoLEARN_version = Column(String)
pangolin_pango_version = Column(String)
pangolin_status = Column(String)
pangolin_scorpio_version = Column(String)
pangolin_constellation_version = Column(String)
pangolin_qc_status = Column(String)
pangolin_qc_notes = Column(String)
pangolin_note = Column(String)

# Picard deduplicatio output
Expand Down Expand Up @@ -500,48 +425,6 @@ def get_variant_id(self):
return "{}:{}>{}".format(self.position, self.reference, self.alternate)


class GisaidVariant(Base):
__tablename__ = GISAID_VARIANT_TABLE_NAME

variant_id = Column(String, primary_key=True)
chromosome = Column(String)
position = Column(Integer, index=True)
reference = Column(String)
alternate = Column(String)
overlaps_multiple_genes = Column(Boolean, default=False)

annotation = Column(String, index=True)
annotation_highest_impact = Column(String, index=True)
annotation_impact = Column(String, index=True)
gene_name = Column(String, index=True)
gene_id = Column(String)
biotype = Column(String)
hgvs_c = Column(String)
hgvs_p = Column(String)
cdna_pos_length = Column(String)
cds_pos_length = Column(String)
aa_pos_length = Column(String)

# derived annotations
variant_type = Column(Enum(VariantType, name=VariantType.__constraint_name__))
length = Column(Integer)
reference_amino_acid = Column(String)
alternate_amino_acid = Column(String)
position_amino_acid = Column(Integer)

# ConsHMM conservation annotations
cons_hmm_sars_cov_2 = Column(Float)
cons_hmm_sarbecovirus = Column(Float)
cons_hmm_vertebrate_cov = Column(Float)

# Pfam protein domains
pfam_name = Column(String)
pfam_description = Column(String)

def get_variant_id(self):
return "{}:{}>{}".format(self.position, self.reference, self.alternate)


class VariantObservation(Base):
"""
A variant observation in a particular sample. This contains all annotations of a specific observation of a variant.
Expand Down Expand Up @@ -736,70 +619,6 @@ class LowFrequencyVariantObservation(Base):
)


class GisaidVariantObservation(Base):
"""
A variant observation in a particular sample. This contains all annotations of a specific observation of a variant.
"""
__tablename__ = GISAID_VARIANT_OBSERVATION_TABLE_NAME

sample = Column(String, primary_key=True)
variant_id = Column(String, primary_key=True)
chromosome = Column(String)
position = Column(Integer)
reference = Column(String)
alternate = Column(String)
quality = Column(Float)
filter = Column(String)
dp = Column(Integer)
ac = Column(Integer)
dp4_ref_forward = Column(Integer)
dp4_ref_reverse = Column(Integer)
dp4_alt_forward = Column(Integer)
dp4_alt_reverse = Column(Integer)
vaf = Column(Float)
strand_bias = Column(Integer)

# fields replicated from Variant for performance reasons
annotation = Column(String)
annotation_impact = Column(String)
biotype = Column(String)
annotation_highest_impact = Column(String, index=True)
gene_name = Column(String)
hgvs_c = Column(String)
hgvs_p = Column(String)

# fields replicated from sample for performance reasons
date = Column(Date)

# derived annotations
variant_type = Column(Enum(VariantType, name=VariantType.__constraint_name__))
length = Column(Integer)
reference_amino_acid = Column(String)
alternate_amino_acid = Column(String)
position_amino_acid = Column(Integer)

# ConsHMM conservation annotations
cons_hmm_sars_cov_2 = Column(Float)
cons_hmm_sarbecovirus = Column(Float)
cons_hmm_vertebrate_cov = Column(Float)

# Pfam protein domains
pfam_name = Column(String)
pfam_description = Column(String)

ForeignKeyConstraint([sample], [SampleGisaid.run_accession])
ForeignKeyConstraint([variant_id], [GisaidVariant.variant_id])

__table_args__ = (
Index("{}_index_annotation_position".format(GISAID_VARIANT_OBSERVATION_TABLE_NAME),
"annotation_highest_impact", "position"),
Index("{}_index_sample".format(GISAID_VARIANT_OBSERVATION_TABLE_NAME), "sample"),
Index("{}_index_position".format(GISAID_VARIANT_OBSERVATION_TABLE_NAME), "position"),
Index("{}_index_annotation".format(GISAID_VARIANT_OBSERVATION_TABLE_NAME), "annotation_highest_impact"),
Index("{}_index_variant_id".format(GISAID_VARIANT_OBSERVATION_TABLE_NAME), "variant_id")
)


class VariantCooccurrence(Base):

__tablename__ = VARIANT_COOCCURRENCE_TABLE_NAME
Expand All @@ -812,18 +631,6 @@ class VariantCooccurrence(Base):
ForeignKeyConstraint([variant_id_two], [Variant.variant_id])


class GisaidVariantCooccurrence(Base):

__tablename__ = GISAID_VARIANT_COOCCURRENCE_TABLE_NAME

variant_id_one = Column(String, primary_key=True)
variant_id_two = Column(String, primary_key=True)
count = Column(Integer, default=0)

ForeignKeyConstraint([variant_id_one], [GisaidVariant.variant_id])
ForeignKeyConstraint([variant_id_two], [GisaidVariant.variant_id])


class CovigatorModule(enum.Enum):

__constraint_name__ = COVIGATOR_MODULE_CONSTRAINT_NAME
Expand Down
29 changes: 8 additions & 21 deletions covigator/database/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
from sqlalchemy.sql.sqltypes import NullType
from covigator import SYNONYMOUS_VARIANT
from covigator.database.model import DataSource, SampleEna, JobStatus, \
VariantObservation, Gene, Variant, VariantCooccurrence, Conservation, SampleGisaid, \
VariantObservation, Gene, Variant, VariantCooccurrence, Conservation, \
SubclonalVariantObservation, PrecomputedVariantsPerSample, PrecomputedSubstitutionsCounts, PrecomputedIndelLength, \
VariantType, PrecomputedAnnotation, PrecomputedOccurrence, PrecomputedTableCounts, \
PrecomputedVariantAbundanceHistogram, PrecomputedSynonymousNonSynonymousCounts, RegionType, Domain, \
GisaidVariantObservation, GisaidVariant, LastUpdate, GisaidVariantCooccurrence
LastUpdate
from covigator.exceptions import CovigatorQueryException, CovigatorDashboardMissingPrecomputedData


Expand All @@ -26,8 +26,6 @@ def __init__(self, session: Session):
def get_variant_observation_klass(source: str):
if source == DataSource.ENA.name:
klass = VariantObservation
elif source == DataSource.GISAID.name:
klass = GisaidVariantObservation
else:
raise CovigatorQueryException("Bad data source: {}".format(source))
return klass
Expand All @@ -36,8 +34,6 @@ def get_variant_observation_klass(source: str):
def get_variant_klass(source: str):
if source == DataSource.ENA.name:
klass = Variant
elif source == DataSource.GISAID.name:
klass = GisaidVariant
else:
raise CovigatorQueryException("Bad data source: {}".format(source))
return klass
Expand All @@ -46,8 +42,6 @@ def get_variant_klass(source: str):
def get_sample_klass(source: str):
if source == DataSource.ENA.name:
klass = SampleEna
elif source == DataSource.GISAID.name:
klass = SampleGisaid
else:
raise CovigatorQueryException("Bad data source: {}".format(source))
return klass
Expand All @@ -56,23 +50,21 @@ def get_sample_klass(source: str):
def get_variant_cooccurrence_klass(source: str):
if source == DataSource.ENA.name:
klass = VariantCooccurrence
elif source == DataSource.GISAID.name:
klass = GisaidVariantCooccurrence
else:
raise CovigatorQueryException("Bad data source: {}".format(source))
return klass

def find_job_by_accession_and_status(
self, run_accession: str, status: JobStatus, data_source: DataSource) -> Union[SampleEna, SampleGisaid]:
self, run_accession: str, status: JobStatus, data_source: DataSource) -> Union[SampleEna]:
klass = self.get_sample_klass(source=data_source.name)
return self.session.query(klass)\
.filter(and_(klass.run_accession == run_accession, klass.status == status)).first()

def find_job_by_accession(self, run_accession: str, data_source: DataSource) -> Union[SampleEna, SampleGisaid]:
def find_job_by_accession(self, run_accession: str, data_source: DataSource) -> Union[SampleEna]:
klass = self.get_sample_klass(source=data_source.name)
return self.session.query(klass).filter(klass.run_accession == run_accession).first()

def find_first_by_status(self, data_source: DataSource, status, n=100) -> List[Union[SampleEna, SampleGisaid]]:
def find_first_by_status(self, data_source: DataSource, status, n=100) -> List[Union[SampleEna]]:
klass = self.get_sample_klass(source=data_source.name)
return self.session.query(klass) \
.filter(klass.status.in_(status)) \
Expand All @@ -81,10 +73,10 @@ def find_first_by_status(self, data_source: DataSource, status, n=100) -> List[U
.all()

def find_first_pending_jobs(
self, data_source: DataSource, n=100, status: List = [JobStatus.DOWNLOADED]) -> List[Union[SampleEna, SampleGisaid]]:
self, data_source: DataSource, n=100, status: List = [JobStatus.DOWNLOADED]) -> List[Union[SampleEna]]:
return self.find_first_by_status(data_source=data_source, status=status, n=n)

def find_first_jobs_to_download(self, data_source: DataSource, n=100) -> List[Union[SampleEna, SampleGisaid]]:
def find_first_jobs_to_download(self, data_source: DataSource, n=100) -> List[Union[SampleEna]]:
return self.find_first_by_status(data_source=data_source, status=[JobStatus.PENDING], n=n)

def count_jobs_in_queue(self, data_source):
Expand All @@ -94,7 +86,7 @@ def count_jobs_by_status(self, data_source: DataSource, status: JobStatus):
klass = self.get_sample_klass(source=data_source.name)
return self.session.query(klass).filter(klass.status == status).count()

def find_sample_by_accession(self, run_accession: str, source: DataSource) -> Union[SampleEna, SampleGisaid]:
def find_sample_by_accession(self, run_accession: str, source: DataSource) -> Union[SampleEna]:
klass = self.get_sample_klass(source=source.name)
return self.session.query(klass).filter(klass.run_accession == run_accession).first()

Expand Down Expand Up @@ -344,11 +336,6 @@ def count_samples(self, source: str, cache=True) -> int:
PrecomputedTableCounts.table == SampleEna.__name__,
PrecomputedTableCounts.factor == PrecomputedTableCounts.FACTOR_SOURCE
))
elif source == DataSource.GISAID.name:
query = query.filter(and_(
PrecomputedTableCounts.table == SampleGisaid.__name__,
PrecomputedTableCounts.factor == PrecomputedTableCounts.FACTOR_SOURCE
))
result = query.first()
if result is None:
raise CovigatorDashboardMissingPrecomputedData
Expand Down
4 changes: 2 additions & 2 deletions covigator/pipeline/vcf_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

from covigator import MISSENSE_VARIANT
from covigator.database.model import Variant as CovigatorVariant, VariantObservation, \
SubclonalVariantObservation, SampleEna, SampleGisaid, DataSource, VariantType, GisaidVariantObservation, \
LowFrequencyVariantObservation, GisaidVariant, SubclonalVariant, LowFrequencyVariant
SubclonalVariantObservation, SampleEna, DataSource, VariantType, \
LowFrequencyVariantObservation, SubclonalVariant, LowFrequencyVariant
from covigator.database.queries import Queries
from covigator.exceptions import CovigatorNotSupportedVariant

Expand Down
4 changes: 2 additions & 2 deletions covigator/precomputations/load_cooccurrences.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from sqlalchemy.orm import Session

from covigator import SYNONYMOUS_VARIANT
from covigator.database.model import JobStatus, VariantCooccurrence, GisaidVariantCooccurrence
from covigator.database.model import JobStatus, VariantCooccurrence
from covigator.database.queries import Queries
from logzero import logger

Expand Down Expand Up @@ -72,7 +72,7 @@ def _get_from_cache(self, variant_id_one: str, variant_id_two: str):
return self.cache.get(self._unique_id(variant_id_one, variant_id_two), None)

def _store_in_cache(self, variant_id_one: str, variant_id_two: str,
entry: Union[VariantCooccurrence, GisaidVariantCooccurrence]):
entry: Union[VariantCooccurrence]):
self.cache[self._unique_id(variant_id_one, variant_id_two)] = entry

def _commit_cache(self):
Expand Down
Loading

0 comments on commit f475f19

Please sign in to comment.