-
Notifications
You must be signed in to change notification settings - Fork 0
/
source.py
87 lines (67 loc) · 2.26 KB
/
source.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from Bio import SeqIO
import pandas as pd
from utils.constants import DHS_DATA_COLUMNS, REFERENCE_GENOME_FILETYPE
from utils.exceptions import UnimplementedMethodException
class DataSource:
def __init__(self, data, filepath):
self.raw_data = data
self.filepath = filepath
@classmethod
def from_path(cls, path):
raise UnimplementedMethodException()
@classmethod
def from_dict(cls, data_dict):
raise UnimplementedMethodException()
@property
def data(self):
return self.raw_data
class DHSAnnotations(DataSource):
"""Object for quickly loading DHS annotations and relevant columns.
"""
@classmethod
def from_path(cls, path):
df = pd.read_csv(path, sep='\t', dtype={'identifier': str})
return cls(df, path)
@classmethod
def from_dict(cls, data_dict):
df = pd.DataFrame.from_dict(data_dict)
return cls(df, filepath=None)
@property
def data(self):
return self.raw_data[DHS_DATA_COLUMNS]
class NMFLoadings(DataSource):
"""Object for quickly loading NMF loading data.
"""
@classmethod
def from_path(cls, path):
df = pd.read_csv(path, sep='\t')
df.set_index([df.columns.values[0]], inplace=True)
df.index.names = [None]
return cls(df, path)
@classmethod
def from_dict(cls, data_dict):
df = pd.DataFrame.from_dict(data_dict)
return cls(df, filepath=None)
class ReferenceGenome(DataSource):
"""Object for quickly loading and querying the reference genome.
"""
@classmethod
def from_path(cls, path):
genome_dict = {
record.id : record.seq
for record in SeqIO.parse(path, REFERENCE_GENOME_FILETYPE)
}
return cls(genome_dict, path)
@classmethod
def from_dict(cls, data_dict):
return cls(data_dict, filepath=None)
@property
def genome(self):
return self.data
def sequence(self, chrom, start, end):
chrom_sequence = self.genome[chrom]
assert end < len(chrom_sequence), (
f"Sequence position bound out of range for chromosome {chrom}. "
f"{chrom} length {len(chrom_sequence)}, requested position {end}."
)
return chrom_sequence[start:end]