Skip to content

Commit

Permalink
Quantile scaling
Browse files Browse the repository at this point in the history
  • Loading branch information
iiSeymour committed Sep 2, 2022
1 parent 54eeab9 commit f2a3a8e
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 74 deletions.
30 changes: 12 additions & 18 deletions bonito/fast5.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __init__(self, read, filename, meta=False):
channel_info = read.handle[read.global_key + 'channel_id'].attrs

self.offset = int(channel_info['offset'])
self.sampling_rate = channel_info['sampling_rate']
self.sample_rate = channel_info['sampling_rate']
self.scaling = channel_info['range'] / channel_info['digitisation']

self.mux = read_attrs['start_mux']
Expand All @@ -63,29 +63,23 @@ def __init__(self, read, filename, meta=False):
if type(self.channel) in (bytes, np.bytes_):
self.channel = self.channel.decode()

self.start = read_attrs['start_time'] / self.sampling_rate
self.duration = read_attrs['duration'] / self.sampling_rate
self.start = read_attrs['start_time'] / self.sample_rate
self.duration = read_attrs['duration'] / self.sample_rate

exp_start_dt = parser.parse(self.exp_start_time)
start_time = exp_start_dt + timedelta(seconds=self.start)
self.start_time = start_time.replace(microsecond=0).isoformat()

raw = read.handle[read.raw_dataset_name][:]
scaled = np.array(self.scaling * (raw + self.offset), dtype=np.float32)
self.num_samples = len(scaled)

trim_start, _ = bonito.reader.trim(scaled[:8000])
scaled = scaled[trim_start:]
self.trimmed_samples = trim_start
self.template_start = self.start + (1 / self.sampling_rate) * trim_start
self.template_duration = self.duration - (1 / self.sampling_rate) * trim_start

if len(scaled) > 8000:
self.med, self.mad = bonito.reader.med_mad(scaled)
self.mad = max(1.0, self.mad)
self.signal = (scaled - self.med) / self.mad
else:
self.signal, (self.med, self.mad) = bonito.reader.norm_by_noisiest_section(scaled, return_medmad=True)
self.scaled = np.array(self.scaling * (raw + self.offset), dtype=np.float32)
self.num_samples = len(self.scaled)

self.shift, self.scale = bonito.reader.normalisation(self.scaled)
self.trimmed_samples, _ = bonito.reader.trim(self.scaled, self.shift, self.scale)
self.template_start = self.start + (self.trimmed_samples / self.sample_rate)
self.template_duration = self.duration - (self.trimmed_samples / self.sample_rate)

self.signal = (self.scaled[self.trimmed_samples:] - self.shift) / self.scale


def get_meta_data(filename, read_ids=None, skip=False):
Expand Down
20 changes: 6 additions & 14 deletions bonito/pod5.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,23 +54,15 @@ def __init__(self, read, filename, meta=False):
self.calibration = read.calibration
self.scaling = self.calibration.scale
self.offset = self.calibration.offset
self.scaled = self.scaling * (self.raw.astype(np.float32) + self.offset)

scaled = self.scaling * (self.raw.astype(np.float32) + self.offset)
trim_start, _ = bonito.reader.trim(scaled[:8000])
scaled = scaled[trim_start:]
self.trimmed_samples = trim_start
self.shift, self.scale = bonito.reader.normalisation(self.scaled)
self.trimmed_samples, _ = bonito.reader.trim(self.scaled, self.shift, self.scale)

self.template_start = self.start + (trim_start / self.sample_rate)
self.template_duration = self.duration - (trim_start / self.sample_rate)
self.template_start = self.start + (self.trimmed_samples / self.sample_rate)
self.template_duration = self.duration - (self.trimmed_samples / self.sample_rate)

self.signal = scaled

if len(scaled) > 8000:
self.med, self.mad = bonito.reader.med_mad(scaled)
self.mad = max(1.0, self.mad)
self.signal = (scaled - self.med) / self.mad
else:
self.signal, (self.med, self.mad) = bonito.reader.norm_by_noisiest_section(scaled, return_medmad=True)
self.signal = (scaled[self.trimmed_samples:] - self.shift) / self.scale


def pod5_reads(pod5_file, read_ids, skip=False):
Expand Down
53 changes: 11 additions & 42 deletions bonito/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ def tagdata(self):
f"st:Z:{self.start_time}",
f"rn:i:{self.read_number}",
f"f5:Z:{self.filename}",
f"sm:f:{self.med}",
f"sd:f:{self.mad}",
f"sv:Z:med_mad",
f"sm:f:{self.shift}",
f"sd:f:{self.scale}",
f"sv:Z:quantile",
]


Expand Down Expand Up @@ -108,14 +108,12 @@ def read_chunks(read, chunksize=4000, overlap=400):
yield ReadChunk(read, block.numpy(), i+1, blocks.shape[0])


def trim(signal, window_size=40, threshold_factor=2.4, min_elements=3):
def trim(signal, shift, scale, window_size=40, threshold_factor=2.4, min_elements=3):

min_trim = 10
signal = signal[min_trim:]

med, mad = med_mad(signal[-(window_size*100):])

threshold = med + mad * threshold_factor
threshold = shift + scale * threshold_factor
num_windows = len(signal) // window_size

seen_peak = False
Expand All @@ -133,40 +131,11 @@ def trim(signal, window_size=40, threshold_factor=2.4, min_elements=3):
return min_trim, len(signal)


def med_mad(x, factor=1.4826):
"""
Calculate signal median and median absolute deviation
"""
med = np.median(x)
mad = np.median(np.absolute(x - med)) * factor + np.finfo(np.float32).eps
return med, mad


def norm_by_noisiest_section(signal, samples=100, threshold=6.0, return_medmad=False):
def normalisation(sig):
"""
Normalise using the medmad from the longest continuous region where the
noise is above some threshold relative to the std of the full signal.
Calculate signal shift and scale factors for normalisation..
"""
threshold = signal.std() / threshold
noise = np.ones(signal.shape)

for idx in np.arange(signal.shape[0] // samples):
window = slice(idx * samples, (idx + 1) * samples)
noise[window] = np.where(signal[window].std() > threshold, 1, 0)

# start and end low for peak finding
noise[0] = 0; noise[-1] = 0
peaks, info = find_peaks(noise, width=(None, None))

if len(peaks):
widest = np.argmax(info['widths'])
med, mad = med_mad(signal[info['left_bases'][widest]: info['right_bases'][widest]])
else:
med, mad = med_mad(signal)

mad = max(1.0, mad)
scaled = (signal - med) / mad

if return_medmad:
return scaled, (med, mad)
return scaled
q20, q90 = np.quantile(sig, [0.2, 0.9])
shift = max(10, 0.51 * (q20 + q90))
scale = max(1.0, 0.53 * (q90 - q20))
return shift, scale

0 comments on commit f2a3a8e

Please sign in to comment.