Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nanodrop eight support #230

2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Increase test coverage of calculated data documents on Perkin Elmer Envision.
- Add calculated data documents to Moldev Softmax Pro
- Add multi-analyte-profiling BENCHLING/2024/01 schema
- Parser for Nanodrop Eight
- Add context manager to handle backups to schema generation script
- Add --regex argument to schema generation script

### Fixed
- Perkin Elmer Envision: calculated data name now captures string to the left - rather than right - of the ‘=’ in the Formula cell.

Expand Down
5 changes: 5 additions & 0 deletions src/allotropy/parser_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
from allotropy.parsers.roche_cedex_bioht.roche_cedex_bioht_parser import (
RocheCedexBiohtParser,
)
from allotropy.parsers.thermo_fisher_nanodrop_eight.nanodrop_eight_parser import (
NanodropEightParser,
)
from allotropy.parsers.unchained_labs_lunatic.unchained_labs_lunatic_parser import (
UnchainedLabsLunaticParser,
)
Expand All @@ -45,6 +48,7 @@ class Vendor(Enum):
NOVABIO_FLEX2 = "NOVABIO_FLEX2"
PERKIN_ELMER_ENVISION = "PERKIN_ELMER_ENVISION"
ROCHE_CEDEX_BIOHT = "ROCHE_CEDEX_BIOHT"
THERMO_FISHER_NANODROP_EIGHT = "THERMO_FISHER_NANODROP_EIGHT"
UNCHAINED_LABS_LUNATIC = "UNCHAINED_LABS_LUNATIC"


Expand All @@ -63,6 +67,7 @@ class Vendor(Enum):
Vendor.NOVABIO_FLEX2: NovaBioFlexParser,
Vendor.PERKIN_ELMER_ENVISION: PerkinElmerEnvisionParser,
Vendor.ROCHE_CEDEX_BIOHT: RocheCedexBiohtParser,
Vendor.THERMO_FISHER_NANODROP_EIGHT: NanodropEightParser,
Vendor.UNCHAINED_LABS_LUNATIC: UnchainedLabsLunaticParser,
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,302 @@
from collections.abc import Mapping
from typing import Optional, Union
import uuid

import pandas as pd

from allotropy.allotrope.models.shared.definitions.custom import (
TQuantityValueMicrogramPerMicroliter,
TQuantityValueMicrogramPerMilliliter,
TQuantityValueMilliAbsorbanceUnit,
TQuantityValueMilligramPerMilliliter,
TQuantityValueNanogramPerMicroliter,
TQuantityValueNanogramPerMilliliter,
TQuantityValueNanometer,
TQuantityValuePicogramPerMilliliter,
)
from allotropy.allotrope.models.shared.definitions.definitions import (
InvalidJsonFloat,
JsonFloat,
TQuantityValue,
)
from allotropy.allotrope.models.shared.definitions.units import UNITLESS
from allotropy.allotrope.models.spectrophotometry_benchling_2023_12_spectrophotometry import (
CalculatedDataAggregateDocument,
CalculatedDataDocumentItem,
DataSourceAggregateDocument1,
DataSourceDocumentItem,
DataSystemDocument,
DeviceSystemDocument,
MeasurementAggregateDocument,
Model,
SampleDocument,
SpectrophotometryAggregateDocument,
SpectrophotometryDocumentItem,
UltravioletAbsorbancePointDetectionDeviceControlAggregateDocument,
UltravioletAbsorbancePointDetectionDeviceControlDocumentItem,
UltravioletAbsorbancePointDetectionMeasurementDocumentItems,
)
from allotropy.constants import ASM_CONVERTER_NAME, ASM_CONVERTER_VERSION
from allotropy.named_file_contents import NamedFileContents
from allotropy.parsers.thermo_fisher_nanodrop_eight.nanodrop_eight_reader import (
NanoDropEightReader,
)
from allotropy.parsers.utils.values import assert_not_none
from allotropy.parsers.vendor_parser import VendorParser

ConcentrationType = Union[
TQuantityValueMicrogramPerMicroliter,
TQuantityValueMicrogramPerMilliliter,
TQuantityValueMilligramPerMilliliter,
TQuantityValueNanogramPerMicroliter,
TQuantityValueNanogramPerMilliliter,
TQuantityValuePicogramPerMilliliter,
]
ConcentrationClassType = Union[
type[TQuantityValueMicrogramPerMicroliter],
type[TQuantityValueMicrogramPerMilliliter],
type[TQuantityValueMilligramPerMilliliter],
type[TQuantityValueNanogramPerMicroliter],
type[TQuantityValueNanogramPerMilliliter],
type[TQuantityValuePicogramPerMilliliter],
]

CONCENTRATION_UNIT_TO_TQUANTITY: Mapping[str, ConcentrationClassType] = {
"ug/ul": TQuantityValueMicrogramPerMicroliter,
"ug/ml": TQuantityValueMicrogramPerMilliliter,
"mg/ml": TQuantityValueMilligramPerMilliliter,
"ng/ul": TQuantityValueNanogramPerMicroliter,
"ng/ml": TQuantityValueNanogramPerMilliliter,
"pg/ul": TQuantityValuePicogramPerMilliliter,
}


def _get_str_or_none(data_frame: pd.DataFrame, row: int, column: str) -> Optional[str]:
if column not in data_frame.columns:
return None

val = data_frame.iloc[row][column]
if pd.isna(val):
return None

return str(val)


def _get_str(data_frame: pd.DataFrame, row: int, column: str) -> str:
val = _get_str_or_none(data_frame=data_frame, row=row, column=column)

assert_not_none(val)

return str(val)


def _get_float(data_frame: pd.DataFrame, row: int, column: str) -> JsonFloat:
try:
return float(data_frame.iloc[row][column])
except (ValueError, TypeError):
return InvalidJsonFloat.NaN


def _get_concentration(
conc: JsonFloat, unit: Optional[str]
) -> Optional[ConcentrationType]:
if unit and unit in CONCENTRATION_UNIT_TO_TQUANTITY and isinstance(conc, float):
cls = CONCENTRATION_UNIT_TO_TQUANTITY[unit]
return cls(value=conc)

return None


class NanodropEightParser(VendorParser):
def to_allotrope(self, named_file_contents: NamedFileContents) -> Model:
contents, filename = named_file_contents
data = NanoDropEightReader.read(contents)
data = self._add_measurement_uuids(data)
return self._get_model(data, filename)

def _get_model(self, data: pd.DataFrame, filename: str) -> Model:
return Model(
field_asm_manifest="http://purl.allotrope.org/manifests/spectrophotometry/BENCHLING/2023/12/spectrophotometry.manifest",
spectrophotometry_aggregate_document=SpectrophotometryAggregateDocument(
spectrophotometry_document=self._get_spectrophotometry_document(data),
calculated_data_aggregate_document=CalculatedDataAggregateDocument(
calculated_data_document=self._get_calculated_data_document(data),
),
data_system_document=DataSystemDocument(
file_name=filename,
ASM_converter_name=ASM_CONVERTER_NAME,
ASM_converter_version=ASM_CONVERTER_VERSION,
),
device_system_document=DeviceSystemDocument(
model_number="Nanodrop Eight",
device_identifier="Nanodrop",
),
),
)

def _add_measurement_uuids(self, data: pd.DataFrame) -> pd.DataFrame:
bdworth marked this conversation as resolved.
Show resolved Hide resolved
data["A260 uuid"] = [str(uuid.uuid4()) for _ in range(len(data.index))]
data["A280 uuid"] = [str(uuid.uuid4()) for _ in range(len(data.index))]
return data

def _get_spectrophotometry_document(
self, data: pd.DataFrame
) -> list[SpectrophotometryDocumentItem]:
return [
self._get_spectrophotometry_document_item(data, i)
for i in range(len(data.index))
]

def _get_calculated_data_document(
self, data: pd.DataFrame
) -> list[CalculatedDataDocumentItem]:
calculated_data_documents = []
for i in range(len(data.index)):
if _get_str_or_none(data, i, "260/280"):
calculated_data_documents.append(self._get_260_280(data, i))

if _get_str_or_none(data, i, "260/230"):
calculated_data_documents.append(self._get_260_230(data, i))

return calculated_data_documents

def _get_260_280(self, data: pd.DataFrame, row: int) -> CalculatedDataDocumentItem:

return CalculatedDataDocumentItem(
calculated_data_name="A260/280",
calculated_result=TQuantityValue(
value=_get_float(data, row, "260/280"), unit=UNITLESS
),
calculated_data_identifier=str(uuid.uuid4()),
data_source_aggregate_document=DataSourceAggregateDocument1(
[
DataSourceDocumentItem(
data_source_feature="absorbance",
data_source_identifier=_get_str(data, row, "A260 uuid"),
),
DataSourceDocumentItem(
data_source_feature="absorbance",
data_source_identifier=_get_str(data, row, "A280 uuid"),
),
]
),
)

def _get_260_230(self, data: pd.DataFrame, row: int) -> CalculatedDataDocumentItem:
return CalculatedDataDocumentItem(
calculated_data_name="A260/230",
calculated_result=TQuantityValue(
value=_get_float(data, row, "260/230"), unit=UNITLESS
),
calculated_data_identifier=str(uuid.uuid4()),
data_source_aggregate_document=DataSourceAggregateDocument1(
[
DataSourceDocumentItem(
data_source_feature="absorbance",
data_source_identifier=_get_str(data, row, "A260 uuid"),
)
]
),
)

def _get_spectrophotometry_document_item(
self, data: pd.DataFrame, row: int
) -> SpectrophotometryDocumentItem:
return SpectrophotometryDocumentItem(
analyst=_get_str_or_none(data, row, "User ID"),
measurement_aggregate_document=MeasurementAggregateDocument(
measurement_time=self._get_date_time(
_get_str(data, row, "Date") + " " + _get_str(data, row, "Time")
),
experiment_type=_get_str_or_none(data, row, "NA Type"),
measurement_document=self._get_measurement_document(data=data, row=row),
),
)

def _get_measurement_document(
self, data: pd.DataFrame, row: int
) -> list[UltravioletAbsorbancePointDetectionMeasurementDocumentItems]:
measurement_docs = []
if _get_str_or_none(data, row, "A260"):
measurement_docs.append(
UltravioletAbsorbancePointDetectionMeasurementDocumentItems(
measurement_identifier=_get_str(data, row, "A260 uuid"),
sample_document=SampleDocument(
sample_identifier=_get_str(data, row, "Sample ID")
if _get_str_or_none(data, row, "Sample ID")
else "NA",
well_plate_identifier=_get_str_or_none(data, row, "Plate ID"),
location_identifier=_get_str_or_none(data, row, "Well"),
),
# capture concentration on the A260 measurement document if the experiment type is
# DNA or RNA, protein and other concentration is captured on A280 measurment
mass_concentration=_get_concentration(
_get_float(data, row, str(self._get_concentration_col(data))),
_get_str_or_none(data, row, "Units"),
)
if _get_str_or_none(data, row, "NA Type")
and "NA" in _get_str(data, row, "NA Type")
and self._get_concentration_col(data)
else None,
device_control_aggregate_document=UltravioletAbsorbancePointDetectionDeviceControlAggregateDocument(
device_control_document=[
UltravioletAbsorbancePointDetectionDeviceControlDocumentItem(
device_type="absorbance detector",
detector_wavelength_setting=TQuantityValueNanometer(
value=260
),
)
]
),
absorbance=TQuantityValueMilliAbsorbanceUnit(
_get_float(data, row, "A260")
),
)
)
a280_col = "A280"
if a280_col not in data.columns and "A280 10mm" in data.columns:
a280_col = "A280 10mm"
if _get_str_or_none(data, row, a280_col):
measurement_docs.append(
UltravioletAbsorbancePointDetectionMeasurementDocumentItems(
measurement_identifier=_get_str(data, row, "A280 uuid"),
sample_document=SampleDocument(
sample_identifier=_get_str(data, row, "Sample ID")
if _get_str_or_none(data, row, "Sample ID")
else "NA",
well_plate_identifier=_get_str_or_none(data, row, "Plate ID"),
location_identifier=_get_str_or_none(data, row, "Well"),
),
# capture concentration on the A280 measurement document if the experiment type is
# something other than DNA or RNA
mass_concentration=_get_concentration(
_get_float(data, row, str(self._get_concentration_col(data))),
_get_str_or_none(data, row, "Units"),
)
if _get_str_or_none(data, row, "NA Type")
and "NA" not in _get_str(data, row, "NA Type")
and self._get_concentration_col(data)
else None,
device_control_aggregate_document=UltravioletAbsorbancePointDetectionDeviceControlAggregateDocument(
device_control_document=[
UltravioletAbsorbancePointDetectionDeviceControlDocumentItem(
device_type="absorbance detector",
detector_wavelength_setting=TQuantityValueNanometer(
value=280
),
)
]
),
absorbance=TQuantityValueMilliAbsorbanceUnit(
_get_float(data, row, a280_col)
),
)
)

return measurement_docs

def _get_concentration_col(self, data: pd.DataFrame) -> Optional[str]:
for col in data.columns:
if col.lower() in ["conc.", "conc", "concentration"]:
nathan-stender marked this conversation as resolved.
Show resolved Hide resolved
return col
return None
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from io import StringIO

import pandas as pd

from allotropy.parsers import lines_reader
from allotropy.parsers.lines_reader import CsvReader
from allotropy.types import IOType


class NanoDropEightReader:
@classmethod
def read(cls, contents: IOType) -> pd.DataFrame:
all_lines = lines_reader.read_to_lines(contents)
reader = CsvReader(all_lines)
lines = reader.pop_csv_block_as_lines()
raw_data = pd.read_csv(
StringIO("\n".join(lines)),
sep="\t",
dtype={"Plate ID": str, "Sample ID": str},
bdworth marked this conversation as resolved.
Show resolved Hide resolved
# Prevent pandas from rounding decimal values, at the cost of some speed.
float_precision="round_trip",
)
raw_data = raw_data.rename(columns=lambda x: x.strip())

return raw_data
Empty file.
Loading