diff --git a/tools/misc/.shed.yml b/tools/misc/.shed.yml index 402c30ac..cd86a101 100644 --- a/tools/misc/.shed.yml +++ b/tools/misc/.shed.yml @@ -13,3 +13,11 @@ repositories: - use_theoretical_mz_annotations.py - macros.xml - test-data + target_screen: + description: "Extract peaks from recetox-aplcms tables using a list of target ions" + long_description: | + "Extract peaks from recetox-aplcms tables using a list of target ions. MZ and RT tolerances are used to find matching features." + include: + - target_screen.xml + - target_screen.py + - test-data \ No newline at end of file diff --git a/tools/misc/target_screen.py b/tools/misc/target_screen.py new file mode 100644 index 00000000..6a032d5a --- /dev/null +++ b/tools/misc/target_screen.py @@ -0,0 +1,59 @@ +import argparse + +import numpy as np +import pandas as pd + + +def mz_match(marker, peak, ppm): + return np.abs(marker - peak) <= ((peak + marker) / 2) * ppm * 1e-06 + + +def rt_match(marker, peak, tol): + return np.abs(marker - peak) <= tol + + +def find_matches(peaks, markers, ppm, rt_tol): + # Create a meshgrid of all combinations of mz and rt values + marker_mz = markers['mz'].values[:, np.newaxis] + peak_mz = peaks['mz'].values + marker_rt = markers['rt'].values[:, np.newaxis] + peak_rt = peaks['rt'].values + + # Calculate mz and rt matches + mz_matches = mz_match(marker_mz, peak_mz, ppm) + rt_matches = rt_match(marker_rt, peak_rt, rt_tol) + + # Find the indices where both mz and rt match + match_indices = np.where(mz_matches & rt_matches) + + # Create a DataFrame of hits + matched_markers = markers.iloc[match_indices[0]].reset_index(drop=True) + matched_peaks = peaks.iloc[match_indices[1]].reset_index(drop=True) + hits = pd.concat([matched_markers[['formula']].reset_index(drop=True), matched_peaks], axis=1) + + # Calculate mz and rt differences + hits['mz_diff'] = np.abs(matched_markers['mz'].values - matched_peaks['mz'].values) + hits['rt_diff'] = np.abs(matched_markers['rt'].values - matched_peaks['rt'].values) + + return hits + + +def main(): + parser = argparse.ArgumentParser(description='Find matches between peaks and markers.') + parser.add_argument('--peaks', required=True, help='Path to the peaks parquet file.') + parser.add_argument('--markers', required=True, help='Path to the markers CSV file.') + parser.add_argument('--output', required=True, help='Path to the output TSV file.') + parser.add_argument('--ppm', type=int, default=5, help='PPM tolerance for mz matching.') + parser.add_argument('--rt_tol', type=int, default=10, help='RT tolerance for rt matching.') + args = parser.parse_args() + + peaks = pd.read_parquet(args.peaks) + markers = pd.read_csv(args.markers, sep='\t') + + hits = find_matches(peaks, markers, args.ppm, args.rt_tol) + + hits.to_csv(args.output, sep='\t', index=False) + + +if __name__ == "__main__": + main() diff --git a/tools/misc/target_screen.xml b/tools/misc/target_screen.xml new file mode 100644 index 00000000..e315c307 --- /dev/null +++ b/tools/misc/target_screen.xml @@ -0,0 +1,47 @@ + + Extract peaks from recetox-aplcms tables using a list of target ions + + 0.1.0 + 0 + + + + pandas + pyarrow + + + + + + + + + + + + + + + + + + + + + + + + + 10.25080/Majora-92bf1922-00a + + \ No newline at end of file diff --git a/tools/misc/test-data/target_screen/markers.tsv b/tools/misc/test-data/target_screen/markers.tsv new file mode 100644 index 00000000..4148cded --- /dev/null +++ b/tools/misc/test-data/target_screen/markers.tsv @@ -0,0 +1,23 @@ +formula mz rt +C8H6Cl2O3 218.9621 474.6 +C9H15N3O1 180.1142 458.23 +C5H2Cl3N1O1 195.9129 488.1 +C13H10O3 213.0557 508.40 +C13H9FO3 231.0463 521.48 +C6H5NO3 138.0197 166.56 +C6H4Cl1N1O2 155.9858 176.62 +C19H28N2O5S 395.1646 598.96 +C10H12N2O3S1 239.0496 312.55 +C4H11O3P1S1 169.0094 168.08 +C14H17Cl2NO2 300.0564 689.79 +C11H13ClO2 167.0633 572.93 +C12H4Cl2F6N4OS 434.9314 767.86 +C12H4Cl2F6N4O2S 450.9263 791.29 +C16H22ClN3O2 322.1328 706.5 +C16H11ClF6N2O 395.0391 741.93 +C10H11Cl1O3 213.0324 533.9 +C7H9NO2S 170.0281 363.59 +C12H7Cl3O2 286.9439 830.97 +C18H15Cl3O8 462.976 662.99 +C12H7Cl3O5S 366.9007 700.52 +C9H9N4Cl 207.0443 403.37 diff --git a/tools/misc/test-data/target_screen/out.tsv b/tools/misc/test-data/target_screen/out.tsv new file mode 100644 index 00000000..8ac64ca7 --- /dev/null +++ b/tools/misc/test-data/target_screen/out.tsv @@ -0,0 +1,18 @@ +formula mz rt sd1 sd2 area mz_diff rt_diff +C8H6Cl2O3 218.9619738108278 473.4840709352675 0.6057217022739683 2.7706017478506073 1239147.63695882 0.00012618917219242576 1.1159290647325406 +C9H15N3O1 180.11422341297595 450.9460162486645 0.4692965104502825 4.727634916193644 1100073.3285644436 2.341297593488889e-05 7.2839837513355405 +C5H2Cl3N1O1 195.91267599889463 487.37949630118806 0.8695685392506757 2.8811688054510127 734461.0596300099 0.00022400110538001172 0.7205036988119673 +C13H10O3 213.05556658306853 508.4123751384482 2.9585968043814983 3.226731392934289 1787580.264815322 0.00013341693147594924 0.01237513844824889 +C13H9FO3 231.04576243564085 521.2436784813573 0.9930695671903609 2.469013097815558 1316270.081622402 0.0005375643591492008 0.23632151864273965 +C10H12N2O3S1 239.04945126090132 311.8317362000094 0.5578277726641567 3.57063615115722 3042462.634739455 0.0001487390986767423 0.7182637999906092 +C14H17Cl2NO2 300.0561299922103 685.3731548839577 0.8491884774374224 2.8491999009146074 1021277.4141378121 0.0002700077897088704 4.416845116042282 +C12H4Cl2F6N4OS 434.93037227267905 766.6610671335172 0.6265405149641161 3.55175113250731 43923382.478327975 0.0010277273209453597 1.198932866482778 +C12H4Cl2F6N4O2S 450.9259113906124 789.7479646306683 0.5765707513162325 3.4834377486718897 35843894.74749327 0.00038860938764173625 1.5420353693316429 +C16H22ClN3O2 322.13274143359513 705.9176130811956 0.765497607933695 2.9798451004946203 7686414.229962895 5.8566404845805664e-05 0.5823869188044455 +C16H11ClF6N2O 395.0387483584033 741.1840034426168 0.9150873601266857 2.396923077539685 692605.613740076 0.0003516415966942077 0.7459965573831369 +C10H11Cl1O3 213.03219616261535 532.8368925687558 0.8335128693984499 2.548404631638127 1231177.7029795102 0.00020383738464602175 1.0631074312441342 +C7H9NO2S 170.0280487596005 363.28514725405876 0.8844811055327363 2.7876246329523737 915161.3987675996 5.124039950032966e-05 0.3048527459412185 +C12H7Cl3O2 286.9434413572324 831.0018611928409 0.32058179843066653 1.7667251294853705 19934.364712896095 0.00045864276756901745 0.03186119284089273 +C18H15Cl3O8 462.97625391610677 662.6552310211961 0.9093786171678189 2.128435471267278 1209160.0005544876 0.00025391610677161225 0.3347689788039361 +C12H7Cl3O5S 366.90097256680355 699.9403505546061 0.8393755187990459 2.354260942300286 9578789.63215569 0.0002725668035736817 0.5796494453938976 +C9H9N4Cl 207.04420254367005 402.95120970553893 1.2647033563807812 2.594410018631832 40475158.16355405 9.74563299394049e-05 0.4187902944610755 diff --git a/tools/misc/test-data/target_screen/peaks.parquet b/tools/misc/test-data/target_screen/peaks.parquet new file mode 100644 index 00000000..b07a2640 Binary files /dev/null and b/tools/misc/test-data/target_screen/peaks.parquet differ