From eae22d1d518e0b6b2ce5e58275dd56c91018b791 Mon Sep 17 00:00:00 2001 From: Charles Shale Date: Mon, 9 Dec 2024 16:05:15 +1100 Subject: [PATCH] Cobalt: removed off-target logic --- .../hmftools/cobalt/ratio/RatioSupplier.java | 2 +- .../targeted/TargetRegionEnrichment.java | 74 -------- .../cobalt/targeted/TargetedRatioMapper.java | 179 +----------------- .../targeted/TargetedRatioMapperTest.java | 4 +- .../genome/gc/GCMedianReadDepthFile.java | 4 - 5 files changed, 4 insertions(+), 259 deletions(-) delete mode 100644 cobalt/src/main/java/com/hartwig/hmftools/cobalt/targeted/TargetRegionEnrichment.java diff --git a/cobalt/src/main/java/com/hartwig/hmftools/cobalt/ratio/RatioSupplier.java b/cobalt/src/main/java/com/hartwig/hmftools/cobalt/ratio/RatioSupplier.java index 69b1b27c16..a400141e5e 100644 --- a/cobalt/src/main/java/com/hartwig/hmftools/cobalt/ratio/RatioSupplier.java +++ b/cobalt/src/main/java/com/hartwig/hmftools/cobalt/ratio/RatioSupplier.java @@ -98,7 +98,7 @@ static class SampleRatios if(targetRegionEnrichment != null) { CB_LOGGER.info("using targeted ratio"); - readRatios = new TargetedRatioMapper(targetRegionEnrichment, chromosomePosCodec).mapRatios(readRatios); + readRatios = new TargetedRatioMapper(targetRegionEnrichment).mapRatios(readRatios); } gcNormalizedRatioMapper = new GcNormalizedRatioMapper(); diff --git a/cobalt/src/main/java/com/hartwig/hmftools/cobalt/targeted/TargetRegionEnrichment.java b/cobalt/src/main/java/com/hartwig/hmftools/cobalt/targeted/TargetRegionEnrichment.java deleted file mode 100644 index eac6543274..0000000000 --- a/cobalt/src/main/java/com/hartwig/hmftools/cobalt/targeted/TargetRegionEnrichment.java +++ /dev/null @@ -1,74 +0,0 @@ -package com.hartwig.hmftools.cobalt.targeted; - -import com.hartwig.hmftools.common.genome.position.GenomePosition; -import com.hartwig.hmftools.common.genome.position.GenomePositions; -import org.apache.commons.csv.CSVFormat; -import org.apache.commons.csv.CSVRecord; - -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; - -public class TargetRegionEnrichment -{ - private final List mTargetedRegions = new ArrayList<>(); - private final Map mTargetRelativeEnrichment = new TreeMap<>(GenomePosition::compare); - - public List getTargetedRegions() - { - return mTargetedRegions; - } - - public Map getTargetRelativeEnrichment() - { - return mTargetRelativeEnrichment; - } - - private static final char DELIMITER = '\t'; - - public static TargetRegionEnrichment fromTsv(String fileName) throws IOException - { - TargetRegionEnrichment targetRegionEnrichment = new TargetRegionEnrichment(); - - try (BufferedReader reader = new BufferedReader(new FileReader(fileName))) - { - CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(DELIMITER) - .setRecordSeparator('\n') - .setHeader().setSkipHeaderRecord(true) - .build(); - Iterable records = format.parse(reader); - - for (CSVRecord record : records) - { - String chromosome = record.get("chromosome").intern(); - int position = (int) Double.parseDouble(record.get("position")); - Double relativeEnrichment = parseDoubleOrNull(record.get("relativeEnrichment")); - GenomePosition genomePosition = GenomePositions.create(chromosome, position); - targetRegionEnrichment.mTargetedRegions.add(genomePosition); - if (relativeEnrichment != null && !Double.isNaN(relativeEnrichment)) - { - targetRegionEnrichment.mTargetRelativeEnrichment.put(genomePosition, relativeEnrichment); - } - } - } - - return targetRegionEnrichment; - } - - private static Double parseDoubleOrNull(String value) - { - try - { - return Double.parseDouble(value); - } - catch (NumberFormatException e) - { - return null; - } - } -} diff --git a/cobalt/src/main/java/com/hartwig/hmftools/cobalt/targeted/TargetedRatioMapper.java b/cobalt/src/main/java/com/hartwig/hmftools/cobalt/targeted/TargetedRatioMapper.java index dacb14f46c..ef8dc6f3fd 100644 --- a/cobalt/src/main/java/com/hartwig/hmftools/cobalt/targeted/TargetedRatioMapper.java +++ b/cobalt/src/main/java/com/hartwig/hmftools/cobalt/targeted/TargetedRatioMapper.java @@ -1,28 +1,13 @@ package com.hartwig.hmftools.cobalt.targeted; -import static java.lang.Math.round; -import static java.lang.String.format; - import static com.hartwig.hmftools.cobalt.CobaltConfig.CB_LOGGER; -import static com.hartwig.hmftools.cobalt.CobaltUtils.replaceColumn; - -import java.util.List; -import java.util.stream.Collectors; import com.hartwig.hmftools.cobalt.ChromosomePositionCodec; import com.hartwig.hmftools.cobalt.CobaltColumns; -import com.hartwig.hmftools.cobalt.CobaltConstants; -import com.hartwig.hmftools.cobalt.lowcov.LowCoverageRatioMapper; -import com.hartwig.hmftools.cobalt.ratio.GcNormalizedRatioMapper; import com.hartwig.hmftools.cobalt.ratio.RatioMapper; -import com.hartwig.hmftools.common.cobalt.ImmutableReadRatio; -import com.hartwig.hmftools.common.cobalt.ReadRatio; -import com.hartwig.hmftools.common.genome.position.GenomePosition; -import com.hartwig.hmftools.common.utils.Doubles; import org.apache.commons.lang3.Validate; import org.apache.logging.log4j.Level; -import org.jetbrains.annotations.Nullable; import tech.tablesaw.api.DoubleColumn; import tech.tablesaw.api.Table; @@ -31,28 +16,15 @@ public class TargetedRatioMapper implements RatioMapper { private final Table mTargetRegionEnrichment; - private final ChromosomePositionCodec mChromosomePosCodec; - public TargetedRatioMapper(final Table targetRegionEnrichment, - ChromosomePositionCodec chromosomePosCodec) + public TargetedRatioMapper(final Table targetRegionEnrichment) { mTargetRegionEnrichment = targetRegionEnrichment; - mChromosomePosCodec = chromosomePosCodec; } // we use on target ratios only for now @Override public Table mapRatios(final Table inputRatios) - { - return onTargetRatios(inputRatios); - } - - private void populateCombinedRatios(final Table ratios1, final Table ratios2) - { - // mCombinedRatios = ratios1.append(ratios2); - } - - Table onTargetRatios(final Table inputRatios) { // find all the ratios that are inside the target enriched regions // we filter out all the regions with 0 gc normalised ratios, as they do not actually @@ -82,153 +54,4 @@ Table onTargetRatios(final Table inputRatios) return onTargetRatios; } - - // we create a pan window ratio by taking the median count of super windows that combine multiple windows - Table offTargetRatios(final Table inputRatios) - { - // merge in the targeted region columns - Table offTargetRatios = inputRatios.joinOn(CobaltColumns.ENCODED_CHROMOSOME_POS).inner(mTargetRegionEnrichment); - - // resort it, the join messes up with the ordering - offTargetRatios = offTargetRatios.sortAscendingOn(CobaltColumns.ENCODED_CHROMOSOME_POS); - - offTargetRatios = offTargetRatios.where( - offTargetRatios.booleanColumn("offTarget").asSelection() - .and(offTargetRatios.doubleColumn("ratio").isNonNegative()) - .and(offTargetRatios.doubleColumn("relativeEnrichment").isNotMissing())); - - // double median = offTargetRatios.doubleColumn("ratio").median(); - - // normalise the ratio by relative enrichment - replaceColumn(offTargetRatios, "ratio", - offTargetRatios.doubleColumn("ratio") - .divide(offTargetRatios.doubleColumn("relativeEnrichment"))); - - CB_LOGGER.info("off target after enrichment normalisation: \n{}", offTargetRatios); - - // next we do low coverage - offTargetRatios = new LowCoverageRatioMapper(1000, mChromosomePosCodec).mapRatios(offTargetRatios); - - CB_LOGGER.info("off target after consolidation: \n{}", offTargetRatios); - - // apply gc normalisation - GcNormalizedRatioMapper gcNormalizedRatioMapper = new GcNormalizedRatioMapper(); - offTargetRatios = gcNormalizedRatioMapper.mapRatios(offTargetRatios); - - CB_LOGGER.info("off target gc normalisation: \n{}", gcNormalizedRatioMapper.gcMedianReadDepthTable()); - CB_LOGGER.info("off target after gc normalisation: \n{}", offTargetRatios); - - return offTargetRatios; - - // remove any with invalid ratios - // mOffTargetRatios = offTargetRatios.where(offTargetRatios.doubleColumn(CobaltColumns.RATIO).) - - /* - Window window = new Window(offTargetWindowSize); - - for(String chromosome : rawRatios.stringColumn("chromosome").unique()) - { - int currentWindowStart = 1; - - List windowGcRatios = new ArrayList<>(); - - // we need this to make sure we get consistent chromosome name (1 vs chr1) - - - for(ReadRatio readRatio : rawRatios.get(chromosome)) - { - // todo: make sure this is sorted - - int windowStart = window.start(readRatio.position()); - - if(windowStart != currentWindowStart) - { - if(currentWindowStart != -1) - { - ReadRatio unnormalizedRatio = unnormalizedOffTargetRatio( - offTargetWindowSize, chromosomeStr, currentWindowStart, windowGcRatios, targetRegions); - - if(unnormalizedRatio != null) - { - unnormalizedRatios.put(chromosome, unnormalizedRatio); - } - } - - currentWindowStart = windowStart; - windowGcRatios.clear(); - } - - if(readRatio.ratio() >= 0) - windowGcRatios.add(readRatio); - } - - ReadRatio unnormalizedRatio = unnormalizedOffTargetRatio( - offTargetWindowSize, chromosomeStr, currentWindowStart, windowGcRatios, targetRegions); - - if(unnormalizedRatio != null) - unnormalizedRatios.put(chromosome, unnormalizedRatio); - } - - // now we want to normalise all of those off target gc ratios by the median - List values = new ArrayList<>(); - unnormalizedRatios.values().forEach(x -> values.add(x.ratio())); - double median = Doubles.median(values); - - CB_LOGGER.debug("normalizing {} off target windows ratio by median: {}", unnormalizedRatios.size(), median); - - mOffTargetRatios.clear(); - - //for ((key, value) in unnormalizedRatios.entries()) - for(Map.Entry entry : unnormalizedRatios.entries()) - { - ReadRatio readRatio = entry.getValue(); - double normalizedRatio = readRatio.ratio() / median; - - mOffTargetRatios.put(entry.getKey(), ImmutableReadRatio.builder().from(readRatio).ratio(normalizedRatio).build()); - } - */ - } - - @Nullable - private static ReadRatio unnormalizedOffTargetRatio( - int offTargetWindowSize, final String chromosome, int windowStart, final List windowGcRatios, - final List targetRegions) - { - int minNumGcRatios = (int)round((double)offTargetWindowSize / CobaltConstants.WINDOW_SIZE * CobaltConstants.MIN_OFF_TARGET_WINDOW_RATIO); - - int windowEnd = windowStart + offTargetWindowSize - 1; - - // the window position is the middle - int windowMid = windowStart + offTargetWindowSize / 2; - - // check for targeted regions, we want to remove them - for (GenomePosition targetRegion : targetRegions) - { - if (targetRegion.chromosome().equals(chromosome) && windowStart <= targetRegion.position() && windowEnd > targetRegion.position()) - { - // this window contains a target region - int removeStart = targetRegion.position() - 2 * CobaltConstants.WINDOW_SIZE; - int removeEnd = targetRegion.position() + 2 * CobaltConstants.WINDOW_SIZE; - CB_LOGGER.trace("off target window: {}:{} ({} - {}), contains target region", - chromosome, windowMid, windowStart, windowEnd); - - windowGcRatios.removeIf(o -> o.position() >= removeStart && o.position() <= removeEnd); - } - } - - if (windowGcRatios.size() < minNumGcRatios) - { - // if we don't have enough sub windows with valid values then we skip this - CB_LOGGER.trace( "off target window: {}:{} ({} - {}), not enough sub window", - chromosome, windowMid, windowStart, windowEnd); - return null; - } - - double median = Doubles.median(windowGcRatios.stream().map(ReadRatio::ratio).collect(Collectors.toList())); - - CB_LOGGER.debug("off target window: {}:{} ({} - {}), num sub windows: {}, median: {}", - chromosome, windowMid, windowStart, windowEnd, windowGcRatios.size(), format("%.4f", median)); - - return !Double.isNaN(median) ? ImmutableReadRatio.builder().chromosome(chromosome).position(windowMid).ratio(median).build() : null; - } } \ No newline at end of file diff --git a/cobalt/src/test/java/com/hartwig/hmftools/cobalt/targeted/TargetedRatioMapperTest.java b/cobalt/src/test/java/com/hartwig/hmftools/cobalt/targeted/TargetedRatioMapperTest.java index 9618494f34..c96a167384 100644 --- a/cobalt/src/test/java/com/hartwig/hmftools/cobalt/targeted/TargetedRatioMapperTest.java +++ b/cobalt/src/test/java/com/hartwig/hmftools/cobalt/targeted/TargetedRatioMapperTest.java @@ -64,9 +64,9 @@ public void testOnTargetRatio() chromosomePositionCodec.addEncodedChrPosColumn(targetEnrichmentRatios, true); - TargetedRatioMapper ratioMapper = new TargetedRatioMapper(targetEnrichmentRatios, chromosomePositionCodec); + TargetedRatioMapper ratioMapper = new TargetedRatioMapper(targetEnrichmentRatios); - Table onTargetRatios = ratioMapper.onTargetRatios(ratios); + Table onTargetRatios = ratioMapper.mapRatios(ratios); assertEquals(2, onTargetRatios.rowCount()); diff --git a/hmf-common/src/main/java/com/hartwig/hmftools/common/genome/gc/GCMedianReadDepthFile.java b/hmf-common/src/main/java/com/hartwig/hmftools/common/genome/gc/GCMedianReadDepthFile.java index 611b062dd7..0ecaedeb6a 100644 --- a/hmf-common/src/main/java/com/hartwig/hmftools/common/genome/gc/GCMedianReadDepthFile.java +++ b/hmf-common/src/main/java/com/hartwig/hmftools/common/genome/gc/GCMedianReadDepthFile.java @@ -18,13 +18,11 @@ public final class GCMedianReadDepthFile private static final String EXTENSION = ".cobalt.gc.median.tsv"; private static final int ASSUMED_READ_LENGTH = 151; - @NotNull public static String generateFilename(final String basePath, final String sample) { return basePath + File.separator + sample + EXTENSION; } - @NotNull public static GCMedianReadDepth read(final String filename) throws IOException { return fromLines(Files.readAllLines(new File(filename).toPath())); @@ -35,7 +33,6 @@ public static void write(final String fileName, final GCMedianReadDepth gcMedian Files.write(new File(fileName).toPath(), toLines(gcMedianReadDepth)); } - @NotNull private static GCMedianReadDepth fromLines(final List lines) { boolean useReadDepth = true; @@ -91,7 +88,6 @@ private static GCMedianReadDepth fromLines(final List lines) return new GCMedianReadDepth(mean, median, medianPerBucket); } - @NotNull private static List toLines(final GCMedianReadDepth gcMedianReadDepth) { final List lines = new ArrayList<>();