From b1178ae2db04c640a9e8cd5c4dda18ad122999c9 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Fri, 15 Apr 2022 13:06:51 -0400 Subject: [PATCH 1/7] Overlapping reads supported in TheoreticalSensitity function, need to update tool --- .../picard/analysis/CollectWgsMetrics.java | 3 +- .../analysis/TheoreticalSensitivity.java | 52 +++++++++++++++---- .../directed/CollectTargetedMetrics.java | 3 +- .../analysis/TheoreticalSensitivityTest.java | 38 ++++++++++---- 4 files changed, 75 insertions(+), 21 deletions(-) diff --git a/src/main/java/picard/analysis/CollectWgsMetrics.java b/src/main/java/picard/analysis/CollectWgsMetrics.java index 47f681f429..573a7df133 100644 --- a/src/main/java/picard/analysis/CollectWgsMetrics.java +++ b/src/main/java/picard/analysis/CollectWgsMetrics.java @@ -243,7 +243,8 @@ protected int doWork() { // Write out theoretical sensitivity results. final MetricsFile theoreticalSensitivityMetrics = getMetricsFile(); log.info("Calculating theoretical sentitivity at " + ALLELE_FRACTION.size() + " allele fractions."); - List tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION); + List tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, + collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION, 0.0); theoreticalSensitivityMetrics.addAllMetrics(tsm); theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT); } diff --git a/src/main/java/picard/analysis/TheoreticalSensitivity.java b/src/main/java/picard/analysis/TheoreticalSensitivity.java index 08d0b8e9c8..03adf9f6e1 100644 --- a/src/main/java/picard/analysis/TheoreticalSensitivity.java +++ b/src/main/java/picard/analysis/TheoreticalSensitivity.java @@ -49,6 +49,7 @@ public class TheoreticalSensitivity { private static final int LARGE_NUMBER_OF_DRAWS = 10; // The number of draws at which we believe a Gaussian approximation to sum random variables. private static final double DEPTH_BIN_WIDTH = 0.01; // Minimal fraction of depth histogram to use when integrating theoretical sensitivity. This ensures we don't calculate theoretical sensitivity at every depth, which would be computationally expensive. private static final int RANDOM_SEED = 51; + private static final int PCR_ERROR_RATE = 45; /** * @param depthDistribution the probability of depth n is depthDistribution[n] for n = 0, 1. . . N - 1 @@ -243,7 +244,10 @@ public TheoreticalSensitivity() { * @param randomSeed random number seed to use for random number generator * @return Theoretical sensitivity for the given arguments at a constant depth. */ - public static double sensitivityAtConstantDepth(final int depth, final Histogram qualityHistogram, final double logOddsThreshold, final int sampleSize, final double alleleFraction, final long randomSeed) { + public static double sensitivityAtConstantDepth(final int depth, final Histogram qualityHistogram, + final double logOddsThreshold, final int sampleSize, + final double alleleFraction, final long randomSeed, + final double overlapProbability) { // If the depth is 0 at a particular locus, the sensitivity is trivially 0.0. if (depth == 0) { return 0.0; @@ -264,10 +268,21 @@ public static double sensitivityAtConstantDepth(final int depth, final Histogram for (int sample = 0; sample < sampleSize; sample++) { final int altDepth = bd.sample(); - final int sumOfQualities; + int sumOfQualities; if (altDepth < LARGE_NUMBER_OF_DRAWS) { // If the number of alt reads is "small" we draw from the actual base quality distribution. - sumOfQualities = IntStream.range(0, altDepth).map(n -> qualityRW.draw()).sum(); + //sumOfQualities = IntStream.range(0, altDepth).map(n -> qualityRW.draw()).sum(); + sumOfQualities = 0; + for(int i = 0;i < altDepth;i++) { + if(rg.nextDouble() > overlapProbability) { + sumOfQualities += qualityRW.draw(); + } else { + // Simulate overlapping reads by sampling from the quality distribution twice. + // Since PCR errors affect overlapping reads, the total contribution to the sum + // of qualities cannot exceed the PCR error rate. + sumOfQualities += Math.min(qualityRW.draw() + qualityRW.draw(), PCR_ERROR_RATE); + } + } } else { // If the number of alt reads is "large" we draw from a Gaussian approximation of the base // quality distribution to speed up the code. @@ -305,8 +320,11 @@ static int drawSumOfQScores(final int altDepth, final double averageQuality, fin * @param alleleFraction the allele fraction to evaluate sensitivity at * @return Theoretical sensitivity for the given arguments at a constant depth. */ - private static double sensitivityAtConstantDepth(final int depth, final Histogram qualityHistogram, final double logOddsThreshold, final int sampleSize, final double alleleFraction) { - return sensitivityAtConstantDepth(depth, qualityHistogram, logOddsThreshold, sampleSize, alleleFraction, RANDOM_SEED); + private static double sensitivityAtConstantDepth(final int depth, final Histogram qualityHistogram, + final double logOddsThreshold, final int sampleSize, + final double alleleFraction, final double overlapProbability) { + return sensitivityAtConstantDepth(depth, qualityHistogram, logOddsThreshold, sampleSize, alleleFraction, + RANDOM_SEED, overlapProbability); } /** @@ -319,8 +337,17 @@ private static double sensitivityAtConstantDepth(final int depth, final Histogra * @param alleleFraction the allele fraction to evaluate sensitivity at * @return Theoretical sensitivity for the given arguments over a particular depth distribution. */ - public static double theoreticalSensitivity(final Histogram depthHistogram, final Histogram qualityHistogram, - final int sampleSize, final double logOddsThreshold, final double alleleFraction) { + public static double theoreticalSensitivity(final Histogram depthHistogram, + final Histogram qualityHistogram, final int sampleSize, + final double logOddsThreshold, final double alleleFraction) { + return theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, logOddsThreshold, alleleFraction, + 0.0); + } + + public static double theoreticalSensitivity(final Histogram depthHistogram, + final Histogram qualityHistogram, final int sampleSize, + final double logOddsThreshold, final double alleleFraction, + final double overlapProbability) { if (alleleFraction > 1.0 || alleleFraction < 0.0) { throw new IllegalArgumentException("Allele fractions must be between 0 and 1."); } @@ -346,7 +373,8 @@ public static double theoreticalSensitivity(final Histogram depthHistog } // Calculate sensitivity for a particular depth, and use trapezoid rule to integrate sensitivity. final double left = right; - right = sensitivityAtConstantDepth(currentDepth, qualityHistogram, logOddsThreshold, sampleSize, alleleFraction); + right = sensitivityAtConstantDepth(currentDepth, qualityHistogram, logOddsThreshold, sampleSize, + alleleFraction, overlapProbability); sensitivity += deltaDepthProbability * (left + right) / 2.0; } return sensitivity; @@ -386,7 +414,10 @@ static double[] trimDistribution(final double[] distribution) { * @param alleleFractions List of allele fractions to measure theoretical sensitivity over. */ public static List calculateSensitivities(final int simulationSize, - final Histogram depthHistogram, final Histogram baseQHistogram, final List alleleFractions) { + final Histogram depthHistogram, + final Histogram baseQHistogram, + final List alleleFractions, + final double overlapProbability) { final List metricsOverVariousAlleleFractions = new ArrayList<>(); final double logOddsThreshold = 6.2; // This threshold is used because it is the value used for MuTect2. @@ -396,7 +427,8 @@ public static List calculateSensitivities(final i for (final double alleleFraction : alleleFractions) { final TheoreticalSensitivityMetrics theoreticalSensitivityMetrics = new TheoreticalSensitivityMetrics(); theoreticalSensitivityMetrics.ALLELE_FRACTION = alleleFraction; - theoreticalSensitivityMetrics.THEORETICAL_SENSITIVITY = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, baseQHistogram, simulationSize, logOddsThreshold, alleleFraction); + theoreticalSensitivityMetrics.THEORETICAL_SENSITIVITY = TheoreticalSensitivity.theoreticalSensitivity( + depthHistogram, baseQHistogram, simulationSize, logOddsThreshold, alleleFraction, overlapProbability); theoreticalSensitivityMetrics.THEORETICAL_SENSITIVITY_Q = QualityUtil.getPhredScoreFromErrorProbability((1 - theoreticalSensitivityMetrics.THEORETICAL_SENSITIVITY)); theoreticalSensitivityMetrics.SAMPLE_SIZE = simulationSize; theoreticalSensitivityMetrics.LOG_ODDS_THRESHOLD = logOddsThreshold; diff --git a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java index 35a3bd5f3c..322e1db542 100644 --- a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java +++ b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java @@ -169,7 +169,8 @@ protected int doWork() { // Write out theoretical sensitivity results. final MetricsFile theoreticalSensitivityMetrics = getMetricsFile(); log.info("Calculating theoretical sentitivity at " + ALLELE_FRACTION.size() + " allele fractions."); - List tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, collector.getDepthHistogram(), collector.getBaseQualityHistogram(), ALLELE_FRACTION); + List tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, + collector.getDepthHistogram(), collector.getBaseQualityHistogram(), ALLELE_FRACTION, 0.0); theoreticalSensitivityMetrics.addAllMetrics(tsm); theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT); } diff --git a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java index 95a80a460a..5358f81c77 100644 --- a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java +++ b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java @@ -249,12 +249,16 @@ public void testSensitivityAtConstantDepth(final double expected, final File met metrics.read(metricsFileReader); } + Object hi[] = metrics.getMetrics().toArray(); + final List> histograms = metrics.getAllHistograms(); final Histogram qualityHistogram = histograms.get(1); + List l = metrics.getMetrics(); // We ensure that even using different random seeds we converge to roughly the same value. for (int i = 0; i < 3; i++) { - double result = TheoreticalSensitivity.sensitivityAtConstantDepth(depth, qualityHistogram, 3, sampleSize, alleleFraction, i); + double result = TheoreticalSensitivity.sensitivityAtConstantDepth(depth, qualityHistogram, 3, + sampleSize, alleleFraction, i, 0.0); Assert.assertEquals(result, expected, tolerance); } } @@ -267,15 +271,20 @@ public Object[][] arbFracSensDataProvider() { // are not quite large enough to converge properly, but is used for the purpose of // keeping the compute time of the tests short. return new Object[][]{ - {0.90, wgsMetricsFile, 0.5, 400}, - {0.78, wgsMetricsFile, 0.3, 400}, - {0.29, wgsMetricsFile, 0.1, 500}, - {0.08, wgsMetricsFile, 0.05, 500}, + {0.90, wgsMetricsFile, 0.5, 400, false}, + {0.78, wgsMetricsFile, 0.3, 400, false}, + {0.29, wgsMetricsFile, 0.1, 500, false}, + {0.08, wgsMetricsFile, 0.05, 500, false}, + + {0.90, wgsMetricsFile, 0.5, 400, true}, + {0.80, wgsMetricsFile, 0.3, 400, true}, + {0.35, wgsMetricsFile, 0.1, 500, true}, + {0.12, wgsMetricsFile, 0.05, 500, true} }; } @Test(dataProvider = "TheoreticalSensitivityDataProvider") - public void testSensitivity(final double expected, final File metricsFile, final double alleleFraction, final int sampleSize) throws Exception { + public void testSensitivity(final double expected, final File metricsFile, final double alleleFraction, final int sampleSize, final boolean useOverlapProbability) throws Exception { // This tests Theoretical Sensitivity using distributions on both base quality scores // and the depth histogram. @@ -292,7 +301,15 @@ public void testSensitivity(final double expected, final File metricsFile, final final Histogram depthHistogram = histograms.get(0); final Histogram qualityHistogram = histograms.get(1); - final double result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction); + // Get overlap probability from PCT_EXC_OVERLAP + final double overlapProbability = ((WgsMetrics) metrics.getMetrics().toArray()[0]).PCT_EXC_OVERLAP; + final double result; + if (useOverlapProbability) { + result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction, overlapProbability); + } + else { + result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction); + } Assert.assertEquals(result, expected, tolerance); } @@ -327,6 +344,7 @@ public void testHetVsArbitrary(final File metricsFile, final double tolerance, f final double[] qualityDistribution = TheoreticalSensitivity.normalizeHistogram(qualityHistogram); final double[] depthDistribution = TheoreticalSensitivity.normalizeHistogram(depthHistogram); + // TS is TheoreticalSensitivity, THS is TheoreticalHetSensitivity final double resultFromTS = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, 0.5); final double resultFromTHS = TheoreticalSensitivity.hetSNPSensitivity(depthDistribution, qualityDistribution, sampleSize, 3); @@ -382,7 +400,8 @@ public void testDrawSumOfQScores(final File metricsFile, final int altDepth, fin final List> histograms = metrics.getAllHistograms(); final Histogram qualityHistogram = histograms.get(1); - final TheoreticalSensitivity.RouletteWheel qualityRW = new TheoreticalSensitivity.RouletteWheel(TheoreticalSensitivity.trimDistribution(TheoreticalSensitivity.normalizeHistogram(qualityHistogram))); + final TheoreticalSensitivity.RouletteWheel qualityRW = new TheoreticalSensitivity.RouletteWheel( + TheoreticalSensitivity.trimDistribution(TheoreticalSensitivity.normalizeHistogram(qualityHistogram))); final Random randomNumberGenerator = new Random(51); @@ -392,7 +411,8 @@ public void testDrawSumOfQScores(final File metricsFile, final int altDepth, fin for (int k = 0; k < 1; k++) { int sumOfQualitiesFull = IntStream.range(0, altDepth).map(n -> qualityRW.draw()).sum(); - int sumOfQualities = TheoreticalSensitivity.drawSumOfQScores(altDepth, averageQuality, standardDeviationQuality, randomNumberGenerator.nextGaussian()); + int sumOfQualities = TheoreticalSensitivity.drawSumOfQScores(altDepth, averageQuality, + standardDeviationQuality, randomNumberGenerator.nextGaussian()); Assert.assertEquals(sumOfQualitiesFull, sumOfQualities, sumOfQualitiesFull * tolerance); } From 1090a30871b5940ac7a6387c6f8e050a6bef824a Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Fri, 15 Apr 2022 13:44:33 -0400 Subject: [PATCH 2/7] Adds overlap functionality to CollectWgsMetrics --- src/main/java/picard/analysis/CollectWgsMetrics.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/picard/analysis/CollectWgsMetrics.java b/src/main/java/picard/analysis/CollectWgsMetrics.java index 573a7df133..c5e022a65d 100644 --- a/src/main/java/picard/analysis/CollectWgsMetrics.java +++ b/src/main/java/picard/analysis/CollectWgsMetrics.java @@ -242,9 +242,10 @@ protected int doWork() { if (THEORETICAL_SENSITIVITY_OUTPUT != null) { // Write out theoretical sensitivity results. final MetricsFile theoreticalSensitivityMetrics = getMetricsFile(); - log.info("Calculating theoretical sentitivity at " + ALLELE_FRACTION.size() + " allele fractions."); + log.info("Calculating theoretical sensitivity at " + ALLELE_FRACTION.size() + " allele fractions."); + List tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, - collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION, 0.0); + collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION, collector.basesExcludedByOverlap); theoreticalSensitivityMetrics.addAllMetrics(tsm); theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT); } From 3d38bf0bef44622eed7c4d143de0acc48d54e4b7 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Tue, 19 Apr 2022 13:39:05 -0400 Subject: [PATCH 3/7] Enable overlap probability for CollectHsMetrics --- .../picard/analysis/directed/CollectTargetedMetrics.java | 6 +++++- .../picard/analysis/directed/TargetMetricsCollector.java | 2 -- src/test/java/picard/IntelInflaterDeflaterLoadTest.java | 3 ++- .../java/picard/analysis/directed/CollectHsMetricsTest.java | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java index 322e1db542..955153fbd7 100644 --- a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java +++ b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java @@ -18,6 +18,7 @@ import picard.analysis.MetricAccumulationLevel; import picard.analysis.TheoreticalSensitivity; import picard.analysis.TheoreticalSensitivityMetrics; +import picard.analysis.WgsMetrics; import picard.cmdline.CommandLineProgram; import picard.cmdline.StandardOptionDefinitions; import picard.metrics.MultilevelMetrics; @@ -169,8 +170,11 @@ protected int doWork() { // Write out theoretical sensitivity results. final MetricsFile theoreticalSensitivityMetrics = getMetricsFile(); log.info("Calculating theoretical sentitivity at " + ALLELE_FRACTION.size() + " allele fractions."); + + final double overlapProbability = ((TargetedPcrMetrics) metrics.getMetrics().toArray()[0]).PCT_EXC_OVERLAP; + List tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, - collector.getDepthHistogram(), collector.getBaseQualityHistogram(), ALLELE_FRACTION, 0.0); + collector.getDepthHistogram(), collector.getBaseQualityHistogram(), ALLELE_FRACTION, overlapProbability); theoreticalSensitivityMetrics.addAllMetrics(tsm); theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT); } diff --git a/src/main/java/picard/analysis/directed/TargetMetricsCollector.java b/src/main/java/picard/analysis/directed/TargetMetricsCollector.java index f31d6231c2..f8a5b9b214 100644 --- a/src/main/java/picard/analysis/directed/TargetMetricsCollector.java +++ b/src/main/java/picard/analysis/directed/TargetMetricsCollector.java @@ -134,8 +134,6 @@ public abstract class TargetMetricsCollector Date: Thu, 21 Apr 2022 14:18:57 -0400 Subject: [PATCH 4/7] Fixing a few minor bugs, and adding tests --- .../picard/analysis/CollectWgsMetrics.java | 5 +++- .../analysis/TheoreticalSensitivity.java | 23 ++++++++++--------- .../directed/CollectTargetedMetrics.java | 7 ++++-- .../analysis/TheoreticalSensitivityTest.java | 4 ++-- .../directed/CollectTargetedMetricsTest.java | 17 ++++++++++++-- 5 files changed, 38 insertions(+), 18 deletions(-) diff --git a/src/main/java/picard/analysis/CollectWgsMetrics.java b/src/main/java/picard/analysis/CollectWgsMetrics.java index c5e022a65d..0526e8ae22 100644 --- a/src/main/java/picard/analysis/CollectWgsMetrics.java +++ b/src/main/java/picard/analysis/CollectWgsMetrics.java @@ -139,6 +139,9 @@ public class CollectWgsMetrics extends CommandLineProgram { @Argument(doc="Allele fraction for which to calculate theoretical sensitivity.", optional = true) public List ALLELE_FRACTION = new ArrayList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5)); + @Argument(doc="Phred scaled PCR Error Rate for Theoretical Sensitivity model.", optional = true) + public int PCR_ERROR_RATE = 45; + @Argument(doc = "If true, fast algorithm is used.") public boolean USE_FAST_ALGORITHM = false; @@ -245,7 +248,7 @@ protected int doWork() { log.info("Calculating theoretical sensitivity at " + ALLELE_FRACTION.size() + " allele fractions."); List tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, - collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION, collector.basesExcludedByOverlap); + collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION, collector.basesExcludedByOverlap, PCR_ERROR_RATE); theoreticalSensitivityMetrics.addAllMetrics(tsm); theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT); } diff --git a/src/main/java/picard/analysis/TheoreticalSensitivity.java b/src/main/java/picard/analysis/TheoreticalSensitivity.java index 03adf9f6e1..ce6ea59d66 100644 --- a/src/main/java/picard/analysis/TheoreticalSensitivity.java +++ b/src/main/java/picard/analysis/TheoreticalSensitivity.java @@ -49,7 +49,6 @@ public class TheoreticalSensitivity { private static final int LARGE_NUMBER_OF_DRAWS = 10; // The number of draws at which we believe a Gaussian approximation to sum random variables. private static final double DEPTH_BIN_WIDTH = 0.01; // Minimal fraction of depth histogram to use when integrating theoretical sensitivity. This ensures we don't calculate theoretical sensitivity at every depth, which would be computationally expensive. private static final int RANDOM_SEED = 51; - private static final int PCR_ERROR_RATE = 45; /** * @param depthDistribution the probability of depth n is depthDistribution[n] for n = 0, 1. . . N - 1 @@ -247,7 +246,7 @@ public TheoreticalSensitivity() { public static double sensitivityAtConstantDepth(final int depth, final Histogram qualityHistogram, final double logOddsThreshold, final int sampleSize, final double alleleFraction, final long randomSeed, - final double overlapProbability) { + final double overlapProbability, final int pcrErrorRate) { // If the depth is 0 at a particular locus, the sensitivity is trivially 0.0. if (depth == 0) { return 0.0; @@ -275,12 +274,12 @@ public static double sensitivityAtConstantDepth(final int depth, final Histogram sumOfQualities = 0; for(int i = 0;i < altDepth;i++) { if(rg.nextDouble() > overlapProbability) { - sumOfQualities += qualityRW.draw(); + sumOfQualities += Math.min(qualityRW.draw(), pcrErrorRate); } else { // Simulate overlapping reads by sampling from the quality distribution twice. // Since PCR errors affect overlapping reads, the total contribution to the sum // of qualities cannot exceed the PCR error rate. - sumOfQualities += Math.min(qualityRW.draw() + qualityRW.draw(), PCR_ERROR_RATE); + sumOfQualities += Math.min(qualityRW.draw() + qualityRW.draw(), pcrErrorRate); } } } else { @@ -322,9 +321,10 @@ static int drawSumOfQScores(final int altDepth, final double averageQuality, fin */ private static double sensitivityAtConstantDepth(final int depth, final Histogram qualityHistogram, final double logOddsThreshold, final int sampleSize, - final double alleleFraction, final double overlapProbability) { + final double alleleFraction, final double overlapProbability, + final int pcrErrorRate) { return sensitivityAtConstantDepth(depth, qualityHistogram, logOddsThreshold, sampleSize, alleleFraction, - RANDOM_SEED, overlapProbability); + RANDOM_SEED, overlapProbability, pcrErrorRate); } /** @@ -341,13 +341,13 @@ public static double theoreticalSensitivity(final Histogram depthHistog final Histogram qualityHistogram, final int sampleSize, final double logOddsThreshold, final double alleleFraction) { return theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, logOddsThreshold, alleleFraction, - 0.0); + 0.0, 45); } public static double theoreticalSensitivity(final Histogram depthHistogram, final Histogram qualityHistogram, final int sampleSize, final double logOddsThreshold, final double alleleFraction, - final double overlapProbability) { + final double overlapProbability, final int pcrErrorRate) { if (alleleFraction > 1.0 || alleleFraction < 0.0) { throw new IllegalArgumentException("Allele fractions must be between 0 and 1."); } @@ -374,7 +374,7 @@ public static double theoreticalSensitivity(final Histogram depthHistog // Calculate sensitivity for a particular depth, and use trapezoid rule to integrate sensitivity. final double left = right; right = sensitivityAtConstantDepth(currentDepth, qualityHistogram, logOddsThreshold, sampleSize, - alleleFraction, overlapProbability); + alleleFraction, overlapProbability, pcrErrorRate); sensitivity += deltaDepthProbability * (left + right) / 2.0; } return sensitivity; @@ -417,7 +417,8 @@ public static List calculateSensitivities(final i final Histogram depthHistogram, final Histogram baseQHistogram, final List alleleFractions, - final double overlapProbability) { + final double overlapProbability, + final int pcrErrorRate) { final List metricsOverVariousAlleleFractions = new ArrayList<>(); final double logOddsThreshold = 6.2; // This threshold is used because it is the value used for MuTect2. @@ -428,7 +429,7 @@ public static List calculateSensitivities(final i final TheoreticalSensitivityMetrics theoreticalSensitivityMetrics = new TheoreticalSensitivityMetrics(); theoreticalSensitivityMetrics.ALLELE_FRACTION = alleleFraction; theoreticalSensitivityMetrics.THEORETICAL_SENSITIVITY = TheoreticalSensitivity.theoreticalSensitivity( - depthHistogram, baseQHistogram, simulationSize, logOddsThreshold, alleleFraction, overlapProbability); + depthHistogram, baseQHistogram, simulationSize, logOddsThreshold, alleleFraction, overlapProbability, pcrErrorRate); theoreticalSensitivityMetrics.THEORETICAL_SENSITIVITY_Q = QualityUtil.getPhredScoreFromErrorProbability((1 - theoreticalSensitivityMetrics.THEORETICAL_SENSITIVITY)); theoreticalSensitivityMetrics.SAMPLE_SIZE = simulationSize; theoreticalSensitivityMetrics.LOG_ODDS_THRESHOLD = logOddsThreshold; diff --git a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java index 955153fbd7..e83e41f0a7 100644 --- a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java +++ b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java @@ -106,6 +106,9 @@ protected abstract COLLECTOR makeCollector(final Set ac @Argument(doc="Output for Theoretical Sensitivity metrics where the allele fractions are provided by the ALLELE_FRACTION argument.", optional = true) public File THEORETICAL_SENSITIVITY_OUTPUT; + @Argument(doc="Phred scaled PCR Error Rate for Theoretical Sensitivity model.", optional = true) + public int PCR_ERROR_RATE = 45; + @Argument(doc="Allele fraction for which to calculate theoretical sensitivity.", optional = true) public List ALLELE_FRACTION = new ArrayList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5)); /** @@ -171,10 +174,10 @@ protected int doWork() { final MetricsFile theoreticalSensitivityMetrics = getMetricsFile(); log.info("Calculating theoretical sentitivity at " + ALLELE_FRACTION.size() + " allele fractions."); - final double overlapProbability = ((TargetedPcrMetrics) metrics.getMetrics().toArray()[0]).PCT_EXC_OVERLAP; + final double overlapProbability = ((PanelMetricsBase) metrics.getMetrics().toArray()[0]).PCT_EXC_OVERLAP; List tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, - collector.getDepthHistogram(), collector.getBaseQualityHistogram(), ALLELE_FRACTION, overlapProbability); + collector.getDepthHistogram(), collector.getBaseQualityHistogram(), ALLELE_FRACTION, overlapProbability, PCR_ERROR_RATE); theoreticalSensitivityMetrics.addAllMetrics(tsm); theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT); } diff --git a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java index 5358f81c77..544b4cf1c0 100644 --- a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java +++ b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java @@ -258,7 +258,7 @@ public void testSensitivityAtConstantDepth(final double expected, final File met // We ensure that even using different random seeds we converge to roughly the same value. for (int i = 0; i < 3; i++) { double result = TheoreticalSensitivity.sensitivityAtConstantDepth(depth, qualityHistogram, 3, - sampleSize, alleleFraction, i, 0.0); + sampleSize, alleleFraction, i, 0.0, 45); Assert.assertEquals(result, expected, tolerance); } } @@ -305,7 +305,7 @@ public void testSensitivity(final double expected, final File metricsFile, final final double overlapProbability = ((WgsMetrics) metrics.getMetrics().toArray()[0]).PCT_EXC_OVERLAP; final double result; if (useOverlapProbability) { - result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction, overlapProbability); + result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction, overlapProbability, 45); } else { result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction); diff --git a/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java b/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java index 82f47ea3a0..ca232c77cf 100644 --- a/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java +++ b/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java @@ -166,14 +166,26 @@ public Object[][] theoreticalSensitivityDataProvider() { // the tests from taking too long to run. {tempSamFile, outfile, tsOutfile, perTargetOutfile, referenceFile, singleIntervals, 2000, Arrays.asList(0.01, 0.05, 0.10, 0.30, 0.50), // Allele fraction - Arrays.asList(0.01, 0.54, 0.93, 0.99, 0.99) // Expected sensitivity + Arrays.asList(0.01, 0.54, 0.93, 0.99, 0.99), // Expected sensitivity + 45 // PCR Error Rate + }, + + // This is the same as the previous test but uses a lower value for the pcrErrorRate. + // The important thing captured here is that the expected sensitivities should be + // less than or equal to the values from the previous test. + {tempSamFile, outfile, tsOutfile, perTargetOutfile, referenceFile, singleIntervals, 2000, + Arrays.asList(0.01, 0.05, 0.10, 0.30, 0.50), // Allele fraction + Arrays.asList(0.00, 0.31, 0.87, 0.99, 0.99), // Expected sensitivity + 10 // PCR Error Rate } }; } @Test(dataProvider = "theoreticalSensitivityDataProvider") public void runCollectTargetedMetricsTheoreticalSensitivityTest(final File input, final File outfile, final File tsOutfile, final File perTargetOutfile, final String referenceFile, - final String targetIntervals, final int sampleSize, final List alleleFractions, final List expectedSensitivities) throws IOException { + final String targetIntervals, final int sampleSize, + final List alleleFractions, final List expectedSensitivities, + final int pcrErrorRate) throws IOException { final String[] args = new String[] { "TARGET_INTERVALS=" + targetIntervals, @@ -184,6 +196,7 @@ public void runCollectTargetedMetricsTheoreticalSensitivityTest(final File input "LEVEL=ALL_READS", "AMPLICON_INTERVALS=" + targetIntervals, "THEORETICAL_SENSITIVITY_OUTPUT=" + tsOutfile.getAbsolutePath(), + "PCR_ERROR_RATE=" + pcrErrorRate, "SAMPLE_SIZE=" + sampleSize }; From 1d72fa7c0528c0d446098bf693eb43d3717a7548 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Thu, 21 Apr 2022 14:49:02 -0400 Subject: [PATCH 5/7] Adding javadoc, and removing unnecessary default function --- .../java/picard/analysis/TheoreticalSensitivity.java | 10 ++-------- .../picard/analysis/TheoreticalSensitivityTest.java | 4 ++-- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/main/java/picard/analysis/TheoreticalSensitivity.java b/src/main/java/picard/analysis/TheoreticalSensitivity.java index ce6ea59d66..2b6b2c7c06 100644 --- a/src/main/java/picard/analysis/TheoreticalSensitivity.java +++ b/src/main/java/picard/analysis/TheoreticalSensitivity.java @@ -270,7 +270,6 @@ public static double sensitivityAtConstantDepth(final int depth, final Histogram int sumOfQualities; if (altDepth < LARGE_NUMBER_OF_DRAWS) { // If the number of alt reads is "small" we draw from the actual base quality distribution. - //sumOfQualities = IntStream.range(0, altDepth).map(n -> qualityRW.draw()).sum(); sumOfQualities = 0; for(int i = 0;i < altDepth;i++) { if(rg.nextDouble() > overlapProbability) { @@ -335,15 +334,10 @@ private static double sensitivityAtConstantDepth(final int depth, final Histogra * @param sampleSize the total number of simulations to run * @param logOddsThreshold Log odds threshold necessary for variant to be called * @param alleleFraction the allele fraction to evaluate sensitivity at + * @param overlapProbability the probability two bases from the same fragment are overlapping + * @param pcrErrorRate Upper bound for the effective base quality two bases in an overlapping read * @return Theoretical sensitivity for the given arguments over a particular depth distribution. */ - public static double theoreticalSensitivity(final Histogram depthHistogram, - final Histogram qualityHistogram, final int sampleSize, - final double logOddsThreshold, final double alleleFraction) { - return theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, logOddsThreshold, alleleFraction, - 0.0, 45); - } - public static double theoreticalSensitivity(final Histogram depthHistogram, final Histogram qualityHistogram, final int sampleSize, final double logOddsThreshold, final double alleleFraction, diff --git a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java index 544b4cf1c0..8bfa8c67aa 100644 --- a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java +++ b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java @@ -308,7 +308,7 @@ public void testSensitivity(final double expected, final File metricsFile, final result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction, overlapProbability, 45); } else { - result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction); + result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction, 0, 45); } Assert.assertEquals(result, expected, tolerance); @@ -345,7 +345,7 @@ public void testHetVsArbitrary(final File metricsFile, final double tolerance, f final double[] depthDistribution = TheoreticalSensitivity.normalizeHistogram(depthHistogram); // TS is TheoreticalSensitivity, THS is TheoreticalHetSensitivity - final double resultFromTS = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, 0.5); + final double resultFromTS = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, 0.5, 0.0, 45); final double resultFromTHS = TheoreticalSensitivity.hetSNPSensitivity(depthDistribution, qualityDistribution, sampleSize, 3); Assert.assertEquals(resultFromTS, resultFromTHS, tolerance); From fa0c1e1213ffafc0dd8a4aa7a810371b979ca2ea Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Fri, 3 Jun 2022 18:12:55 -0400 Subject: [PATCH 6/7] Incomplete commit for Friday night, but almost finished responding to Chris --- .../picard/analysis/CollectWgsMetrics.java | 7 +- .../analysis/TheoreticalSensitivity.java | 46 ++++------- .../directed/CollectTargetedMetrics.java | 1 - .../picard/IntelInflaterDeflaterLoadTest.java | 3 +- .../analysis/TheoreticalSensitivityTest.java | 78 +++++-------------- 5 files changed, 37 insertions(+), 98 deletions(-) diff --git a/src/main/java/picard/analysis/CollectWgsMetrics.java b/src/main/java/picard/analysis/CollectWgsMetrics.java index 0526e8ae22..f6afc7c69d 100644 --- a/src/main/java/picard/analysis/CollectWgsMetrics.java +++ b/src/main/java/picard/analysis/CollectWgsMetrics.java @@ -139,8 +139,8 @@ public class CollectWgsMetrics extends CommandLineProgram { @Argument(doc="Allele fraction for which to calculate theoretical sensitivity.", optional = true) public List ALLELE_FRACTION = new ArrayList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5)); - @Argument(doc="Phred scaled PCR Error Rate for Theoretical Sensitivity model.", optional = true) - public int PCR_ERROR_RATE = 45; + @Argument(doc="Maximum value allowed for base quality resulting from overlapping reads. Often this is governed by PCR error rate. (Phred scale)", optional = true) + public int MAX_OVERLAP_BASE_QUAL = 45; @Argument(doc = "If true, fast algorithm is used.") public boolean USE_FAST_ALGORITHM = false; @@ -248,7 +248,8 @@ protected int doWork() { log.info("Calculating theoretical sensitivity at " + ALLELE_FRACTION.size() + " allele fractions."); List tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, - collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION, collector.basesExcludedByOverlap, PCR_ERROR_RATE); + collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION, + collector.getMetrics(dupeFilter, adapterFilter, mapqFilter, pairFilter).PCT_EXC_OVERLAP, MAX_OVERLAP_BASE_QUAL); theoreticalSensitivityMetrics.addAllMetrics(tsm); theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT); } diff --git a/src/main/java/picard/analysis/TheoreticalSensitivity.java b/src/main/java/picard/analysis/TheoreticalSensitivity.java index 2b6b2c7c06..33c3a0c337 100644 --- a/src/main/java/picard/analysis/TheoreticalSensitivity.java +++ b/src/main/java/picard/analysis/TheoreticalSensitivity.java @@ -46,7 +46,6 @@ public class TheoreticalSensitivity { private static final Log log = Log.getInstance(TheoreticalSensitivity.class); private static final int SAMPLING_MAX = 600; //prevent 'infinite' loops private static final int MAX_CONSIDERED_DEPTH_HET_SENS = 1000; // No point in looking any deeper than this, otherwise GC overhead is too high. Only used for HET sensitivity. - private static final int LARGE_NUMBER_OF_DRAWS = 10; // The number of draws at which we believe a Gaussian approximation to sum random variables. private static final double DEPTH_BIN_WIDTH = 0.01; // Minimal fraction of depth histogram to use when integrating theoretical sensitivity. This ensures we don't calculate theoretical sensitivity at every depth, which would be computationally expensive. private static final int RANDOM_SEED = 51; @@ -241,12 +240,14 @@ public TheoreticalSensitivity() { * @param sampleSize sampleSize is the total number of simulations to run * @param alleleFraction the allele fraction to evaluate sensitivity at * @param randomSeed random number seed to use for random number generator + * @param overlapProbability Probability a position on a read overlaps with its mate + * @param maxOverlapBaseQuality Maximum base quality that can result from summing overlapping base qualities, often capped by PCR error rate * @return Theoretical sensitivity for the given arguments at a constant depth. */ public static double sensitivityAtConstantDepth(final int depth, final Histogram qualityHistogram, final double logOddsThreshold, final int sampleSize, final double alleleFraction, final long randomSeed, - final double overlapProbability, final int pcrErrorRate) { + final double overlapProbability, final int maxOverlapBaseQuality) { // If the depth is 0 at a particular locus, the sensitivity is trivially 0.0. if (depth == 0) { return 0.0; @@ -268,25 +269,20 @@ public static double sensitivityAtConstantDepth(final int depth, final Histogram final int altDepth = bd.sample(); int sumOfQualities; - if (altDepth < LARGE_NUMBER_OF_DRAWS) { - // If the number of alt reads is "small" we draw from the actual base quality distribution. - sumOfQualities = 0; - for(int i = 0;i < altDepth;i++) { - if(rg.nextDouble() > overlapProbability) { - sumOfQualities += Math.min(qualityRW.draw(), pcrErrorRate); - } else { - // Simulate overlapping reads by sampling from the quality distribution twice. - // Since PCR errors affect overlapping reads, the total contribution to the sum - // of qualities cannot exceed the PCR error rate. - sumOfQualities += Math.min(qualityRW.draw() + qualityRW.draw(), pcrErrorRate); - } + // If the number of alt reads is "small" we draw from the actual base quality distribution. + sumOfQualities = 0; + for(int i = 0;i < altDepth;i++) { + if(rg.nextDouble() > overlapProbability) { + sumOfQualities += Math.min(qualityRW.draw(), maxOverlapBaseQuality); + } else { + // Simulate overlapping reads by sampling from the quality distribution twice. + // Since PCR errors affect overlapping reads, the total contribution to the sum + // of qualities cannot exceed the PCR error rate. + sumOfQualities += Math.min(qualityRW.draw() + qualityRW.draw(), maxOverlapBaseQuality); } - } else { - // If the number of alt reads is "large" we draw from a Gaussian approximation of the base - // quality distribution to speed up the code. - sumOfQualities = drawSumOfQScores(altDepth, averageQuality, standardDeviationQuality, randomNumberGenerator.nextGaussian()); } + if (isCalled(depth, altDepth, (double) sumOfQualities, alleleFraction, logOddsThreshold)) { calledVariants++; } @@ -294,20 +290,6 @@ public static double sensitivityAtConstantDepth(final int depth, final Histogram return (double) calledVariants / sampleSize; } - /** - * Simulates the sum of base qualities taken from reads that support the alternate allele by - * taking advantage of the fact that the sum of draws from a distribution tends towards a - * Gaussian per the Central Limit Theorem. - * @param altDepth Number of draws to take from base quality distribution - * @param averageQuality Average quality of alt bases - * @param standardDeviationQuality Sample standard deviation of base quality scores - * @param z number of standard deviation from the mean to take sum over - * @return Simulated sum of base qualities the support the alternate allele - */ - static int drawSumOfQScores(final int altDepth, final double averageQuality, final double standardDeviationQuality, final double z) { - return (int) (altDepth * averageQuality + z * Math.sqrt(altDepth) * standardDeviationQuality); - } - /** * Calculates the theoretical sensitivity with a given Phred-scaled quality score distribution at a constant * depth. diff --git a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java index e83e41f0a7..a4598ca2f0 100644 --- a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java +++ b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java @@ -18,7 +18,6 @@ import picard.analysis.MetricAccumulationLevel; import picard.analysis.TheoreticalSensitivity; import picard.analysis.TheoreticalSensitivityMetrics; -import picard.analysis.WgsMetrics; import picard.cmdline.CommandLineProgram; import picard.cmdline.StandardOptionDefinitions; import picard.metrics.MultilevelMetrics; diff --git a/src/test/java/picard/IntelInflaterDeflaterLoadTest.java b/src/test/java/picard/IntelInflaterDeflaterLoadTest.java index 9e7f0f9d35..355dcbb166 100644 --- a/src/test/java/picard/IntelInflaterDeflaterLoadTest.java +++ b/src/test/java/picard/IntelInflaterDeflaterLoadTest.java @@ -26,8 +26,7 @@ public void testIntelDeflaterIsAvailable() { } private void checkIntelSupported(final String componentName) { - // Check if on Linux Mac or Apple Silicon (e.g. Apple M1) - if ((!SystemUtils.IS_OS_LINUX && !SystemUtils.IS_OS_MAC) || SystemUtils.OS_ARCH.equals("aarch64")) { + if ((!SystemUtils.IS_OS_LINUX && !SystemUtils.IS_OS_MAC)) { throw new SkipException(componentName + " is not available on this platform"); } diff --git a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java index 8bfa8c67aa..36f7d514a4 100644 --- a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java +++ b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java @@ -34,7 +34,6 @@ import java.io.File; import java.io.FileReader; import java.util.*; -import java.util.stream.IntStream; /** * Created by davidben on 5/18/15. @@ -249,12 +248,9 @@ public void testSensitivityAtConstantDepth(final double expected, final File met metrics.read(metricsFileReader); } - Object hi[] = metrics.getMetrics().toArray(); - final List> histograms = metrics.getAllHistograms(); final Histogram qualityHistogram = histograms.get(1); - List l = metrics.getMetrics(); // We ensure that even using different random seeds we converge to roughly the same value. for (int i = 0; i < 3; i++) { double result = TheoreticalSensitivity.sensitivityAtConstantDepth(depth, qualityHistogram, 3, @@ -266,25 +262,31 @@ public void testSensitivityAtConstantDepth(final double expected, final File met @DataProvider(name = "TheoreticalSensitivityDataProvider") public Object[][] arbFracSensDataProvider() { final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics"); + final File wgsMetricsHighDepthFile = new File(TEST_DIR, "ultra_high_depth.wgs_metrics"); // This test acts primarily as an integration test. The sample sizes // are not quite large enough to converge properly, but is used for the purpose of // keeping the compute time of the tests short. return new Object[][]{ - {0.90, wgsMetricsFile, 0.5, 400, false}, - {0.78, wgsMetricsFile, 0.3, 400, false}, - {0.29, wgsMetricsFile, 0.1, 500, false}, - {0.08, wgsMetricsFile, 0.05, 500, false}, - - {0.90, wgsMetricsFile, 0.5, 400, true}, - {0.80, wgsMetricsFile, 0.3, 400, true}, - {0.35, wgsMetricsFile, 0.1, 500, true}, - {0.12, wgsMetricsFile, 0.05, 500, true} + {0.90, wgsMetricsFile, 0.5, 400, false, 45}, + {0.78, wgsMetricsFile, 0.3, 400, false, 45}, + {0.29, wgsMetricsFile, 0.1, 500, false, 45}, + {0.08, wgsMetricsFile, 0.05, 500, false, 45}, + + {0.90, wgsMetricsFile, 0.5, 400, true, 45}, + {0.80, wgsMetricsFile, 0.3, 400, true, 45}, + {0.35, wgsMetricsFile, 0.1, 500, true, 45}, + {0.12, wgsMetricsFile, 0.05, 500, true, 45}, + + {0.90, wgsMetricsHighDepthFile, 0.001, 400, true, 45}, + {0.80, wgsMetricsHighDepthFile, 0.001, 400, true, 90} }; } @Test(dataProvider = "TheoreticalSensitivityDataProvider") - public void testSensitivity(final double expected, final File metricsFile, final double alleleFraction, final int sampleSize, final boolean useOverlapProbability) throws Exception { + public void testSensitivity(final double expected, final File metricsFile, final double alleleFraction, + final int sampleSize, final boolean useOverlapProbability, + final int pcrErrorRate) throws Exception { // This tests Theoretical Sensitivity using distributions on both base quality scores // and the depth histogram. @@ -305,10 +307,10 @@ public void testSensitivity(final double expected, final File metricsFile, final final double overlapProbability = ((WgsMetrics) metrics.getMetrics().toArray()[0]).PCT_EXC_OVERLAP; final double result; if (useOverlapProbability) { - result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction, overlapProbability, 45); + result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction, overlapProbability, pcrErrorRate); } else { - result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction, 0, 45); + result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction, 0, pcrErrorRate); } Assert.assertEquals(result, expected, tolerance); @@ -374,50 +376,6 @@ public void testCallingThreshold(final int totalDepth, final int altDepth, final Assert.assertEquals(TheoreticalSensitivity.isCalled(totalDepth, altDepth, sumOfAltQualities, alleleFraction, logOddsThreshold), expectedCall); } - @DataProvider(name = "sumOfGaussiansDataProvider") - public Object[][] sumOfGaussians() { - final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics"); - final File targetedMetricsFile = new File(TEST_DIR, "test_25103070136.targeted_pcr_metrics"); - - // When we sum more base qualities from a particular distribution, it should look increasingly Gaussian. - return new Object[][]{ - {wgsMetricsFile, 500, 0.03}, - {wgsMetricsFile, 20, 0.05}, - {wgsMetricsFile, 10, 0.10}, - {targetedMetricsFile, 500, 0.03}, - {targetedMetricsFile, 20, 0.05}, - {targetedMetricsFile, 10, 0.10} - }; - } - - @Test(dataProvider = "sumOfGaussiansDataProvider") - public void testDrawSumOfQScores(final File metricsFile, final int altDepth, final double tolerance) throws Exception { - final MetricsFile metrics = new MetricsFile<>(); - try (final FileReader metricsFileReader = new FileReader(metricsFile)) { - metrics.read(metricsFileReader); - } - - final List> histograms = metrics.getAllHistograms(); - - final Histogram qualityHistogram = histograms.get(1); - final TheoreticalSensitivity.RouletteWheel qualityRW = new TheoreticalSensitivity.RouletteWheel( - TheoreticalSensitivity.trimDistribution(TheoreticalSensitivity.normalizeHistogram(qualityHistogram))); - - final Random randomNumberGenerator = new Random(51); - - // Calculate mean and deviation of quality score distribution to enable Gaussian sampling below - final double averageQuality = qualityHistogram.getMean(); - final double standardDeviationQuality = qualityHistogram.getStandardDeviation(); - - for (int k = 0; k < 1; k++) { - int sumOfQualitiesFull = IntStream.range(0, altDepth).map(n -> qualityRW.draw()).sum(); - int sumOfQualities = TheoreticalSensitivity.drawSumOfQScores(altDepth, averageQuality, - standardDeviationQuality, randomNumberGenerator.nextGaussian()); - - Assert.assertEquals(sumOfQualitiesFull, sumOfQualities, sumOfQualitiesFull * tolerance); - } - } - @DataProvider(name = "trimDistributionDataProvider") public Object[][] trimDistributions() { return new Object[][]{ From 91588b0bbc70d28bda0aa7789c8d27d13fb66b64 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Wed, 13 Jul 2022 10:57:34 -0400 Subject: [PATCH 7/7] Added tests for monotonicity. These tests fail, and suggest a possible problem with the model. --- .../analysis/TheoreticalSensitivity.java | 80 +++++++++---- .../analysis/TheoreticalSensitivityTest.java | 105 +++++++++++++++++- 2 files changed, 159 insertions(+), 26 deletions(-) diff --git a/src/main/java/picard/analysis/TheoreticalSensitivity.java b/src/main/java/picard/analysis/TheoreticalSensitivity.java index 33c3a0c337..8c75f87531 100644 --- a/src/main/java/picard/analysis/TheoreticalSensitivity.java +++ b/src/main/java/picard/analysis/TheoreticalSensitivity.java @@ -201,13 +201,20 @@ public static double[] normalizeHistogram(final Histogram histogram) { if (histogram == null) throw new PicardException("Histogram is null and cannot be normalized"); final double histogramSumOfValues = histogram.getSumOfValues(); - final double[] normalizedHistogram = new double[histogram.size()]; + final double[] normalizedHistogram = new double[(int) histogram.getMax() + 1]; - for (int i = 0; i < histogram.size(); i++) { - if (histogram.get(i) != null) { - normalizedHistogram[i] = histogram.get(i).getValue() / histogramSumOfValues; - } + final Iterator keySet = histogram.keySet().iterator(); + + for (Iterator it = keySet; it.hasNext(); ) { + Integer key = it.next(); + normalizedHistogram[key] = histogram.get(key).getValue() / histogramSumOfValues; } + + double s = 0; + for(int i = 0;i < (int) histogram.getMax() + 1;i++) { + s += normalizedHistogram[i]; + } + return normalizedHistogram; } @@ -286,6 +293,7 @@ public static double sensitivityAtConstantDepth(final int depth, final Histogram if (isCalled(depth, altDepth, (double) sumOfQualities, alleleFraction, logOddsThreshold)) { calledVariants++; } + //System.out.println("AF = " + alleleFraction + " altDepth = " + altDepth + " " + sumOfQualities + " " + isCalled(depth, altDepth, (double) sumOfQualities, alleleFraction, logOddsThreshold)); } return (double) calledVariants / sampleSize; } @@ -324,6 +332,15 @@ public static double theoreticalSensitivity(final Histogram depthHistog final Histogram qualityHistogram, final int sampleSize, final double logOddsThreshold, final double alleleFraction, final double overlapProbability, final int pcrErrorRate) { + return theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, logOddsThreshold, + alleleFraction, overlapProbability, pcrErrorRate, false); + } + + public static double theoreticalSensitivity(final Histogram depthHistogram, + final Histogram qualityHistogram, final int sampleSize, + final double logOddsThreshold, final double alleleFraction, + final double overlapProbability, final int pcrErrorRate, + final boolean useTrapezoidRule) { if (alleleFraction > 1.0 || alleleFraction < 0.0) { throw new IllegalArgumentException("Allele fractions must be between 0 and 1."); } @@ -333,25 +350,42 @@ public static double theoreticalSensitivity(final Histogram depthHistog // Integrate sensitivity over depth distribution double sensitivity = 0.0; int currentDepth = 0; - double right = 0; - while (currentDepth < depthDistribution.length) { - double deltaDepthProbability = 0.0; - // Accumulate a portion of the depth distribution to compute theoretical sensitivity over. - // This helps prevent us from spending lots of compute over coverages - // that occur with low probability and don't contribute much to sensitivity anyway, but - // it complicates things a bit by having a variable deltaDepthProbability which - // amount of the depth distribution to use with the trapezoid rule integration step. - while (deltaDepthProbability == 0 && currentDepth < depthDistribution.length || - deltaDepthProbability < DEPTH_BIN_WIDTH && currentDepth < depthDistribution.length && - depthDistribution[currentDepth] < DEPTH_BIN_WIDTH / 2.0) { - deltaDepthProbability += depthDistribution[currentDepth]; - currentDepth++; + + if(useTrapezoidRule) { + double right = 0; + while (currentDepth < depthDistribution.length) { + double deltaDepthProbability = 0.0; + // Accumulate a portion of the depth distribution to compute theoretical sensitivity over. + // This helps prevent us from spending lots of compute over coverages + // that occur with low probability and don't contribute much to sensitivity anyway, but + // it complicates things a bit by having a variable deltaDepthProbability which + // amount of the depth distribution to use with the trapezoid rule integration step. + while (deltaDepthProbability == 0 && currentDepth < depthDistribution.length || + deltaDepthProbability < DEPTH_BIN_WIDTH && currentDepth < depthDistribution.length && + depthDistribution[currentDepth] < DEPTH_BIN_WIDTH / 2.0) { + deltaDepthProbability += depthDistribution[currentDepth]; + currentDepth++; + } + // I think we overshoot by 1 + + + // Calculate sensitivity for a particular depth, and use trapezoid rule to integrate sensitivity. + final double left = right; + + System.out.println("Current Depth = " + currentDepth); + right = sensitivityAtConstantDepth(currentDepth, qualityHistogram, logOddsThreshold, sampleSize, + alleleFraction, overlapProbability, pcrErrorRate); + sensitivity += deltaDepthProbability * (left + right) / 2.0; + } + } + else { + for(currentDepth = 0;currentDepth < depthDistribution.length;currentDepth++) { + if(depthDistribution[currentDepth] != 0) { + System.out.println("Depth = " + currentDepth); + sensitivity += sensitivityAtConstantDepth(currentDepth, qualityHistogram, logOddsThreshold, sampleSize, + alleleFraction, overlapProbability, pcrErrorRate) * depthDistribution[currentDepth]; + } } - // Calculate sensitivity for a particular depth, and use trapezoid rule to integrate sensitivity. - final double left = right; - right = sensitivityAtConstantDepth(currentDepth, qualityHistogram, logOddsThreshold, sampleSize, - alleleFraction, overlapProbability, pcrErrorRate); - sensitivity += deltaDepthProbability * (left + right) / 2.0; } return sensitivity; } diff --git a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java index 36f7d514a4..4c99644bdd 100644 --- a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java +++ b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java @@ -262,7 +262,7 @@ public void testSensitivityAtConstantDepth(final double expected, final File met @DataProvider(name = "TheoreticalSensitivityDataProvider") public Object[][] arbFracSensDataProvider() { final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics"); - final File wgsMetricsHighDepthFile = new File(TEST_DIR, "ultra_high_depth.wgs_metrics"); + final File hsMetricsHighDepthFile = new File(TEST_DIR, "ultra_high_depth.wgs_metrics"); // This test acts primarily as an integration test. The sample sizes // are not quite large enough to converge properly, but is used for the purpose of @@ -278,8 +278,11 @@ public Object[][] arbFracSensDataProvider() { {0.35, wgsMetricsFile, 0.1, 500, true, 45}, {0.12, wgsMetricsFile, 0.05, 500, true, 45}, - {0.90, wgsMetricsHighDepthFile, 0.001, 400, true, 45}, - {0.80, wgsMetricsHighDepthFile, 0.001, 400, true, 90} + // This doesn't seem like a good strategy. I think + // I need to create tests that test a particular distribution of base quals and + // depth dist rather than reading it from a metrics file. I also should use overlap probability. + //{0.50, hsMetricsHighDepthFile, 0.01, 400, true, 45}, + //{0.50, hsMetricsHighDepthFile, 0.01, 400, true, 90} }; } @@ -316,6 +319,102 @@ public void testSensitivity(final double expected, final File metricsFile, final Assert.assertEquals(result, expected, tolerance); } + @DataProvider(name = "fixedDistributionsDataProvider") + public Object[][] fixedDistributionsDataProvider() { + final Histogram depthHistogram30X = new Histogram<>(); + depthHistogram30X.increment(30, 100); + + final Histogram qualityHistogramAllQ30 = new Histogram<>(); + qualityHistogramAllQ30.increment(30, 100); + + final Histogram depthHistogram1000X = new Histogram<>(); + depthHistogram1000X.increment(1000, 1); + + // Something is strange about how the depth distribution is acting. + // The quality distribution seems to be acting fine, but the depth + // dist seems to act as if there is an off by 1 error with a bin + // set at 0 depth, and reducing the sensitivity. + + // I think there is a bug in the trapz code, seems very off by oneish. + // I also think there is something wrong with the model because it isn't monotonic with + // increasing fixed depth... WTF is going on there? + + final Histogram qualityHistogramAllQ60 = new Histogram<>(); + qualityHistogramAllQ60.increment(60, 100); + + return new Object[][]{ + {0.58, depthHistogram30X, qualityHistogramAllQ30, 400, 0.1, 45}, + {0.58, depthHistogram30X, qualityHistogramAllQ30, 400, 0.1, 45}, + {1.00, depthHistogram1000X, qualityHistogramAllQ60, 400, 0.5, 60}, + {1.00, depthHistogram1000X, qualityHistogramAllQ60, 400, 0.1, 105}, + + }; + } + + @Test(dataProvider = "fixedDistributionsDataProvider") + public void testSensitivityOnFixedDistributions(final double expected, + final Histogram depthHistogram, final Histogram qualityHistogram, + final int sampleSize, final double alleleFraction, + final int pcrErrorRate) { + + double actual = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, + 3, alleleFraction, 0.0, pcrErrorRate); + + Assert.assertEquals(actual, expected, 0.01); + } + + @Test + public void isMonotonicFixedDepth() { + final Histogram qualityHistogram = new Histogram<>(); + qualityHistogram.increment(20, 100); + + final Histogram depthHistogram = new Histogram<>(); + depthHistogram.increment(20, 1); + + int sampleSize = 100000; + int logOddsThreshold = 10; + double overlapProbability = 0.0; + int pcrErrorRate = 45; + + double lastSensitivity = 0.0; + double currentSensitivity; + for(double alleleFraction = 0.49;alleleFraction <= 0.49;alleleFraction += 0.01) { + currentSensitivity = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, + logOddsThreshold, alleleFraction, overlapProbability, pcrErrorRate); + System.out.println("alleleFraction = " + alleleFraction + " currentSensitivity = " + currentSensitivity + " last = " + lastSensitivity); + + Assert.assertTrue(currentSensitivity >= lastSensitivity); + lastSensitivity = currentSensitivity; + } + } + + @Test + public void isMonotonicFixedAF() { + final double alleleFraction = 0.3; + final Histogram qualityHistogram = new Histogram<>(); + qualityHistogram.increment(30, 100); + + + int sampleSize = 100000; + int logOddsThreshold = 10; + double overlapProbability = 0.0; + int pcrErrorRate = 45; + + double lastSensitivity = 0.0; + double currentSensitivity; + for(int currentDepth = 20;currentDepth <= 21;currentDepth++) { + final Histogram depthHistogram = new Histogram<>(); + depthHistogram.increment(currentDepth, 1); + + currentSensitivity = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, + logOddsThreshold, alleleFraction, overlapProbability, pcrErrorRate); + System.out.println("alleleFraction = " + alleleFraction + " currentSensitivity = " + currentSensitivity + " last = " + lastSensitivity); + + Assert.assertTrue(currentSensitivity >= lastSensitivity); + lastSensitivity = currentSensitivity; + } + } + @DataProvider(name = "equivalanceHetVsArbitrary") public Object[][] equivalenceHetVsFull() { final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics");