diff --git a/src/main/java/picard/analysis/CollectUmiPrevalenceMetrics.java b/src/main/java/picard/analysis/CollectUmiPrevalenceMetrics.java new file mode 100644 index 0000000000..433d366d84 --- /dev/null +++ b/src/main/java/picard/analysis/CollectUmiPrevalenceMetrics.java @@ -0,0 +1,181 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import htsjdk.samtools.*; +import htsjdk.samtools.filter.*; +import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.util.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.argparser.ExperimentalFeature; +import org.broadinstitute.barclay.help.DocumentedFeature; +import picard.cmdline.CommandLineProgram; +import picard.cmdline.StandardOptionDefinitions; +import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup; +import picard.filter.CountingFilterWrapper; +import picard.filter.CountingMapQFilter; +import picard.filter.CountingPairedFilter; + +import java.io.File; +import java.io.IOException; +import java.util.*; +import java.util.stream.IntStream; + +import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME; + +/** + * A CLP that, tallies the number of UMIs in a duplicate set across the entire bam and produces a histogram. + * + * @author Yossi Farjoun + */ +@DocumentedFeature +@ExperimentalFeature +@CommandLineProgramProperties( + summary = CollectUmiPrevalenceMetrics.USAGE_SUMMARY + CollectUmiPrevalenceMetrics.USAGE_DETAILS, + oneLineSummary = CollectUmiPrevalenceMetrics.USAGE_SUMMARY, + programGroup = DiagnosticsAndQCProgramGroup.class +) +public class CollectUmiPrevalenceMetrics extends CommandLineProgram { + + static final String USAGE_SUMMARY = "Tally the counts of UMIs in duplicate sets within a bam. \n"; + static final String USAGE_DETAILS = "

" + + "This tool collects the Histogram of the number of duplicate sets that contain a given number of UMIs. " + + "Understanding this distribution can help understand the role that the UMIs have in the determination of " + + "consensus sets, the risk of UMI collisions, and of spurious reads that result from uncorrected UMIs."; + + + @Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input (indexed) BAM/CRAM file.") + public File INPUT; + + @Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Write metrics to this file") + public File OUTPUT; + + @Argument(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "minimal value for the mapping quality of the reads to be used in the estimation.", optional = true) + public Integer MINIMUM_MQ = 30; + + @Argument(doc = "Barcode SAM tag.", optional = true) + public String BARCODE_TAG = "RX"; + + @Argument(doc = "Barcode Quality SAM tag.", optional = true) + public String BARCODE_BQ = "QX"; + + @Argument(shortName = "BQ", doc = "minimal value for the base quality of all the bases in a molecular barcode, for it to be used.", optional = true) + public Integer MINIMUM_BARCODE_BQ = 30; + + @Argument(shortName = "FUR", doc = "Whether to filter unpaired reads from the input.", optional = true) + public boolean FILTER_UNPAIRED_READS = true; + + @Argument( + fullName = "PROGRESS_STEP_INTERVAL", + doc = "The interval between which progress will be displayed.", + optional = true + ) + public int PROGRESS_STEP_INTERVAL = 1_000_000; + + + private static final Log log = Log.getInstance(CollectUmiPrevalenceMetrics.class); + + @Override + protected int doWork() { + + IOUtil.assertFileIsReadable(INPUT); + + // get an iterator to reads that overlap the heterozygous sites + final Histogram umiCount = new Histogram<>("numUmis", "duplicateSets"); + final CountingPairedFilter countingPairedFilter = new CountingPairedFilter(); + final CountingFilterWrapper countingAlignedFilter = new CountingFilterWrapper(new AlignedFilter(true)); + final CountingMapQFilter countingMapQFilter = new CountingMapQFilter(MINIMUM_MQ); + final CountingFilterWrapper countingSecondaryOrSupplementaryFilter = + new CountingFilterWrapper(new SecondaryOrSupplementaryFilter()); + final ProgressLogger progress = new ProgressLogger(log, PROGRESS_STEP_INTERVAL, "examined", "duplicate sets"); + + try (SamReader in = SamReaderFactory.makeDefault() + .referenceSequence(REFERENCE_SEQUENCE) + .open(INPUT)) { + + IOUtil.assertFileIsWritable(OUTPUT); + + final SAMRecordIterator samRecordIterator = in.iterator(); + final List samFilters = CollectionUtil.makeList( + countingAlignedFilter, + countingMapQFilter, + countingSecondaryOrSupplementaryFilter + ); + if (FILTER_UNPAIRED_READS) { + samFilters.add(countingPairedFilter); + } + + final FilteringSamIterator filteredSamRecordIterator = new FilteringSamIterator(samRecordIterator, new AggregateFilter(samFilters)); + log.info("Queried BAM, getting duplicate sets."); + + // get duplicate iterator from iterator above + final DuplicateSetIterator duplicateSets = new DuplicateSetIterator(filteredSamRecordIterator, in.getFileHeader(), false, null, log); + + log.info("Starting iteration on duplicate sets"); + + set: + while (duplicateSets.hasNext()) { + + final DuplicateSet set = duplicateSets.next(); + final SAMRecord setRep = set.getRepresentative(); + + progress.record(setRep); + Set barcodes = new HashSet<>(); + for (final SAMRecord read : set.getRecords()) { + if (!read.hasAttribute(BARCODE_TAG)) { + log.warn("no barcode tag!"); + continue; + } + if (read.hasAttribute(BARCODE_BQ)) { + final String barcodeBQ = read.getStringAttribute(BARCODE_BQ).replace(" ", ""); + ; + final byte[] bytes = SAMUtils.fastqToPhred(barcodeBQ); + final boolean badQuality = IntStream.range(0, bytes.length).map(i -> bytes[i]).anyMatch(q -> q < MINIMUM_BARCODE_BQ); + if (badQuality) { + log.warn("bad quality barcode"); + continue; + } + } + barcodes.add(read.getStringAttribute(BARCODE_TAG)); + } + umiCount.increment(barcodes.size(), 1); + } + } catch (IOException e) { + throw new RuntimeException("Problem while reading file: " + INPUT, e); + } + log.info("Iteration done. Emitting metrics."); + log.info(String.format("Processed %d sets", progress.getCount())); + log.info(String.format("Filtered %d unpaired reads", countingPairedFilter.getFilteredRecords())); + log.info(String.format("Filtered %d unaligned reads", countingAlignedFilter.getFilteredRecords())); + log.info(String.format("Filtered %d low mapQ reads", countingMapQFilter.getFilteredRecords())); + log.info(String.format("Filtered %d Secondary or Supplementary reads", countingSecondaryOrSupplementaryFilter.getFilteredRecords())); + // Emit metrics + final MetricsFile metricsFile = getMetricsFile(); + metricsFile.addHistogram(umiCount); + metricsFile.write(OUTPUT); + return 0; + } +} diff --git a/src/main/java/picard/filter/CountingFilterWrapper.java b/src/main/java/picard/filter/CountingFilterWrapper.java new file mode 100644 index 0000000000..58984c1a11 --- /dev/null +++ b/src/main/java/picard/filter/CountingFilterWrapper.java @@ -0,0 +1,19 @@ +package picard.filter; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.filter.SamRecordFilter; + +/** + * A CountingFilter that wraps any SamRecordFilter and provides a count of the reads and bases filtered + */ +public class CountingFilterWrapper extends CountingFilter { + private final SamRecordFilter wrappedFilter; + public CountingFilterWrapper(SamRecordFilter wrappedFilter) { + this.wrappedFilter = wrappedFilter; + } + + @Override + public boolean reallyFilterOut(SAMRecord record) { + return wrappedFilter.filterOut(record); + } +} diff --git a/src/test/java/picard/analysis/CollectUmiPrevalenceMetricsTest.java b/src/test/java/picard/analysis/CollectUmiPrevalenceMetricsTest.java new file mode 100644 index 0000000000..e06b91d753 --- /dev/null +++ b/src/test/java/picard/analysis/CollectUmiPrevalenceMetricsTest.java @@ -0,0 +1,50 @@ +package picard.analysis; + +import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.util.Histogram; +import org.testng.Assert; +import org.testng.annotations.Test; +import picard.cmdline.CommandLineProgramTest; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; + +import static org.testng.Assert.*; + +public class CollectUmiPrevalenceMetricsTest extends CommandLineProgramTest { + + @Override + public String getCommandLineProgramName() { return CollectUmiPrevalenceMetrics.class.getSimpleName();} + + + @Test + public void integrationTest() throws IOException { + final File TEST_DIR = new File("testdata/picard/independent_replicates/"); + + final File samFile = new File(TEST_DIR, "twopairsWithManyUmis.sam"); + final File metricsFile = File.createTempFile("test", ".umi_hist"); + metricsFile.deleteOnExit(); + + final String[] args = new String[]{ + "INPUT=" + samFile.getAbsolutePath(), + "OUTPUT=" + metricsFile.getAbsolutePath(), + "MINIMUM_MQ=20" + }; + Assert.assertEquals(runPicardCommandLine(args), 0); + + final MetricsFile output = new MetricsFile<>(); + output.read(new FileReader(metricsFile)); + + final Histogram hist = output.getHistogram(); + + assertNull(hist.get(1)); + assertEquals(hist.get(2).getValue(),4); + assertEquals(hist.get(3).getValue(),1); + assertEquals(hist.get(4).getValue(),1); + assertNull(hist.get(5)); + assertNull(hist.get(6)); + assertEquals(hist.getCount(),6); // sets + assertEquals(hist.getSum(),15); // unique barcodes (in sets) + } +} \ No newline at end of file diff --git a/testdata/picard/independent_replicates/twopairsWithManyUmis.sam b/testdata/picard/independent_replicates/twopairsWithManyUmis.sam new file mode 100644 index 0000000000..134b9bc4d1 --- /dev/null +++ b/testdata/picard/independent_replicates/twopairsWithManyUmis.sam @@ -0,0 +1,30 @@ +@HD VN:1.0 SO:coordinate +@SQ SN:chr1 LN:501 +@SQ SN:chr2 LN:501 +@SQ SN:chr3 LN:501 +@SQ SN:chr4 LN:101 +@SQ SN:chr5 LN:101 +@SQ SN:chr6 LN:101 +@SQ SN:chr7 LN:404 +@SQ SN:chr8 LN:202 +@RG ID:1AAXX.3 SM:test LB:mylib.yossi PL:ILLUMINA +@PG ID:bwa PN:bwa VN:3 CL:bwa aln +@CO manually created file. second pair is a copy of the first pair except that it's marked as a duplicate, and the 10th base of the +@CO first read is modified C->t (lower case for easy view). this corresponds to position 11 +@CO 123456789 123456789 123456789 123456789 123456789 1234567890 +1AAXX.3.1.1 83 chr1 1 255 101M = 302 0201 CAACAGAAGCCGGTATCTGaGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA +1AAXX.3.1.2 83 chr1 1 255 101M = 302 0201 CAACAGAAGCtGGNATCTGaGcTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA +1AAXX.3.1.1 163 chr1 302 255 101M = 1 -201 NCGCGGCATCcCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA +1AAXX.3.1.2 163 chr1 302 255 101M = 1 -201 NCGCGGCATCcCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA +1AAXX.3.2.1 99 chr2 1 255 101M = 302 0201 CAACAGAAGCCGGTATCTGaGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA +1AAXX.3.2.2 163 chr2 1 255 101M = 302 0201 CAACAGAAGCtGGNATCTGaGcTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA +1AAXX.3.2.1 147 chr2 302 255 101M = 1 -201 NCGCGGCATCcCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA +1AAXX.3.2.2 83 chr2 302 255 101M = 1 -201 NCGCGGCATCcCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA +1ACXX.3.3.1 83 chr3 2 255 101M = 303 +201 AACAGAAGCCGGTATCTGaGTTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA +1ACXX.3.3.2 83 chr3 2 255 101M = 303 +201 AACAGAAGCtGGNATCTGaGcTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA +1ACXX.3.4.1 83 chr3 2 255 101M = 303 +201 AACAGAAGCCGGTATCTGaGTTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA +1ACXX.3.4.2 83 chr3 2 255 101M = 303 +201 AACAGAAGCtGGNATCTGaGcTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:GG-GG QX:Z:AA AA +1ACXX.3.3.1 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA +1ACXX.3.3.2 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA +1ACXX.3.4.1 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:TT-TT QX:Z:AA AA +1ACXX.3.4.2 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:GG-GG QX:Z:AA AA