diff --git a/src/main/java/picard/analysis/CollectUmiPrevalenceMetrics.java b/src/main/java/picard/analysis/CollectUmiPrevalenceMetrics.java
new file mode 100644
index 0000000000..433d366d84
--- /dev/null
+++ b/src/main/java/picard/analysis/CollectUmiPrevalenceMetrics.java
@@ -0,0 +1,181 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package picard.analysis;
+
+import htsjdk.samtools.*;
+import htsjdk.samtools.filter.*;
+import htsjdk.samtools.metrics.MetricsFile;
+import htsjdk.samtools.util.*;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import picard.cmdline.CommandLineProgram;
+import picard.cmdline.StandardOptionDefinitions;
+import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;
+import picard.filter.CountingFilterWrapper;
+import picard.filter.CountingMapQFilter;
+import picard.filter.CountingPairedFilter;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.*;
+import java.util.stream.IntStream;
+
+import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME;
+
+/**
+ * A CLP that, tallies the number of UMIs in a duplicate set across the entire bam and produces a histogram.
+ *
+ * @author Yossi Farjoun
+ */
+@DocumentedFeature
+@ExperimentalFeature
+@CommandLineProgramProperties(
+ summary = CollectUmiPrevalenceMetrics.USAGE_SUMMARY + CollectUmiPrevalenceMetrics.USAGE_DETAILS,
+ oneLineSummary = CollectUmiPrevalenceMetrics.USAGE_SUMMARY,
+ programGroup = DiagnosticsAndQCProgramGroup.class
+)
+public class CollectUmiPrevalenceMetrics extends CommandLineProgram {
+
+ static final String USAGE_SUMMARY = "Tally the counts of UMIs in duplicate sets within a bam. \n";
+ static final String USAGE_DETAILS = "
" +
+ "This tool collects the Histogram of the number of duplicate sets that contain a given number of UMIs. " +
+ "Understanding this distribution can help understand the role that the UMIs have in the determination of " +
+ "consensus sets, the risk of UMI collisions, and of spurious reads that result from uncorrected UMIs.";
+
+
+ @Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input (indexed) BAM/CRAM file.")
+ public File INPUT;
+
+ @Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Write metrics to this file")
+ public File OUTPUT;
+
+ @Argument(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "minimal value for the mapping quality of the reads to be used in the estimation.", optional = true)
+ public Integer MINIMUM_MQ = 30;
+
+ @Argument(doc = "Barcode SAM tag.", optional = true)
+ public String BARCODE_TAG = "RX";
+
+ @Argument(doc = "Barcode Quality SAM tag.", optional = true)
+ public String BARCODE_BQ = "QX";
+
+ @Argument(shortName = "BQ", doc = "minimal value for the base quality of all the bases in a molecular barcode, for it to be used.", optional = true)
+ public Integer MINIMUM_BARCODE_BQ = 30;
+
+ @Argument(shortName = "FUR", doc = "Whether to filter unpaired reads from the input.", optional = true)
+ public boolean FILTER_UNPAIRED_READS = true;
+
+ @Argument(
+ fullName = "PROGRESS_STEP_INTERVAL",
+ doc = "The interval between which progress will be displayed.",
+ optional = true
+ )
+ public int PROGRESS_STEP_INTERVAL = 1_000_000;
+
+
+ private static final Log log = Log.getInstance(CollectUmiPrevalenceMetrics.class);
+
+ @Override
+ protected int doWork() {
+
+ IOUtil.assertFileIsReadable(INPUT);
+
+ // get an iterator to reads that overlap the heterozygous sites
+ final Histogram umiCount = new Histogram<>("numUmis", "duplicateSets");
+ final CountingPairedFilter countingPairedFilter = new CountingPairedFilter();
+ final CountingFilterWrapper countingAlignedFilter = new CountingFilterWrapper(new AlignedFilter(true));
+ final CountingMapQFilter countingMapQFilter = new CountingMapQFilter(MINIMUM_MQ);
+ final CountingFilterWrapper countingSecondaryOrSupplementaryFilter =
+ new CountingFilterWrapper(new SecondaryOrSupplementaryFilter());
+ final ProgressLogger progress = new ProgressLogger(log, PROGRESS_STEP_INTERVAL, "examined", "duplicate sets");
+
+ try (SamReader in = SamReaderFactory.makeDefault()
+ .referenceSequence(REFERENCE_SEQUENCE)
+ .open(INPUT)) {
+
+ IOUtil.assertFileIsWritable(OUTPUT);
+
+ final SAMRecordIterator samRecordIterator = in.iterator();
+ final List samFilters = CollectionUtil.makeList(
+ countingAlignedFilter,
+ countingMapQFilter,
+ countingSecondaryOrSupplementaryFilter
+ );
+ if (FILTER_UNPAIRED_READS) {
+ samFilters.add(countingPairedFilter);
+ }
+
+ final FilteringSamIterator filteredSamRecordIterator = new FilteringSamIterator(samRecordIterator, new AggregateFilter(samFilters));
+ log.info("Queried BAM, getting duplicate sets.");
+
+ // get duplicate iterator from iterator above
+ final DuplicateSetIterator duplicateSets = new DuplicateSetIterator(filteredSamRecordIterator, in.getFileHeader(), false, null, log);
+
+ log.info("Starting iteration on duplicate sets");
+
+ set:
+ while (duplicateSets.hasNext()) {
+
+ final DuplicateSet set = duplicateSets.next();
+ final SAMRecord setRep = set.getRepresentative();
+
+ progress.record(setRep);
+ Set barcodes = new HashSet<>();
+ for (final SAMRecord read : set.getRecords()) {
+ if (!read.hasAttribute(BARCODE_TAG)) {
+ log.warn("no barcode tag!");
+ continue;
+ }
+ if (read.hasAttribute(BARCODE_BQ)) {
+ final String barcodeBQ = read.getStringAttribute(BARCODE_BQ).replace(" ", "");
+ ;
+ final byte[] bytes = SAMUtils.fastqToPhred(barcodeBQ);
+ final boolean badQuality = IntStream.range(0, bytes.length).map(i -> bytes[i]).anyMatch(q -> q < MINIMUM_BARCODE_BQ);
+ if (badQuality) {
+ log.warn("bad quality barcode");
+ continue;
+ }
+ }
+ barcodes.add(read.getStringAttribute(BARCODE_TAG));
+ }
+ umiCount.increment(barcodes.size(), 1);
+ }
+ } catch (IOException e) {
+ throw new RuntimeException("Problem while reading file: " + INPUT, e);
+ }
+ log.info("Iteration done. Emitting metrics.");
+ log.info(String.format("Processed %d sets", progress.getCount()));
+ log.info(String.format("Filtered %d unpaired reads", countingPairedFilter.getFilteredRecords()));
+ log.info(String.format("Filtered %d unaligned reads", countingAlignedFilter.getFilteredRecords()));
+ log.info(String.format("Filtered %d low mapQ reads", countingMapQFilter.getFilteredRecords()));
+ log.info(String.format("Filtered %d Secondary or Supplementary reads", countingSecondaryOrSupplementaryFilter.getFilteredRecords()));
+ // Emit metrics
+ final MetricsFile, Integer> metricsFile = getMetricsFile();
+ metricsFile.addHistogram(umiCount);
+ metricsFile.write(OUTPUT);
+ return 0;
+ }
+}
diff --git a/src/main/java/picard/filter/CountingFilterWrapper.java b/src/main/java/picard/filter/CountingFilterWrapper.java
new file mode 100644
index 0000000000..58984c1a11
--- /dev/null
+++ b/src/main/java/picard/filter/CountingFilterWrapper.java
@@ -0,0 +1,19 @@
+package picard.filter;
+
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.filter.SamRecordFilter;
+
+/**
+ * A CountingFilter that wraps any SamRecordFilter and provides a count of the reads and bases filtered
+ */
+public class CountingFilterWrapper extends CountingFilter {
+ private final SamRecordFilter wrappedFilter;
+ public CountingFilterWrapper(SamRecordFilter wrappedFilter) {
+ this.wrappedFilter = wrappedFilter;
+ }
+
+ @Override
+ public boolean reallyFilterOut(SAMRecord record) {
+ return wrappedFilter.filterOut(record);
+ }
+}
diff --git a/src/test/java/picard/analysis/CollectUmiPrevalenceMetricsTest.java b/src/test/java/picard/analysis/CollectUmiPrevalenceMetricsTest.java
new file mode 100644
index 0000000000..e06b91d753
--- /dev/null
+++ b/src/test/java/picard/analysis/CollectUmiPrevalenceMetricsTest.java
@@ -0,0 +1,50 @@
+package picard.analysis;
+
+import htsjdk.samtools.metrics.MetricsFile;
+import htsjdk.samtools.util.Histogram;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+import picard.cmdline.CommandLineProgramTest;
+
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+
+import static org.testng.Assert.*;
+
+public class CollectUmiPrevalenceMetricsTest extends CommandLineProgramTest {
+
+ @Override
+ public String getCommandLineProgramName() { return CollectUmiPrevalenceMetrics.class.getSimpleName();}
+
+
+ @Test
+ public void integrationTest() throws IOException {
+ final File TEST_DIR = new File("testdata/picard/independent_replicates/");
+
+ final File samFile = new File(TEST_DIR, "twopairsWithManyUmis.sam");
+ final File metricsFile = File.createTempFile("test", ".umi_hist");
+ metricsFile.deleteOnExit();
+
+ final String[] args = new String[]{
+ "INPUT=" + samFile.getAbsolutePath(),
+ "OUTPUT=" + metricsFile.getAbsolutePath(),
+ "MINIMUM_MQ=20"
+ };
+ Assert.assertEquals(runPicardCommandLine(args), 0);
+
+ final MetricsFile, Integer> output = new MetricsFile<>();
+ output.read(new FileReader(metricsFile));
+
+ final Histogram hist = output.getHistogram();
+
+ assertNull(hist.get(1));
+ assertEquals(hist.get(2).getValue(),4);
+ assertEquals(hist.get(3).getValue(),1);
+ assertEquals(hist.get(4).getValue(),1);
+ assertNull(hist.get(5));
+ assertNull(hist.get(6));
+ assertEquals(hist.getCount(),6); // sets
+ assertEquals(hist.getSum(),15); // unique barcodes (in sets)
+ }
+}
\ No newline at end of file
diff --git a/testdata/picard/independent_replicates/twopairsWithManyUmis.sam b/testdata/picard/independent_replicates/twopairsWithManyUmis.sam
new file mode 100644
index 0000000000..134b9bc4d1
--- /dev/null
+++ b/testdata/picard/independent_replicates/twopairsWithManyUmis.sam
@@ -0,0 +1,30 @@
+@HD VN:1.0 SO:coordinate
+@SQ SN:chr1 LN:501
+@SQ SN:chr2 LN:501
+@SQ SN:chr3 LN:501
+@SQ SN:chr4 LN:101
+@SQ SN:chr5 LN:101
+@SQ SN:chr6 LN:101
+@SQ SN:chr7 LN:404
+@SQ SN:chr8 LN:202
+@RG ID:1AAXX.3 SM:test LB:mylib.yossi PL:ILLUMINA
+@PG ID:bwa PN:bwa VN:3 CL:bwa aln
+@CO manually created file. second pair is a copy of the first pair except that it's marked as a duplicate, and the 10th base of the
+@CO first read is modified C->t (lower case for easy view). this corresponds to position 11
+@CO 123456789 123456789 123456789 123456789 123456789 1234567890
+1AAXX.3.1.1 83 chr1 1 255 101M = 302 0201 CAACAGAAGCCGGTATCTGaGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA
+1AAXX.3.1.2 83 chr1 1 255 101M = 302 0201 CAACAGAAGCtGGNATCTGaGcTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA
+1AAXX.3.1.1 163 chr1 302 255 101M = 1 -201 NCGCGGCATCcCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA
+1AAXX.3.1.2 163 chr1 302 255 101M = 1 -201 NCGCGGCATCcCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA
+1AAXX.3.2.1 99 chr2 1 255 101M = 302 0201 CAACAGAAGCCGGTATCTGaGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA
+1AAXX.3.2.2 163 chr2 1 255 101M = 302 0201 CAACAGAAGCtGGNATCTGaGcTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA
+1AAXX.3.2.1 147 chr2 302 255 101M = 1 -201 NCGCGGCATCcCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA
+1AAXX.3.2.2 83 chr2 302 255 101M = 1 -201 NCGCGGCATCcCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA
+1ACXX.3.3.1 83 chr3 2 255 101M = 303 +201 AACAGAAGCCGGTATCTGaGTTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA
+1ACXX.3.3.2 83 chr3 2 255 101M = 303 +201 AACAGAAGCtGGNATCTGaGcTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA
+1ACXX.3.4.1 83 chr3 2 255 101M = 303 +201 AACAGAAGCCGGTATCTGaGTTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA
+1ACXX.3.4.2 83 chr3 2 255 101M = 303 +201 AACAGAAGCtGGNATCTGaGcTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:GG-GG QX:Z:AA AA
+1ACXX.3.3.1 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA
+1ACXX.3.3.2 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA
+1ACXX.3.4.1 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:TT-TT QX:Z:AA AA
+1ACXX.3.4.2 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:GG-GG QX:Z:AA AA