From fad9986e5dc02cb5e7acd00c5ef9acd733095223 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Fri, 14 Feb 2025 14:26:47 -0500 Subject: [PATCH 1/4] feat: add CollectUmiPrevelanceMetrics Add a CLP that collects the histogram of the number of unique UMIs found within duplicate sets (constructed _without_ consideration for UMIs) --- .../analysis/CollectUmiPrevelanceMetrics.java | 187 ++++++++++++++++++ .../twopairsWithUMIsMultipleOrientations.sam | 6 +- 2 files changed, 190 insertions(+), 3 deletions(-) create mode 100644 src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java diff --git a/src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java b/src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java new file mode 100644 index 0000000000..caf5f86cfb --- /dev/null +++ b/src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java @@ -0,0 +1,187 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import htsjdk.samtools.*; +import htsjdk.samtools.filter.*; +import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.util.*; +import htsjdk.utils.ValidationUtils; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.filter.*; +import htsjdk.variant.vcf.VCFContigHeaderLine; +import htsjdk.variant.vcf.VCFFileReader; +import htsjdk.variant.vcf.VCFHeader; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.argparser.ExperimentalFeature; +import org.broadinstitute.barclay.help.DocumentedFeature; +import picard.PicardException; +import picard.analysis.replicates.IndependentReplicateMetric; +import picard.cmdline.CommandLineProgram; +import picard.cmdline.StandardOptionDefinitions; +import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup; +import picard.filter.CountingMapQFilter; +import picard.filter.CountingPairedFilter; +import picard.util.SequenceDictionaryUtils; + +import java.io.File; +import java.io.IOException; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME; + +/** + * A CLP that, tallys the number of UMIs in a duplicate set across the entire bam and produces a histogram. + * + * @author Yossi Farjoun + */ +@DocumentedFeature +@ExperimentalFeature +@CommandLineProgramProperties( + summary = CollectUmiPrevelanceMetrics.USAGE_SUMMARY + CollectUmiPrevelanceMetrics.USAGE_DETAILS, + oneLineSummary = CollectUmiPrevelanceMetrics.USAGE_SUMMARY, + programGroup = DiagnosticsAndQCProgramGroup.class +) +public class CollectUmiPrevelanceMetrics extends CommandLineProgram { + + static final String USAGE_SUMMARY = "Tally the counts of UMIs in duplicate sets within a bam. \n"; + static final String USAGE_DETAILS = "

" + + "This tool estimates "; + + + @Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input (indexed) BAM/CRAM file.") + public File INPUT; + + @Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Write metrics to this file") + public File OUTPUT; + + @Argument(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "minimal value for the mapping quality of the reads to be used in the estimation.", optional = true) + public Integer MINIMUM_MQ = 40; + + @Argument(doc = "Number of sets to examine before stopping.", optional = true) + public Integer STOP_AFTER = 0; + + @Argument(doc = "Barcode SAM tag.", optional = true) + public String BARCODE_TAG = "RX"; + + @Argument(doc = "Barcode Quality SAM tag.", optional = true) + public String BARCODE_BQ = "QX"; + + @Argument(shortName = "BQ", doc = "minimal value for the base quality of all the bases in a molecular barcode, for it to be used.", optional = true) + public Integer MINIMUM_BARCODE_BQ = 30; + + @Argument(shortName = "FUR", doc = "Whether to filter unpaired reads from the input.", optional = true) + public boolean FILTER_UNPAIRED_READS = true; + + @Argument( + fullName = "PROGRESS_STEP_INTERVAL", + doc = "The interval between which progress will be displayed.", + optional = true + ) + public int PROGRESS_STEP_INTERVAL = 100000; + + + private static final Log log = Log.getInstance(CollectUmiPrevelanceMetrics.class); + + @Override + protected int doWork() { + + IOUtil.assertFileIsReadable(INPUT); + + // get an iterator to reads that overlap the heterozygous sites + final Histogram umiCount = new Histogram<>("numUmis", "duplicateSets"); + final CountingPairedFilter countingPairedFilter = new CountingPairedFilter(); + final CountingMapQFilter countingMapQFilter = new CountingMapQFilter(MINIMUM_MQ); + final ProgressLogger progress = new ProgressLogger(log, PROGRESS_STEP_INTERVAL, "examined", "duplicate sets"); + + try (SamReader in = SamReaderFactory.makeDefault() + .referenceSequence(REFERENCE_SEQUENCE) + .open(INPUT)) { + + + IOUtil.assertFileIsWritable(OUTPUT); + + final SAMRecordIterator samRecordIterator = in.iterator(); + final List samFilters = CollectionUtil.makeList( + new AlignedFilter(true), + new SecondaryOrSupplementaryFilter(), + countingMapQFilter + ); + if (FILTER_UNPAIRED_READS) { + samFilters.add(countingPairedFilter); + } + + final FilteringSamIterator filteredSamRecordIterator = new FilteringSamIterator(samRecordIterator, new AggregateFilter(samFilters)); + log.info("Queried BAM, getting duplicate sets."); + + // get duplicate iterator from iterator above + final DuplicateSetIterator duplicateSets = new DuplicateSetIterator(filteredSamRecordIterator, in.getFileHeader(), false, null, log); + + log.info("Starting iteration on reads"); + + set: + while (duplicateSets.hasNext()) { + + final DuplicateSet set = duplicateSets.next(); + final SAMRecord setRep = set.getRepresentative(); + log.info(String.format("duplicate set! %s", setRep.getReadName())); + + progress.record(setRep); + Set barcodes = new HashSet<>(); + for (final SAMRecord read : set.getRecords()) { + if (!read.hasAttribute(BARCODE_TAG)) { + log.warn("no barcode tag!"); + continue; + } + if (read.hasAttribute(BARCODE_BQ)) { + final byte[] bytes = SAMUtils.fastqToPhred(read.getStringAttribute(BARCODE_BQ)); + final boolean badQuality = IntStream.range(0, bytes.length).map(i -> bytes[i]).anyMatch(q -> q < MINIMUM_BARCODE_BQ); + if (badQuality) { + log.warn("bad quality barcode"); + continue; + } + } + barcodes.add(read.getStringAttribute(BARCODE_TAG)); + } + umiCount.increment(barcodes.size(), 1); + } + } catch (IOException e) { + throw new RuntimeException("Problem while reading file: " + INPUT, e); + } + log.info("Iteration done. Emitting metrics."); + log.info(String.format("Processed %d sets", progress.getCount())); + log.info(String.format("Filtered %d unpaired reads", countingPairedFilter.getFilteredRecords())); + log.info(String.format("Filtered %d low mapQ reads", countingMapQFilter.getFilteredRecords())); + // Emit metrics + final MetricsFile metricsFile = getMetricsFile(); + metricsFile.addHistogram(umiCount); + metricsFile.write(OUTPUT); + return 0; + } +} diff --git a/testdata/picard/independent_replicates/twopairsWithUMIsMultipleOrientations.sam b/testdata/picard/independent_replicates/twopairsWithUMIsMultipleOrientations.sam index 0523919ec4..40b51b87ad 100644 --- a/testdata/picard/independent_replicates/twopairsWithUMIsMultipleOrientations.sam +++ b/testdata/picard/independent_replicates/twopairsWithUMIsMultipleOrientations.sam @@ -23,8 +23,8 @@ 1ACXX.3.3.1 83 chr3 2 255 101M = 303 +201 AACAGAAGCCGGTATCTGaGTTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AAAA QX:Z:AAAA 1ACXX.3.3.2 83 chr3 2 255 101M = 303 +201 AACAGAAGCtGGNATCTGaGcTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CCCC QX:Z:AAAA 1ACXX.3.4.1 83 chr3 2 20 101M = 303 +201 AACAGAAGCCGGTATCTGaGTTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AAAA QX:Z:AAAA -1ACXX.3.4.2 83 chr3 2 20 101M = 303 +201 AACAGAAGCtGGNATCTGaGcTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CCCC QX:Z:AAAA +1ACXX.3.4.2 83 chr3 2 20 101M = 303 +201 AACAGAAGCtGGNATCTGaGcTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CCCT QX:Z:AAAA 1ACXX.3.3.1 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AAAA QX:Z:AAAA 1ACXX.3.3.2 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CCCC QX:Z:AAAA -1ACXX.3.4.1 163 chr3 303 20 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AAAA QX:Z:AAAA -1ACXX.3.4.2 163 chr3 303 20 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CCCC QX:Z:AAAA +1ACXX.3.4.1 163 chr3 303 20 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AAAT QX:Z:AAAA +1ACXX.3.4.2 163 chr3 303 20 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CCCG QX:Z:AAAA From fe4867474215a4fcbe3cae928d5dd40e117f34ad Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Fri, 14 Feb 2025 17:22:26 -0500 Subject: [PATCH 2/4] fix: remove spaces from the BARCODE_BQ string prior to using --- src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java b/src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java index caf5f86cfb..76216107a7 100644 --- a/src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java +++ b/src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java @@ -160,7 +160,8 @@ protected int doWork() { continue; } if (read.hasAttribute(BARCODE_BQ)) { - final byte[] bytes = SAMUtils.fastqToPhred(read.getStringAttribute(BARCODE_BQ)); + final String barcodeBQ = read.getStringAttribute(BARCODE_BQ).replace(" ", "");; + final byte[] bytes = SAMUtils.fastqToPhred(barcodeBQ); final boolean badQuality = IntStream.range(0, bytes.length).map(i -> bytes[i]).anyMatch(q -> q < MINIMUM_BARCODE_BQ); if (badQuality) { log.warn("bad quality barcode"); From fed40cff1dd9d9eda6033679eb175381f2ed8130 Mon Sep 17 00:00:00 2001 From: znorgaard Date: Fri, 14 Feb 2025 14:35:39 -0800 Subject: [PATCH 3/4] chore: do not log every duplicate set --- src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java b/src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java index 76216107a7..23821817d5 100644 --- a/src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java +++ b/src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java @@ -150,7 +150,6 @@ protected int doWork() { final DuplicateSet set = duplicateSets.next(); final SAMRecord setRep = set.getRepresentative(); - log.info(String.format("duplicate set! %s", setRep.getReadName())); progress.record(setRep); Set barcodes = new HashSet<>(); From 8159088479b7052ec7c77c3b08dd9ddc74e47c46 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Fri, 14 Feb 2025 23:57:05 -0500 Subject: [PATCH 4/4] chore: add test feat: add wrapping counting filter --- ....java => CollectUmiPrevalenceMetrics.java} | 50 ++++++++----------- .../picard/filter/CountingFilterWrapper.java | 19 +++++++ .../CollectUmiPrevalenceMetricsTest.java | 50 +++++++++++++++++++ .../twopairsWithManyUmis.sam | 30 +++++++++++ .../twopairsWithUMIsMultipleOrientations.sam | 6 +-- 5 files changed, 124 insertions(+), 31 deletions(-) rename src/main/java/picard/analysis/{CollectUmiPrevelanceMetrics.java => CollectUmiPrevalenceMetrics.java} (80%) create mode 100644 src/main/java/picard/filter/CountingFilterWrapper.java create mode 100644 src/test/java/picard/analysis/CollectUmiPrevalenceMetricsTest.java create mode 100644 testdata/picard/independent_replicates/twopairsWithManyUmis.sam diff --git a/src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java b/src/main/java/picard/analysis/CollectUmiPrevalenceMetrics.java similarity index 80% rename from src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java rename to src/main/java/picard/analysis/CollectUmiPrevalenceMetrics.java index 23821817d5..433d366d84 100644 --- a/src/main/java/picard/analysis/CollectUmiPrevelanceMetrics.java +++ b/src/main/java/picard/analysis/CollectUmiPrevalenceMetrics.java @@ -28,51 +28,43 @@ import htsjdk.samtools.filter.*; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.*; -import htsjdk.utils.ValidationUtils; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.filter.*; -import htsjdk.variant.vcf.VCFContigHeaderLine; -import htsjdk.variant.vcf.VCFFileReader; -import htsjdk.variant.vcf.VCFHeader; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.argparser.ExperimentalFeature; import org.broadinstitute.barclay.help.DocumentedFeature; -import picard.PicardException; -import picard.analysis.replicates.IndependentReplicateMetric; import picard.cmdline.CommandLineProgram; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup; +import picard.filter.CountingFilterWrapper; import picard.filter.CountingMapQFilter; import picard.filter.CountingPairedFilter; -import picard.util.SequenceDictionaryUtils; import java.io.File; import java.io.IOException; import java.util.*; -import java.util.stream.Collectors; import java.util.stream.IntStream; import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME; /** - * A CLP that, tallys the number of UMIs in a duplicate set across the entire bam and produces a histogram. + * A CLP that, tallies the number of UMIs in a duplicate set across the entire bam and produces a histogram. * * @author Yossi Farjoun */ @DocumentedFeature @ExperimentalFeature @CommandLineProgramProperties( - summary = CollectUmiPrevelanceMetrics.USAGE_SUMMARY + CollectUmiPrevelanceMetrics.USAGE_DETAILS, - oneLineSummary = CollectUmiPrevelanceMetrics.USAGE_SUMMARY, + summary = CollectUmiPrevalenceMetrics.USAGE_SUMMARY + CollectUmiPrevalenceMetrics.USAGE_DETAILS, + oneLineSummary = CollectUmiPrevalenceMetrics.USAGE_SUMMARY, programGroup = DiagnosticsAndQCProgramGroup.class ) -public class CollectUmiPrevelanceMetrics extends CommandLineProgram { +public class CollectUmiPrevalenceMetrics extends CommandLineProgram { static final String USAGE_SUMMARY = "Tally the counts of UMIs in duplicate sets within a bam. \n"; static final String USAGE_DETAILS = "

" + - "This tool estimates "; + "This tool collects the Histogram of the number of duplicate sets that contain a given number of UMIs. " + + "Understanding this distribution can help understand the role that the UMIs have in the determination of " + + "consensus sets, the risk of UMI collisions, and of spurious reads that result from uncorrected UMIs."; @Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input (indexed) BAM/CRAM file.") @@ -82,10 +74,7 @@ public class CollectUmiPrevelanceMetrics extends CommandLineProgram { public File OUTPUT; @Argument(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "minimal value for the mapping quality of the reads to be used in the estimation.", optional = true) - public Integer MINIMUM_MQ = 40; - - @Argument(doc = "Number of sets to examine before stopping.", optional = true) - public Integer STOP_AFTER = 0; + public Integer MINIMUM_MQ = 30; @Argument(doc = "Barcode SAM tag.", optional = true) public String BARCODE_TAG = "RX"; @@ -104,10 +93,10 @@ public class CollectUmiPrevelanceMetrics extends CommandLineProgram { doc = "The interval between which progress will be displayed.", optional = true ) - public int PROGRESS_STEP_INTERVAL = 100000; + public int PROGRESS_STEP_INTERVAL = 1_000_000; - private static final Log log = Log.getInstance(CollectUmiPrevelanceMetrics.class); + private static final Log log = Log.getInstance(CollectUmiPrevalenceMetrics.class); @Override protected int doWork() { @@ -117,21 +106,23 @@ protected int doWork() { // get an iterator to reads that overlap the heterozygous sites final Histogram umiCount = new Histogram<>("numUmis", "duplicateSets"); final CountingPairedFilter countingPairedFilter = new CountingPairedFilter(); + final CountingFilterWrapper countingAlignedFilter = new CountingFilterWrapper(new AlignedFilter(true)); final CountingMapQFilter countingMapQFilter = new CountingMapQFilter(MINIMUM_MQ); + final CountingFilterWrapper countingSecondaryOrSupplementaryFilter = + new CountingFilterWrapper(new SecondaryOrSupplementaryFilter()); final ProgressLogger progress = new ProgressLogger(log, PROGRESS_STEP_INTERVAL, "examined", "duplicate sets"); try (SamReader in = SamReaderFactory.makeDefault() .referenceSequence(REFERENCE_SEQUENCE) .open(INPUT)) { - IOUtil.assertFileIsWritable(OUTPUT); final SAMRecordIterator samRecordIterator = in.iterator(); final List samFilters = CollectionUtil.makeList( - new AlignedFilter(true), - new SecondaryOrSupplementaryFilter(), - countingMapQFilter + countingAlignedFilter, + countingMapQFilter, + countingSecondaryOrSupplementaryFilter ); if (FILTER_UNPAIRED_READS) { samFilters.add(countingPairedFilter); @@ -143,7 +134,7 @@ protected int doWork() { // get duplicate iterator from iterator above final DuplicateSetIterator duplicateSets = new DuplicateSetIterator(filteredSamRecordIterator, in.getFileHeader(), false, null, log); - log.info("Starting iteration on reads"); + log.info("Starting iteration on duplicate sets"); set: while (duplicateSets.hasNext()) { @@ -159,7 +150,8 @@ protected int doWork() { continue; } if (read.hasAttribute(BARCODE_BQ)) { - final String barcodeBQ = read.getStringAttribute(BARCODE_BQ).replace(" ", "");; + final String barcodeBQ = read.getStringAttribute(BARCODE_BQ).replace(" ", ""); + ; final byte[] bytes = SAMUtils.fastqToPhred(barcodeBQ); final boolean badQuality = IntStream.range(0, bytes.length).map(i -> bytes[i]).anyMatch(q -> q < MINIMUM_BARCODE_BQ); if (badQuality) { @@ -177,7 +169,9 @@ protected int doWork() { log.info("Iteration done. Emitting metrics."); log.info(String.format("Processed %d sets", progress.getCount())); log.info(String.format("Filtered %d unpaired reads", countingPairedFilter.getFilteredRecords())); + log.info(String.format("Filtered %d unaligned reads", countingAlignedFilter.getFilteredRecords())); log.info(String.format("Filtered %d low mapQ reads", countingMapQFilter.getFilteredRecords())); + log.info(String.format("Filtered %d Secondary or Supplementary reads", countingSecondaryOrSupplementaryFilter.getFilteredRecords())); // Emit metrics final MetricsFile metricsFile = getMetricsFile(); metricsFile.addHistogram(umiCount); diff --git a/src/main/java/picard/filter/CountingFilterWrapper.java b/src/main/java/picard/filter/CountingFilterWrapper.java new file mode 100644 index 0000000000..58984c1a11 --- /dev/null +++ b/src/main/java/picard/filter/CountingFilterWrapper.java @@ -0,0 +1,19 @@ +package picard.filter; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.filter.SamRecordFilter; + +/** + * A CountingFilter that wraps any SamRecordFilter and provides a count of the reads and bases filtered + */ +public class CountingFilterWrapper extends CountingFilter { + private final SamRecordFilter wrappedFilter; + public CountingFilterWrapper(SamRecordFilter wrappedFilter) { + this.wrappedFilter = wrappedFilter; + } + + @Override + public boolean reallyFilterOut(SAMRecord record) { + return wrappedFilter.filterOut(record); + } +} diff --git a/src/test/java/picard/analysis/CollectUmiPrevalenceMetricsTest.java b/src/test/java/picard/analysis/CollectUmiPrevalenceMetricsTest.java new file mode 100644 index 0000000000..e06b91d753 --- /dev/null +++ b/src/test/java/picard/analysis/CollectUmiPrevalenceMetricsTest.java @@ -0,0 +1,50 @@ +package picard.analysis; + +import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.util.Histogram; +import org.testng.Assert; +import org.testng.annotations.Test; +import picard.cmdline.CommandLineProgramTest; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; + +import static org.testng.Assert.*; + +public class CollectUmiPrevalenceMetricsTest extends CommandLineProgramTest { + + @Override + public String getCommandLineProgramName() { return CollectUmiPrevalenceMetrics.class.getSimpleName();} + + + @Test + public void integrationTest() throws IOException { + final File TEST_DIR = new File("testdata/picard/independent_replicates/"); + + final File samFile = new File(TEST_DIR, "twopairsWithManyUmis.sam"); + final File metricsFile = File.createTempFile("test", ".umi_hist"); + metricsFile.deleteOnExit(); + + final String[] args = new String[]{ + "INPUT=" + samFile.getAbsolutePath(), + "OUTPUT=" + metricsFile.getAbsolutePath(), + "MINIMUM_MQ=20" + }; + Assert.assertEquals(runPicardCommandLine(args), 0); + + final MetricsFile output = new MetricsFile<>(); + output.read(new FileReader(metricsFile)); + + final Histogram hist = output.getHistogram(); + + assertNull(hist.get(1)); + assertEquals(hist.get(2).getValue(),4); + assertEquals(hist.get(3).getValue(),1); + assertEquals(hist.get(4).getValue(),1); + assertNull(hist.get(5)); + assertNull(hist.get(6)); + assertEquals(hist.getCount(),6); // sets + assertEquals(hist.getSum(),15); // unique barcodes (in sets) + } +} \ No newline at end of file diff --git a/testdata/picard/independent_replicates/twopairsWithManyUmis.sam b/testdata/picard/independent_replicates/twopairsWithManyUmis.sam new file mode 100644 index 0000000000..134b9bc4d1 --- /dev/null +++ b/testdata/picard/independent_replicates/twopairsWithManyUmis.sam @@ -0,0 +1,30 @@ +@HD VN:1.0 SO:coordinate +@SQ SN:chr1 LN:501 +@SQ SN:chr2 LN:501 +@SQ SN:chr3 LN:501 +@SQ SN:chr4 LN:101 +@SQ SN:chr5 LN:101 +@SQ SN:chr6 LN:101 +@SQ SN:chr7 LN:404 +@SQ SN:chr8 LN:202 +@RG ID:1AAXX.3 SM:test LB:mylib.yossi PL:ILLUMINA +@PG ID:bwa PN:bwa VN:3 CL:bwa aln +@CO manually created file. second pair is a copy of the first pair except that it's marked as a duplicate, and the 10th base of the +@CO first read is modified C->t (lower case for easy view). this corresponds to position 11 +@CO 123456789 123456789 123456789 123456789 123456789 1234567890 +1AAXX.3.1.1 83 chr1 1 255 101M = 302 0201 CAACAGAAGCCGGTATCTGaGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA +1AAXX.3.1.2 83 chr1 1 255 101M = 302 0201 CAACAGAAGCtGGNATCTGaGcTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA +1AAXX.3.1.1 163 chr1 302 255 101M = 1 -201 NCGCGGCATCcCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA +1AAXX.3.1.2 163 chr1 302 255 101M = 1 -201 NCGCGGCATCcCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA +1AAXX.3.2.1 99 chr2 1 255 101M = 302 0201 CAACAGAAGCCGGTATCTGaGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA +1AAXX.3.2.2 163 chr2 1 255 101M = 302 0201 CAACAGAAGCtGGNATCTGaGcTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA +1AAXX.3.2.1 147 chr2 302 255 101M = 1 -201 NCGCGGCATCcCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA +1AAXX.3.2.2 83 chr2 302 255 101M = 1 -201 NCGCGGCATCcCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA +1ACXX.3.3.1 83 chr3 2 255 101M = 303 +201 AACAGAAGCCGGTATCTGaGTTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA +1ACXX.3.3.2 83 chr3 2 255 101M = 303 +201 AACAGAAGCtGGNATCTGaGcTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA +1ACXX.3.4.1 83 chr3 2 255 101M = 303 +201 AACAGAAGCCGGTATCTGaGTTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA +1ACXX.3.4.2 83 chr3 2 255 101M = 303 +201 AACAGAAGCtGGNATCTGaGcTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:GG-GG QX:Z:AA AA +1ACXX.3.3.1 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AA-AA QX:Z:AA AA +1ACXX.3.3.2 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CC-CC QX:Z:AA AA +1ACXX.3.4.1 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:TT-TT QX:Z:AA AA +1ACXX.3.4.2 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:GG-GG QX:Z:AA AA diff --git a/testdata/picard/independent_replicates/twopairsWithUMIsMultipleOrientations.sam b/testdata/picard/independent_replicates/twopairsWithUMIsMultipleOrientations.sam index 40b51b87ad..0523919ec4 100644 --- a/testdata/picard/independent_replicates/twopairsWithUMIsMultipleOrientations.sam +++ b/testdata/picard/independent_replicates/twopairsWithUMIsMultipleOrientations.sam @@ -23,8 +23,8 @@ 1ACXX.3.3.1 83 chr3 2 255 101M = 303 +201 AACAGAAGCCGGTATCTGaGTTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AAAA QX:Z:AAAA 1ACXX.3.3.2 83 chr3 2 255 101M = 303 +201 AACAGAAGCtGGNATCTGaGcTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CCCC QX:Z:AAAA 1ACXX.3.4.1 83 chr3 2 20 101M = 303 +201 AACAGAAGCCGGTATCTGaGTTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AAAA QX:Z:AAAA -1ACXX.3.4.2 83 chr3 2 20 101M = 303 +201 AACAGAAGCtGGNATCTGaGcTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CCCT QX:Z:AAAA +1ACXX.3.4.2 83 chr3 2 20 101M = 303 +201 AACAGAAGCtGGNATCTGaGcTTGTGTTTCgGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAANN AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CCCC QX:Z:AAAA 1ACXX.3.3.1 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AAAA QX:Z:AAAA 1ACXX.3.3.2 163 chr3 303 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CCCC QX:Z:AAAA -1ACXX.3.4.1 163 chr3 303 20 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AAAT QX:Z:AAAA -1ACXX.3.4.2 163 chr3 303 20 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CCCG QX:Z:AAAA +1ACXX.3.4.1 163 chr3 303 20 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:AAAA QX:Z:AAAA +1ACXX.3.4.2 163 chr3 303 20 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAAtCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA RG:Z:1AAXX.3 MC:Z:101M RX:Z:CCCC QX:Z:AAAA