Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing MergeBamAlignment creates invalid BAMs in the presence of secondary alignments #932

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@
/**
* This strategy was designed for TopHat output, but could be of general utility. It picks the alignment with best MAPQ.
* If paired-end, it is the alignment in which the sum of the MAPQs of both ends is the best. In case of ties, one
* is selected arbitrarily. This strategy expects pair-aware alignments, with the corresponding alignment for each
* mate of the pair correlated by HI (hit index) tag. If the aligner has set a pair of alignments as primary, this
* is selected arbitrarily. If the aligner has set a pair of alignments as primary, this
* is used (assuming one of those alignments is not filtered out). Otherwise the alignment pair with best MapQ is
* selected.
*/
Expand All @@ -49,7 +48,7 @@ public class BestMapqPrimaryAlignmentSelectionStrategy implements PrimaryAlignme
public void pickPrimaryAlignment(final HitsForInsert hits) {

if (hits.numHits() == 0) throw new IllegalArgumentException("No alignments to pick from");
hits.coordinateByHitIndex();
hits.coordinateByMate();
// See if primary alignment is not already unambiguously determined.
final NumPrimaryAlignmentState firstEndAlignmentState = hits.tallyPrimaryAlignments(true);
final NumPrimaryAlignmentState secondEndAlignmentState = hits.tallyPrimaryAlignments(false);
Expand Down
117 changes: 80 additions & 37 deletions src/main/java/picard/sam/HitsForInsert.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
import htsjdk.samtools.SAMTag;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Objects;

/**
* Holds all the hits (alignments) for a read or read pair. For single-end reads, all the alignments are
Expand All @@ -48,28 +48,26 @@
*/
class HitsForInsert {

private static final HitIndexComparator comparator = new HitIndexComparator();

public enum NumPrimaryAlignmentState {
NONE, ONE, MORE_THAN_ONE
}

// These are package-visible to make life easier for the PrimaryAlignmentSelectionStrategies.
final List<SAMRecord> firstOfPairOrFragment = new ArrayList<SAMRecord>();
final List<SAMRecord> secondOfPair = new ArrayList<SAMRecord>();
final List<SAMRecord> firstOfPairOrFragment = new ArrayList<>();
final List<SAMRecord> secondOfPair = new ArrayList<>();

private final List<SAMRecord> supplementalFirstOfPairOrFragment = new ArrayList<SAMRecord>();
private final List<SAMRecord> supplementalSecondOfPair = new ArrayList<SAMRecord>();
private final List<SAMRecord> supplementalFirstOfPairOrFragment = new ArrayList<>();
private final List<SAMRecord> supplementalSecondOfPair = new ArrayList<>();

/**
* @throws if numHits() == 0
* @throws IllegalStateException if numHits() == 0
*/
public String getReadName() {
return getRepresentativeRead().getReadName();
}

/**
* @throws if numHits() == 0
* @throws IllegalStateException if numHits() == 0
*/
public boolean isPaired() {
return getRepresentativeRead().getReadPairedFlag();
Expand Down Expand Up @@ -152,7 +150,7 @@ public SAMRecord getSecondOfPair(final int i) {
* @return the index, or -1 if no primary was found.
*/
public int getIndexOfEarliestPrimary() {
for (int i = 0; i < numHits(); i++) {
for (int i = 0; i < numHits(); ++i) {
final SAMRecord firstAligned = getFirstOfPair(i);
final SAMRecord secondAligned = getSecondOfPair(i);
final boolean isPrimaryAlignment = (firstAligned != null && !firstAligned.isSecondaryOrSupplementary()) ||
Expand Down Expand Up @@ -195,8 +193,12 @@ public void setPrimaryAlignment(final int primaryAlignmentIndex) {
*/
public void coordinateByHitIndex() {
// Sort by HI value, with reads with no HI going at the end.
Collections.sort(firstOfPairOrFragment, comparator);
Collections.sort(secondOfPair, comparator);
final Comparator<SAMRecord> comparator = Comparator.comparing(
record -> record.getIntegerAttribute(SAMTag.HI.name()),
Comparator.nullsLast(Comparator.naturalOrder())
);
firstOfPairOrFragment.sort(comparator);
secondOfPair.sort(comparator);

// Insert nulls as necessary in the two lists so that correlated alignments have the same index
// and uncorrelated alignments have null in the other list at the corresponding index.
Expand All @@ -217,16 +219,18 @@ public void coordinateByHitIndex() {
secondOfPair.add(i, null);
}
}
}

// Now renumber any correlated alignments, and remove hit index if no correlated read.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for breaking this out and cleaning up hi, and so on.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for your feedback=)

int hi = 0;
/**
* Renumber any correlated alignments, and remove hit index if no correlated read.
*/
private void renumberHitIndex() {
for (int i = 0; i < numHits(); ++i) {
final SAMRecord first = getFirstOfPair(i);
final SAMRecord second = getSecondOfPair(i);
if (first != null && second != null) {
first.setAttribute(SAMTag.HI.name(), i);
second.setAttribute(SAMTag.HI.name(), i);
++hi;
} else if (first != null) {
first.setAttribute(SAMTag.HI.name(), null);
} else {
Expand All @@ -235,42 +239,81 @@ public void coordinateByHitIndex() {
}
}

/**
* This method lines up alignments in firstOfPairOrFragment and secondOfPair lists so that the first and the second
* records of each pair have the same index.
*/
void coordinateByMate() {
final List<SAMRecord> newFirstOfPairOrFragment = new ArrayList<>();
final List<SAMRecord> newSecondOfPair = new ArrayList<>();

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this file preferred ++i to i++.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As you might see (here for instance), there was a usage of a postfix increment in this file before. So I just followed this example.
All increment operators in this file changed to prefix form.

for (int i = 0; i < firstOfPairOrFragment.size(); ++i) {
final SAMRecord first = firstOfPairOrFragment.get(i);
newFirstOfPairOrFragment.add(i, first);
newSecondOfPair.add(null);

for (int k = 0; k < secondOfPair.size(); ++k) {
final SAMRecord second = secondOfPair.get(k);
if (arePair(first, second)) {
newSecondOfPair.set(i, second);
secondOfPair.set(k, null);
break;
}
}
}

for (int i = 0; i < secondOfPair.size(); ++i) {
if (secondOfPair.get(i) != null) {
newFirstOfPairOrFragment.add(i, null);
newSecondOfPair.add(i, secondOfPair.get(i));
}
}

firstOfPairOrFragment.clear();
firstOfPairOrFragment.addAll(newFirstOfPairOrFragment);
secondOfPair.clear();
secondOfPair.addAll(newSecondOfPair);

renumberHitIndex();
}

/**
* Identifies whether records present pairwise alignment or not.
*
* It is unnecessary to check QNAME here cause an object of {@code HitsForInsert}
* presents only records with the same QNAME.
*
* @return {@code true} if records belong to the same pairwise alignment
*/
private static boolean arePair(final SAMRecord first, final SAMRecord second) {
return first != null && second != null
&& first.getMateAlignmentStart() == second.getAlignmentStart()
&& first.getAlignmentStart() == second.getMateAlignmentStart()
&& !SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(first.getMateReferenceName())
&& !SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(second.getMateReferenceName())
&& Objects.equals(first.getReferenceName(), second.getMateReferenceName())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why use Object.equals() here? is it possible that one of these is null?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since both SamRecod#mReferenceName and SamRecord#mMateReferenceName are nullable we need to ensure that they are non-null or simply delegate it to Objects.equals to avoid NullPointerException. This is a theoretical case that may hardly ever occur in runtime. So, do we need to prevent this case or can I unwrap this check?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's OK, I was wondering if this was added to fix a known problem or was just precautionary.

Looking at SamRecord I see that it's common to check these fields for null so it seems wise to do it here too.

It would be great if picard had nullable constraints in the code base! That would be a huge PR though.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optional can be rather helpful here and it would be just great if we could use Try and Lazy types from vavr to reduce service-code and to clarify Picard logic!=D

&& Objects.equals(first.getMateReferenceName(), second.getReferenceName());
}

/**
* Determine if there is a single primary alignment in a list of alignments.
* @param records
* @return NONE, ONE or MORE_THAN_ONE.
*/
private NumPrimaryAlignmentState tallyPrimaryAlignments(final List<SAMRecord> records) {
boolean seenPrimary = false;
for (int i = 0; i < records.size(); ++i) {
if (records.get(i) != null && !records.get(i).isSecondaryOrSupplementary()) {
for (SAMRecord record : records) {
if (record != null && !record.isSecondaryOrSupplementary()) {
if (seenPrimary) return NumPrimaryAlignmentState.MORE_THAN_ONE;
else seenPrimary = true;
}
}
if (seenPrimary) return NumPrimaryAlignmentState.ONE;
else return NumPrimaryAlignmentState.NONE;
return seenPrimary ? NumPrimaryAlignmentState.ONE : NumPrimaryAlignmentState.NONE;
}

public NumPrimaryAlignmentState tallyPrimaryAlignments(final boolean firstEnd) {
if (firstEnd) return tallyPrimaryAlignments(firstOfPairOrFragment);
else return tallyPrimaryAlignments(secondOfPair);
}

// null HI tag sorts after any non-null.
private static class HitIndexComparator implements Comparator<SAMRecord> {
public int compare(final SAMRecord rec1, final SAMRecord rec2) {
final Integer hi1 = rec1.getIntegerAttribute(SAMTag.HI.name());
final Integer hi2 = rec2.getIntegerAttribute(SAMTag.HI.name());
if (hi1 == null) {
if (hi2 == null) return 0;
else return 1;
} else if (hi2 == null) {
return -1;
} else {
return hi1.compareTo(hi2);
}
}
final List<SAMRecord> records = firstEnd ? firstOfPairOrFragment : secondOfPair;
return tallyPrimaryAlignments(records);
}

List<SAMRecord> getSupplementalFirstOfPairOrFragment() {
Expand Down
83 changes: 81 additions & 2 deletions src/test/java/picard/sam/MergeBamAlignmentTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1931,7 +1931,6 @@ public void testHeaderFromMappedBreaks(final String filename) throws IOException
assertSamValid(mergedSam);
IOUtil.assertFilesEqual(expectedSam, mergedSam);
}

@DataProvider(name = "mappedReadInUnmappedBamDataProvider")
Object[][] mappedReadInUnmappedBamDataProvider() {
return new Object[][] {
Expand Down Expand Up @@ -1999,5 +1998,85 @@ public void testMappedReadInUnmappedBam(final boolean paired, final boolean firs
runPicardCommandLine(args);
}
}
}


@Test
public void testPairedReadsAreProcessedProperlyInAbsenceOfHitIndexTags () throws IOException {
final File unmappedSam = new File(TEST_DATA_DIR, "unmapped1.sam");
final File alignedSam = new File(TEST_DATA_DIR, "aligned.sam");
final File expectedSam = new File(TEST_DATA_DIR, "merged_expected.sam");
final File refFasta = new File(TEST_DATA_DIR, "reference.fasta");
final File mergedSam = File.createTempFile("merged", ".sam");
mergedSam.deleteOnExit();

doMergeAlignment(unmappedSam, Collections.singletonList(alignedSam),
null, null, null, null,
false, true, false, 1,
"0", "1.0", "align!", "myAligner",
true, refFasta, mergedSam,
null, null, null, null, true, SAMFileHeader.SortOrder.queryname);

assertSamValid(mergedSam);
IOUtil.assertFilesEqual(expectedSam, mergedSam);
}

@Test
public void testCoordinateByMateMethodWorksProperly () {
final HitsForInsert hitsForInsert = new HitsForInsert();
final SAMFileHeader header = new SAMFileHeader();
header.setSortOrder(SAMFileHeader.SortOrder.queryname);

{ // Both first and second of pair
final SAMRecord first = new SAMRecord(header);
final SAMRecord second = new SAMRecord(header);

first.setReferenceName("chrm1");
first.setAlignmentStart(20);
first.setMateReferenceName("chrm2");
first.setMateAlignmentStart(1);
second.setReferenceName("chrm2");
second.setAlignmentStart(1);
second.setMateReferenceName("chrm1");
second.setMateAlignmentStart(20);

hitsForInsert.addFirstOfPairOrFragment(first);
hitsForInsert.addSecondOfPair(second);
}

{ // Only first
final SAMRecord first = new SAMRecord(header);

first.setReferenceName("chrm1");
first.setAlignmentStart(8);
first.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
first.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);

hitsForInsert.addFirstOfPairOrFragment(first);
}

{ // Only Second
final SAMRecord second = new SAMRecord(header);

second.setReferenceName("chrm2");
second.setAlignmentStart(15);
second.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
second.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);

hitsForInsert.addFirstOfPairOrFragment(second);
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is 3 some function of HitsForInsert::numHits?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you, fixed here.

for (int i = 0; i < hitsForInsert.numHits(); i++) {
final SAMRecord first = hitsForInsert.getFirstOfPair(i);
final SAMRecord second = hitsForInsert.getSecondOfPair(i);

if (first != null && second != null) {
Assert.assertEquals(first.getMateAlignmentStart(), second.getAlignmentStart());
Assert.assertEquals(second.getMateAlignmentStart(), first.getAlignmentStart());
} else if (first != null) {
Assert.assertEquals(first.getMateAlignmentStart(), SAMRecord.NO_ALIGNMENT_START);
} else {
Assert.assertEquals(second.getMateAlignmentStart(), SAMRecord.NO_ALIGNMENT_START);
}
}
}
}
7 changes: 7 additions & 0 deletions testdata/picard/sam/MergeBamAlignment/aligned.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
@SQ SN:chr2 LN:243199373
@SQ SN:chr4 LN:191154276
@SQ SN:chr21 LN:48129895
@RG ID:RG1 PL:Illumina SM:some_sample
example1 113 chr21 10856814 0 29S91M31S chr2 89872071 0 GAACCCGAATAGAATGGAATGGAATGGAATGGAACGGAACGGAATGGAATGGAATGGAATGGAATGGAATGGAACGGAACGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATCAACGCGAGTGCAGGGGAATGGAATGGAATG 0?/112F4HDBFHHFGF4FHGGHHH4HHFGBGF31GHG@3HHHFFBHHEBF5EADFFHHHHHHGHHHGDFHGFFEHHFFEGGHHHHHGGDFHHHHHHHHHHHHHGHHHHHHHHHHHHHHHGGGGGFHHHGGGGGFGFGGFFFFFFFBBBBB NM:i:4 RG:Z:RG1
example1 177 chr2 89872071 25 79S63M9S chr21 10856814 0 GCATTCCATTCTACTCCAGTTAATCCCATTCCATTCTATTCCATTCCACTCCATTCCATTCCATTGCAATGGAGTGGACTCCATTCCATTCCATTCCATTCCGGATGATTCCATTCCATTGCATTCCATTCCATTCCATTCCCCTGCACTC 1B1111B44433B033443B33?B03GB3FHFF33333333B3BBFBB233B3EFBB3BC3GDDF5FHGE23FFA3AF2CBFFDFFF3G3E33DFHGFEFGGHHFEHHFHHGHHGGE5BE5HHFFEC4GFGGFGFEC4E?@BFFFFAABBB NM:i:2 RG:Z:RG1
example1 417 chr4 49644617 6 86S40M25S chr21 10856814 0 GAGTGCAGGGGAATGGAATGGAATGGAATGCAATGGAATGGAATCATCCGGAATGGAATGGAATGGAATGGAGTCCACTCCATTGCAATGGAATGGAATGGAGTGGAATGGAATAGAATGGAATGGGATTAACTGGAGTAGAATGGAATGC BBBAAFFFFB@?E4CEFGFGGFG4CEFFHH5EB5EGGHHGHHFHHEFHHGGFEFGHFD33E3G3FFFDFFBC2FA3AFF32EGHF5FDDG3CB3BBFE3B332BBFBB3B33333333FFHF3BG30B?33B344330B33444B1111B1 NM:i:0 RG:Z:RG1
9 changes: 9 additions & 0 deletions testdata/picard/sam/MergeBamAlignment/merged_expected.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
@HD VN:1.5 SO:queryname
@SQ SN:chr2 LN:243199373
@SQ SN:chr4 LN:191154276
@SQ SN:chr21 LN:48129895
@RG ID:RG1 PL:Illumina SM:some_sample
@PG ID:0 VN:1.0 CL:align! PN:myAligner
example1 113 chr21 10856814 0 29S91M31S chr2 89872071 0 GAACCCGAATAGAATGGAATGGAATGGAATGGAACGGAACGGAATGGAATGGAATGGAATGGAATGGAATGGAACGGAACGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATCAACGCGAGTGCAGGGGAATGGAATGGAATG 0?/112F4HDBFHHFGF4FHGGHHH4HHFGBGF31GHG@3HHHFFBHHEBF5EADFFHHHHHHGHHHGDFHGFFEHHFFEGGHHHHHGGDFHHHHHHHHHHHHHGHHHHHHHHHHHHHHHGGGGGFHHHGGGGGFGFGGFFFFFFFBBBBB MC:Z:79S63M9S PG:Z:0 RG:Z:RG1 HI:i:0 NM:i:4 MQ:i:25
example1 177 chr2 89872071 25 79S63M9S chr21 10856814 0 GCATTCCATTCTACTCCAGTTAATCCCATTCCATTCTATTCCATTCCACTCCATTCCATTCCATTGCAATGGAGTGGACTCCATTCCATTCCATTCCATTCCGGATGATTCCATTCCATTGCATTCCATTCCATTCCATTCCCCTGCACTC 1B1111B44433B033443B33?B03GB3FHFF33333333B3BBFBB233B3EFBB3BC3GDDF5FHGE23FFA3AF2CBFFDFFF3G3E33DFHGFEFGGHHFEHHFHHGHHGGE5BE5HHFFEC4GFGGFGFEC4E?@BFFFFAABBB MC:Z:29S91M31S PG:Z:0 RG:Z:RG1 HI:i:0 NM:i:2 MQ:i:0
example1 393 chr4 49644617 6 86S40M25S = 49644617 0 GAGTGCAGGGGAATGGAATGGAATGGAATGCAATGGAATGGAATCATCCGGAATGGAATGGAATGGAATGGAGTCCACTCCATTGCAATGGAATGGAATGGAGTGGAATGGAATAGAATGGAATGGGATTAACTGGAGTAGAATGGAATGC BBBAAFFFFB@?E4CEFGFGGFG4CEFFHH5EB5EGGHHGHHFHHEFHHGGFEFGHFD33E3G3FFFDFFBC2FA3AFF32EGHF5FDDG3CB3BBFE3B332BBFBB3B33333333FFHF3BG30B?33B344330B33444B1111B1 PG:Z:0 RG:Z:RG1 NM:i:0
4 changes: 4 additions & 0 deletions testdata/picard/sam/MergeBamAlignment/reference.dict
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@HD VN:1.0 SO:unsorted
@SQ SN:chr2 LN:243199373
@SQ SN:chr4 LN:191154276
@SQ SN:chr21 LN:48129895
1 change: 1 addition & 0 deletions testdata/picard/sam/MergeBamAlignment/reference.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
; picard.sam.MergeBamAlignmentTest.testPairedReadsAreProcessedProperlyInAbsenceOfHitIndexTags
4 changes: 4 additions & 0 deletions testdata/picard/sam/MergeBamAlignment/unmapped1.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@HD VN:1.5 SO:queryname
@RG ID:RG1 PL:Illumina SM:some_sample
example1 77 * 0 0 * * 0 0 CATTCCATTCCATTCCCCTGCACTCGCGTTGATTCCATTCCATTCCATTCCATTCCATTCCATTCCATTCCGTTCCGTTCCATTCCATTCCATTCCATTCCATTCCATTCCGTTCCGTTCCATTCCATTCCATTCCATTCTATTCGGGTTC BBBBBFFFFFFFGGFGFGGGGGHHHFGGGGGHHHHHHHHHHHHHHHGHHHHHHHHHHHHHFDGGHHHHHGGEFFHHEFFGHFDGHHHGHHHHHHFFDAE5FBEHHBFFHHH3@GHG13FGBGFHH4HHHGGHF4FGFHHFBDH4F211/?0 RG:Z:RG1
example1 141 * 0 0 * * 0 0 GAGTGCAGGGGAATGGAATGGAATGGAATGCAATGGAATGGAATCATCCGGAATGGAATGGAATGGAATGGAGTCCACTCCATTGCAATGGAATGGAATGGAGTGGAATGGAATAGAATGGAATGGGATTAACTGGAGTAGAATGGAATGC BBBAAFFFFB@?E4CEFGFGGFG4CEFFHH5EB5EGGHHGHHFHHEFHHGGFEFGHFD33E3G3FFFDFFBC2FA3AFF32EGHF5FDDG3CB3BBFE3B332BBFBB3B33333333FFHF3BG30B?33B344330B33444B1111B1 RG:Z:RG1