-
Notifications
You must be signed in to change notification settings - Fork 372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixing MergeBamAlignment creates invalid BAMs in the presence of secondary alignments
#932
base: master
Are you sure you want to change the base?
Changes from all commits
d9efb8e
340248d
e4c8c0d
90bce0b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,9 +27,9 @@ | |
import htsjdk.samtools.SAMTag; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.Comparator; | ||
import java.util.List; | ||
import java.util.Objects; | ||
|
||
/** | ||
* Holds all the hits (alignments) for a read or read pair. For single-end reads, all the alignments are | ||
|
@@ -48,28 +48,26 @@ | |
*/ | ||
class HitsForInsert { | ||
|
||
private static final HitIndexComparator comparator = new HitIndexComparator(); | ||
|
||
public enum NumPrimaryAlignmentState { | ||
NONE, ONE, MORE_THAN_ONE | ||
} | ||
|
||
// These are package-visible to make life easier for the PrimaryAlignmentSelectionStrategies. | ||
final List<SAMRecord> firstOfPairOrFragment = new ArrayList<SAMRecord>(); | ||
final List<SAMRecord> secondOfPair = new ArrayList<SAMRecord>(); | ||
final List<SAMRecord> firstOfPairOrFragment = new ArrayList<>(); | ||
final List<SAMRecord> secondOfPair = new ArrayList<>(); | ||
|
||
private final List<SAMRecord> supplementalFirstOfPairOrFragment = new ArrayList<SAMRecord>(); | ||
private final List<SAMRecord> supplementalSecondOfPair = new ArrayList<SAMRecord>(); | ||
private final List<SAMRecord> supplementalFirstOfPairOrFragment = new ArrayList<>(); | ||
private final List<SAMRecord> supplementalSecondOfPair = new ArrayList<>(); | ||
|
||
/** | ||
* @throws if numHits() == 0 | ||
* @throws IllegalStateException if numHits() == 0 | ||
*/ | ||
public String getReadName() { | ||
return getRepresentativeRead().getReadName(); | ||
} | ||
|
||
/** | ||
* @throws if numHits() == 0 | ||
* @throws IllegalStateException if numHits() == 0 | ||
*/ | ||
public boolean isPaired() { | ||
return getRepresentativeRead().getReadPairedFlag(); | ||
|
@@ -152,7 +150,7 @@ public SAMRecord getSecondOfPair(final int i) { | |
* @return the index, or -1 if no primary was found. | ||
*/ | ||
public int getIndexOfEarliestPrimary() { | ||
for (int i = 0; i < numHits(); i++) { | ||
for (int i = 0; i < numHits(); ++i) { | ||
final SAMRecord firstAligned = getFirstOfPair(i); | ||
final SAMRecord secondAligned = getSecondOfPair(i); | ||
final boolean isPrimaryAlignment = (firstAligned != null && !firstAligned.isSecondaryOrSupplementary()) || | ||
|
@@ -195,8 +193,12 @@ public void setPrimaryAlignment(final int primaryAlignmentIndex) { | |
*/ | ||
public void coordinateByHitIndex() { | ||
// Sort by HI value, with reads with no HI going at the end. | ||
Collections.sort(firstOfPairOrFragment, comparator); | ||
Collections.sort(secondOfPair, comparator); | ||
final Comparator<SAMRecord> comparator = Comparator.comparing( | ||
record -> record.getIntegerAttribute(SAMTag.HI.name()), | ||
Comparator.nullsLast(Comparator.naturalOrder()) | ||
); | ||
firstOfPairOrFragment.sort(comparator); | ||
secondOfPair.sort(comparator); | ||
|
||
// Insert nulls as necessary in the two lists so that correlated alignments have the same index | ||
// and uncorrelated alignments have null in the other list at the corresponding index. | ||
|
@@ -217,16 +219,18 @@ public void coordinateByHitIndex() { | |
secondOfPair.add(i, null); | ||
} | ||
} | ||
} | ||
|
||
// Now renumber any correlated alignments, and remove hit index if no correlated read. | ||
int hi = 0; | ||
/** | ||
* Renumber any correlated alignments, and remove hit index if no correlated read. | ||
*/ | ||
private void renumberHitIndex() { | ||
for (int i = 0; i < numHits(); ++i) { | ||
final SAMRecord first = getFirstOfPair(i); | ||
final SAMRecord second = getSecondOfPair(i); | ||
if (first != null && second != null) { | ||
first.setAttribute(SAMTag.HI.name(), i); | ||
second.setAttribute(SAMTag.HI.name(), i); | ||
++hi; | ||
} else if (first != null) { | ||
first.setAttribute(SAMTag.HI.name(), null); | ||
} else { | ||
|
@@ -235,42 +239,81 @@ public void coordinateByHitIndex() { | |
} | ||
} | ||
|
||
/** | ||
* This method lines up alignments in firstOfPairOrFragment and secondOfPair lists so that the first and the second | ||
* records of each pair have the same index. | ||
*/ | ||
void coordinateByMate() { | ||
final List<SAMRecord> newFirstOfPairOrFragment = new ArrayList<>(); | ||
final List<SAMRecord> newSecondOfPair = new ArrayList<>(); | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like this file preferred ++i to i++. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As you might see (here for instance), there was a usage of a postfix increment in this file before. So I just followed this example. |
||
for (int i = 0; i < firstOfPairOrFragment.size(); ++i) { | ||
final SAMRecord first = firstOfPairOrFragment.get(i); | ||
newFirstOfPairOrFragment.add(i, first); | ||
newSecondOfPair.add(null); | ||
|
||
for (int k = 0; k < secondOfPair.size(); ++k) { | ||
final SAMRecord second = secondOfPair.get(k); | ||
if (arePair(first, second)) { | ||
newSecondOfPair.set(i, second); | ||
secondOfPair.set(k, null); | ||
break; | ||
} | ||
} | ||
} | ||
|
||
for (int i = 0; i < secondOfPair.size(); ++i) { | ||
if (secondOfPair.get(i) != null) { | ||
newFirstOfPairOrFragment.add(i, null); | ||
newSecondOfPair.add(i, secondOfPair.get(i)); | ||
} | ||
} | ||
|
||
firstOfPairOrFragment.clear(); | ||
firstOfPairOrFragment.addAll(newFirstOfPairOrFragment); | ||
secondOfPair.clear(); | ||
secondOfPair.addAll(newSecondOfPair); | ||
|
||
renumberHitIndex(); | ||
} | ||
|
||
/** | ||
* Identifies whether records present pairwise alignment or not. | ||
* | ||
* It is unnecessary to check QNAME here cause an object of {@code HitsForInsert} | ||
* presents only records with the same QNAME. | ||
* | ||
* @return {@code true} if records belong to the same pairwise alignment | ||
*/ | ||
private static boolean arePair(final SAMRecord first, final SAMRecord second) { | ||
return first != null && second != null | ||
&& first.getMateAlignmentStart() == second.getAlignmentStart() | ||
&& first.getAlignmentStart() == second.getMateAlignmentStart() | ||
&& !SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(first.getMateReferenceName()) | ||
&& !SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(second.getMateReferenceName()) | ||
&& Objects.equals(first.getReferenceName(), second.getMateReferenceName()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's OK, I was wondering if this was added to fix a known problem or was just precautionary. Looking at It would be great if picard had nullable constraints in the code base! That would be a huge PR though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
&& Objects.equals(first.getMateReferenceName(), second.getReferenceName()); | ||
} | ||
|
||
/** | ||
* Determine if there is a single primary alignment in a list of alignments. | ||
* @param records | ||
* @return NONE, ONE or MORE_THAN_ONE. | ||
*/ | ||
private NumPrimaryAlignmentState tallyPrimaryAlignments(final List<SAMRecord> records) { | ||
boolean seenPrimary = false; | ||
for (int i = 0; i < records.size(); ++i) { | ||
if (records.get(i) != null && !records.get(i).isSecondaryOrSupplementary()) { | ||
for (SAMRecord record : records) { | ||
if (record != null && !record.isSecondaryOrSupplementary()) { | ||
if (seenPrimary) return NumPrimaryAlignmentState.MORE_THAN_ONE; | ||
else seenPrimary = true; | ||
} | ||
} | ||
if (seenPrimary) return NumPrimaryAlignmentState.ONE; | ||
else return NumPrimaryAlignmentState.NONE; | ||
return seenPrimary ? NumPrimaryAlignmentState.ONE : NumPrimaryAlignmentState.NONE; | ||
} | ||
|
||
public NumPrimaryAlignmentState tallyPrimaryAlignments(final boolean firstEnd) { | ||
if (firstEnd) return tallyPrimaryAlignments(firstOfPairOrFragment); | ||
else return tallyPrimaryAlignments(secondOfPair); | ||
} | ||
|
||
// null HI tag sorts after any non-null. | ||
private static class HitIndexComparator implements Comparator<SAMRecord> { | ||
public int compare(final SAMRecord rec1, final SAMRecord rec2) { | ||
final Integer hi1 = rec1.getIntegerAttribute(SAMTag.HI.name()); | ||
final Integer hi2 = rec2.getIntegerAttribute(SAMTag.HI.name()); | ||
if (hi1 == null) { | ||
if (hi2 == null) return 0; | ||
else return 1; | ||
} else if (hi2 == null) { | ||
return -1; | ||
} else { | ||
return hi1.compareTo(hi2); | ||
} | ||
} | ||
final List<SAMRecord> records = firstEnd ? firstOfPairOrFragment : secondOfPair; | ||
return tallyPrimaryAlignments(records); | ||
} | ||
|
||
List<SAMRecord> getSupplementalFirstOfPairOrFragment() { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1931,7 +1931,6 @@ public void testHeaderFromMappedBreaks(final String filename) throws IOException | |
assertSamValid(mergedSam); | ||
IOUtil.assertFilesEqual(expectedSam, mergedSam); | ||
} | ||
|
||
@DataProvider(name = "mappedReadInUnmappedBamDataProvider") | ||
Object[][] mappedReadInUnmappedBamDataProvider() { | ||
return new Object[][] { | ||
|
@@ -1999,5 +1998,85 @@ public void testMappedReadInUnmappedBam(final boolean paired, final boolean firs | |
runPicardCommandLine(args); | ||
} | ||
} | ||
} | ||
|
||
|
||
@Test | ||
public void testPairedReadsAreProcessedProperlyInAbsenceOfHitIndexTags () throws IOException { | ||
final File unmappedSam = new File(TEST_DATA_DIR, "unmapped1.sam"); | ||
final File alignedSam = new File(TEST_DATA_DIR, "aligned.sam"); | ||
final File expectedSam = new File(TEST_DATA_DIR, "merged_expected.sam"); | ||
final File refFasta = new File(TEST_DATA_DIR, "reference.fasta"); | ||
final File mergedSam = File.createTempFile("merged", ".sam"); | ||
mergedSam.deleteOnExit(); | ||
|
||
doMergeAlignment(unmappedSam, Collections.singletonList(alignedSam), | ||
null, null, null, null, | ||
false, true, false, 1, | ||
"0", "1.0", "align!", "myAligner", | ||
true, refFasta, mergedSam, | ||
null, null, null, null, true, SAMFileHeader.SortOrder.queryname); | ||
|
||
assertSamValid(mergedSam); | ||
IOUtil.assertFilesEqual(expectedSam, mergedSam); | ||
} | ||
|
||
@Test | ||
public void testCoordinateByMateMethodWorksProperly () { | ||
final HitsForInsert hitsForInsert = new HitsForInsert(); | ||
final SAMFileHeader header = new SAMFileHeader(); | ||
header.setSortOrder(SAMFileHeader.SortOrder.queryname); | ||
|
||
{ // Both first and second of pair | ||
final SAMRecord first = new SAMRecord(header); | ||
final SAMRecord second = new SAMRecord(header); | ||
|
||
first.setReferenceName("chrm1"); | ||
first.setAlignmentStart(20); | ||
first.setMateReferenceName("chrm2"); | ||
first.setMateAlignmentStart(1); | ||
second.setReferenceName("chrm2"); | ||
second.setAlignmentStart(1); | ||
second.setMateReferenceName("chrm1"); | ||
second.setMateAlignmentStart(20); | ||
|
||
hitsForInsert.addFirstOfPairOrFragment(first); | ||
hitsForInsert.addSecondOfPair(second); | ||
} | ||
|
||
{ // Only first | ||
final SAMRecord first = new SAMRecord(header); | ||
|
||
first.setReferenceName("chrm1"); | ||
first.setAlignmentStart(8); | ||
first.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); | ||
first.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); | ||
|
||
hitsForInsert.addFirstOfPairOrFragment(first); | ||
} | ||
|
||
{ // Only Second | ||
final SAMRecord second = new SAMRecord(header); | ||
|
||
second.setReferenceName("chrm2"); | ||
second.setAlignmentStart(15); | ||
second.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); | ||
second.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); | ||
|
||
hitsForInsert.addFirstOfPairOrFragment(second); | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is 3 some function of HitsForInsert::numHits? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you, fixed here. |
||
for (int i = 0; i < hitsForInsert.numHits(); i++) { | ||
final SAMRecord first = hitsForInsert.getFirstOfPair(i); | ||
final SAMRecord second = hitsForInsert.getSecondOfPair(i); | ||
|
||
if (first != null && second != null) { | ||
Assert.assertEquals(first.getMateAlignmentStart(), second.getAlignmentStart()); | ||
Assert.assertEquals(second.getMateAlignmentStart(), first.getAlignmentStart()); | ||
} else if (first != null) { | ||
Assert.assertEquals(first.getMateAlignmentStart(), SAMRecord.NO_ALIGNMENT_START); | ||
} else { | ||
Assert.assertEquals(second.getMateAlignmentStart(), SAMRecord.NO_ALIGNMENT_START); | ||
} | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
@SQ SN:chr2 LN:243199373 | ||
@SQ SN:chr4 LN:191154276 | ||
@SQ SN:chr21 LN:48129895 | ||
@RG ID:RG1 PL:Illumina SM:some_sample | ||
example1 113 chr21 10856814 0 29S91M31S chr2 89872071 0 GAACCCGAATAGAATGGAATGGAATGGAATGGAACGGAACGGAATGGAATGGAATGGAATGGAATGGAATGGAACGGAACGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATCAACGCGAGTGCAGGGGAATGGAATGGAATG 0?/112F4HDBFHHFGF4FHGGHHH4HHFGBGF31GHG@3HHHFFBHHEBF5EADFFHHHHHHGHHHGDFHGFFEHHFFEGGHHHHHGGDFHHHHHHHHHHHHHGHHHHHHHHHHHHHHHGGGGGFHHHGGGGGFGFGGFFFFFFFBBBBB NM:i:4 RG:Z:RG1 | ||
example1 177 chr2 89872071 25 79S63M9S chr21 10856814 0 GCATTCCATTCTACTCCAGTTAATCCCATTCCATTCTATTCCATTCCACTCCATTCCATTCCATTGCAATGGAGTGGACTCCATTCCATTCCATTCCATTCCGGATGATTCCATTCCATTGCATTCCATTCCATTCCATTCCCCTGCACTC 1B1111B44433B033443B33?B03GB3FHFF33333333B3BBFBB233B3EFBB3BC3GDDF5FHGE23FFA3AF2CBFFDFFF3G3E33DFHGFEFGGHHFEHHFHHGHHGGE5BE5HHFFEC4GFGGFGFEC4E?@BFFFFAABBB NM:i:2 RG:Z:RG1 | ||
example1 417 chr4 49644617 6 86S40M25S chr21 10856814 0 GAGTGCAGGGGAATGGAATGGAATGGAATGCAATGGAATGGAATCATCCGGAATGGAATGGAATGGAATGGAGTCCACTCCATTGCAATGGAATGGAATGGAGTGGAATGGAATAGAATGGAATGGGATTAACTGGAGTAGAATGGAATGC BBBAAFFFFB@?E4CEFGFGGFG4CEFFHH5EB5EGGHHGHHFHHEFHHGGFEFGHFD33E3G3FFFDFFBC2FA3AFF32EGHF5FDDG3CB3BBFE3B332BBFBB3B33333333FFHF3BG30B?33B344330B33444B1111B1 NM:i:0 RG:Z:RG1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
@HD VN:1.5 SO:queryname | ||
@SQ SN:chr2 LN:243199373 | ||
@SQ SN:chr4 LN:191154276 | ||
@SQ SN:chr21 LN:48129895 | ||
@RG ID:RG1 PL:Illumina SM:some_sample | ||
@PG ID:0 VN:1.0 CL:align! PN:myAligner | ||
example1 113 chr21 10856814 0 29S91M31S chr2 89872071 0 GAACCCGAATAGAATGGAATGGAATGGAATGGAACGGAACGGAATGGAATGGAATGGAATGGAATGGAATGGAACGGAACGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATCAACGCGAGTGCAGGGGAATGGAATGGAATG 0?/112F4HDBFHHFGF4FHGGHHH4HHFGBGF31GHG@3HHHFFBHHEBF5EADFFHHHHHHGHHHGDFHGFFEHHFFEGGHHHHHGGDFHHHHHHHHHHHHHGHHHHHHHHHHHHHHHGGGGGFHHHGGGGGFGFGGFFFFFFFBBBBB MC:Z:79S63M9S PG:Z:0 RG:Z:RG1 HI:i:0 NM:i:4 MQ:i:25 | ||
example1 177 chr2 89872071 25 79S63M9S chr21 10856814 0 GCATTCCATTCTACTCCAGTTAATCCCATTCCATTCTATTCCATTCCACTCCATTCCATTCCATTGCAATGGAGTGGACTCCATTCCATTCCATTCCATTCCGGATGATTCCATTCCATTGCATTCCATTCCATTCCATTCCCCTGCACTC 1B1111B44433B033443B33?B03GB3FHFF33333333B3BBFBB233B3EFBB3BC3GDDF5FHGE23FFA3AF2CBFFDFFF3G3E33DFHGFEFGGHHFEHHFHHGHHGGE5BE5HHFFEC4GFGGFGFEC4E?@BFFFFAABBB MC:Z:29S91M31S PG:Z:0 RG:Z:RG1 HI:i:0 NM:i:2 MQ:i:0 | ||
example1 393 chr4 49644617 6 86S40M25S = 49644617 0 GAGTGCAGGGGAATGGAATGGAATGGAATGCAATGGAATGGAATCATCCGGAATGGAATGGAATGGAATGGAGTCCACTCCATTGCAATGGAATGGAATGGAGTGGAATGGAATAGAATGGAATGGGATTAACTGGAGTAGAATGGAATGC BBBAAFFFFB@?E4CEFGFGGFG4CEFFHH5EB5EGGHHGHHFHHEFHHGGFEFGHFD33E3G3FFFDFFBC2FA3AFF32EGHF5FDDG3CB3BBFE3B332BBFBB3B33333333FFHF3BG30B?33B344330B33444B1111B1 PG:Z:0 RG:Z:RG1 NM:i:0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
@HD VN:1.0 SO:unsorted | ||
@SQ SN:chr2 LN:243199373 | ||
@SQ SN:chr4 LN:191154276 | ||
@SQ SN:chr21 LN:48129895 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
; picard.sam.MergeBamAlignmentTest.testPairedReadsAreProcessedProperlyInAbsenceOfHitIndexTags |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
@HD VN:1.5 SO:queryname | ||
@RG ID:RG1 PL:Illumina SM:some_sample | ||
example1 77 * 0 0 * * 0 0 CATTCCATTCCATTCCCCTGCACTCGCGTTGATTCCATTCCATTCCATTCCATTCCATTCCATTCCATTCCGTTCCGTTCCATTCCATTCCATTCCATTCCATTCCATTCCGTTCCGTTCCATTCCATTCCATTCCATTCTATTCGGGTTC BBBBBFFFFFFFGGFGFGGGGGHHHFGGGGGHHHHHHHHHHHHHHHGHHHHHHHHHHHHHFDGGHHHHHGGEFFHHEFFGHFDGHHHGHHHHHHFFDAE5FBEHHBFFHHH3@GHG13FGBGFHH4HHHGGHF4FGFHHFBDH4F211/?0 RG:Z:RG1 | ||
example1 141 * 0 0 * * 0 0 GAGTGCAGGGGAATGGAATGGAATGGAATGCAATGGAATGGAATCATCCGGAATGGAATGGAATGGAATGGAGTCCACTCCATTGCAATGGAATGGAATGGAGTGGAATGGAATAGAATGGAATGGGATTAACTGGAGTAGAATGGAATGC BBBAAFFFFB@?E4CEFGFGGFG4CEFFHH5EB5EGGHHGHHFHHEFHHGGFEFGHFD33E3G3FFFDFFBC2FA3AFF32EGHF5FDDG3CB3BBFE3B332BBFBB3B33333333FFHF3BG30B?33B344330B33444B1111B1 RG:Z:RG1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for breaking this out and cleaning up hi, and so on.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for your feedback=)