-
Notifications
You must be signed in to change notification settings - Fork 372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optionally extract all fingerprint snps #1989
base: master
Are you sure you want to change the base?
Changes from 3 commits
98f7727
7770030
d54015b
0b38737
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,16 +45,10 @@ | |
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.Date; | ||
import java.util.LinkedHashSet; | ||
import java.util.List; | ||
import java.util.Objects; | ||
import java.util.Set; | ||
import java.util.TreeSet; | ||
import java.util.*; | ||
import java.util.function.Function; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.Stream; | ||
|
||
/** | ||
* A set of utilities used in the fingerprinting environment | ||
|
@@ -71,18 +65,20 @@ public class FingerprintUtils { | |
* @param referenceSequenceFileName the reference sequence (file) | ||
* @param sample the sample name to use in the vcf | ||
* @param source a "source" comment to use in the VCF | ||
* @throws IOException | ||
* @param representativeOnly whether to extract only the representative snps | ||
* @throws IOException if two snps in the haplotype map have the same "name" | ||
*/ | ||
public static void writeFingerPrint(final Fingerprint fingerprint, | ||
final File outputFile, | ||
final File referenceSequenceFileName, | ||
final String sample, | ||
final String source) throws IOException { | ||
final String source, | ||
final boolean representativeOnly) throws IOException { | ||
|
||
try (final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceSequenceFileName); | ||
final VariantContextWriter variantContextWriter = getVariantContextWriter(outputFile, referenceSequenceFileName, sample, source, ref)) { | ||
|
||
createVCSetFromFingerprint(fingerprint, ref, sample).forEach(variantContextWriter::add); | ||
createVCSetFromFingerprint(fingerprint, ref, sample, representativeOnly).forEach(variantContextWriter::add); | ||
} | ||
} | ||
|
||
|
@@ -113,18 +109,21 @@ private static VariantContextWriter getVariantContextWriter(final File outputFil | |
/** | ||
* A utility function that takes a fingerprint and returns a VariantContextSet with variants representing the haplotypes in the fingerprint | ||
* | ||
* @param fingerPrint A fingerprint | ||
* @param reference A reference sequence that will be used to create the VariantContexts | ||
* @param sample A sample name that will be used for the genotype field | ||
* @param fingerPrint A fingerprint | ||
* @param reference A reference sequence that will be used to create the VariantContexts | ||
* @param sample A sample name that will be used for the genotype field | ||
* @param representativeOnly A boolean specifying whether to create a VC only for the representative Snp (or for all of them if false) | ||
* @return VariantContextSet with variants representing the haplotypes in the fingerprint | ||
*/ | ||
public static VariantContextSet createVCSetFromFingerprint(final Fingerprint fingerPrint, final ReferenceSequenceFile reference, final String sample) { | ||
public static VariantContextSet createVCSetFromFingerprint(final Fingerprint fingerPrint, final ReferenceSequenceFile reference, final String sample, boolean representativeOnly) { | ||
|
||
final VariantContextSet variantContexts = new VariantContextSet(reference.getSequenceDictionary()); | ||
|
||
// check that the same snp name isn't twice in the fingerprint. | ||
fingerPrint.values().stream() | ||
.map(hp -> hp.getRepresentativeSnp().getName()) | ||
.flatMap(hp -> representativeOnly ? | ||
Stream.of(hp.getRepresentativeSnp().getName()) : | ||
hp.getHaplotype().getSnps().stream().map(Snp::getName)) | ||
.filter(Objects::nonNull) | ||
.filter(n -> !n.equals("")) | ||
.collect(Collectors.groupingBy(Function.identity(), Collectors.counting())) | ||
|
@@ -138,18 +137,31 @@ public static VariantContextSet createVCSetFromFingerprint(final Fingerprint fin | |
|
||
// convert all the haplotypes to variant contexts and add them to the set. | ||
fingerPrint.values().stream() | ||
.map(hp -> getVariantContext(reference, sample, hp)) | ||
.flatMap(hp -> getVariantContext(reference, sample, hp, representativeOnly)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. flatMap! |
||
.forEach(variantContexts::add); | ||
|
||
return variantContexts; | ||
} | ||
|
||
@VisibleForTesting | ||
static VariantContext getVariantContext(final ReferenceSequenceFile reference, | ||
static Stream<VariantContext> getVariantContext(final ReferenceSequenceFile reference, | ||
final String sample, | ||
final HaplotypeProbabilities haplotypeProbabilities) { | ||
final Snp snp = haplotypeProbabilities.getRepresentativeSnp(); | ||
final byte refAllele = StringUtil.toUpperCase(reference.getSubsequenceAt( | ||
final HaplotypeProbabilities haplotypeProbabilities, | ||
final boolean representative_only) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. representativeOnly is more idiomatic There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. indeed. |
||
if (representative_only) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like flatmaps, it's a bit awkward how you have to repeat the same if/else logic inside the flatmap repeatedly though. You might pull out the HP -> stream<> logic into it's own method that you flatmap over, and then map the resulting stream to get the VariantContext or name as appropriate. It looks like you probably didn't do that because of the way that getVariantContext relies on the outer HP so it can't be decomposed as nicely without nesting logic within the flatmap. Do what you like with it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah...seems like over-complication... |
||
return Stream.of(getVariantContextFromSnp(reference, sample, haplotypeProbabilities, haplotypeProbabilities.getRepresentativeSnp())); | ||
} else { | ||
return haplotypeProbabilities.getHaplotype().getSnps().stream().map(snp -> getVariantContextFromSnp(reference, sample, haplotypeProbabilities, snp)); | ||
} | ||
} | ||
|
||
@VisibleForTesting | ||
static VariantContext getVariantContextFromSnp(final ReferenceSequenceFile reference, | ||
final String sample, | ||
final HaplotypeProbabilities haplotypeProbabilities, | ||
final Snp snp) { | ||
|
||
final byte refAllele = StringUtil.toUpperCase(reference.getSubsequenceAt( | ||
snp.getChrom(), | ||
snp.getPos(), | ||
snp.getPos()).getBases()[0]); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -375,7 +375,7 @@ void testCanHandleDeepData(final HaplotypeProbabilitiesFromSequence hp, final in | |
final File fasta = new File(TEST_DATA_DIR, "reference.fasta"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you want a test to show outputting multiple values? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd love to, but it would involve building a bam...and I need to find time for that... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when I find a project that actually uses this, I'll make time to write a test... OK? |
||
|
||
try (final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(fasta)) { | ||
final VariantContext vc = FingerprintUtils.getVariantContext(ref, "test", hp); | ||
final VariantContext vc = FingerprintUtils.getVariantContext(ref, "test", hp, true).iterator().next(); | ||
Assert.assertTrue(MathUtil.max(MathUtil.promote(vc.getGenotype(0).getPL())) > 0); | ||
} | ||
} | ||
|
@@ -551,7 +551,7 @@ public void testWriteFingerprint() throws IOException { | |
final Fingerprint fp = fpchecker.fingerprintVcf(vcfInput.toPath()).values().iterator().next(); | ||
|
||
final File vcfOutput = File.createTempFile("fingerprint", ".vcf"); | ||
FingerprintUtils.writeFingerPrint(fp, vcfOutput, fasta, "Dummy", "Testing"); | ||
FingerprintUtils.writeFingerPrint(fp, vcfOutput, fasta, "Dummy", "Testing", true); | ||
|
||
VcfTestUtils.assertVcfFilesAreEqual(vcfOutput, vcfExpected); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
default=true boolean arguments are very strange to me. Why not invert to "EXTRACT_ALL_VARIANTS" or something like that?
Why is it hidden? Would Advanced make more sense?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure.