Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optionally extract all fingerprint snps #1989

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/main/java/picard/fingerprint/ExtractFingerprint.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.Hidden;
Expand Down Expand Up @@ -82,6 +83,11 @@ public class ExtractFingerprint extends CommandLineProgram {
@Argument(doc = "When true code will check for readability on input files (this can be slow on cloud access)")
public boolean TEST_INPUT_READABILITY = true;


@Advanced
@Argument(doc = "When true, code will extract variants for every snp in the haplotype database, not only the representative one.")
public boolean EXTRACT_NON_REPRESENTATIVES_TOO = false;

@Override
protected boolean requiresReference() {
return true;
Expand Down Expand Up @@ -133,7 +139,8 @@ protected int doWork() {
final String sampleToUse = getSampleToUse(soleEntry.getKey());

try {
FingerprintUtils.writeFingerPrint(soleEntry.getValue(), OUTPUT, referenceSequence.getReferenceFile(), sampleToUse, "PLs derived from " + INPUT + " using an assumed contamination of " + this.CONTAMINATION);
FingerprintUtils.writeFingerPrint(soleEntry.getValue(), OUTPUT, referenceSequence.getReferenceFile(),
sampleToUse, "PLs derived from " + INPUT + " using an assumed contamination of " + this.CONTAMINATION, !EXTRACT_NON_REPRESENTATIVES_TOO);
} catch (Exception e) {
log.error(e);
}
Expand Down
54 changes: 33 additions & 21 deletions src/main/java/picard/fingerprint/FingerprintUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,10 @@

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.TreeSet;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
* A set of utilities used in the fingerprinting environment
Expand All @@ -71,18 +65,20 @@ public class FingerprintUtils {
* @param referenceSequenceFileName the reference sequence (file)
* @param sample the sample name to use in the vcf
* @param source a "source" comment to use in the VCF
* @throws IOException
* @param representativeOnly whether to extract only the representative snps
* @throws IOException if two snps in the haplotype map have the same "name"
*/
public static void writeFingerPrint(final Fingerprint fingerprint,
final File outputFile,
final File referenceSequenceFileName,
final String sample,
final String source) throws IOException {
final String source,
final boolean representativeOnly) throws IOException {

try (final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceSequenceFileName);
final VariantContextWriter variantContextWriter = getVariantContextWriter(outputFile, referenceSequenceFileName, sample, source, ref)) {

createVCSetFromFingerprint(fingerprint, ref, sample).forEach(variantContextWriter::add);
createVCSetFromFingerprint(fingerprint, ref, sample, representativeOnly).forEach(variantContextWriter::add);
}
}

Expand Down Expand Up @@ -113,18 +109,21 @@ private static VariantContextWriter getVariantContextWriter(final File outputFil
/**
* A utility function that takes a fingerprint and returns a VariantContextSet with variants representing the haplotypes in the fingerprint
*
* @param fingerPrint A fingerprint
* @param reference A reference sequence that will be used to create the VariantContexts
* @param sample A sample name that will be used for the genotype field
* @param fingerPrint A fingerprint
* @param reference A reference sequence that will be used to create the VariantContexts
* @param sample A sample name that will be used for the genotype field
* @param representativeOnly A boolean specifying whether to create a VC only for the representative Snp (or for all of them if false)
* @return VariantContextSet with variants representing the haplotypes in the fingerprint
*/
public static VariantContextSet createVCSetFromFingerprint(final Fingerprint fingerPrint, final ReferenceSequenceFile reference, final String sample) {
public static VariantContextSet createVCSetFromFingerprint(final Fingerprint fingerPrint, final ReferenceSequenceFile reference, final String sample, boolean representativeOnly) {

final VariantContextSet variantContexts = new VariantContextSet(reference.getSequenceDictionary());

// check that the same snp name isn't twice in the fingerprint.
fingerPrint.values().stream()
.map(hp -> hp.getRepresentativeSnp().getName())
.flatMap(hp -> representativeOnly ?
Stream.of(hp.getRepresentativeSnp().getName()) :
hp.getHaplotype().getSnps().stream().map(Snp::getName))
.filter(Objects::nonNull)
.filter(n -> !n.equals(""))
.collect(Collectors.groupingBy(Function.identity(), Collectors.counting()))
Expand All @@ -138,18 +137,31 @@ public static VariantContextSet createVCSetFromFingerprint(final Fingerprint fin

// convert all the haplotypes to variant contexts and add them to the set.
fingerPrint.values().stream()
.map(hp -> getVariantContext(reference, sample, hp))
.flatMap(hp -> getVariantContext(reference, sample, hp, representativeOnly))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

flatMap!

.forEach(variantContexts::add);

return variantContexts;
}

@VisibleForTesting
static VariantContext getVariantContext(final ReferenceSequenceFile reference,
static Stream<VariantContext> getVariantContext(final ReferenceSequenceFile reference,
final String sample,
final HaplotypeProbabilities haplotypeProbabilities) {
final Snp snp = haplotypeProbabilities.getRepresentativeSnp();
final byte refAllele = StringUtil.toUpperCase(reference.getSubsequenceAt(
final HaplotypeProbabilities haplotypeProbabilities,
final boolean representativeOnly) {
if (representativeOnly) {
return Stream.of(getVariantContextFromSnp(reference, sample, haplotypeProbabilities, haplotypeProbabilities.getRepresentativeSnp()));
} else {
return haplotypeProbabilities.getHaplotype().getSnps().stream().map(snp -> getVariantContextFromSnp(reference, sample, haplotypeProbabilities, snp));
}
}

@VisibleForTesting
static VariantContext getVariantContextFromSnp(final ReferenceSequenceFile reference,
final String sample,
final HaplotypeProbabilities haplotypeProbabilities,
final Snp snp) {

final byte refAllele = StringUtil.toUpperCase(reference.getSubsequenceAt(
snp.getChrom(),
snp.getPos(),
snp.getPos()).getBases()[0]);
Expand Down
18 changes: 17 additions & 1 deletion src/main/java/picard/fingerprint/Snp.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
package picard.fingerprint;

import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.StringUtil;
import htsjdk.variant.variantcontext.Allele;

Expand All @@ -36,7 +37,7 @@
*
* @author Tim Fennell
*/
public class Snp implements Comparable<Snp> {
public class Snp implements Comparable<Snp>, Locatable {
private final String name;
private final String chrom;
private final int pos;
Expand Down Expand Up @@ -124,4 +125,19 @@ public int hashCode() {
public String toString() {
return this.chrom + ":" + this.pos;
}

@Override
public String getContig() {
return chrom;
}

@Override
public int getStart() {
return pos;
}

@Override
public int getEnd() {
return pos;
}
}
3 changes: 1 addition & 2 deletions src/test/java/picard/fingerprint/CheckFingerprintTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
Expand Down Expand Up @@ -214,7 +213,7 @@ public void testFPToVC() throws IOException {
tempFile.deleteOnExit();

FingerprintUtils.writeFingerPrint(fingerprint, tempFile, SHIFTED_REFERENCE,
"NA12892", null);
"NA12892", null, true);

final Fingerprint NA12892FromVCF = checker.fingerprintFiles(Collections.singleton(tempFile.toPath()), 1, 1, TimeUnit.DAYS)
.values().stream()
Expand Down
5 changes: 3 additions & 2 deletions src/test/java/picard/fingerprint/FingerprintCheckerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -375,11 +375,12 @@ void testCanHandleDeepData(final HaplotypeProbabilitiesFromSequence hp, final in
final File fasta = new File(TEST_DATA_DIR, "reference.fasta");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want a test to show outputting multiple values?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd love to, but it would involve building a bam...and I need to find time for that...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when I find a project that actually uses this, I'll make time to write a test... OK?


try (final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(fasta)) {
final VariantContext vc = FingerprintUtils.getVariantContext(ref, "test", hp);
final VariantContext vc = FingerprintUtils.getVariantContext(ref, "test", hp, true).iterator().next();
Assert.assertTrue(MathUtil.max(MathUtil.promote(vc.getGenotype(0).getPL())) > 0);
}
}


@DataProvider()
Object[][] mergeIsSafeProvider() {
final HaplotypeProbabilitiesFromSequence hp1 = new HaplotypeProbabilitiesFromSequence(hb);
Expand Down Expand Up @@ -551,7 +552,7 @@ public void testWriteFingerprint() throws IOException {
final Fingerprint fp = fpchecker.fingerprintVcf(vcfInput.toPath()).values().iterator().next();

final File vcfOutput = File.createTempFile("fingerprint", ".vcf");
FingerprintUtils.writeFingerPrint(fp, vcfOutput, fasta, "Dummy", "Testing");
FingerprintUtils.writeFingerPrint(fp, vcfOutput, fasta, "Dummy", "Testing", true);

VcfTestUtils.assertVcfFilesAreEqual(vcfOutput, vcfExpected);
}
Expand Down
Loading