From fc85c751ce3d81a3784d7cef39e9cae872405d08 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Sun, 17 Oct 2021 11:24:09 -0400 Subject: [PATCH 1/3] Changes for the htsjdk VCFHeader and VCFHeaderLine refactoring. --- .../picard/fingerprint/FingerprintUtils.java | 2 +- .../java/picard/fingerprint/HaplotypeMap.java | 7 +- .../java/picard/vcf/GenotypeConcordance.java | 4 +- ...snps_vs_CEUTrio-snps_GtConcordanceDiff.vcf | 116 +++++++++--------- ...letionCallset_vs_spanningDeletionTruth.vcf | 17 ++- 5 files changed, 69 insertions(+), 77 deletions(-) diff --git a/src/main/java/picard/fingerprint/FingerprintUtils.java b/src/main/java/picard/fingerprint/FingerprintUtils.java index d1fd5d9a73..51cb2708ce 100644 --- a/src/main/java/picard/fingerprint/FingerprintUtils.java +++ b/src/main/java/picard/fingerprint/FingerprintUtils.java @@ -95,7 +95,7 @@ private static VariantContextWriter getVariantContextWriter(final File outputFil .setReferenceDictionary(ref.getSequenceDictionary()) .setOutputFile(outputFile).build(); - final Set lines = new LinkedHashSet<>(); + final Set lines = VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION); lines.add(new VCFHeaderLine("reference", referenceSequenceFileName.getAbsolutePath())); lines.add(new VCFHeaderLine("source", source)); lines.add(new VCFHeaderLine("fileDate", new Date().toString())); diff --git a/src/main/java/picard/fingerprint/HaplotypeMap.java b/src/main/java/picard/fingerprint/HaplotypeMap.java index d323168b43..400690fa24 100644 --- a/src/main/java/picard/fingerprint/HaplotypeMap.java +++ b/src/main/java/picard/fingerprint/HaplotypeMap.java @@ -369,12 +369,11 @@ public void writeAsVcf(final File output, final File refFile) throws FileNotFoun .build()) { final VCFHeader vcfHeader = new VCFHeader( - VCFUtils.withUpdatedContigsAsLines(Collections.emptySet(), refFile, header.getSequenceDictionary(), false), + VCFUtils.withUpdatedContigsAsLines( + VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION), + refFile, header.getSequenceDictionary(), false), Collections.singleton(HET_GENOTYPE_FOR_PHASING)); - VCFUtils.withUpdatedContigsAsLines(Collections.emptySet(), refFile, header.getSequenceDictionary(), false); - - vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString())); vcfHeader.addMetaDataLine(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); vcfHeader.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); vcfHeader.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.PHASE_SET_KEY, 1, VCFHeaderLineType.String, "Phase-set identifier for phased genotypes.")); diff --git a/src/main/java/picard/vcf/GenotypeConcordance.java b/src/main/java/picard/vcf/GenotypeConcordance.java index cdbbbcf797..5ab257e399 100644 --- a/src/main/java/picard/vcf/GenotypeConcordance.java +++ b/src/main/java/picard/vcf/GenotypeConcordance.java @@ -497,9 +497,7 @@ private Optional getVariantContextWriter(final VCFFileRead // create the output header final List sampleNames = Arrays.asList(OUTPUT_VCF_CALL_SAMPLE_NAME, OUTPUT_VCF_TRUTH_SAMPLE_NAME); - final Set headerLines = new HashSet<>(); - headerLines.addAll(callReader.getFileHeader().getMetaDataInInputOrder()); - headerLines.addAll(truthReader.getFileHeader().getMetaDataInInputOrder()); + final Set headerLines = VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(callReader.getFileHeader(), truthReader.getFileHeader()), true); headerLines.add(CONTINGENCY_STATE_HEADER_LINE); writer.writeHeader(new VCFHeader(headerLines, sampleNames)); return Optional.of(writer); diff --git a/testdata/picard/vcf/NIST-truth-snps_vs_CEUTrio-snps_GtConcordanceDiff.vcf b/testdata/picard/vcf/NIST-truth-snps_vs_CEUTrio-snps_GtConcordanceDiff.vcf index 1e80a38ce1..3b677358bf 100644 --- a/testdata/picard/vcf/NIST-truth-snps_vs_CEUTrio-snps_GtConcordanceDiff.vcf +++ b/testdata/picard/vcf/NIST-truth-snps_vs_CEUTrio-snps_GtConcordanceDiff.vcf @@ -10,17 +10,13 @@ ##FILTER= ##FILTER= ##FORMAT= -##FORMAT= ##FORMAT= -##FORMAT= ##FORMAT= -##FORMAT= ##FORMAT= -##FORMAT= ##FORMAT= ##FORMAT= -##GATKCommandLine.SelectVariants= 10000] invertselect=false excludeNonVariants=false excludeFiltered=false preserveAlleles=false removeUnusedAlternates=false restrictAllelesTo=ALL keepOriginalAC=false keepOriginalDP=false mendelianViolation=false invertMendelianViolation=false mendelianViolationQualThreshold=0.0 select_random_fraction=0.0 remove_fraction_genotypes=0.0 selectTypeToInclude=[] selectTypeToExclude=[] keepIDs=null excludeIDs=null fullyDecode=false justRead=false maxIndelSize=2147483647 minIndelSize=0 maxFilteredGenotypes=2147483647 minFilteredGenotypes=0 maxFractionFilteredGenotypes=1.0 minFractionFilteredGenotypes=0.0 setFilteredGtToNocall=false ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false"> -##GATKCommandLine= +##GATKCommandLine.SelectVariants= 10000] invertselect=false excludeNonVariants=false excludeFiltered=false preserveAlleles=false removeUnusedAlternates=false restrictAllelesTo=ALL keepOriginalAC=false keepOriginalDP=false mendelianViolation=false invertMendelianViolation=false mendelianViolationQualThreshold=0.0 select_random_fraction=0.0 remove_fraction_genotypes=0.0 selectTypeToInclude=[] selectTypeToExclude=[] keepIDs=null excludeIDs=null fullyDecode=false justRead=false maxIndelSize=2147483647 minIndelSize=0 maxFilteredGenotypes=2147483647 minFilteredGenotypes=0 maxFractionFilteredGenotypes=1.0 minFractionFilteredGenotypes=0.0 setFilteredGtToNocall=false ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false"> +##GATKCommandLine= ##INFO= ##INFO= ##INFO= @@ -89,90 +85,90 @@ ##PhaseByTransmission="analysis_type=PhaseByTransmission input_file=[] read_buffer_size=null phone_home=STANDARD gatk_key=null read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null num_threads=1 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[/broad/hptmp/ami/tmp/CEUTrio.ped] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false variant=(RodBinding name=variant source=/broad/hptmp/ami/tmp/CEUTrio_UG_Both_WG/CEUTrio.HiSeq.WGS.b37.snps_and_indels.recalibrated.filtered.vcf) DeNovoPrior=1.0E-8 FatherAlleleFirst=false out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub filter_mismatching_base_and_quals=false" ##SelectVariants="analysis_type=SelectVariants input_file=[] read_buffer_size=null phone_home=STANDARD gatk_key=null tag=NA read_filter=[] intervals=[/projects/scratch-data-backup/justin.zook/NA12878/bed/union13callableMQonlymerged_addcert_nouncert_excludesimplerep_excludesegdups_excludedecoy_excludeRepSeqSTRs_noCNVs_v2.18_2mindatasets_5minYesNoRatio.bed] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/projects/scratch-data-backup/justin.zook/references/human_g1k_v37.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=LENIENT_VCF_PROCESSING disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false variant=(RodBinding name=variant source=/projects/scratch-data-backup/justin.zook/NA12878/Integration131103/AllFDAdatasets_131103_allcall_UGHapMerge_HetHomVarPASS_VQSRv2.18_2mindatasets_5minYesNoRatio_all.primitives_nodup.vcf) discordance=(RodBinding name= source=UNBOUND) concordance=(RodBinding name= source=UNBOUND) out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sample_name=[] sample_expressions=null sample_file=null exclude_sample_name=[] exclude_sample_file=[] select_expressions=[] excludeNonVariants=false excludeFiltered=false restrictAllelesTo=ALL keepOriginalAC=false mendelianViolation=false mendelianViolationQualThreshold=0.0 select_random_fraction=0.0 remove_fraction_genotypes=0.0 selectTypeToInclude=[] keepIDs=null fullyDecode=false forceGenotypesDecode=false justRead=false maxIndelSize=2147483647 ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false" ##UnifiedGenotyper="analysis_type=UnifiedGenotyper input_file=[/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.list] read_buffer_size=null phone_home=NO_ET gatk_key=/humgen/gsa-hpprojects/GATK/data/gatk_user_keys/gsamembers_broadinstitute.org.key read_filter=[] intervals=[/broad/hptmp/ami/tmp/queueScatterGather/.qlog/CEUTrio.indelcall-sg/temp_020_of_300/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null num_threads=2 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false genotype_likelihoods_model=INDEL p_nonref_model=EXACT pcr_error_rate=1.0E-4 noSLOD=false annotateNDA=false min_base_quality_score=17 max_deletion_fraction=0.05 cap_max_alternate_alleles_for_indels=false min_indel_count_for_genotyping=5 min_indel_fraction_per_sample=0.25 indel_heterozygosity=1.25E-4 indelGapContinuationPenalty=10 indelGapOpenPenalty=45 indelHaplotypeSize=80 noBandedIndel=false indelDebug=false ignoreSNPAlleles=false allReadsSP=false ignoreLaneInfo=false reference_sample_calls=(RodBinding name= source=UNBOUND) reference_sample_name=null sample_ploidy=2 min_quality_score=1 max_quality_score=40 site_quality_prior=20 min_power_threshold_for_calling=0.95 min_reference_depth=100 exclude_filtered_reference_sites=false heterozygosity=0.0010 genotyping_mode=DISCOVERY output_mode=EMIT_VARIANTS_ONLY standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=30.0 alleles=(RodBinding name= source=UNBOUND) max_alternate_alleles=3 dbsnp=(RodBinding name=dbsnp source=/humgen/gsa-hpprojects/GATK/bundle/current/b37/dbsnp_135.b37.vcf) comp=[] out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub debug_file=null metrics_file=null annotation=[] excludeAnnotation=[] filter_mismatching_base_and_quals=false" -##contig= +##contig= ##contig= ##contig= -##contig= -##contig= -##contig= -##contig= +##contig= +##contig= +##contig= +##contig= ##contig= -##contig= +##contig= ##contig= -##contig= -##contig= -##contig= -##contig= +##contig= +##contig= +##contig= +##contig= ##contig= ##contig= ##contig= -##contig= +##contig= ##contig= -##contig= -##contig= +##contig= +##contig= ##contig= -##contig= -##contig= -##contig= -##contig= -##contig= +##contig= +##contig= +##contig= +##contig= +##contig= ##contig= -##contig= +##contig= ##contig= -##contig= +##contig= ##contig= -##contig= -##contig= -##contig= +##contig= +##contig= +##contig= ##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= ##contig= ##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= ##contig= -##contig= +##contig= ##contig= -##contig= +##contig= ##contig= -##contig= -##contig= +##contig= +##contig= ##contig= -##contig= +##contig= ##contig= ##contig= ##contig= ##contig= -##contig= -##contig= +##contig= +##contig= ##contig= ##contig= -##contig= -##contig= +##contig= +##contig= ##contig= -##contig= -##contig= +##contig= +##contig= ##contig= -##contig= -##contig= +##contig= +##contig= ##contig= ##contig= -##contig= -##contig= -##contig= +##contig= +##contig= +##contig= ##contig= -##contig= -##contig= +##contig= +##contig= ##fileDate=20130719 ##phasing=none ##reference=file:///humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta diff --git a/testdata/picard/vcf/spanningDeletionCallset_vs_spanningDeletionTruth.vcf b/testdata/picard/vcf/spanningDeletionCallset_vs_spanningDeletionTruth.vcf index 31f9b677ce..54bdc326cb 100644 --- a/testdata/picard/vcf/spanningDeletionCallset_vs_spanningDeletionTruth.vcf +++ b/testdata/picard/vcf/spanningDeletionCallset_vs_spanningDeletionTruth.vcf @@ -24,15 +24,14 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##GATKCommandLine.ApplyRecalibration.2= -##GATKCommandLine.ApplyRecalibration.3= -##GATKCommandLine.ApplyRecalibration.4= -##GATKCommandLine.ApplyRecalibration= -##GATKCommandLine.GenotypeGVCFs= -##GATKCommandLine.SelectVariants.2= -##GATKCommandLine.SelectVariants= -##GATKCommandLine.SelectVariants= -##GATKCommandLine.VariantFiltration= 54.69] filterName=[ExcessHet] genotypeFilterExpression=[] genotypeFilterName=[] clusterSize=3 clusterWindowSize=0 maskExtension=0 maskName=Mask filterNotInMask=false missingValuesInExpressionsShouldEvaluateAsFailing=false invalidatePreviousFilters=false invertFilterExpression=false invertGenotypeFilterExpression=false setFilteredGtToNocall=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false"> +##GATKCommandLine.ApplyRecalibration.2= +##GATKCommandLine.ApplyRecalibration.3= +##GATKCommandLine.ApplyRecalibration.4= +##GATKCommandLine.ApplyRecalibration= +##GATKCommandLine.GenotypeGVCFs= +##GATKCommandLine.SelectVariants.2= +##GATKCommandLine.SelectVariants= +##GATKCommandLine.VariantFiltration= 54.69] filterName=[ExcessHet] genotypeFilterExpression=[] genotypeFilterName=[] clusterSize=3 clusterWindowSize=0 maskExtension=0 maskName=Mask filterNotInMask=false missingValuesInExpressionsShouldEvaluateAsFailing=false invalidatePreviousFilters=false invertFilterExpression=false invertGenotypeFilterExpression=false setFilteredGtToNocall=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false"> ##INFO= ##INFO= ##INFO= From b38406afa3c894f5ed6e39ac74c125fb74afb559 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Wed, 17 May 2023 12:45:22 -0400 Subject: [PATCH 2/3] Update header merging. --- src/main/java/picard/vcf/GenotypeConcordance.java | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/main/java/picard/vcf/GenotypeConcordance.java b/src/main/java/picard/vcf/GenotypeConcordance.java index 5ab257e399..4d66851eb3 100644 --- a/src/main/java/picard/vcf/GenotypeConcordance.java +++ b/src/main/java/picard/vcf/GenotypeConcordance.java @@ -40,13 +40,7 @@ import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; -import htsjdk.variant.vcf.VCFConstants; -import htsjdk.variant.vcf.VCFFileReader; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLine; -import htsjdk.variant.vcf.VCFHeaderLineCount; -import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFInfoHeaderLine; +import htsjdk.variant.vcf.*; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.help.DocumentedFeature; From 2c4f23219e640ad0d90d135e3c6af263542ad19a Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Wed, 17 May 2023 12:45:40 -0400 Subject: [PATCH 3/3] Temporarily use htsjdk snapshot. --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 0d7259299b..f4bb26eca4 100644 --- a/build.gradle +++ b/build.gradle @@ -57,7 +57,7 @@ def ensureBuildPrerequisites(buildPrerequisitesMessage) { ensureBuildPrerequisites(buildPrerequisitesMessage) -final htsjdkVersion = System.getProperty('htsjdk.version', '3.0.1') +final htsjdkVersion = System.getProperty('htsjdk.version', '3.0.5-12-g5700958-SNAPSHOT') final googleNio = 'com.google.cloud:google-cloud-nio:0.123.25' configurations {