diff --git a/.github/workflows/cloud_tests.yml b/.github/workflows/cloud_tests.yml index e8049f415f..7526a531ba 100644 --- a/.github/workflows/cloud_tests.yml +++ b/.github/workflows/cloud_tests.yml @@ -87,7 +87,7 @@ jobs: - name: Upload test results if: always() - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: cloud-test-results-${{ matrix.Java }}-barclay-${{ matrix.run_barclay_tests}} path: build/reports/tests \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 30fddcd431..af0fa3bae3 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -50,7 +50,7 @@ jobs: java -jar build/libs/picard.jar MarkDuplicates -I testdata/picard/sam/aligned_queryname_sorted.bam -O out.bam --METRICS_FILE out.metrics - name: Upload test results if: always() - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: test-results-${{ matrix.Java }}-barclay-${{ matrix.run_barclay_tests}} path: build/reports/tests diff --git a/src/main/java/picard/sam/FilterReadsByFlowCellLocation.java b/src/main/java/picard/sam/FilterReadsByFlowCellLocation.java new file mode 100644 index 0000000000..09b0d58293 --- /dev/null +++ b/src/main/java/picard/sam/FilterReadsByFlowCellLocation.java @@ -0,0 +1,154 @@ +package picard.sam; + +import htsjdk.samtools.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import picard.cmdline.CommandLineProgram; +import picard.cmdline.StandardOptionDefinitions; +import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; +import htsjdk.samtools.util.Log; +import picard.sam.util.ReadNameParser; +import picard.sam.util.PhysicalLocation; +import java.io.File; +import java.io.IOException; + +@CommandLineProgramProperties( + summary = "Filters out reads with specific flowcell coordinates", + oneLineSummary = "Removes reads from specific flowcell positions from BAM/CRAM files", + programGroup = ReadDataManipulationProgramGroup.class +) +public class FilterReadsByFlowCellLocation extends CommandLineProgram { + private static final Log logger = Log.getInstance(FilterReadsByFlowCellLocation.class); + + @Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, + doc = "Input BAM/CRAM file") + public String INPUT; + + @Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, + doc = "Output BAM/CRAM file") + public String OUTPUT; + + @Argument(shortName = "X", + doc = "X coordinate to filter (default: 1000)", + optional = true) + public int X_COORD = 1000; + + @Argument(shortName = "Y", + doc = "Y coordinate to filter (default: 1000)", + optional = true) + public int Y_COORD = 1000; + + private final ReadNameParser readNameParser = new ReadNameParser(ReadNameParser.DEFAULT_READ_NAME_REGEX); + + private boolean hasFlowcellCoordinates(String readName) { + class ReadLocation implements PhysicalLocation { + private short libraryId; + private int x = -1, y = -1; // Default to invalid values + private short tile; + + @Override + public void setLibraryId(short libraryId) { + this.libraryId = libraryId; + } + + @Override + public short getLibraryId() { + return libraryId; + } + + @Override + public void setX(int x) { + this.x = x; + } + + @Override + public int getX() { + return x; + } + + @Override + public void setY(int y) { + this.y = y; + } + + @Override + public int getY() { + return y; + } + + @Override + public void setReadGroup(short readGroup) {} + + @Override + public short getReadGroup() { + return 0; + } + + @Override + public void setTile(short tile) { + this.tile = tile; + } + + @Override + public short getTile() { + return tile; + } + } + + ReadLocation location = new ReadLocation(); + try { + readNameParser.addLocationInformation(readName, location); + } catch (Exception e) { + logger.warn("Failed to parse read name: " + readName, e); + return false; // Keep the read if parsing fails + } + + if (location.getX() == -1 || location.getY() == -1) { + return false; // Keep the read if coordinates are invalid + } + + return location.getX() == X_COORD && location.getY() == Y_COORD; + } + + @Override + protected int doWork() { + final SamReader reader = SamReaderFactory.makeDefault() + .referenceSequence(REFERENCE_SEQUENCE) + .open(new File(INPUT)); + + final SAMFileHeader header = reader.getFileHeader(); + final SAMFileWriter writer = new SAMFileWriterFactory() + .makeWriter(header, true, new File(OUTPUT), REFERENCE_SEQUENCE); + + int totalReads = 0; + int filteredReads = 0; + + try { + for (final SAMRecord read : reader) { + totalReads++; + if (hasFlowcellCoordinates(read.getReadName())) { + filteredReads++; + continue; + } + writer.addAlignment(read); + } + } finally { + try { + reader.close(); + } catch (IOException e) { + logger.error("Error closing input file", e); + } + writer.close(); + } + + logger.info("Processed " + totalReads + " total reads"); + logger.info("Filtered " + filteredReads + " reads at flowcell position " + X_COORD + ":" + Y_COORD); + logger.info("Wrote " + (totalReads - filteredReads) + " reads to output"); + + return 0; + } + + public static void main(String[] args) { + new FilterReadsByFlowCellLocation().instanceMain(args); + } +} diff --git a/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java b/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java index 6f32f33d8c..69fee46db3 100644 --- a/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java +++ b/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java @@ -425,7 +425,7 @@ public EstimateLibraryComplexity() { } else { sizeInBytes = PairedReadSequence.getSizeInBytes(); } - MAX_RECORDS_IN_RAM = (int) (Runtime.getRuntime().maxMemory() / sizeInBytes) / 2; + MAX_RECORDS_IN_RAM = Math.min((int) (Runtime.getRuntime().maxMemory() / sizeInBytes) / 2, Integer.MAX_VALUE - 32); } /** @@ -673,4 +673,4 @@ boolean passesQualityCheck(final byte[] bases, final byte[] quals, final int see for (int i = 0; i < readLength; i++) total += quals[i]; return total / readLength >= minQuality; } -} \ No newline at end of file +} diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java index 669fb8cbca..7d1b84c051 100644 --- a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java +++ b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java @@ -479,8 +479,8 @@ private void buildSortedReadEndLists(final boolean useBarcodes) { } else { sizeInBytes = ReadEndsForMarkDuplicates.getSizeOf(); } - MAX_RECORDS_IN_RAM = (int) (Runtime.getRuntime().maxMemory() / sizeInBytes) / 2; - final int maxInMemory = (int) ((Runtime.getRuntime().maxMemory() * SORTING_COLLECTION_SIZE_RATIO) / sizeInBytes); + MAX_RECORDS_IN_RAM = Math.min((int) (Runtime.getRuntime().maxMemory() / sizeInBytes) / 2, Integer.MAX_VALUE - 32); + final int maxInMemory = Math.min((int) ((Runtime.getRuntime().maxMemory() * SORTING_COLLECTION_SIZE_RATIO) / sizeInBytes), Integer.MAX_VALUE - 32); log.info("Will retain up to " + maxInMemory + " data points before spilling to disk."); final ReadEndsForMarkDuplicatesCodec fragCodec, pairCodec, diskCodec; @@ -719,7 +719,7 @@ protected void sortIndicesForDuplicates(final boolean indexOpticalDuplicates){ entryOverhead = SortingLongCollection.SIZEOF; } // Keep this number from getting too large even if there is a huge heap. - int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / entryOverhead, (double) (Integer.MAX_VALUE - 5)); + int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / entryOverhead, (double) (Integer.MAX_VALUE - 32)); // If we're also tracking optical duplicates, reduce maxInMemory, since we'll need two sorting collections if (indexOpticalDuplicates) { maxInMemory /= ((entryOverhead + SortingLongCollection.SIZEOF) / entryOverhead); diff --git a/src/main/java/picard/util/SequenceDictionaryUtils.java b/src/main/java/picard/util/SequenceDictionaryUtils.java index 178935fb52..972f90a674 100644 --- a/src/main/java/picard/util/SequenceDictionaryUtils.java +++ b/src/main/java/picard/util/SequenceDictionaryUtils.java @@ -190,7 +190,7 @@ public static SortingCollection makeSortingCollection() { String.class, new StringCodec(), String::compareTo, - (int) Math.min(maxNamesInRam, Integer.MAX_VALUE), + (int) Math.min(maxNamesInRam, Integer.MAX_VALUE - 32), tmpDir.toPath() ); } diff --git a/src/test/java/picard/sam/FilterReadsByFlowCellLocationTest.java b/src/test/java/picard/sam/FilterReadsByFlowCellLocationTest.java new file mode 100644 index 0000000000..4055d290e9 --- /dev/null +++ b/src/test/java/picard/sam/FilterReadsByFlowCellLocationTest.java @@ -0,0 +1,163 @@ +package picard.sam; + +import htsjdk.samtools.*; +import org.testng.Assert; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; + +/** + * Unit tests for FilterFlowCellEdgeReads using TestNG. + */ +public class FilterReadsByFlowCellLocationTest { + + // Temporary files for input and output. + private File inputSam; + private File outputSam; + + /** + * Helper method to create a temporary SAM file with one record per provided read name. + * Each record is given minimal required fields. + * + * @param readNames an array of read names to include. + * @return the temporary SAM file. + * @throws IOException if an I/O error occurs. + */ + private File createSamFile(String[] readNames) throws IOException { + File tmpSam = File.createTempFile("FilterFlowCellEdgeReadsTest_input", ".sam"); + tmpSam.deleteOnExit(); + + // Create a minimal SAM file header. + SAMFileHeader header = new SAMFileHeader(); + header.setSortOrder(SAMFileHeader.SortOrder.unsorted); + // Add one sequence record so that records have a reference. + header.addSequence(new SAMSequenceRecord("chr1", 1000000)); + + // Use SAMFileWriterFactory to write a SAM file. + try (SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(header, false, tmpSam, null)) { + // Create one record for each read name. + for (String readName : readNames) { + SAMRecord rec = new SAMRecord(header); + rec.setReadName(readName); + rec.setReferenceName("chr1"); + rec.setAlignmentStart(1); + rec.setCigarString("50M"); + // Set dummy bases and qualities. + rec.setReadString("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"); + rec.setBaseQualityString("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"); + writer.addAlignment(rec); + } + } + return tmpSam; + } + + /** + * Helper method to count the number of SAM records in a given SAM file. + * + * @param samFile the SAM file. + * @return the number of records. + * @throws IOException if an I/O error occurs. + */ + private int countRecords(File samFile) throws IOException { + int count = 0; + try (SamReader reader = SamReaderFactory.makeDefault().open(samFile)) { + for (SAMRecord rec : reader) { + count++; + } + } + return count; + } + + @AfterMethod + public void tearDown() { + if (inputSam != null && inputSam.exists()) { + inputSam.delete(); + } + if (outputSam != null && outputSam.exists()) { + outputSam.delete(); + } + } + + /** + * Test with a mixed input: + * – One read with a name that matches the default coordinates ("1000:1000") and should be filtered out. + * – One read with non-matching coordinates ("2000:2000") that should be retained. + */ + @Test + public void testMixedReads() throws IOException { + String[] readNames = new String[]{ + "EAS139:136:FC706VJ:2:1000:1000", // should be filtered out (matches default X_COORD and Y_COORD) + "EAS139:136:FC706VJ:2:2000:2000" // should be retained + }; + inputSam = createSamFile(readNames); + outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam"); + outputSam.deleteOnExit(); + + FilterReadsByFlowCellLocation tool = new FilterReadsByFlowCellLocation(); + tool.INPUT = inputSam.getAbsolutePath(); + tool.OUTPUT = outputSam.getAbsolutePath(); + // Use default X_COORD=1000, Y_COORD=1000 + + int ret = tool.doWork(); + Assert.assertEquals(ret, 0, "doWork() should return 0"); + + // Only the record that does not match the filter should be written. + int recordCount = countRecords(outputSam); + Assert.assertEquals(recordCount, 1, "Only one record should be written"); + } + + /** + * Test with a read whose name does not contain colon-delimited coordinates. + * The method hasFlowcellCoordinates should catch the exception and return false, + * so the record should be retained. + */ + @Test + public void testNonConformingReadName() throws IOException { + String[] readNames = new String[]{ + "nonconforming_read" // no colon-separated parts → not filtered + }; + inputSam = createSamFile(readNames); + outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam"); + outputSam.deleteOnExit(); + + FilterReadsByFlowCellLocation tool = new FilterReadsByFlowCellLocation(); + tool.INPUT = inputSam.getAbsolutePath(); + tool.OUTPUT = outputSam.getAbsolutePath(); + // Defaults are used. + + int ret = tool.doWork(); + Assert.assertEquals(ret, 0); + + // The read should be retained. + int recordCount = countRecords(outputSam); + Assert.assertEquals(recordCount, 1, "The nonconforming read should be kept"); + } + + /** + * Test with an input that has only a read with coordinates matching the filter. + * In this case, the tool should filter out the only record and write an empty output. + */ + @Test + public void testAllReadsFiltered() throws IOException { + String[] readNames = new String[]{ + "EAS139:136:FC706VJ:2:1000:1000" // matches filter → filtered out + }; + inputSam = createSamFile(readNames); + outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam"); + outputSam.deleteOnExit(); + + FilterReadsByFlowCellLocation tool = new FilterReadsByFlowCellLocation(); + tool.INPUT = inputSam.getAbsolutePath(); + tool.OUTPUT = outputSam.getAbsolutePath(); + // Defaults: X_COORD=1000, Y_COORD=1000 + + int ret = tool.doWork(); + Assert.assertEquals(ret, 0); + + // Expect zero records in the output. + int recordCount = countRecords(outputSam); + Assert.assertEquals(recordCount, 0, "No records should be written"); + } +}