broadinstitute · fleharty · Feb 5, 2025 · Feb 5, 2025 · Feb 5, 2025 · Feb 5, 2025
diff --git a/.github/workflows/cloud_tests.yml b/.github/workflows/cloud_tests.yml
@@ -87,7 +87,7 @@ jobs:
 
       - name: Upload test results
         if: always()
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: cloud-test-results-${{ matrix.Java }}-barclay-${{ matrix.run_barclay_tests}}
           path: build/reports/tests
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -50,7 +50,7 @@ jobs:
           java -jar build/libs/picard.jar MarkDuplicates -I testdata/picard/sam/aligned_queryname_sorted.bam -O out.bam --METRICS_FILE out.metrics
       - name: Upload test results
         if: always()
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: test-results-${{ matrix.Java }}-barclay-${{ matrix.run_barclay_tests}}
           path: build/reports/tests

diff --git a/src/main/java/picard/sam/FilterReadsByFlowCellLocation.java b/src/main/java/picard/sam/FilterReadsByFlowCellLocation.java
@@ -0,0 +1,154 @@
+package picard.sam;
+
+import htsjdk.samtools.*;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import picard.cmdline.CommandLineProgram;
+import picard.cmdline.StandardOptionDefinitions;
+import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
+import htsjdk.samtools.util.Log;
+import picard.sam.util.ReadNameParser;
+import picard.sam.util.PhysicalLocation;
+import java.io.File;
+import java.io.IOException;
+
+@CommandLineProgramProperties(
+        summary = "Filters out reads with specific flowcell coordinates",
+        oneLineSummary = "Removes reads from specific flowcell positions from BAM/CRAM files",
+        programGroup = ReadDataManipulationProgramGroup.class
+)
+public class FilterReadsByFlowCellLocation extends CommandLineProgram {
+    private static final Log logger = Log.getInstance(FilterReadsByFlowCellLocation.class);
+
+    @Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
+            doc = "Input BAM/CRAM file")
+    public String INPUT;
+
+    @Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
+            doc = "Output BAM/CRAM file")
+    public String OUTPUT;
+
+    @Argument(shortName = "X",
+            doc = "X coordinate to filter (default: 1000)",
+            optional = true)
+    public int X_COORD = 1000;
+
+    @Argument(shortName = "Y",
+            doc = "Y coordinate to filter (default: 1000)",
+            optional = true)
+    public int Y_COORD = 1000;
+
+    private final ReadNameParser readNameParser = new ReadNameParser(ReadNameParser.DEFAULT_READ_NAME_REGEX);
+
+    private boolean hasFlowcellCoordinates(String readName) {
+        class ReadLocation implements PhysicalLocation {
+            private short libraryId;
+            private int x = -1, y = -1; // Default to invalid values
+            private short tile;
+
+            @Override
+            public void setLibraryId(short libraryId) {
+                this.libraryId = libraryId;
+            }
+
+            @Override
+            public short getLibraryId() {
+                return libraryId;
+            }
+
+            @Override
+            public void setX(int x) {
+                this.x = x;
+            }
+
+            @Override
+            public int getX() {
+                return x;
+            }
+
+            @Override
+            public void setY(int y) {
+                this.y = y;
+            }
+
+            @Override
+            public int getY() {
+                return y;
+            }
+
+            @Override
+            public void setReadGroup(short readGroup) {}
+
+            @Override
+            public short getReadGroup() {
+                return 0;
+            }
+
+            @Override
+            public void setTile(short tile) {
+                this.tile = tile;
+            }
+
+            @Override
+            public short getTile() {
+                return tile;
+            }
+        }
+
+        ReadLocation location = new ReadLocation();
+        try {
+            readNameParser.addLocationInformation(readName, location);
+        } catch (Exception e) {
+            logger.warn("Failed to parse read name: " + readName, e);
+            return false;  // Keep the read if parsing fails
+        }
+
+        if (location.getX() == -1 || location.getY() == -1) {
+            return false;  // Keep the read if coordinates are invalid
+        }
+
+        return location.getX() == X_COORD && location.getY() == Y_COORD;
+    }
+
+    @Override
+    protected int doWork() {
+        final SamReader reader = SamReaderFactory.makeDefault()
+                .referenceSequence(REFERENCE_SEQUENCE)
+                .open(new File(INPUT));
+
+        final SAMFileHeader header = reader.getFileHeader();
+        final SAMFileWriter writer = new SAMFileWriterFactory()
+                .makeWriter(header, true, new File(OUTPUT), REFERENCE_SEQUENCE);
+
+        int totalReads = 0;
+        int filteredReads = 0;
+
+        try {
+            for (final SAMRecord read : reader) {
+                totalReads++;
+                if (hasFlowcellCoordinates(read.getReadName())) {
+                    filteredReads++;
+                    continue;
+                }
+                writer.addAlignment(read);
+            }
+        } finally {
+            try {
+                reader.close();
+            } catch (IOException e) {
+                logger.error("Error closing input file", e);
+            }
+            writer.close();
+        }
+
+        logger.info("Processed " + totalReads + " total reads");
+        logger.info("Filtered " + filteredReads + " reads at flowcell position " + X_COORD + ":" + Y_COORD);
+        logger.info("Wrote " + (totalReads - filteredReads) + " reads to output");
+
+        return 0;
+    }
+
+    public static void main(String[] args) {
+        new FilterReadsByFlowCellLocation().instanceMain(args);
+    }
+}
diff --git a/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java b/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java
@@ -425,7 +425,7 @@ public EstimateLibraryComplexity() {
         } else {
             sizeInBytes = PairedReadSequence.getSizeInBytes();
         }
-        MAX_RECORDS_IN_RAM = (int) (Runtime.getRuntime().maxMemory() / sizeInBytes) / 2;
+        MAX_RECORDS_IN_RAM = Math.min((int) (Runtime.getRuntime().maxMemory() / sizeInBytes) / 2, Integer.MAX_VALUE - 32);
     }
 
     /**
@@ -673,4 +673,4 @@ boolean passesQualityCheck(final byte[] bases, final byte[] quals, final int see
         for (int i = 0; i < readLength; i++) total += quals[i];
         return total / readLength >= minQuality;
     }
-}
+}
diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java
@@ -479,8 +479,8 @@ private void buildSortedReadEndLists(final boolean useBarcodes) {
         } else {
             sizeInBytes = ReadEndsForMarkDuplicates.getSizeOf();
         }
-        MAX_RECORDS_IN_RAM = (int) (Runtime.getRuntime().maxMemory() / sizeInBytes) / 2;
-        final int maxInMemory = (int) ((Runtime.getRuntime().maxMemory() * SORTING_COLLECTION_SIZE_RATIO) / sizeInBytes);
+        MAX_RECORDS_IN_RAM = Math.min((int) (Runtime.getRuntime().maxMemory() / sizeInBytes) / 2, Integer.MAX_VALUE - 32);
+        final int maxInMemory = Math.min((int) ((Runtime.getRuntime().maxMemory() * SORTING_COLLECTION_SIZE_RATIO) / sizeInBytes), Integer.MAX_VALUE - 32);
         log.info("Will retain up to " + maxInMemory + " data points before spilling to disk.");
 
         final ReadEndsForMarkDuplicatesCodec fragCodec, pairCodec, diskCodec;
@@ -719,7 +719,7 @@ protected void sortIndicesForDuplicates(final boolean indexOpticalDuplicates){
             entryOverhead = SortingLongCollection.SIZEOF;
         }
         // Keep this number from getting too large even if there is a huge heap.
-        int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / entryOverhead, (double) (Integer.MAX_VALUE - 5));
+        int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / entryOverhead, (double) (Integer.MAX_VALUE - 32));
         // If we're also tracking optical duplicates, reduce maxInMemory, since we'll need two sorting collections
         if (indexOpticalDuplicates) {
             maxInMemory /= ((entryOverhead + SortingLongCollection.SIZEOF) / entryOverhead);

diff --git a/src/main/java/picard/util/SequenceDictionaryUtils.java b/src/main/java/picard/util/SequenceDictionaryUtils.java
@@ -190,7 +190,7 @@ public static SortingCollection<String> makeSortingCollection() {
                 String.class,
                 new StringCodec(),
                 String::compareTo,
-                (int) Math.min(maxNamesInRam, Integer.MAX_VALUE),
+                (int) Math.min(maxNamesInRam, Integer.MAX_VALUE - 32),
                 tmpDir.toPath()
         );
     }

diff --git a/src/test/java/picard/sam/FilterReadsByFlowCellLocationTest.java b/src/test/java/picard/sam/FilterReadsByFlowCellLocationTest.java
@@ -0,0 +1,163 @@
+package picard.sam;
+
+import htsjdk.samtools.*;
+import org.testng.Assert;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.io.IOException;
+
+/**
+ * Unit tests for FilterFlowCellEdgeReads using TestNG.
+ */
+public class FilterReadsByFlowCellLocationTest {
+
+    // Temporary files for input and output.
+    private File inputSam;
+    private File outputSam;
+
+    /**
+     * Helper method to create a temporary SAM file with one record per provided read name.
+     * Each record is given minimal required fields.
+     *
+     * @param readNames an array of read names to include.
+     * @return the temporary SAM file.
+     * @throws IOException if an I/O error occurs.
+     */
+    private File createSamFile(String[] readNames) throws IOException {
+        File tmpSam = File.createTempFile("FilterFlowCellEdgeReadsTest_input", ".sam");
+        tmpSam.deleteOnExit();
+
+        // Create a minimal SAM file header.
+        SAMFileHeader header = new SAMFileHeader();
+        header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
+        // Add one sequence record so that records have a reference.
+        header.addSequence(new SAMSequenceRecord("chr1", 1000000));
+
+        // Use SAMFileWriterFactory to write a SAM file.
+        try (SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(header, false, tmpSam, null)) {
+            // Create one record for each read name.
+            for (String readName : readNames) {
+                SAMRecord rec = new SAMRecord(header);
+                rec.setReadName(readName);
+                rec.setReferenceName("chr1");
+                rec.setAlignmentStart(1);
+                rec.setCigarString("50M");
+                // Set dummy bases and qualities.
+                rec.setReadString("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA");
+                rec.setBaseQualityString("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF");
+                writer.addAlignment(rec);
+            }
+        }
+        return tmpSam;
+    }
+
+    /**
+     * Helper method to count the number of SAM records in a given SAM file.
+     *
+     * @param samFile the SAM file.
+     * @return the number of records.
+     * @throws IOException if an I/O error occurs.
+     */
+    private int countRecords(File samFile) throws IOException {
+        int count = 0;
+        try (SamReader reader = SamReaderFactory.makeDefault().open(samFile)) {
+            for (SAMRecord rec : reader) {
+                count++;
+            }
+        }
+        return count;
+    }
+
+    @AfterMethod
+    public void tearDown() {
+        if (inputSam != null && inputSam.exists()) {
+            inputSam.delete();
+        }
+        if (outputSam != null && outputSam.exists()) {
+            outputSam.delete();
+        }
+    }
+
+    /**
+     * Test with a mixed input:
+     * – One read with a name that matches the default coordinates ("1000:1000") and should be filtered out.
+     * – One read with non-matching coordinates ("2000:2000") that should be retained.
+     */
+    @Test
+    public void testMixedReads() throws IOException {
+        String[] readNames = new String[]{
+                "EAS139:136:FC706VJ:2:1000:1000", // should be filtered out (matches default X_COORD and Y_COORD)
+                "EAS139:136:FC706VJ:2:2000:2000"  // should be retained
+        };
+        inputSam = createSamFile(readNames);
+        outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam");
+        outputSam.deleteOnExit();
+
+        FilterReadsByFlowCellLocation tool = new FilterReadsByFlowCellLocation();
+        tool.INPUT = inputSam.getAbsolutePath();
+        tool.OUTPUT = outputSam.getAbsolutePath();
+        // Use default X_COORD=1000, Y_COORD=1000
+
+        int ret = tool.doWork();
+        Assert.assertEquals(ret, 0, "doWork() should return 0");
+
+        // Only the record that does not match the filter should be written.
+        int recordCount = countRecords(outputSam);
+        Assert.assertEquals(recordCount, 1, "Only one record should be written");
+    }
+
+    /**
+     * Test with a read whose name does not contain colon-delimited coordinates.
+     * The method hasFlowcellCoordinates should catch the exception and return false,
+     * so the record should be retained.
+     */
+    @Test
+    public void testNonConformingReadName() throws IOException {
+        String[] readNames = new String[]{
+                "nonconforming_read"  // no colon-separated parts → not filtered
+        };
+        inputSam = createSamFile(readNames);
+        outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam");
+        outputSam.deleteOnExit();
+
+        FilterReadsByFlowCellLocation tool = new FilterReadsByFlowCellLocation();
+        tool.INPUT = inputSam.getAbsolutePath();
+        tool.OUTPUT = outputSam.getAbsolutePath();
+        // Defaults are used.
+
+        int ret = tool.doWork();
+        Assert.assertEquals(ret, 0);
+
+        // The read should be retained.
+        int recordCount = countRecords(outputSam);
+        Assert.assertEquals(recordCount, 1, "The nonconforming read should be kept");
+    }
+
+    /**
+     * Test with an input that has only a read with coordinates matching the filter.
+     * In this case, the tool should filter out the only record and write an empty output.
+     */
+    @Test
+    public void testAllReadsFiltered() throws IOException {
+        String[] readNames = new String[]{
+                "EAS139:136:FC706VJ:2:1000:1000"  // matches filter → filtered out
+        };
+        inputSam = createSamFile(readNames);
+        outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam");
+        outputSam.deleteOnExit();
+
+        FilterReadsByFlowCellLocation tool = new FilterReadsByFlowCellLocation();
+        tool.INPUT = inputSam.getAbsolutePath();
+        tool.OUTPUT = outputSam.getAbsolutePath();
+        // Defaults: X_COORD=1000, Y_COORD=1000
+
+        int ret = tool.doWork();
+        Assert.assertEquals(ret, 0);
+
+        // Expect zero records in the output.
+        int recordCount = countRecords(outputSam);
+        Assert.assertEquals(recordCount, 0, "No records should be written");
+    }
+}