broadinstitute · fleharty · Feb 5, 2025 · Feb 5, 2025 · Feb 5, 2025 · Feb 5, 2025
diff --git a/src/main/java/picard/sam/FilterReadsByFlowCellLocation.java b/src/main/java/picard/sam/FilterReadsByFlowCellLocation.java
@@ -0,0 +1,106 @@
+package picard.sam;
+
+import htsjdk.samtools.*;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import picard.cmdline.CommandLineProgram;
+import picard.cmdline.StandardOptionDefinitions;
+import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
+import htsjdk.samtools.util.Log;  // Added this import
+import java.io.File;
+import java.io.IOException;
+
+@CommandLineProgramProperties(
+        summary = "Filters out reads with specific flowcell coordinates",
+        oneLineSummary = "Removes reads from specific flowcell positions from BAM/CRAM files",
+        programGroup = ReadDataManipulationProgramGroup.class
+)
+public class FilterReadsByFlowCellLocation extends CommandLineProgram {
+    // Initialize logger
+    private static final Log logger = Log.getInstance(FilterReadsByFlowCellLocation.class);
+
+    @Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
+            doc = "Input BAM/CRAM file")
+    public String INPUT;
+
+    @Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
+            doc = "Output BAM/CRAM file")
+    public String OUTPUT;
+
+    @Argument(shortName = "X",
+            doc = "X coordinate to filter (default: 1000)",
+            optional = true)
+    public int X_COORD = 1000;
+
+    @Argument(shortName = "Y",
+            doc = "Y coordinate to filter (default: 1000)",
+            optional = true)
+    public int Y_COORD = 1000;
+
+    private boolean hasFlowcellCoordinates(String readName) {
+        // Parse Illumina read name format
+        // Example format: @HWUSI-EAS100R:6:73:941:1973#0/1
+        // or: @EAS139:136:FC706VJ:2:2104:15343:197393
+        try {
+            String[] parts = readName.split(":");
+            if (parts.length >= 6) {  // Ensure we have enough parts
+                // The last two numbers are typically X and Y coordinates
+                int x = Integer.parseInt(parts[parts.length-2]);
+                int y = Integer.parseInt(parts[parts.length-1].split("[#/]")[0]);  // Remove any trailing /1 or #0
+
+                return x == X_COORD && y == Y_COORD;
+            }
+        } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
+            // If we can't parse the coordinates, assume it doesn't match
+            return false;
+        }
+        return false;
+    }
+
+    @Override
+    protected int doWork() {
+        final SamReader reader = SamReaderFactory.makeDefault()
+                .referenceSequence(REFERENCE_SEQUENCE)
+                .open(new File(INPUT));
+
+        final SAMFileHeader header = reader.getFileHeader();
+        final SAMFileWriter writer = new SAMFileWriterFactory()
+                .makeWriter(header, true, new File(OUTPUT), REFERENCE_SEQUENCE);
+
+        // Process reads
+        int totalReads = 0;
+        int filteredReads = 0;
+
+        try {
+            for (final SAMRecord read : reader) {
+                totalReads++;
+
+                // Check if read has the specified flowcell coordinates
+                if (hasFlowcellCoordinates(read.getReadName())) {
+                    filteredReads++;
+                    continue; // Skip this read
+                }
+
+                // Write read to output if it doesn't match filter criteria
+                writer.addAlignment(read);
+            }
+        } finally {
+            try {
+                reader.close();
+            } catch (IOException e) {
+                logger.error("Error closing input file", e);
+            }
+            writer.close();
+        }
+
+        logger.info("Processed " + totalReads + " total reads");
+        logger.info("Filtered " + filteredReads + " reads at flowcell position " + X_COORD + ":" + Y_COORD);
+        logger.info("Wrote " + (totalReads - filteredReads) + " reads to output");
+
+        return 0;
+    }
+
+    public static void main(String[] args) {
+        new FilterReadsByFlowCellLocation().instanceMain(args);
+    }
+}
diff --git a/src/test/java/picard/sam/FilterReadsByFlowCellLocationTest.java b/src/test/java/picard/sam/FilterReadsByFlowCellLocationTest.java
@@ -0,0 +1,163 @@
+package picard.sam;
+
+import htsjdk.samtools.*;
+import org.testng.Assert;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.io.IOException;
+
+/**
+ * Unit tests for FilterFlowCellEdgeReads using TestNG.
+ */
+public class FilterReadsByFlowCellLocationTest {
+
+    // Temporary files for input and output.
+    private File inputSam;
+    private File outputSam;
+
+    /**
+     * Helper method to create a temporary SAM file with one record per provided read name.
+     * Each record is given minimal required fields.
+     *
+     * @param readNames an array of read names to include.
+     * @return the temporary SAM file.
+     * @throws IOException if an I/O error occurs.
+     */
+    private File createSamFile(String[] readNames) throws IOException {
+        File tmpSam = File.createTempFile("FilterFlowCellEdgeReadsTest_input", ".sam");
+        tmpSam.deleteOnExit();
+
+        // Create a minimal SAM file header.
+        SAMFileHeader header = new SAMFileHeader();
+        header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
+        // Add one sequence record so that records have a reference.
+        header.addSequence(new SAMSequenceRecord("chr1", 1000000));
+
+        // Use SAMFileWriterFactory to write a SAM file.
+        try (SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(header, false, tmpSam, null)) {
+            // Create one record for each read name.
+            for (String readName : readNames) {
+                SAMRecord rec = new SAMRecord(header);
+                rec.setReadName(readName);
+                rec.setReferenceName("chr1");
+                rec.setAlignmentStart(1);
+                rec.setCigarString("50M");
+                // Set dummy bases and qualities.
+                rec.setReadString("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA");
+                rec.setBaseQualityString("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF");
+                writer.addAlignment(rec);
+            }
+        }
+        return tmpSam;
+    }
+
+    /**
+     * Helper method to count the number of SAM records in a given SAM file.
+     *
+     * @param samFile the SAM file.
+     * @return the number of records.
+     * @throws IOException if an I/O error occurs.
+     */
+    private int countRecords(File samFile) throws IOException {
+        int count = 0;
+        try (SamReader reader = SamReaderFactory.makeDefault().open(samFile)) {
+            for (SAMRecord rec : reader) {
+                count++;
+            }
+        }
+        return count;
+    }
+
+    @AfterMethod
+    public void tearDown() {
+        if (inputSam != null && inputSam.exists()) {
+            inputSam.delete();
+        }
+        if (outputSam != null && outputSam.exists()) {
+            outputSam.delete();
+        }
+    }
+
+    /**
+     * Test with a mixed input:
+     * – One read with a name that matches the default coordinates ("1000:1000") and should be filtered out.
+     * – One read with non-matching coordinates ("2000:2000") that should be retained.
+     */
+    @Test
+    public void testMixedReads() throws IOException {
+        String[] readNames = new String[]{
+                "EAS139:136:FC706VJ:2:1000:1000", // should be filtered out (matches default X_COORD and Y_COORD)
+                "EAS139:136:FC706VJ:2:2000:2000"  // should be retained
+        };
+        inputSam = createSamFile(readNames);
+        outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam");
+        outputSam.deleteOnExit();
+
+        FilterReadsByFlowCellLocation tool = new FilterReadsByFlowCellLocation();
+        tool.INPUT = inputSam.getAbsolutePath();
+        tool.OUTPUT = outputSam.getAbsolutePath();
+        // Use default X_COORD=1000, Y_COORD=1000
+
+        int ret = tool.doWork();
+        Assert.assertEquals(ret, 0, "doWork() should return 0");
+
+        // Only the record that does not match the filter should be written.
+        int recordCount = countRecords(outputSam);
+        Assert.assertEquals(recordCount, 1, "Only one record should be written");
+    }
+
+    /**
+     * Test with a read whose name does not contain colon-delimited coordinates.
+     * The method hasFlowcellCoordinates should catch the exception and return false,
+     * so the record should be retained.
+     */
+    @Test
+    public void testNonConformingReadName() throws IOException {
+        String[] readNames = new String[]{
+                "nonconforming_read"  // no colon-separated parts → not filtered
+        };
+        inputSam = createSamFile(readNames);
+        outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam");
+        outputSam.deleteOnExit();
+
+        FilterReadsByFlowCellLocation tool = new FilterReadsByFlowCellLocation();
+        tool.INPUT = inputSam.getAbsolutePath();
+        tool.OUTPUT = outputSam.getAbsolutePath();
+        // Defaults are used.
+
+        int ret = tool.doWork();
+        Assert.assertEquals(ret, 0);
+
+        // The read should be retained.
+        int recordCount = countRecords(outputSam);
+        Assert.assertEquals(recordCount, 1, "The nonconforming read should be kept");
+    }
+
+    /**
+     * Test with an input that has only a read with coordinates matching the filter.
+     * In this case, the tool should filter out the only record and write an empty output.
+     */
+    @Test
+    public void testAllReadsFiltered() throws IOException {
+        String[] readNames = new String[]{
+                "EAS139:136:FC706VJ:2:1000:1000"  // matches filter → filtered out
+        };
+        inputSam = createSamFile(readNames);
+        outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam");
+        outputSam.deleteOnExit();
+
+        FilterReadsByFlowCellLocation tool = new FilterReadsByFlowCellLocation();
+        tool.INPUT = inputSam.getAbsolutePath();
+        tool.OUTPUT = outputSam.getAbsolutePath();
+        // Defaults: X_COORD=1000, Y_COORD=1000
+
+        int ret = tool.doWork();
+        Assert.assertEquals(ret, 0);
+
+        // Expect zero records in the output.
+        int recordCount = countRecords(outputSam);
+        Assert.assertEquals(recordCount, 0, "No records should be written");
+    }
+}