From 7d6d220e2b435de4d881f7507891f2c2acb6d8af Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Wed, 5 Feb 2025 10:03:54 -0500 Subject: [PATCH] Create a new tool, FilterFlowCellEdgeReadsTest for filtering out reads with specific flow cell locations --- .../picard/sam/FilterFlowCellEdgeReads.java | 106 ++++++++++++ .../sam/FilterFlowCellEdgeReadsTest.java | 163 ++++++++++++++++++ 2 files changed, 269 insertions(+) create mode 100644 src/main/java/picard/sam/FilterFlowCellEdgeReads.java create mode 100644 src/test/java/picard/sam/FilterFlowCellEdgeReadsTest.java diff --git a/src/main/java/picard/sam/FilterFlowCellEdgeReads.java b/src/main/java/picard/sam/FilterFlowCellEdgeReads.java new file mode 100644 index 0000000000..718f8e86a0 --- /dev/null +++ b/src/main/java/picard/sam/FilterFlowCellEdgeReads.java @@ -0,0 +1,106 @@ +package picard.sam; + +import htsjdk.samtools.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import picard.cmdline.CommandLineProgram; +import picard.cmdline.StandardOptionDefinitions; +import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; +import htsjdk.samtools.util.Log; // Added this import +import java.io.File; +import java.io.IOException; + +@CommandLineProgramProperties( + summary = "Filters out reads with specific flowcell coordinates", + oneLineSummary = "Removes reads from specific flowcell positions from BAM/CRAM files", + programGroup = ReadDataManipulationProgramGroup.class +) +public class FilterFlowCellEdgeReads extends CommandLineProgram { + // Initialize logger + private static final Log logger = Log.getInstance(FilterFlowCellEdgeReads.class); + + @Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, + doc = "Input BAM/CRAM file") + public String INPUT; + + @Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, + doc = "Output BAM/CRAM file") + public String OUTPUT; + + @Argument(shortName = "X", + doc = "X coordinate to filter (default: 1000)", + optional = true) + public int X_COORD = 1000; + + @Argument(shortName = "Y", + doc = "Y coordinate to filter (default: 1000)", + optional = true) + public int Y_COORD = 1000; + + private boolean hasFlowcellCoordinates(String readName) { + // Parse Illumina read name format + // Example format: @HWUSI-EAS100R:6:73:941:1973#0/1 + // or: @EAS139:136:FC706VJ:2:2104:15343:197393 + try { + String[] parts = readName.split(":"); + if (parts.length >= 6) { // Ensure we have enough parts + // The last two numbers are typically X and Y coordinates + int x = Integer.parseInt(parts[parts.length-2]); + int y = Integer.parseInt(parts[parts.length-1].split("[#/]")[0]); // Remove any trailing /1 or #0 + + return x == X_COORD && y == Y_COORD; + } + } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) { + // If we can't parse the coordinates, assume it doesn't match + return false; + } + return false; + } + + @Override + protected int doWork() { + final SamReader reader = SamReaderFactory.makeDefault() + .referenceSequence(REFERENCE_SEQUENCE) + .open(new File(INPUT)); + + final SAMFileHeader header = reader.getFileHeader(); + final SAMFileWriter writer = new SAMFileWriterFactory() + .makeWriter(header, true, new File(OUTPUT), REFERENCE_SEQUENCE); + + // Process reads + int totalReads = 0; + int filteredReads = 0; + + try { + for (final SAMRecord read : reader) { + totalReads++; + + // Check if read has the specified flowcell coordinates + if (hasFlowcellCoordinates(read.getReadName())) { + filteredReads++; + continue; // Skip this read + } + + // Write read to output if it doesn't match filter criteria + writer.addAlignment(read); + } + } finally { + try { + reader.close(); + } catch (IOException e) { + logger.error("Error closing input file", e); + } + writer.close(); + } + + logger.info("Processed " + totalReads + " total reads"); + logger.info("Filtered " + filteredReads + " reads at flowcell position " + X_COORD + ":" + Y_COORD); + logger.info("Wrote " + (totalReads - filteredReads) + " reads to output"); + + return 0; + } + + public static void main(String[] args) { + new FilterFlowCellEdgeReads().instanceMain(args); + } +} diff --git a/src/test/java/picard/sam/FilterFlowCellEdgeReadsTest.java b/src/test/java/picard/sam/FilterFlowCellEdgeReadsTest.java new file mode 100644 index 0000000000..e6ffe51d10 --- /dev/null +++ b/src/test/java/picard/sam/FilterFlowCellEdgeReadsTest.java @@ -0,0 +1,163 @@ +package picard.sam; + +import htsjdk.samtools.*; +import org.testng.Assert; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; + +/** + * Unit tests for FilterFlowCellEdgeReads using TestNG. + */ +public class FilterFlowCellEdgeReadsTest { + + // Temporary files for input and output. + private File inputSam; + private File outputSam; + + /** + * Helper method to create a temporary SAM file with one record per provided read name. + * Each record is given minimal required fields. + * + * @param readNames an array of read names to include. + * @return the temporary SAM file. + * @throws IOException if an I/O error occurs. + */ + private File createSamFile(String[] readNames) throws IOException { + File tmpSam = File.createTempFile("FilterFlowCellEdgeReadsTest_input", ".sam"); + tmpSam.deleteOnExit(); + + // Create a minimal SAM file header. + SAMFileHeader header = new SAMFileHeader(); + header.setSortOrder(SAMFileHeader.SortOrder.unsorted); + // Add one sequence record so that records have a reference. + header.addSequence(new SAMSequenceRecord("chr1", 1000000)); + + // Use SAMFileWriterFactory to write a SAM file. + try (SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(header, false, tmpSam, null)) { + // Create one record for each read name. + for (String readName : readNames) { + SAMRecord rec = new SAMRecord(header); + rec.setReadName(readName); + rec.setReferenceName("chr1"); + rec.setAlignmentStart(1); + rec.setCigarString("50M"); + // Set dummy bases and qualities. + rec.setReadString("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"); + rec.setBaseQualityString("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"); + writer.addAlignment(rec); + } + } + return tmpSam; + } + + /** + * Helper method to count the number of SAM records in a given SAM file. + * + * @param samFile the SAM file. + * @return the number of records. + * @throws IOException if an I/O error occurs. + */ + private int countRecords(File samFile) throws IOException { + int count = 0; + try (SamReader reader = SamReaderFactory.makeDefault().open(samFile)) { + for (SAMRecord rec : reader) { + count++; + } + } + return count; + } + + @AfterMethod + public void tearDown() { + if (inputSam != null && inputSam.exists()) { + inputSam.delete(); + } + if (outputSam != null && outputSam.exists()) { + outputSam.delete(); + } + } + + /** + * Test with a mixed input: + * – One read with a name that matches the default coordinates ("1000:1000") and should be filtered out. + * – One read with non-matching coordinates ("2000:2000") that should be retained. + */ + @Test + public void testMixedReads() throws IOException { + String[] readNames = new String[]{ + "EAS139:136:FC706VJ:2:1000:1000", // should be filtered out (matches default X_COORD and Y_COORD) + "EAS139:136:FC706VJ:2:2000:2000" // should be retained + }; + inputSam = createSamFile(readNames); + outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam"); + outputSam.deleteOnExit(); + + FilterFlowCellEdgeReads tool = new FilterFlowCellEdgeReads(); + tool.INPUT = inputSam.getAbsolutePath(); + tool.OUTPUT = outputSam.getAbsolutePath(); + // Use default X_COORD=1000, Y_COORD=1000 + + int ret = tool.doWork(); + Assert.assertEquals(ret, 0, "doWork() should return 0"); + + // Only the record that does not match the filter should be written. + int recordCount = countRecords(outputSam); + Assert.assertEquals(recordCount, 1, "Only one record should be written"); + } + + /** + * Test with a read whose name does not contain colon-delimited coordinates. + * The method hasFlowcellCoordinates should catch the exception and return false, + * so the record should be retained. + */ + @Test + public void testNonConformingReadName() throws IOException { + String[] readNames = new String[]{ + "nonconforming_read" // no colon-separated parts → not filtered + }; + inputSam = createSamFile(readNames); + outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam"); + outputSam.deleteOnExit(); + + FilterFlowCellEdgeReads tool = new FilterFlowCellEdgeReads(); + tool.INPUT = inputSam.getAbsolutePath(); + tool.OUTPUT = outputSam.getAbsolutePath(); + // Defaults are used. + + int ret = tool.doWork(); + Assert.assertEquals(ret, 0); + + // The read should be retained. + int recordCount = countRecords(outputSam); + Assert.assertEquals(recordCount, 1, "The nonconforming read should be kept"); + } + + /** + * Test with an input that has only a read with coordinates matching the filter. + * In this case, the tool should filter out the only record and write an empty output. + */ + @Test + public void testAllReadsFiltered() throws IOException { + String[] readNames = new String[]{ + "EAS139:136:FC706VJ:2:1000:1000" // matches filter → filtered out + }; + inputSam = createSamFile(readNames); + outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam"); + outputSam.deleteOnExit(); + + FilterFlowCellEdgeReads tool = new FilterFlowCellEdgeReads(); + tool.INPUT = inputSam.getAbsolutePath(); + tool.OUTPUT = outputSam.getAbsolutePath(); + // Defaults: X_COORD=1000, Y_COORD=1000 + + int ret = tool.doWork(); + Assert.assertEquals(ret, 0); + + // Expect zero records in the output. + int recordCount = countRecords(outputSam); + Assert.assertEquals(recordCount, 0, "No records should be written"); + } +}