Skip to content

Commit

Permalink
Create a new tool, FilterFlowCellEdgeReadsTest for filtering out read…
Browse files Browse the repository at this point in the history
…s with specific flow cell locations
  • Loading branch information
fleharty committed Feb 5, 2025
1 parent 5b17c47 commit 7d6d220
Show file tree
Hide file tree
Showing 2 changed files with 269 additions and 0 deletions.
106 changes: 106 additions & 0 deletions src/main/java/picard/sam/FilterFlowCellEdgeReads.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
package picard.sam;

import htsjdk.samtools.*;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import htsjdk.samtools.util.Log; // Added this import
import java.io.File;
import java.io.IOException;

@CommandLineProgramProperties(
summary = "Filters out reads with specific flowcell coordinates",
oneLineSummary = "Removes reads from specific flowcell positions from BAM/CRAM files",
programGroup = ReadDataManipulationProgramGroup.class
)
public class FilterFlowCellEdgeReads extends CommandLineProgram {
// Initialize logger
private static final Log logger = Log.getInstance(FilterFlowCellEdgeReads.class);

@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
doc = "Input BAM/CRAM file")
public String INPUT;

@Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
doc = "Output BAM/CRAM file")
public String OUTPUT;

@Argument(shortName = "X",
doc = "X coordinate to filter (default: 1000)",
optional = true)
public int X_COORD = 1000;

@Argument(shortName = "Y",
doc = "Y coordinate to filter (default: 1000)",
optional = true)
public int Y_COORD = 1000;

private boolean hasFlowcellCoordinates(String readName) {
// Parse Illumina read name format
// Example format: @HWUSI-EAS100R:6:73:941:1973#0/1
// or: @EAS139:136:FC706VJ:2:2104:15343:197393
try {
String[] parts = readName.split(":");
if (parts.length >= 6) { // Ensure we have enough parts
// The last two numbers are typically X and Y coordinates
int x = Integer.parseInt(parts[parts.length-2]);
int y = Integer.parseInt(parts[parts.length-1].split("[#/]")[0]); // Remove any trailing /1 or #0

return x == X_COORD && y == Y_COORD;
}
} catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
// If we can't parse the coordinates, assume it doesn't match
return false;
}
return false;
}

@Override
protected int doWork() {
final SamReader reader = SamReaderFactory.makeDefault()
.referenceSequence(REFERENCE_SEQUENCE)
.open(new File(INPUT));

final SAMFileHeader header = reader.getFileHeader();
final SAMFileWriter writer = new SAMFileWriterFactory()
.makeWriter(header, true, new File(OUTPUT), REFERENCE_SEQUENCE);

// Process reads
int totalReads = 0;
int filteredReads = 0;

try {
for (final SAMRecord read : reader) {
totalReads++;

// Check if read has the specified flowcell coordinates
if (hasFlowcellCoordinates(read.getReadName())) {
filteredReads++;
continue; // Skip this read
}

// Write read to output if it doesn't match filter criteria
writer.addAlignment(read);
}
} finally {
try {
reader.close();
} catch (IOException e) {
logger.error("Error closing input file", e);
}
writer.close();
}

logger.info("Processed " + totalReads + " total reads");
logger.info("Filtered " + filteredReads + " reads at flowcell position " + X_COORD + ":" + Y_COORD);
logger.info("Wrote " + (totalReads - filteredReads) + " reads to output");

return 0;
}

public static void main(String[] args) {
new FilterFlowCellEdgeReads().instanceMain(args);
}
}
163 changes: 163 additions & 0 deletions src/test/java/picard/sam/FilterFlowCellEdgeReadsTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
package picard.sam;

import htsjdk.samtools.*;
import org.testng.Assert;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.Test;

import java.io.File;
import java.io.IOException;

/**
* Unit tests for FilterFlowCellEdgeReads using TestNG.
*/
public class FilterFlowCellEdgeReadsTest {

// Temporary files for input and output.
private File inputSam;
private File outputSam;

/**
* Helper method to create a temporary SAM file with one record per provided read name.
* Each record is given minimal required fields.
*
* @param readNames an array of read names to include.
* @return the temporary SAM file.
* @throws IOException if an I/O error occurs.
*/
private File createSamFile(String[] readNames) throws IOException {
File tmpSam = File.createTempFile("FilterFlowCellEdgeReadsTest_input", ".sam");
tmpSam.deleteOnExit();

// Create a minimal SAM file header.
SAMFileHeader header = new SAMFileHeader();
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
// Add one sequence record so that records have a reference.
header.addSequence(new SAMSequenceRecord("chr1", 1000000));

// Use SAMFileWriterFactory to write a SAM file.
try (SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(header, false, tmpSam, null)) {
// Create one record for each read name.
for (String readName : readNames) {
SAMRecord rec = new SAMRecord(header);
rec.setReadName(readName);
rec.setReferenceName("chr1");
rec.setAlignmentStart(1);
rec.setCigarString("50M");
// Set dummy bases and qualities.
rec.setReadString("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA");
rec.setBaseQualityString("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF");
writer.addAlignment(rec);
}
}
return tmpSam;
}

/**
* Helper method to count the number of SAM records in a given SAM file.
*
* @param samFile the SAM file.
* @return the number of records.
* @throws IOException if an I/O error occurs.
*/
private int countRecords(File samFile) throws IOException {
int count = 0;
try (SamReader reader = SamReaderFactory.makeDefault().open(samFile)) {
for (SAMRecord rec : reader) {
count++;
}
}
return count;
}

@AfterMethod
public void tearDown() {
if (inputSam != null && inputSam.exists()) {
inputSam.delete();
}
if (outputSam != null && outputSam.exists()) {
outputSam.delete();
}
}

/**
* Test with a mixed input:
* – One read with a name that matches the default coordinates ("1000:1000") and should be filtered out.
* – One read with non-matching coordinates ("2000:2000") that should be retained.
*/
@Test
public void testMixedReads() throws IOException {
String[] readNames = new String[]{
"EAS139:136:FC706VJ:2:1000:1000", // should be filtered out (matches default X_COORD and Y_COORD)
"EAS139:136:FC706VJ:2:2000:2000" // should be retained
};
inputSam = createSamFile(readNames);
outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam");
outputSam.deleteOnExit();

FilterFlowCellEdgeReads tool = new FilterFlowCellEdgeReads();
tool.INPUT = inputSam.getAbsolutePath();
tool.OUTPUT = outputSam.getAbsolutePath();
// Use default X_COORD=1000, Y_COORD=1000

int ret = tool.doWork();
Assert.assertEquals(ret, 0, "doWork() should return 0");

// Only the record that does not match the filter should be written.
int recordCount = countRecords(outputSam);
Assert.assertEquals(recordCount, 1, "Only one record should be written");
}

/**
* Test with a read whose name does not contain colon-delimited coordinates.
* The method hasFlowcellCoordinates should catch the exception and return false,
* so the record should be retained.
*/
@Test
public void testNonConformingReadName() throws IOException {
String[] readNames = new String[]{
"nonconforming_read" // no colon-separated parts → not filtered
};
inputSam = createSamFile(readNames);
outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam");
outputSam.deleteOnExit();

FilterFlowCellEdgeReads tool = new FilterFlowCellEdgeReads();
tool.INPUT = inputSam.getAbsolutePath();
tool.OUTPUT = outputSam.getAbsolutePath();
// Defaults are used.

int ret = tool.doWork();
Assert.assertEquals(ret, 0);

// The read should be retained.
int recordCount = countRecords(outputSam);
Assert.assertEquals(recordCount, 1, "The nonconforming read should be kept");
}

/**
* Test with an input that has only a read with coordinates matching the filter.
* In this case, the tool should filter out the only record and write an empty output.
*/
@Test
public void testAllReadsFiltered() throws IOException {
String[] readNames = new String[]{
"EAS139:136:FC706VJ:2:1000:1000" // matches filter → filtered out
};
inputSam = createSamFile(readNames);
outputSam = File.createTempFile("FilterFlowCellEdgeReadsTest_output", ".sam");
outputSam.deleteOnExit();

FilterFlowCellEdgeReads tool = new FilterFlowCellEdgeReads();
tool.INPUT = inputSam.getAbsolutePath();
tool.OUTPUT = outputSam.getAbsolutePath();
// Defaults: X_COORD=1000, Y_COORD=1000

int ret = tool.doWork();
Assert.assertEquals(ret, 0);

// Expect zero records in the output.
int recordCount = countRecords(outputSam);
Assert.assertEquals(recordCount, 0, "No records should be written");
}
}

0 comments on commit 7d6d220

Please sign in to comment.