From 133813ef0e4dab2c8eb485d037be1afa5ca56a7c Mon Sep 17 00:00:00 2001 From: Roman Valls Guimera Date: Thu, 31 Jan 2013 09:33:12 +0100 Subject: [PATCH] Graduating FACS C implementation to SciLifeLab repository --- .gitignore | 14 + README | 31 - README.md | 111 ++++ bin/bloombuild.pl | 268 -------- bin/facs.pl | 516 --------------- bin/facs_batch.pl | 518 --------------- doc/GC_.py | 48 ++ doc/ROC.m | 169 +++++ doc/prop.m | 41 ++ drass/Makefile | 44 ++ drass/big_query.c | 277 ++++++++ drass/big_query.h | 15 + drass/bloom.c | 639 ++++++++++++++++++ drass/bloom.h | 133 ++++ drass/build.h | 12 + drass/check.h | 14 + drass/facs.c | 79 +++ drass/file_dir.c | 164 +++++ drass/file_dir.h | 11 + drass/good_build.c | 265 ++++++++ drass/hashes.h | 10 + drass/lookup8.c | 570 ++++++++++++++++ drass/lookup8.h | 7 + drass/main.c | 60 ++ drass/mpi_bloom.c | 1055 ++++++++++++++++++++++++++++++ drass/remove.h | 11 + drass/remove_l.h | 12 + drass/setup.cfg | 3 + drass/setup.py | 34 + drass/simple_check_1_ge.c | 264 ++++++++ drass/simple_remove.c | 347 ++++++++++ drass/simple_remove_l.c | 392 +++++++++++ drass/suggestions.c | 164 +++++ drass/tool.c | 453 +++++++++++++ drass/tool.h | 14 + tests/test_basic.py | 66 ++ tests/test_thousand_genomes.py | 51 ++ tests/utils/__init__.py | 0 tests/utils/fastq_dummy.py | 36 + tests/utils/helpers.py | 143 ++++ tests/utils/valgrind-python.supp | 391 +++++++++++ 41 files changed, 6119 insertions(+), 1333 deletions(-) create mode 100644 .gitignore delete mode 100644 README create mode 100644 README.md delete mode 100755 bin/bloombuild.pl delete mode 100755 bin/facs.pl delete mode 100755 bin/facs_batch.pl create mode 100644 doc/GC_.py create mode 100644 doc/ROC.m create mode 100644 doc/prop.m create mode 100644 drass/Makefile create mode 100644 drass/big_query.c create mode 100644 drass/big_query.h create mode 100644 drass/bloom.c create mode 100644 drass/bloom.h create mode 100644 drass/build.h create mode 100644 drass/check.h create mode 100644 drass/facs.c create mode 100644 drass/file_dir.c create mode 100644 drass/file_dir.h create mode 100644 drass/good_build.c create mode 100644 drass/hashes.h create mode 100644 drass/lookup8.c create mode 100644 drass/lookup8.h create mode 100644 drass/main.c create mode 100644 drass/mpi_bloom.c create mode 100644 drass/remove.h create mode 100644 drass/remove_l.h create mode 100644 drass/setup.cfg create mode 100755 drass/setup.py create mode 100644 drass/simple_check_1_ge.c create mode 100644 drass/simple_remove.c create mode 100644 drass/simple_remove_l.c create mode 100644 drass/suggestions.c create mode 100644 drass/tool.c create mode 100644 drass/tool.h create mode 100644 tests/test_basic.py create mode 100644 tests/test_thousand_genomes.py create mode 100644 tests/utils/__init__.py create mode 100755 tests/utils/fastq_dummy.py create mode 100644 tests/utils/helpers.py create mode 100644 tests/utils/valgrind-python.supp diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e311cd1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +*.o +*.so +*.a +*.pyc +*.swp +*.bloom +*.fasta +*.fastq +*.info +core.* +vgcore.* +*.egg-info +tests/data +build/ diff --git a/README b/README deleted file mode 100644 index f2d8528..0000000 --- a/README +++ /dev/null @@ -1,31 +0,0 @@ -facs README - -For more information, please visit the FACS website: -http://facs.scilifelab.se - -Overview - -New generation sequence technologies and the sequencing of increasingly -complex datasets demand new efficient and specialized sequence analysis -algorithms. Often, it is only the 'novel' sequences in a complex dataset that -are of interest and the superfluous sequences need to be removed. A novel -algorithm, FACS (Fast and Accurate Classification of Sequences), is introduced -that can accurately and rapidly align sequences to a reference sequence. FACS -was first optimized and validated using a synthetic metagenome dataset. An -experimental metagenome dataset was then used to show that FACS is at least -three times faster and more accurate than BLAT and SSAHA2 in classifying -sequences when using references larger than 50Mbp. - -Citation - -Reference: -Henrik Stranneheim, Max Käller, Tobias Allander, Björn Andersson, -Lars Arvestad, Joakim Lundeberg. Classification of DNA sequences using Bloom -filters. Bioinformatics, 2010 July 1; 26(13):1595-1600. - -Published online 2010 May 13. http://dx.doi.org/doi:10.1093/bioinformatics/btq230 - -Contact - -Henrik Stranneheim -Lars Arvestad diff --git a/README.md b/README.md new file mode 100644 index 0000000..6526cf8 --- /dev/null +++ b/README.md @@ -0,0 +1,111 @@ +FACS (Fast and Accurate Classification of Sequences) C implementation +====================================================================== + +[![Build Status](https://travis-ci.org/tzcoolman/DRASS.png?branch=master)](DRASS) + +WARNING: This program is under active development and this documentation might not reflect reality. +Please file a GitHub issue and we will take care of it as soon as we can. + +Overview +-------- + +* 'build' is for building a bloom filter from a reference file. +It supports large genome files (>4GB), human genome, for instance. +* 'query' is for querying a fastq/fasta file against the bloom filter. +* 'remove' is for removing contamination sequences from a fastq/fasta file. + + +Quickstart +---------- + +In order to fetch the source code, compile and run tests: + +``` +$ git clone https://github.com/tzcoolman/DRASS.git && cd drass && make -j8 && make tests +``` + +Please note that python's virtualenv is needed to run the tests. + +Usage +------ + +Facs uses a similar commandline structure to the one found in the popular bwa. +There are three main commands: build, query and remove. Each of them might have slightly different flags but should +behave similarly. + +``` +$ ./facs -h + +Program: facs (Sequence decontamination using bloom filters) +Version: 0.1 +Contact: Enze Liu + +Usage: facs [options] + +Command: build build a bloom filter from a FASTA reference file + query query a bloom filter given a FASTQ/FASTA file + remove remove (contamination) sequences from FASTQ/FASTA file +``` + +For example, to build a bloom filter out of a FASTA reference genome, one should type: + +``` +$ ./facs build -r ecoli.fasta -o ecoli.bloom +``` + +That would generate a ecoli bloom filter that could be used to query a FASTQ file: + +``` +$ ./facs query -b ecoli.bloom -r contaminated_sample.fastq.gz +``` + +Note that both plaintext fastq files and gzip-compressed files are supported transparently +to the user. + +Which would return some metrics indicating how many reads might be contaminated with +ecoli in that particular sample: + +``` +{ + "total_read_count": 201, + "contaminated_reads": 1, + "total_hits": 90, + "contamination_rate": 0.004975, + "bloom_filename":"tests/data/bloom/U00096.2.bloom" +} +``` + +Finally, if one wants to remove those reads from the sample, one should run the following +command: + +``` +$ ./facs remove -b ecoli.bloom -r contaminated_sample.fastq.gz -o discarded_reads.fastq +``` + +Where "discarded_reads.fastq" is the reads that have been filtered out from the original +fastq file. + + +Python interface +---------------- + +A python C-Extension provides a very simple API to build, query and remove sequences, +just as described above with the plain C-based commandline. + +``` +$ python +Python 2.6.6 (r266:84292, Jun 18 2012, 09:57:52) +[GCC 4.4.6 20110731 (Red Hat 4.4.6-3)] on linux2 +Type "help", "copyright", "credits" or "license" for more information. +>>> import facs +>>> facs.build("ecoli.fasta", "ecoli.bloom") +>>> facs.query("contaminated_sample.fastq.gz", "ecoli.bloom") +>>> facs.remove("contaminated_sample.fastq.gz", "ecoli.bloom") +``` + + +Notes +----- + +* All three scripts can be executed on both Linux and Mac system. But they don't support large bloom filter building and loading on MAC system. +* FACS supports fasta and fastq formats. Make sure you use the correct extension name: .fna or .fasta and .fastq, respectively. diff --git a/bin/bloombuild.pl b/bin/bloombuild.pl deleted file mode 100755 index 5dcd9f1..0000000 --- a/bin/bloombuild.pl +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/perl -w - -# Builds Bloom filters with a given K-mer length, false positive rate -# and reference in a FASTA format - -=head1 NAME - - bloombuild - build Bloom filters from reference genomes in FASTA files. - -=head1 SYNOPSIS - - bloombuild [options] -r reference.fasta - -=head1 DESCRIPTION - - Build Bloom filters from reference genomes in fasta files. These - can later be queried using FACS. - - -r/--reference A file containing a list of filenames for reference genome data. - Each line contains a filename. - Each file is a Fasta file. (defaults to STDIN) - - NOTE: Presently the Bloom::FASTER module has problems - with garbage collection which only enables one reference to built each time - the program loads. Supplying references in batch might cause a high false - positive rate. - -=head1 OPTIONS - --os/--outfile Output suffix for the created Bloom filter (defaults to .obj) - --k/--kmer Word length for K-mers inserted into the filter (defaults to '21') - --f/--falseposprob The false positive probaility that you accept (defaults to '0.0005') - --dbpr/--databsepath The unixpath to the references (defaults to STDIN) - --dbpo/--databsepath The unixpath to the output directory (defaults to STDIN) - -=head1 I/O -Input format (FASTA/Pearson) - -Output format (Bloom filter) - -=head1 SEE ALSO - -FACS - -=cut - -use strict; -use Bloom::Faster; -use Getopt::Long; - -use vars qw($USAGE); - -BEGIN { - $USAGE = -qq{bloombuild.pl < file.fa > file.obj --k/--kmer Word length for K-mers inserted into the filter (defaults to '21') --f/--falseposprob The false positive probaility that you accept (defaults to '0.0005') --r/--reference A file containing a list of filenames for reference genome data. Each line contains -a filename. Each file is a Fasta file. (defaults to STDIN) - -NOTE: Presently the Bloom::FASTER module has problems -with garbage collection which only enables one reference to built each time the program loads. -Supplying references in batch might cause a high false positive rate. - --os/--outfilesuffix Output suffix for the created Bloom filter (defaults to .obj) --dbpr/--databsepath The unixpath to the references (defaults to STDIN) --dbpo/--databsepath The unixpath to the output directory (defaults to STDIN) -}; - -} -my ($rformat, $oformat, $outfilesuffix, $kmerlength, $falseposratebloom, $dbpr, $dbpo, -$help) = ('fasta','bloom', ".obj", 21, 0.0005,".", "."); -my ($ref); - -GetOptions('k|kmerlength:i' => \$kmerlength, -'f|falseposratebloom:s' => \$falseposratebloom, -'r|reference:s' => \$ref, -'os|outfile:s' => \$outfilesuffix, -'dbpr|databaseref:s' => \$dbpr, -'dbpo|databaseout:s' => \$dbpo, -'h|help' => \$help, -); - -die $USAGE if( $help ); - -#Inputs references to create bloom filters - -if ($ref) { - - } -else { - -($ref) = @ARGV; - -if (@ARGV != 1) { - my $verbosity = 0; - if (@ARGV == 0) { - $verbosity = 2; - } - print"\n"; - pod2usage({-message => "Must supply reference.\n", - -verbose => $verbosity - }); - } -} -#Reads reference list -ReferenceList($ref); - -my $id; -my $seq; -my %seqs; -my $headercount; - -my $refid; -my @refs; -my $refcount=0; - -my $keycount; -my $filtercounter=0; - -for (my $refid=0;$refidnew({n => $keycount, e => $falseposratebloom}); - -} - - -sub Savebloom { - - $$_[1]->to_file("$dbpo/$_[0]"); - -} - - -sub Blank { - - %seqs=(); - undef($$_[1]); -} - - -sub Targetkeys { - - my ($seq) = $_[0]; - - my $lengthseq = length($seq)-($kmerlength-1); #Stops sliding window at the end - - for (my $i=0;$i<$lengthseq;$i++) { - - $$_[1]->add (substr ($seq, $i, $kmerlength) ); - - } - - print STDERR "Added Reference Keys to Filter", "\n"; - return; - -} - - -sub Refseq { - -open(FASTA, "<$dbpr/$_[0]") or die "Can't open $_[0]:$!, \n"; - while() { - - if (/>/) { - - $seq =""; - chomp($id = $'); #' - $headercount++; - } - - else { - - chomp($seq .= $_); - } - - } - - $seqs{$id}= $seq; - print STDERR "Finished Reading Reference Sequence","\n"; - close(FASTA); - #Determines bitvector length & creates vector - Bloomfilter(length($seqs{$id}), $$_[1] ); - return; - -} - -sub ReferenceList { - -open(BLOOM, "<$_[0]") or die "Can't open $_[0]:$!, \n"; - - while() { - - if (/\S+/) { - - chomp; - $refcount++; - push @refs, $_; - } - - - } - - close(BLOOM); - if ($refcount == 0) { - print STDERR "Did not find any files in reference file '$ref'\n"; - } - - print STDERR "Finished Reading Bloom List","\n"; - return; -} - - - - - - diff --git a/bin/facs.pl b/bin/facs.pl deleted file mode 100755 index aa1b1c8..0000000 --- a/bin/facs.pl +++ /dev/null @@ -1,516 +0,0 @@ -#!/usr/bin/perl -w - -# Loads query in FASTA format, K-mer length and Bloom Filters. Interrogates -# queries against filters and classifies queries onto genomes. -#The algorithm loops trough all queries for one filter at a time. -# Copyright 2011 Henrik Stranneheim - -=pod - -=head1 NAME - -facs - Filter reads of DNA - -=head1 SYNOPSIS - -facs k bloomfilterlist queryfile outprefix - -=head1 DESCRIPTION - -Build Bloom filters from reference genomes in fasta files using bloombuild.pl. These -can later be queried using FACS. - -This version of facs.pl is based on facs_2.pl, using the more accurate scoring -system. - -facs interrogates queries against filters and classifies queries -onto genomes. The algorithm loops trough all queries for one filter -at a time. - -Results are written to two files - -1. Reads matching a reference - -2. Reads not matching any reference - -=head1 Arguments - --b/--bloomfilter A file containing a list of filenames for already created bloom filters. Each line contains the file name of the specific bloom filter. (defaults to STDIN) - --op/--outfileprefix Output prefix for the output files (defaults to "") - --osma/--outfilesuffix Output suffix for the matches output files - --osmi/--outfilesuffix Output suffix for the mismatches output files - --k/--kmer Word length for K-mers to be queried against the bloom filter (defaults to '21') - --f/--falseposprob The false positive probaility that you accept (defaults to '0.0005') - --dbpr/--databsepath The unix path to the bloom filters (defaults to STDIN) - --o/--outdirectory The unix path to the output directory (defaults to STDIN) - --mc/--matchcutoff The percent identity to classify a match (defaults to 80) - --lc/--lengthcutoff The minimum required read length (defaults to 60) - --qp/--quickpasses Number of quick passes that must match before vetting read (defaults to 1) - -Note: You have to use the same false positive rate and K-mer length as was used when the bloom filters were created. - -=head1 I/O - -Input format (FASTA/Pearson) - -Output format (FASTA/Pearson) - -=head1 SEE ALSO - -bloombuild - -=cut - -use strict; -use Bloom::Faster; -use Pod::Usage; -use Pod::Text; -use Getopt::Long; - -use vars qw($USAGE); - -BEGIN { - $USAGE = -qq{facs.pl < queryfile.fa -b bloomfilter.obj > file_suffix.fasta --k/--kmer Word length for K-mers to be queried against the bloom filter (defaults to '21') --f/--falseposprob The false positive probaility that you accept (defaults to '0.0005') --b/--bloomfilter A file containing a list of filenames for already created bloom filters. Each line contains the file name of the specific bloom filter. (defaults to STDIN) --op/--outfileprefix Output prefix for the output files (defaults to "") --osma/--outfilesuffix Output suffix for the matching output files (defaults to "_matching.fasta") --osmi/--outfilesuffix Output suffix for the mismatching output files (defaults to "_mismatching.fasta") --dbpr/--databsepath The unixpath to the bloom filters (defaults to STDIN) --o/--outputdirectory The unixpath to the output directory (defaults to STDIN) --mc/--matchcutoff The percent identity to classify a match (defaults to "0.8"~80%) --lc/--lengthcutoff The minimum required read length (defaults to 60) --qp/--quickpasses Number of quick passes that must match before vetting read (defaults to 1) -}; - -} -my ($rformat, $oformat, $outfileprefix, $matchoutfilesuffix, $mismatchoutfilesuffix, $kmerlength, -$falseposratebloom, $dbpr, $od, $matchcutoff, $lengthcutoff, $nrqp, -$help) = ('fasta','fasta', "", "_matching.fasta", "_mismatching.fasta", 21, 0.0005,".", ".", 0.8, 60, 1); -my ($queryfile, $bloomfilterlist); - -GetOptions('k|kmerlength:i' => \$kmerlength, -'f|falseposratebloom:s' => \$falseposratebloom, -'b|bloomfilter:s' => \$bloomfilterlist, -'op|outfileprefix:s' => \$outfileprefix, -'osma|matchoutfilesuffix:s' => \$matchoutfilesuffix, -'osmi|mismatchoutfilesuffix:s' => \$mismatchoutfilesuffix, -'dbpr|databaseref:s' => \$dbpr, -'o|outdirectory:s' => \$od, -'mc|matchcutoff:s' => \$matchcutoff, -'lc|lengthcutoff:i' => \$lengthcutoff, -'qp|quickpasses:i' => \$nrqp, -'h|help' => \$help, -); - -die $USAGE if( $help ); - -if ($queryfile) { - - } -else { - -($queryfile) = @ARGV; - -if (@ARGV != 1) { - my $verbosity = 0; - if (@ARGV == 0) { - $verbosity = 2; - } - print"\n"; - pod2usage({-message => "Must supply a query file and list of bloom filter(s).\n", - -verbose => $verbosity - }); - } -} - -my $refid; -my $refcount; -my @refs; - -my $queryid; -my %querys; -my $queryseq; - -my $filter; -my $kmermatchcountqp=0; -my $kmermatchcount=0; -my $kmernomatchcount=0; -my $bloomnomatchcount=0; -my $bloommatchcount=0; -my %allshort; -my @matches; -my @matchesid; -my @matchscore; -my $norevcheck=0; -my $n_shorts = 0; -my $trackmatch=0; -my $trackposition=0; - - -my $queryheadercount=0; - -Querysseqs($queryfile); #reads query sequences and stores it in %querys - -Bloomfilterlist($bloomfilterlist); #reads reference genome list and stores it in @refs - - -for (my $refid=0;$refid$lengthcutoff ) { #Minimum length cut-off - - #Divides querie into K-mers, checks filter, calculates match score and classifies sequences - Sortquerykeys($querys{$queryid}, $kmerlength,$queryid); - - if ($norevcheck eq 0) { - #Translates and reverses queries - $querys{$queryid} =~tr/ATGC/TACG/; - $querys{$queryid} = reverse $querys{$queryid}; - Sortquerykeys($querys{$queryid}, $kmerlength,$queryid); - } - $norevcheck=0; - - } - else { - $allshort{$queryid}= $querys{$queryid}; #Saves all queries that did not surpass length cut-off - delete $querys{$queryid}; #Deletes queries that did not surpass length cut-off from further querying - $n_shorts++; - } - - } - - print STDERR "Finished with Classification of Query Keys","\n"; - - #Writes matches to file - Write_matches($outfileprefix . "_" . $refs[$refid]); - Print(); - #Resets parameters - Blank(); - if ( $refid>=(scalar(@refs)-1) ) { # prints all Mismatches to file - - for my $nomatchid (keys %querys) { #Collects all unclassified sequences - - $allshort{$nomatchid} = $querys{$nomatchid}; - - } - - Write_mismatches($outfileprefix . "_" . $refs[$refid]); - - } -} - - -#Prints matches and match score -for (my $matchid=0;$matchid $matchcutoff) { #For matches - - push (@matches, $_[2]); #Saves match headers - $norevcheck=1; - $bloommatchcount++; - $kmermatchcount=0; - $kmernomatchcount=0; - $trackmatch = 0; - $trackposition = 0; - return; - } - $kmermatchcount=0; - $kmernomatchcount=0; - $trackmatch = 0; - $trackposition = 0; - return; - } - } - } - -} - -sub CheckfilterQP { - -# $_[0] = Position in query -# $_[1] = my $query - - if ($filter-> check( $_[1] ) ) { - - ++$kmermatchcountqp; #Adds to match score - return; - } - - else { - - $_[0] = $_[0] + $kmerlength-1; #Increments position in query - return; - } - -} - -sub Checkfilter { - -# $_[0] = Position in query -# $_[1] = my $query - - if ($trackposition >= $kmerlength ) { - - $trackmatch = 0; - $trackposition = 0; - } - - if ($filter-> check( $_[1] ) ) { - - if ($trackmatch eq 0) { - - $kmermatchcount = $kmermatchcount + $kmerlength-1; #Adds to match score - $trackmatch = 1; - $trackposition++; - } - else { - $kmermatchcount++; - $trackposition++; - } - return; - } - - else { - - $trackposition++; - - return; - } - -} - - -sub Querysseqs { - - open(QUERY, "<$_[0]") or die "Can't open $_[0]:$!, \n"; - - while () { - chomp $_; - - if (m/^\s+$/) { # Avoid blank lines - next; - } - - if (/>(\S+)/) { - - $queryseq =""; - $queryid = $1; - $queryheadercount++; - } - else { - $queryseq .= $_; - } - - $querys{$queryid}=$queryseq; - } - - $queryheadercount = scalar(keys(%querys)); - close(QUERY); - print STDERR "Finished Reading Query Sequences","\n"; - return; -} - -sub Bloomfilterlist { - my $filename = shift; - - open(REF, "<$filename") or die "Can't open $filename:$!, \n"; - - $refcount = 0; - while() { - - - if (/\S+/) { - - chomp; - $refcount++; - push @refs, $_; - } - - } - - close(REF); - print STDERR "Finished Reading Ref Filter List","\n"; - return; -} - - -sub Write_matches { - - my $filename = shift; - $filename .= $matchoutfilesuffix; - - open (GENOME, ">$od/$filename") or die "Can't write to $od/$filename: $!\n"; - - my $assemblysseq; - - while (@matches) { - - my $matchid = pop @matches; - $assemblysseq .= $querys{$matchid}; - - print GENOME '>', $matchid,"\n"; - - for (my $i=0;$i<(length($assemblysseq)/60);$i++) { - - print GENOME substr($assemblysseq,$i*60,60),"\n"; - - } - $assemblysseq=""; - delete $querys{$matchid}; - - } - close (GENOME); - return; -} - -sub Write_mismatches { - my $filename = shift; - $filename .= $mismatchoutfilesuffix; - - open (GENOME, ">$od/$filename") or die "Can't write to $od/$filename: $!\n"; - - my $assemblysseq; - - foreach my $id (keys %allshort) { - - $assemblysseq .= $allshort{$id}; - - print GENOME '>', $id,"\n"; - - for (my $i=0;$i<(length($assemblysseq)/60);$i++) { - - print GENOME substr($assemblysseq,$i*60,60),"\n"; - - } - - $assemblysseq=""; - - } - close (GENOME); - return; -} - diff --git a/bin/facs_batch.pl b/bin/facs_batch.pl deleted file mode 100755 index 50788a4..0000000 --- a/bin/facs_batch.pl +++ /dev/null @@ -1,518 +0,0 @@ -#!/usr/bin/perl -w - -# Loads query in FASTA format, K-mer length and Bloom Filters. Interrogates -# queries against filters and classifies queries onto genomes. -#The algorithm loops trough all queries for one filter at a time. -# Copyright 2011 Henrik Stranneheim - -=pod - -=head1 NAME - -facs - Filter reads of DNA - -=head1 SYNOPSIS - -facs queryfile -b bloomfilterlist - -=head1 DESCRIPTION - -Build Bloom filters from reference genomes in fasta files using bloombuild.pl. These -can later be queried using FACS. - -This version of facs.pl is based on facs_2.pl, using the more accurate scoring -system. - -facs interrogates queries against filters and classifies queries -onto genomes. The algorithm loops trough all queries for one filter -at a time. - -Results are written to two files - -1. Reads matching a reference - -2. Reads not matching any reference - -=head3 COMMANDS AND OPTIONS - --b/--bloomfilter A file containing a list of filenames for already created bloom filters. Each line contains the file name of the specific bloom filter. (defaults to STDIN) - --op/--outfileprefix Output prefix for the output files (defaults to "") - --osma/--outfilesuffix Output suffix for the matches output files - --osmi/--outfilesuffix Output suffix for the mismatches output files - --k/--kmer Word length for K-mers to be queried against the bloom filter (defaults to '21') - --f/--falseposprob The false positive probaility that you accept (defaults to '0.0005') - --dbpr/--databsepath The unix path to the bloom filters (defaults to STDIN) - --o/--outdirectory The unix path to the output directory (defaults to STDIN) - --mc/--matchcutoff The percent identity to classify a match (defaults to 80) - --lc/--lengthcutoff The minimum required read length (defaults to 60) - --qp/--quickpasses Number of quick passes that must match before vetting read (defaults to 1) - --bs/--batchsize Number of reads that are analysed in batch (defaults to 5000) - -Note: You have to use the same false positive rate and K-mer length as was used when the bloom filters were created. - -=head1 I/O - -Input format (FASTA/Pearson) - -Output format (FASTA/Pearson) - -=head1 SEE ALSO - -bloombuild - -=cut - -use strict; -use Bloom::Faster; -use Pod::Usage; -use Pod::Text; -use Getopt::Long; - -use vars qw($USAGE); - -BEGIN { - $USAGE = -qq{facs.pl < queryfile.fa -b bloomfilter.obj > file_suffix.fasta --k/--kmer Word length for K-mers to be queried against the bloom filter (defaults to '21') --f/--falseposprob The false positive probaility that you accept (defaults to '0.0005') --b/--bloomfilter A file containing a list of filenames for already created bloom filters. Each line contains the file name of the specific bloom filter. (defaults to STDIN) --op/--outfileprefix Output prefix for the output files (defaults to "") --osma/--outfilesuffix Output suffix for the matching output files (defaults to "_matching.fasta") --osmi/--outfilesuffix Output suffix for the mismatching output files (defaults to "_mismatching.fasta") --dbpr/--databsepath The unixpath to the bloom filters (defaults to STDIN) --o/--outputdirectory The unixpath to the output directory (defaults to STDIN) --mc/--matchcutoff The percent identity to classify a match (defaults to "0.8"~80%) --lc/--lengthcutoff The minimum required read length (defaults to 60) --qp/--quickpasses Number of quick passes that must match before vetting read (defaults to 1) --bs/--batchsize Number of reads that are analysed in batch (defaults to 5000) -}; - -} -my ($rformat, $oformat, $outfileprefix, $matchoutfilesuffix, $mismatchoutfilesuffix, $kmerlength, -$falseposratebloom, $dbpr, $od, $matchcutoff, $lengthcutoff, $nrqp, $bs, -$help) = ('fasta','fasta', "", "_matching.fasta", "_mismatching.fasta", 21, 0.0005,".", ".", 0.8, 60, 1, 5000); -my ($queryfile, $bloomfilterlist); - -GetOptions('k|kmerlength:i' => \$kmerlength, -'f|falseposratebloom:s' => \$falseposratebloom, -'b|bloomfilter:s' => \$bloomfilterlist, -'op|outfileprefix:s' => \$outfileprefix, -'osma|matchoutfilesuffix:s' => \$matchoutfilesuffix, -'osmi|mismatchoutfilesuffix:s' => \$mismatchoutfilesuffix, -'dbpr|databaseref:s' => \$dbpr, -'o|outdirectory:s' => \$od, -'mc|matchcutoff:s' => \$matchcutoff, -'lc|lengthcutoff:i' => \$lengthcutoff, -'qp|quickpasses:i' => \$nrqp, -'bs|batchsize:i' => \$bs, -'h|help' => \$help, -); - -die $USAGE if( $help ); - -if ($queryfile) { - - } -else { - -($queryfile) = @ARGV; - -if (@ARGV != 1) { - my $verbosity = 0; - if (@ARGV == 0) { - $verbosity = 2; - } - print"\n"; - pod2usage({-message => "Must supply a query file and list of bloom filter(s).\n", - -verbose => $verbosity - }); - } -} - -my $refid; -my $refcount; -my @refs; -my %refs; - -my $queryid; -my %querys; -my $queryseq; - -my $filter; -my $kmermatchcountqp=0; -my $kmermatchcount=0; -my $kmernomatchcount=0; -my $bloomnomatchcount=0; -my $bloommatchcount=0; -my %allshort; -my @matches; -my $norevcheck=0; -my $n_shorts = 0; -my $trackmatch=0; -my $trackposition=0; - -my $queryheadercount=0; - -Bloomfilterlist($bloomfilterlist); #reads reference genome list and stores it in @refs - -Querysseqs($queryfile); #reads query sequences and stores it in %querys - -sub Classify { - -for (my $refid=0;$refid$lengthcutoff ) { #Minimum length cut-off - - #Divides querie into K-mers, checks filter, calculates match score and classifies sequences - Sortquerykeys($querys{$queryid}, $kmerlength,$queryid, $refs[$refid]); - - if ($norevcheck eq 0) { - #Translates and reverses queries - $querys{$queryid} =~tr/ATGC/TACG/; - $querys{$queryid} = reverse $querys{$queryid}; - Sortquerykeys($querys{$queryid}, $kmerlength,$queryid, $refs[$refid]); - } - $norevcheck=0; - - } - else { - $allshort{$queryid}= \$querys{$queryid}; #Saves all queries that did not surpass length cut-off - $n_shorts++; - } - - } - - - #Writes matches to file - Write_matches($outfileprefix . "_" . $refs[$refid]); - #Resets parameters - Blank(); - if ( $refid>=(scalar(@refs)-1) ) { - - for my $nomatchid (keys %querys) { #Collects all unclassified sequences - - $allshort{$nomatchid} = \$querys{$nomatchid}; - - } - - Write_mismatches($outfileprefix . "_"); # prints all Mismatches to file - - } - } - return; -} - -Print(); - -for $refid (keys %refs) { #Prints matches and match score - -print $refid, "\t"; - - if ($refs{$refid}) { - - print $refs{$refid},"\n"; - } - else { - print "\n"; - } - -} - -undef($filter); - -########## -#End of main program -########## - -########## -###Subroutines: -# my $refid - Capture filename in reference list -# my @refs - Stores filename in reference list -# my $refcount - Counts the number of filenames in reference file -########## -# my $queryid - Capture each header in queryfile -# my %querys - Stores header and corresponding sequences in queryfile -# my $queryseq - Capture each sequences in queryfile -# my $queryheadercount=0 - Counts number of queries -########## -# my $filter - Bloom filter object -# my $kmermatchcountqp=0 - Number of matching K-mers per query in Quick pass -# my $kmermatchcount=0 - Number of matching nucleotides (K-mers*K-mer length) per queries, match score -# my $kmernomatchcount=0 - Number of mismatching K-mers per queries -# my $bloomnomatchcount=0 - Number of classifed queries -# my $bloommatchcount=0 - Number of John Does -# my %allshort - Stores all queries below length cut-off and at the end collects all John Does for printing -# my @matches - Stores query id for classification -# my %refs - Prints all reference headers and number of classified queries -# my $loopcount=0 - Loop for classifiying both forward and complementary reverse -# my $n_shorts = 0 - Counts all short sequences -# my $trackmatch=0 - Tracks when to give a K-mer length score -# my $trackposition=0 - Tracks position within length of K-mer within last match -########## - - -sub LoadBloom { - - my $filtername = shift; - $filter = new Bloom::Faster("$dbpr/$filtername"); - -} - - -sub Print { - print STDERR "Finished with Classification of Query Keys","\n"; - print STDERR "Number of sequences in original query file:","\t", $queryheadercount,"\n"; - print STDERR "Number of short queries: \t", $n_shorts, "\n"; - - -} - -sub Blank { - - $bloomnomatchcount =0; - $bloommatchcount=0; - $kmermatchcountqp=0; - $kmermatchcount=0; - $kmernomatchcount=0; - @matches = (); - #@matchscore = (); -} - - -sub Sortquerykeys { - -#$_[0] = $queryseq -#$_[1] = $kmerlength -#$_[2] = $queryid -#$_[3] = $refid - - my $lengthseq = length($_[0]) -($_[1]-1); #Stops sliding window at the end - - for (my $i=0;$i<$lengthseq;$i++) { - - my $query = substr ($_[0], $i, $_[1]); - - #Quick pass of query - CheckfilterQP($i, $query); - - if ($kmermatchcountqp) { - - if ($kmermatchcountqp eq $nrqp) { #For queryies passing quickpass - - $kmermatchcountqp=0; - $i=$lengthseq; - - for (my $k=0;$k<$lengthseq;$k++) { - - my $query = substr ($_[0], $k, $_[1]); - - #Queries the Bloom filter - Checkfilter($k, $query); - - } - - if ( ( $kmermatchcount/length($_[0]) ) > $matchcutoff) { #For matches - - push @matches, \$_[2]; #Saves match headers - $refs{$_[3]}++; - $norevcheck=1; - $bloommatchcount++; - $kmermatchcount=0; - $kmernomatchcount=0; - $trackmatch = 0; - $trackposition = 0; - return; - } - $kmermatchcount=0; - $kmernomatchcount=0; - $trackmatch = 0; - $trackposition = 0; - return; - } - } - } - -} - -sub CheckfilterQP { - -# $_[0] = Position in query -# $_[1] = my $query - - if ($filter-> check( $_[1] ) ) { - - ++$kmermatchcountqp; #Adds to match score - return; - } - - else { - - $_[0] = $_[0] + $kmerlength-1; #Increments position in query - return; - } - -} - -sub Checkfilter { - -# $_[0] = Position in query -# $_[1] = my $query - - if ($trackposition >= $kmerlength ) { - - $trackmatch = 0; - $trackposition = 0; - } - - if ($filter-> check( $_[1] ) ) { - - if ($trackmatch eq 0) { - - $kmermatchcount = $kmermatchcount + $kmerlength-1; #Adds to match score - $trackmatch = 1; - $trackposition++; - } - else { - $kmermatchcount++; - $trackposition++; - } - return; - } - - else { - - $trackposition++; - - return; - } - -} - - -sub Querysseqs { - - open(QUERY, "<$_[0]") or die "Can't open $_[0]:$!, \n"; - - while () { - chomp $_; - - if (m/^\s+$/) { # Avoid blank lines - next; - } - - if (/>(\S+)/) { - - if (scalar(keys(%querys)) eq $bs) { #Batch size - - print $queryheadercount=$queryheadercount + scalar(keys(%querys)), "\n"; - Classify(); - %querys = (); - - } - $queryseq =""; - $queryid = $1; - - } - else { - $queryseq .= $_; - } - - $querys{$queryid}=$queryseq; - } - - $queryheadercount=$queryheadercount + scalar(keys(%querys)); - Classify(); #Catch the remainaing reads - close(QUERY); - print STDERR "Finished Reading Query Sequences","\n"; - return; -} - -sub Bloomfilterlist { - my $filename = shift; - - open(REF, "<$filename") or die "Can't open $filename:$!, \n"; - - $refcount = 0; - while() { - - - if (/\S+/) { - - chomp; - $refcount++; - push @refs, $_; - } - - } - - close(REF); - print STDERR "Finished Reading Ref Filter List","\n"; - return; -} - - -sub Write_matches { - - my $filename = shift; - $filename .= $matchoutfilesuffix; - - open (GENOME, ">>$od/$filename") or die "Can't write to $od/$filename: $!\n"; - - while (@matches) { - - my $matchid = pop @matches; - - print GENOME '>', $$matchid,"\n"; - - for (my $i=0;$i<(length($querys{$$matchid})/60);$i++) { - - print GENOME substr($querys{$$matchid},$i*60,60),"\n"; - } - - delete $querys{$$matchid}; - - } - - close (GENOME); - return; -} - -sub Write_mismatches { - my $filename = shift; - $filename .= $mismatchoutfilesuffix; - - open (GENOME, ">>$od/$filename") or die "Can't write to $od/$filename: $!\n"; - - foreach my $id (keys %allshort) { - - print GENOME '>', $id,"\n"; - - for (my $i=0;$i<(length(${$allshort{$id}})/60);$i++) { - - print GENOME substr(${$allshort{$id}},$i*60,60),"\n"; - - } - - delete $allshort{$id}; - } - close (GENOME); - return; -} - diff --git a/doc/GC_.py b/doc/GC_.py new file mode 100644 index 0000000..57c9fef --- /dev/null +++ b/doc/GC_.py @@ -0,0 +1,48 @@ +from Bio import SeqIO +from Bio.Seq import Seq, reverse_complement, transcribe, back_transcribe, translate +import re +import time + +A_t = 0; +G_t = 0; +C_t = 0; +T_t = 0; +All = 0; +All_r = 0; + +''' +x1 = open("test.fna") +for seq in SeqIO.parse(x1,'fasta'): + All_r+=1; + for word in seq: + All+=1; + if word=='A': + A_t+=1; + elif word=='C': + C_t+=1; + elif word=='G': + G_t+=1; + elif word=='T': + T_t+=1; + +print 'A_t->',A_t,'G_t->',G_t,'C_t->',C_t,'T_t->',T_t; +print 'ALL->',All; +print 'average length->', All//All_r; +''' +signal = 0; +New_string = ''; +x1 = open("10xtest_GC44.fasta") +for seq in SeqIO.parse(x1,'fasta'): + if signal == 0: + New_string+='>' + New_string+=seq.id; + New_string+='\n' + New_string+=seq.seq.tostring(); + New_string+='\n'; + signal = 1; + elif signal == 1: + signal = 0; + +x2 = open("single.fasta",'w') +x2.write(New_string) +x2.close() diff --git a/doc/ROC.m b/doc/ROC.m new file mode 100644 index 0000000..0535a6c --- /dev/null +++ b/doc/ROC.m @@ -0,0 +1,169 @@ +%%% OLD +%{ +all = [93645 93641 93629 93611 93448 93224 85442 64825 33489 +88850 85935 75140 52472 26807 10760 4177 2180 1830 +37009 16940 6085 2700 1934 1818 1803 1791 1786 +5116 2254 1874 1819 1805 1798 1790 1786 1767 +2068 1857 1822 1810 1803 1793 1789 1778 1748 +1909 1829 1813 1805 1798 1791 1783 1763 1733 +] + +tp = [1794 1794 1794 1794 1794 1794 1794 1794 1794 +1794 1794 1794 1794 1794 1788 1783 1778 1778 +1786 1780 1778 1778 1778 1778 1778 1776 1776 +1779 1778 1778 1778 1777 1777 1776 1774 1763 +1778 1778 1778 1778 1777 1777 1776 1773 1745 +1778 1778 1778 1778 1777 1776 1771 1758 1731 +] +%} + +%%% NEW +%{ +all = [76239 18890 3488 1844 1755 1450 544 22 0 +3803 1810 1792 1747 1521 830 181 8 0 +2020 1793 1768 1765 1237 555 96 4 0 +1901 1789 1750 1575 1055 413 66 1 0 +1850 1776 1726 1475 894 325 49 1 0 +1800 1772 1680 1368 763 259 39 1 0 +] + +tp = [1794 1789 1778 1778 1748 1446 544 22 0 +1789 1778 1776 1741 1519 829 181 8 0 +1778 1776 1764 1764 1236 554 96 4 0 +1776 1773 1747 1574 1054 413 66 1 0 +1771 1766 1723 1474 893 325 49 1 0 +1763 1763 1678 1367 762 259 39 1 0 +] +%} + +%%% CHB +%{ +all = [91326 62121 19346 4677 1926 1783 1581 357 0 +6351 2003 1813 1799 1777 1693 1037 108 0 +1845 1813 1799 1786 1750 1535 756 55 0 +1828 1806 1795 1775 1715 1391 581 42 0 +1819 1806 1790 1764 1658 1241 484 31 0 +1812 1804 1783 1749 1607 1112 389 21 0 +] + +tp = [1794 1794 1787 1778 1778 1774 1577 357 0 +1778 1778 1778 1778 1769 1691 1036 108 0 +1778 1778 1778 1773 1747 1534 755 55 0 +1778 1778 1777 1766 1713 1390 580 42 0 +1778 1778 1774 1758 1656 1240 483 31 0 +1778 1778 1770 1744 1605 1111 388 21 0 +] +%} +sample = 1794 +%%%%---------------------------------------------------------------- +%%% OLD + +all = [82122 79488 76829 73919 70699 67584 65067 63269 +72104 69232 66812 64945 63704 62011 62169 61265 +65793 64489 63690 63050 62545 62011 61310 59976 +64111 63532 63020 62570 62126 61537 60605 58694 +63632 63107 62683 62237 61781 61113 59840 57237 +63332 62871 62431 62007 61497 60642 58906 55662 +] + +tp = [55137 55137 55137 55136 55136 55133 55119 55045 +55137 55137 55137 55136 55130 55114 55005 54568 +55137 55136 55135 55133 55123 55041 54715 53724 +55137 55136 55134 55125 55089 54895 54297 52690 +55132 55131 55128 55108 55013 54680 53716 51389 +55133 55132 55124 55070 54920 54394 52969 50011 +] +%%%NEW +%{ +all= [63031 61442 56426 41210 17921 3350 98 0 +62015 59624 51000 32294 12226 2158 70 0 +61427 57620 46338 26309 9075 1559 56 0 +60771 55443 42006 22076 7199 1110 54 0 +59949 53054 37785 18688 5825 825 50 0 +58947 50482 34152 15981 4691 675 46 0 +] +tp = [55118 54623 50503 36586 15450 2754 83 0 +55001 53382 45495 28197 10117 1649 57 0 +54769 51680 41154 22636 7348 1147 43 0 +54374 49726 37128 18770 5744 819 41 0 +53784 47516 33212 15703 4558 603 37 0 +52983 45140 29880 13301 3609 487 34 0 +] +%} +%%%CHB +%{ +all = [65308 63416 62267 60141 50654 24228 2155 0 +63226 62430 61349 57411 43730 17341 1322 0 +62747 61979 60040 54627 38021 13519 870 0 +62424 61571 59317 51727 33488 10850 672 0 +62176 61141 58029 48752 29403 8825 557 0 +61909 60533 56481 45688 25842 7332 480 0 +] +tp = [55136 55130 55018 53720 45230 21061 1714 0 +55133 55103 54688 51451 38737 14643 999 0 +55129 55010 54070 48933 33411 11168 655 0 +55113 54875 53229 46254 29224 8812 493 0 +55084 54647 52129 43747 24549 7082 396 0 +55009 54246 50786 40617 22213 5813 333 0 +] +%} +sample = 55137 +fp = all-tp +fn = sample-tp +tn = 100000-sample-fp +tpr = tp./(tp+fn); +fpr = fp./(fp+tn); + +Q_f = fp./all +Q_f; +hold on +%for i=1:6, + +%tpr = tp(1,:)./(tp(1,:)+fn(1,:)); +%fpr = fp(1,:)./(fp(1,:)+tn(1,:)); +%plot(fpr(1,:),tpr(1,:),'--rs'); %12 +%plot(fpr(2,:),tpr(2,:),'--bs'); %13 +%plot(fpr(3,:),tpr(3,:),'--cs'); %14 +%plot(fpr(4,:),tpr(4,:),'-gs'); %15 +%plot(fpr(5,:),tpr(5,:),'-ms'); %16 +%plot(fpr(6,:),tpr(6,:),'-ks'); %17 +%end +hold off +%fp = fp.*fp.*fp.*fp.*fp.*fp.*fp.*fp.*fp.*fp.*fp + +%{ +%vx = [0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1] +vx = [0.3 0.4 0.5 0.6 0.7 0.8 0.9 1] +hold on +plot(vx,Q_f(1,:),'--rs'); %12 +plot(vx,Q_f(2,:),'--bs'); %13 +plot(vx,Q_f(3,:),'--cs'); %14 +plot(vx,Q_f(4,:),'-gs'); %15 +plot(vx,Q_f(5,:),'-ms'); %16 +plot(vx,Q_f(6,:),'-ks'); %17 +hold off +%acc = (tp+tn)/100000 +%tpr +%fpr +%acc +%} +%plot(tpr,tpr,'black') + + +%{ +tp = [1794 1794 1792 1786 1779 1778 1778 1778 1775 +1781 1778 1778 1778 1776 1771 1774 1754 1721 +1778 1778 1778 1778 1776 1770 1758 1720 1617 +1778 1778 1778 1776 1772 1761 1732 1661 1531 +1778 1778 1778 1773 1765 1748 1702 1602 1420 +1778 1778 1777 1768 1758 1730 1649 1521 1308 +] + +all = [92129 74168 37245 14002 5541 2677 1923 1811 1787 +8113 2340 1847 1812 1800 1795 1784 1763 1725 +1856 1819 1807 1798 1792 1781 1762 1723 1618 +1837 1812 1801 1794 1783 1767 1735 1663 1532 +1824 1802 1800 1789 1775 1752 1704 1603 1421 +1818 1804 1797 1780 1767 1733 1651 1522 1309 +] +%} \ No newline at end of file diff --git a/doc/prop.m b/doc/prop.m new file mode 100644 index 0000000..b8837c0 --- /dev/null +++ b/doc/prop.m @@ -0,0 +1,41 @@ +hit =[1670777 +464386 +131302 +44158 +22436 +17064 +] + +hit1 = [10342350 +3092821 +842175 +241199 +88697 +50011 +] + +unhit = [47329223 +48335614 +48468698 +48355842 +48177564 +47982936 +] + +unhit1 = [39457650 +46507179 +48557825 +48958801 +48911303 +48749989 +] + +total = hit+unhit +total1 = hit1+unhit1 + +polp = hit./total +polp1 = hit1./total1 +hold on +plot(polp,'-rs'); +plot(polp1,'-ks'); +hold off \ No newline at end of file diff --git a/drass/Makefile b/drass/Makefile new file mode 100644 index 0000000..fb19009 --- /dev/null +++ b/drass/Makefile @@ -0,0 +1,44 @@ +CFLAGS=-O3 -DFIFO -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE -Wall -fopenmp -g -DNODEBUG -lm -lz +.PHONY: tests clean valgrind +.SUFFIXES:.c .o +PROG=facs + +LOBJS= big_query.o bloom.o file_dir.o good_build.o lookup8.o suggestions.o tool.o simple_check_1_ge.o simple_remove.o +AOBJS= big_query.o bloom.o file_dir.o good_build.o lookup8.o suggestions.o tool.o simple_check_1_ge.o simple_remove.o + +all:$(PROG) + +tests: python + nosetests -v -s ../tests + +valgrind: python + valgrind --tool=memcheck --suppressions=../tests/utils/valgrind-python.supp nosetests -v -s ../tests/test_basic.py + +mpi: + @echo Make sure you have MPI support on your cluster hint: module load openmpi + #mpicc -c *.c ${CFLAGS} + #mpicc -c mpi_decon.c -O3 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE + #mpicc -o mpi_decon mpi_decon.o bloom.o suggestions.o lookup8.o -lm ${CFLAGS} + #mpicc -o mpi_bloom mpi_bloom.o bloom.o suggestions.o lookup8.o file_dir.o -lm ${CFLAGS} + #mpicc -o mpi_bloom_l mpi_bloom_l.o bloom.o suggestions.o lookup8.o file_dir.o -lm ${CFLAGS} + #mpirun -np 1 ./mpi_bloom_l -l tzcoolman -q test.fna + +python: + rm -rf build/ ${PROG}.so && python setup.py build_ext --inplace && python setup.py develop + +clean: + rm -f core.* vgcore.* *.o *.so *.a *.info ${PROG} + + +.c.o: + $(CC) -c $(DFLAGS) $(INCLUDES) $< -o $@ $(CFLAGS) + +${PROG}:lib${PROG}.a $(AOBJS) + +${PROG}:lib${PROG}.a $(AOBJS) main.o + $(CC) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -l${PROG} $(LIBS) $(CFLAGS) + +lib${PROG}.a:$(LOBJS) + $(AR) -csru $@ $(LOBJS) + +main.o: big_query.h bloom.h build.h check.h file_dir.h hashes.h tool.h remove.h remove_l.h diff --git a/drass/big_query.c b/drass/big_query.c new file mode 100644 index 0000000..58f83d5 --- /dev/null +++ b/drass/big_query.c @@ -0,0 +1,277 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "tool.h" +#include "bloom.h" +#include "check.h" +#include "file_dir.h" +#include "big_query.h" +/*-------------------------------------*/ +#include +/*-------------------------------------*/ +#define ONEG 1000000000 +#define ONE 100 +/*-------------------------------------*/ + + +static int +query_usage(void) +{ + fprintf(stderr, "\nUsage: ./facs query [options]\n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-r reference bloom filter to query against\n"); + return 1; +} + +int bq_main(int argc, char** argv) +{ + if (argc < 2) return query_usage(); + +/*-------defaults for bloom filter building-------*/ + int opt; + float tole_rate = 0; + float sampling_rate = 1; + + char* ref = NULL; + char* list = NULL; + char* target_path = NULL; + char* source = NULL; + + while ((opt = getopt (argc, argv, "s:t:r:o:q:l:h")) != -1) { + switch (opt) { + case 't': + (optarg) && ((tole_rate = atof(optarg)), 1); + break; + case 's': + (optarg) && ((sampling_rate = atof(optarg)), 1); + break; + case 'o': + (optarg) && ((target_path = optarg), 1); + break; + case 'q': + (optarg) && (source = optarg, 1); + break; + case 'r': + (optarg) && (ref = optarg, 1); + break; + case 'l': + (optarg) && (list = optarg, 1); + break; + case 'h': + return query_usage(); + case '?': + printf ("Unknown option: -%c\n", (char) optopt); + return query_usage(); + } + } + + return query(source, ref, tole_rate, sampling_rate, list, target_path); +} + +int query(char* query, char* bloom_filter, double tole_rate, double sampling_rate, char* list, char* target_path) +{ + + gzFile zip; + int type = 0; + BIGCAST offset = 0; + char *detail = (char*) malloc((ONE*ONE*ONE)*sizeof(char)); + char *position = (char*) malloc((ONEG+1)*sizeof(char)); + + bloom *bl_2 = NEW (bloom); + F_set *File_head = make_list (bloom_filter, list); + + File_head->reads_num = 0; + File_head->reads_contam = 0; + File_head->hits = 0; + File_head->filename = bloom_filter; + load_bloom (File_head->filename, bl_2); //load a bloom filter + if (tole_rate==0) + tole_rate = mco_suggestion (bl_2->k_mer); + + if ((zip = gzopen(query, "rb"))<0) { + perror("query open error...\n"); + exit(-1); + } + + if (strstr(query, ".fastq") || strstr(query, ".fq")) + type = 2; + else + type = 1; + + + while (offset!=-1) { + offset = CHUNKer(zip,offset,ONEG,position,type); + Queue *head = NEW (Queue); + head->location = NULL; + Queue *tail = NEW (Queue); + head->next = tail; + Queue *head2 = head; + get_parainfo (position, head); + +#pragma omp parallel +{ +#pragma omp single nowait + { + while (head != tail) + { +#pragma omp task firstprivate(head) + { + if (head->location!=NULL) { + if (type == 1) { + fasta_process (bl_2, head, tail, File_head, sampling_rate, + tole_rate); + } else { + fastq_process (bl_2, head, tail, File_head, sampling_rate, + tole_rate); + } + } + } + head = head->next; + } // End of firstprivate + } // End of single - no implied barrier (nowait) +} // End of parallel region - implied barrier + //evaluate (detail, File_head->filename, File_head); + + if (position != NULL) { + memset (position, 0, strlen(position)); + //free (position); + } + else { + perror("Cannot memset, wrong position on fastq file\n"); + exit(-1); + } + + clean_list (head2, tail); + + } //end while + free(position); + evaluate (detail, File_head->filename, File_head); + gzclose(zip); + bloom_destroy (bl_2); + statistic_save (detail, query, target_path); + + return 0; +} + +char *strrstr(char *s, char *str) +{ + char *p; + int len = strlen(s); + for (p = s + len - 1; p >= s; p--) { + if ((*p == *str) && (memcmp(p, str, strlen(str)) == 0)) + return p; + } + return NULL; +} + +void clean_list (Queue* head, Queue *tail) +{ +Queue *element; +while (head!=tail) + { + element = head->next; + memset(head,0,sizeof(Queue)); + free(head); + head = element; + } +free(tail); +} + + +BIGCAST CHUNKer(gzFile zip,BIGCAST offset,int chunk,char *data,int type) +{ + char c, v; + char *pos = NULL; + int length = 0; + + if (type == 2) + v = '@'; + else + v = '>'; + + if (offset == 0) + while (offset <10*ONE) + { + c = gzgetc(zip); + if (c == v) + break; + offset++; + } + + gzseek (zip,offset,SEEK_SET); + gzread (zip,data,chunk); + + if (data != NULL) + length = strlen(data); + + if (length>=chunk) { + if (type == 2) { + pos = strrstr (data,"\n+"); + pos = bac_2_n (pos-1); + } else { + pos = strrchr (data,'>')-1; + } + } + + if (pos) { + offset += (pos-data); + memset (pos, 0, strlen(pos)); + } + + if (length +extern char *bac_2_n (char *filename); +extern char *strrstr(char *s, char *str); +//extern BIGCAST get_size (char *filename); +extern void clean_list (Queue* head, Queue *tail); +extern BIGCAST CHUNKer(gzFile zip,BIGCAST offset,int chunk,char *data,int type); +extern BIGCAST CHUNKgz(gzFile zip, BIGCAST offset,int chunk,char *position,char *extra,int type); +extern int bq_main (int argc, char **argv); +extern int query (char* query, char* bloom_filter, double tole_rate, double sampling_rate, char* list, char* target_path); +#endif diff --git a/drass/bloom.c b/drass/bloom.c new file mode 100644 index 0000000..d132efd --- /dev/null +++ b/drass/bloom.c @@ -0,0 +1,639 @@ +#define _LARGEFILE_SOURCE +#define _LARGEFILE64_SOURCE +#define _FILE_OFFSET_BITS 64 +#include +#include +#include +#include +#include +#include "bloom.h" +#include "hashes.h" +#include "file_dir.h" + +/*---------------------------*/ +#include +#include +#include +#include +#include +#include + +int seed[20] = + { 152501029, 152501717, 152503097, 152500171, 152500157, 152504837, + 10161313, 10371313, 10431313, 10501313, 10581313, 10611313, 10641313, + 10651313, + 10671313, 10731313, 10821313, 10881313, 10951313, 11001313 +}; + +int +bloom_init (bloom * bloom, BIGNUM size, BIGNUM capacity, double error_rate, + int hashes, hash_t hash, int flags) +{ + if (size < 1) + { + fprintf (stderr, "overflow1\n"); + return -1; + } + else + { + /* this may waste a little time, but we need to ensure + * that our array has a prime number of elements, even + * if we have been requested to do otherwise */ + bloom->stat.elements = find_close_prime (size); + } + + if (hashes < 1) + { +#ifdef DEBUG + fprintf (stderr, "hashes was %d,size %lld\n", hashes, size); +#endif + return -1; + + } + else + { + bloom->stat.ideal_hashes = hashes; + } + + if (hash == NULL) + { + bloom->hash = (hash_t) HASH_FUNC; + } + else + { + bloom->hash = hash; + } + + bloom->inserts = 0; + + /** + If error rate and capacity were not specified, but size and num hashes were, + we can calculate the missing elements. + **/ + if (capacity == 0 || error_rate == 0) + { + // From wikipedia, num hashes k that minimizes probability of error is k =~ (0.7 m) / n + // Therefore n =~ (0.7 m) / k + bloom->stat.capacity = 0.7 * bloom->stat.elements / hashes; + bloom->stat.e = powf (2.0, (float) -1 * hashes); + } + else + { + bloom->stat.capacity = capacity; + bloom->stat.e = error_rate; + } + +#ifdef DEBUG + fprintf (stderr, "bloom_init(%lld,%d) => (%lld,%d) =>%f\n", + (BIGCAST) size, hashes, (BIGCAST) bloom->stat.elements, + bloom->stat.ideal_hashes, bloom->stat.e); +#endif + + if ((size > TOPLIMIT)) + { + fprintf (stderr, "overflow2\n"); + return -2; + } + + /* allocate our array of bytes. where m is the size of our desired + * bit vector, we allocate m/8 + 1 bytes. */ + if ((bloom->vector = (char *) malloc (sizeof (char) * + ((long long) (bloom->stat.elements / + 8) + 1))) == NULL) + { + perror ("malloc"); + return -1; + } + else + memset (bloom->vector, 0, + sizeof (char) * ((long long) (bloom->stat.elements / 8) + 1)); + + /* generate a collection of random integers, to use later + * when salting our keys before hashing them */ + + //sketchy_randoms(&bloom->random_nums,hashes); + //bloom->vector = "11111111"; + //printf("vector size-> %d\n",sizeof(bloom->vector)); + //memset(bloom->vector,0,sizeof(bloom->vector)); + + return 0; +} + +void +bloom_destroy (bloom * bloom) +{ + + memset (bloom->vector, 0, + sizeof (char) * ((long long) (bloom->stat.elements / 8) + 1)); + free (bloom->vector); + +} + +int +bloom_check (bloom * bloom, char *str) +{ +//printf("In bloom_check\n"); + return bloom_test (bloom, str, RO); +} + +int +bloom_add (bloom * bloom, char *str) +{ + int ret; + //printf("key--> %s\n",str); + ret = bloom_test (bloom, str, SET); + if (ret == 0) + { + bloom->inserts++; + } + return ret; +} + +int +bloom_test (bloom * bloom, char *str, int mode) +{ + int i, hit; + BIGNUM ret; + //printf("In test\n"); + /* as many times as our ideal hash count dictates, salt our key + * and hash it into the bit vector */ + hit = 1; + for (i = 0; i < bloom->stat.ideal_hashes; i++) + { + + ret = bloom_hash (bloom, str, i, bloom->k_mer); + + if (!test (bloom->vector, ret)) + { + hit = 0; + if (mode == SET) + { + set (bloom->vector, ret); + } + else + { + /* if we are merely testing, we are done. */ + return hit; + } + } + } + + return hit; +} + +BIGNUM +bloom_hash (bloom * bloom, char *str, int i, int length) +{ + BIGNUM ret = 0; + + ret = (BIGNUM) hash5 (str, seed[i], length) % (BIGNUM) bloom->stat.elements; + + return ret; +} + +int +set (char *big, BIGNUM index) +{ + deref dr; + + finder (index, &dr); + big[dr.index] += dr.spot; + + return 0; +} + +int +test (char *big, BIGNUM index) +{ + deref dr; + char bucket; + + finder (index, &dr); + bucket = big[dr.index]; + + if ((bucket & dr.spot) == dr.spot) + { + return 1; + } + else + { + return 0; + } + +} + +int +finder (BIGNUM index, deref * dr) +{ + + //dr->index = (BIGNUM) (index / 8); + //dr->spot = pow (2, (index % 8)); + dr->index = (BIGNUM) (index >> 3); + //dr->spot = pow (2, (index % 8)); + //dr->spot = 0x80; + //dr->spot = dr->spot >> (index & 0x07); + dr->spot = pow(2,(index & 0x07)); + return 0; +} + + +BIGNUM +report_capacity (bloom * bloom) +{ + return bloom->stat.capacity; +} + +char* +prefix_make (char *filename, char *prefix, char *target) +{ + char *position1 = strrchr (filename, '/'); + + char *bloom_file = (char *) malloc (300 * sizeof (char)); + memset (bloom_file, 0, 300); + if (is_dir(target)) { + strcat (bloom_file,target); + strcat (bloom_file,filename); + } else if (target) { + strcat (bloom_file,target); + } + else if (target!=NULL && prefix!=NULL) { + if (position1!=NULL) + strncat (bloom_file,position1,strrchr(position1,'.')-position1); + else + strncat (bloom_file,filename,strrchr(filename,'.')-filename); + strcat (bloom_file, ".bloom"); + } + else + { + if (position1!=NULL) + strncat (bloom_file,position1,strrchr(position1,'.')-position1); + else + strncat (bloom_file,filename,strrchr(filename,'.')-filename); + } + + return bloom_file; +} + +int +save_bloom (char *filename, bloom * bl, char *prefix, char *target) +{ + char *bloom_file = NULL; + int fd; + + bloom_file = prefix_make(filename, prefix, target); + +#ifdef DEBUG + printf("Bloom file to be written in: %s\n", bloom_file); +#endif + + + //if (bloom_file[0]=='/') + // bloom_file++; + if (prefix==NULL && target==NULL) + strcat (bloom_file,".bloom"); + else if (is_dir(target)) + strcat (bloom_file,".bloom"); + +#ifdef __APPLE__ + fd = open (bloom_file, O_RDWR | O_CREAT, PERMS); +#else // assume linux + fd = open (bloom_file, O_RDWR | O_CREAT | O_LARGEFILE, PERMS); +#endif + + if (fd < 0) + { + perror (bloom_file); + return -1; + } + + BIGNUM total_size = + sizeof (bloom) + sizeof (char) * ((long long) (bl->stat.elements / 8) + + 1) + + sizeof (int) * (bl->stat.ideal_hashes + 1); + +#ifdef __APPLE__ + if (ftruncate (fd, total_size) < 0) +#else + if (ftruncate64 (fd, total_size) < 0) +#endif + { + printf ("[%d]-ftruncate64 error: %s/n", errno, strerror (errno)); + close (fd); + return 0; + } + + if(write (fd, bl, sizeof (bloom)) < 0) { + perror (" error writing bloom file "); + exit (EXIT_FAILURE); + } + + total_size = (long long) (bl->stat.elements / 8) + 1; + + BIGNUM off = 0; + while (total_size > TWOG) + { + if(write(fd, bl->vector + off, sizeof (char) * TWOG) < 0) { + perror (" error writing bloom file "); + exit (EXIT_FAILURE); + } + total_size -= TWOG; + off += TWOG; + } + if (write (fd, bl->vector + off, sizeof (char) * total_size) < 0){ + perror (" error writing bloom file "); + exit (EXIT_FAILURE); + }; + close (fd); + + memset (bl->vector, 0, + sizeof (char) * ((long long) (bl->stat.elements / 8) + 1)); + +#ifdef DEBUG + printf ("big file process OK\n"); +#endif + return 0; + +} + +int +load_bloom (char *filename, bloom * bl) +{ + int fd = 0; + int ret; + +#ifdef DEBUG + printf ("bloom name->%s\n", filename); +#endif + +#ifdef __APPLE__ + fd = open (filename, O_RDONLY, PERMS); +#else + fd = open64 (filename, O_RDONLY, PERMS); +#endif + if (fd < 0) { + perror (filename); + return -1; + } + + if (read (fd, bl, sizeof (bloom)) < 0){ + perror("Problem reading bloom filter"); + }; + + bl->vector = + (char *) malloc (sizeof (char) * + ((long long) (bl->stat.elements / 8) + 1)); + + BIGNUM off = 0, total_size = ((long long) (bl->stat.elements / 8) + 1); + + while (total_size > TWOG) { + ret = read(fd, bl->vector + off, sizeof (char) * TWOG); + if (ret < 0) + perror("Problem reading bloom filter"); + total_size -= TWOG; + off += TWOG; + } + + ret = read (fd, bl->vector + off, sizeof (char) * total_size); + +#ifdef DEBUG + if (ret > 0) + printf ("bloom filter read successfully\n"); + else ret = errno; +#endif + + close (fd); + return ret; +} + +void +write_result (char *filename, char *detail) +{ + int fd; + + fd = open (filename, O_CREAT | O_RDWR, S_IRWXU); + if (write (fd, detail, strlen (detail)) < 0) { + perror (" error writing result file "); + exit (EXIT_FAILURE); + } + + close (fd); +} + +void +rev_trans (char *s) +{ + + int i; + int j; + + for (i = 0, j = strlen (s) - 1; i < j; ++i, --j) + { + char c = s[i]; + s[i] = s[j]; + s[j] = c; + } + + i = 0; + + while (i < strlen (s)) + { + switch (s[i]) + { + case 'A': + s[0] = 'T'; + break; + case 'C': + s[0] = 'G'; + break; + case 'G': + s[0] = 'C'; + break; + case 'T': + s[0] = 'A'; + break; + case 'a': + s[0] = 't'; + break; + case 'c': + s[0] = 'g'; + break; + case 'g': + s[0] = 'c'; + break; + case 't': + s[0] = 'a'; + break; + } + s++; + } + +} + +char * +mmaping (char *source) +{ + + struct stat statbuf; + + int src; + char *sm; + + if ((src = open(source, O_RDONLY | O_LARGEFILE)) < 0) + { + perror (" open source "); + exit (EXIT_FAILURE); + } + + if (fstat (src, &statbuf) < 0) + { + perror (" fstat source "); + exit (EXIT_FAILURE); + } + + sm = + mmap (0, (BIGCAST) statbuf.st_size, PROT_READ, MAP_SHARED | MAP_NORESERVE, + src, 0); + + if (MAP_FAILED == sm) + { + perror (" mmap source "); + exit (EXIT_FAILURE); + } + + return sm; +} + +void +build_help () +{ + printf ("USAGE\n"); + printf + ("##########################################################################\n"); + printf ("---Bloom build----\n"); + printf ("# ./facs -m b [option] [option] [option] [option]