From 133813ef0e4dab2c8eb485d037be1afa5ca56a7c Mon Sep 17 00:00:00 2001
From: Roman Valls Guimera <roman@scilifelab.se>
Date: Thu, 31 Jan 2013 09:33:12 +0100
Subject: [PATCH] Graduating FACS C implementation to SciLifeLab repository

---
 .gitignore                       |   14 +
 README                           |   31 -
 README.md                        |  111 ++++
 bin/bloombuild.pl                |  268 --------
 bin/facs.pl                      |  516 ---------------
 bin/facs_batch.pl                |  518 ---------------
 doc/GC_.py                       |   48 ++
 doc/ROC.m                        |  169 +++++
 doc/prop.m                       |   41 ++
 drass/Makefile                   |   44 ++
 drass/big_query.c                |  277 ++++++++
 drass/big_query.h                |   15 +
 drass/bloom.c                    |  639 ++++++++++++++++++
 drass/bloom.h                    |  133 ++++
 drass/build.h                    |   12 +
 drass/check.h                    |   14 +
 drass/facs.c                     |   79 +++
 drass/file_dir.c                 |  164 +++++
 drass/file_dir.h                 |   11 +
 drass/good_build.c               |  265 ++++++++
 drass/hashes.h                   |   10 +
 drass/lookup8.c                  |  570 ++++++++++++++++
 drass/lookup8.h                  |    7 +
 drass/main.c                     |   60 ++
 drass/mpi_bloom.c                | 1055 ++++++++++++++++++++++++++++++
 drass/remove.h                   |   11 +
 drass/remove_l.h                 |   12 +
 drass/setup.cfg                  |    3 +
 drass/setup.py                   |   34 +
 drass/simple_check_1_ge.c        |  264 ++++++++
 drass/simple_remove.c            |  347 ++++++++++
 drass/simple_remove_l.c          |  392 +++++++++++
 drass/suggestions.c              |  164 +++++
 drass/tool.c                     |  453 +++++++++++++
 drass/tool.h                     |   14 +
 tests/test_basic.py              |   66 ++
 tests/test_thousand_genomes.py   |   51 ++
 tests/utils/__init__.py          |    0
 tests/utils/fastq_dummy.py       |   36 +
 tests/utils/helpers.py           |  143 ++++
 tests/utils/valgrind-python.supp |  391 +++++++++++
 41 files changed, 6119 insertions(+), 1333 deletions(-)
 create mode 100644 .gitignore
 delete mode 100644 README
 create mode 100644 README.md
 delete mode 100755 bin/bloombuild.pl
 delete mode 100755 bin/facs.pl
 delete mode 100755 bin/facs_batch.pl
 create mode 100644 doc/GC_.py
 create mode 100644 doc/ROC.m
 create mode 100644 doc/prop.m
 create mode 100644 drass/Makefile
 create mode 100644 drass/big_query.c
 create mode 100644 drass/big_query.h
 create mode 100644 drass/bloom.c
 create mode 100644 drass/bloom.h
 create mode 100644 drass/build.h
 create mode 100644 drass/check.h
 create mode 100644 drass/facs.c
 create mode 100644 drass/file_dir.c
 create mode 100644 drass/file_dir.h
 create mode 100644 drass/good_build.c
 create mode 100644 drass/hashes.h
 create mode 100644 drass/lookup8.c
 create mode 100644 drass/lookup8.h
 create mode 100644 drass/main.c
 create mode 100644 drass/mpi_bloom.c
 create mode 100644 drass/remove.h
 create mode 100644 drass/remove_l.h
 create mode 100644 drass/setup.cfg
 create mode 100755 drass/setup.py
 create mode 100644 drass/simple_check_1_ge.c
 create mode 100644 drass/simple_remove.c
 create mode 100644 drass/simple_remove_l.c
 create mode 100644 drass/suggestions.c
 create mode 100644 drass/tool.c
 create mode 100644 drass/tool.h
 create mode 100644 tests/test_basic.py
 create mode 100644 tests/test_thousand_genomes.py
 create mode 100644 tests/utils/__init__.py
 create mode 100755 tests/utils/fastq_dummy.py
 create mode 100644 tests/utils/helpers.py
 create mode 100644 tests/utils/valgrind-python.supp

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e311cd1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,14 @@
+*.o
+*.so
+*.a
+*.pyc
+*.swp
+*.bloom
+*.fasta
+*.fastq
+*.info
+core.*
+vgcore.*
+*.egg-info
+tests/data
+build/
diff --git a/README b/README
deleted file mode 100644
index f2d8528..0000000
--- a/README
+++ /dev/null
@@ -1,31 +0,0 @@
-facs README
-
-For more information, please visit the FACS website:
-http://facs.scilifelab.se
-
-Overview
-
-New generation sequence technologies and the sequencing of increasingly
-complex datasets demand new efficient and specialized sequence analysis
-algorithms. Often, it is only the 'novel' sequences in a complex dataset that
-are of interest and the superfluous sequences need to be removed. A novel
-algorithm, FACS (Fast and Accurate Classification of Sequences), is introduced
-that can accurately and rapidly align sequences to a reference sequence. FACS
-was first optimized and validated using a synthetic metagenome dataset. An
-experimental metagenome dataset was then used to show that FACS is at least
-three times faster and more accurate than BLAT and SSAHA2 in classifying
-sequences when using references larger than 50Mbp.
-
-Citation
-
-Reference:
-Henrik Stranneheim, Max Käller, Tobias Allander, Björn Andersson,
-Lars Arvestad, Joakim Lundeberg. Classification of DNA sequences using Bloom
-filters. Bioinformatics, 2010 July 1; 26(13):1595-1600.
-
-Published online 2010 May 13. http://dx.doi.org/doi:10.1093/bioinformatics/btq230
-
-Contact
-
-Henrik Stranneheim <henrik.stranneheim@scilifelab.se>
-Lars Arvestad <arve+facs@csc.kth.se>
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6526cf8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,111 @@
+FACS (Fast and Accurate Classification of Sequences) C implementation
+======================================================================
+
+[![Build Status](https://travis-ci.org/tzcoolman/DRASS.png?branch=master)](DRASS)
+
+WARNING: This program is under active development and this documentation might not reflect reality.
+Please file a GitHub issue and we will take care of it as soon as we can.
+
+Overview
+--------
+
+* 'build' is for building a bloom filter from a reference file.
+It supports large genome files (>4GB), human genome, for instance.
+* 'query' is for querying a fastq/fasta file against the bloom filter.
+* 'remove' is for removing contamination sequences from a fastq/fasta file.
+
+
+Quickstart
+----------
+
+In order to fetch the source code, compile and run tests:
+
+```
+$ git clone https://github.com/tzcoolman/DRASS.git && cd drass && make -j8 && make tests
+```
+
+Please note that python's <a href="https://github.com/brainsik/virtualenv-burrito">virtualenv</a> is needed to run the tests.
+
+Usage
+------
+
+Facs uses a similar commandline structure to the one found in the popular <a href="https://github.com/lh3/bwa">bwa</a>.
+There are three main commands: build, query and remove. Each of them might have slightly different flags but should
+behave similarly.
+
+```
+$ ./facs -h
+
+Program: facs (Sequence decontamination using bloom filters)
+Version: 0.1
+Contact: Enze Liu <enze.liu@scilifelab.se>
+
+Usage:   facs <command> [options]
+
+Command: build         build a bloom filter from a FASTA reference file
+         query         query a bloom filter given a FASTQ/FASTA file
+         remove        remove (contamination) sequences from FASTQ/FASTA file
+```
+
+For example, to build a bloom filter out of a FASTA reference genome, one should type:
+
+```
+$ ./facs build -r ecoli.fasta -o ecoli.bloom
+```
+
+That would generate a ecoli bloom filter that could be used to query a FASTQ file:
+
+```
+$ ./facs query -b ecoli.bloom -r contaminated_sample.fastq.gz
+```
+
+Note that both plaintext fastq files and gzip-compressed files are supported transparently
+to the user.
+
+Which would return some metrics indicating how many reads might be contaminated with
+ecoli in that particular sample:
+
+```
+{
+        "total_read_count": 201,
+        "contaminated_reads": 1,
+        "total_hits": 90,
+        "contamination_rate": 0.004975,
+        "bloom_filename":"tests/data/bloom/U00096.2.bloom"
+}
+```
+
+Finally, if one wants to remove those reads from the sample, one should run the following
+command:
+
+```
+$ ./facs remove -b ecoli.bloom -r contaminated_sample.fastq.gz -o discarded_reads.fastq
+```
+
+Where "discarded_reads.fastq" is the reads that have been filtered out from the original
+fastq file.
+
+
+Python interface
+----------------
+
+A python C-Extension provides a very simple API to build, query and remove sequences,
+just as described above with the plain C-based commandline.
+
+```
+$ python
+Python 2.6.6 (r266:84292, Jun 18 2012, 09:57:52) 
+[GCC 4.4.6 20110731 (Red Hat 4.4.6-3)] on linux2
+Type "help", "copyright", "credits" or "license" for more information.
+>>> import facs
+>>> facs.build("ecoli.fasta", "ecoli.bloom")
+>>> facs.query("contaminated_sample.fastq.gz", "ecoli.bloom")
+>>> facs.remove("contaminated_sample.fastq.gz", "ecoli.bloom")
+```
+
+
+Notes
+-----
+
+* All three scripts can be executed on both Linux and Mac system. But they don't support large bloom filter building and loading on MAC system.
+* FACS supports fasta and fastq formats. Make sure you use the correct extension name: .fna or .fasta and .fastq, respectively.
diff --git a/bin/bloombuild.pl b/bin/bloombuild.pl
deleted file mode 100755
index 5dcd9f1..0000000
--- a/bin/bloombuild.pl
+++ /dev/null
@@ -1,268 +0,0 @@
-#!/usr/bin/perl -w
-
-# Builds Bloom filters with a given K-mer length, false positive rate
-# and reference in a FASTA format
-
-=head1 NAME
-
- bloombuild - build Bloom filters from reference genomes in FASTA files.
-
-=head1 SYNOPSIS
-
- bloombuild [options] -r reference.fasta
-
-=head1 DESCRIPTION
-
- Build Bloom filters from reference genomes in fasta files. These
- can later be queried using FACS.
-
- -r/--reference A file containing a list of filenames for reference genome data.
- Each line contains a filename. 
- Each file is a Fasta file. (defaults to STDIN)
-
- NOTE: Presently the Bloom::FASTER module has problems
- with garbage collection which only enables one reference to built each time
- the program loads. Supplying references in batch might cause a high false
- positive rate. 
-
-=head1 OPTIONS
-
--os/--outfile Output suffix for the created Bloom filter (defaults to .obj)
-
--k/--kmer Word length for K-mers inserted into the filter (defaults to '21')
-
--f/--falseposprob The false positive probaility that you accept (defaults to '0.0005')
-
--dbpr/--databsepath The unixpath to the references (defaults to STDIN)
-
--dbpo/--databsepath The unixpath to the output directory (defaults to STDIN)
-
-=head1 I/O
-Input format (FASTA/Pearson)
-
-Output format (Bloom filter)
-
-=head1 SEE ALSO
-
-FACS
-
-=cut
-
-use strict;
-use Bloom::Faster;
-use Getopt::Long;
-
-use vars qw($USAGE);
-
-BEGIN {
-    $USAGE =
-qq{bloombuild.pl < file.fa > file.obj
--k/--kmer Word length for K-mers inserted into the filter (defaults to '21')
--f/--falseposprob The false positive probaility that you accept (defaults to '0.0005')
--r/--reference A file containing a list of filenames for reference genome data. Each line contains 
-a filename. Each file is a Fasta file. (defaults to STDIN)
-
-NOTE: Presently the Bloom::FASTER module has problems
-with garbage collection which only enables one reference to built each time the program loads. 
-Supplying references in batch might cause a high false positive rate.
-
--os/--outfilesuffix Output suffix for the created Bloom filter (defaults to .obj)
--dbpr/--databsepath The unixpath to the references (defaults to STDIN)
--dbpo/--databsepath The unixpath to the output directory (defaults to STDIN)
-};
-
-}
-my ($rformat, $oformat, $outfilesuffix, $kmerlength, $falseposratebloom, $dbpr, $dbpo, 
-$help) = ('fasta','bloom', ".obj", 21, 0.0005,".", ".");
-my ($ref);
-
-GetOptions('k|kmerlength:i' => \$kmerlength,
-'f|falseposratebloom:s' => \$falseposratebloom,
-'r|reference:s' => \$ref,
-'os|outfile:s' => \$outfilesuffix,
-'dbpr|databaseref:s'  => \$dbpr,
-'dbpo|databaseout:s'  => \$dbpo,
-'h|help' => \$help,
-);
-
-die $USAGE if( $help );
-
-#Inputs references to create bloom filters
-
-if ($ref) {
-
-	}
-else {
-
-($ref) = @ARGV;
-
-if (@ARGV != 1) {
-  my $verbosity = 0;
-  if (@ARGV == 0) {
-    $verbosity = 2;
-  }
-  print"\n";
-  pod2usage({-message => "Must supply reference.\n",
-	     -verbose => $verbosity
-	    });
-	}
-}
-#Reads reference list
-ReferenceList($ref);
-
-my $id;	
-my $seq;
-my %seqs;
-my $headercount;
-
-my $refid;
-my @refs;
-my $refcount=0;
-
-my $keycount;
-my $filtercounter=0;
-
-for (my $refid=0;$refid<scalar(@refs);$refid++) { 
-
-	print STDERR "BloomFilter Counter:","\t", $refid,"\n";
-	print STDERR "Reference Id:","\t", $refs[$refid],"\n";
-	print STDERR "K-mer length:","\t", $kmerlength,"\n";
-	print STDERR "False Positive Rate Bloom Filter:","\t", $falseposratebloom,"\n"; 
-
-    Refseq($refs[$refid], $refid);
-    	
-
-    for $id (keys %seqs) {	
-
-    #Adds keys to the filter
-    Targetkeys($seqs{$id}, $refid); 
-    
-    $id .=$outfilesuffix;
-
-    #Saves Bloom filter 
-    Savebloom($id, $refid);	
-    
-    #Resets %seqs & undefines bloom filter
-    Blank($refid);
-
-    }
-}
-
-##########
-#End of main program
-##########
-
-##########
-##Subroutines
-# my $id - Capture header in reference file	
-# my $seq - Capture sequence in reference file
-# my %seqs - Stores header and sequence in reference file  
-# my $headercount - Counts the number of headers in reference file
-##########
-# my $refid - Capture filename in reference list
-# my @refs - Stores filename in reference list
-# my $refcount - Counts the number of filenames in reference file
-##########
-# my $keycount - Number of keys in filter
-# my $filtercounter - Counts the number of filters created
-##########
-
-
-sub Bloomfilter {
-    
-	$keycount = $_[0];
-	$$_[1] = Bloom::Faster->new({n => $keycount, e => $falseposratebloom});
-    
-}
-
-
-sub Savebloom {
-
-    $$_[1]->to_file("$dbpo/$_[0]");
-    
-}
-
-
-sub Blank {
-
-    %seqs=();
-    undef($$_[1]);
-}
-
-
-sub Targetkeys {
-    
-      my ($seq) = $_[0]; 
-    
-	my $lengthseq = length($seq)-($kmerlength-1); #Stops sliding window at the end
-
-	for (my $i=0;$i<$lengthseq;$i++) {	
-
-	$$_[1]->add (substr ($seq, $i, $kmerlength) );
-	
-    }
-
-    print STDERR "Added Reference Keys to Filter", "\n";
-    return;
-    
-}
-
-
-sub Refseq {
-    
-open(FASTA, "<$dbpr/$_[0]") or die "Can't open $_[0]:$!, \n";
-    while(<FASTA>) {
-
-	if (/>/) {
-	   
-	    $seq ="";
-	    chomp($id = $'); #'
-	    $headercount++;
-	}
-	
-	else {
-
-	    chomp($seq .= $_);
-	}
-	
-    }
-
-    $seqs{$id}= $seq;
-    print STDERR "Finished Reading Reference Sequence","\n";
-    close(FASTA);
-	#Determines bitvector length & creates vector
-	Bloomfilter(length($seqs{$id}), $$_[1] );  
-    return;
-    
-}
-
-sub ReferenceList {
-   
-open(BLOOM, "<$_[0]") or die "Can't open $_[0]:$!, \n";
-
-    while(<BLOOM>) {
-	
-	if (/\S+/) {
-
-      chomp;
-      $refcount++;
-      push @refs, $_;
-    }
-
-
-    }
-    
-    close(BLOOM);
-  if ($refcount == 0) {
-    print STDERR "Did not find any files in reference file '$ref'\n";
-  }
-
-    print STDERR "Finished Reading Bloom List","\n"; 
-    return;
-}
-
-
-
-
-
-
diff --git a/bin/facs.pl b/bin/facs.pl
deleted file mode 100755
index aa1b1c8..0000000
--- a/bin/facs.pl
+++ /dev/null
@@ -1,516 +0,0 @@
-#!/usr/bin/perl -w
-
-# Loads query in FASTA format, K-mer length and Bloom Filters. Interrogates
-# queries against filters and classifies queries onto genomes. 
-#The algorithm loops trough all queries for one filter at a time. 
-# Copyright 2011 Henrik Stranneheim
-
-=pod
-
-=head1 NAME
-
-facs - Filter reads of DNA
-
-=head1 SYNOPSIS
-
-facs k bloomfilterlist queryfile outprefix
-
-=head1 DESCRIPTION
-
-Build Bloom filters from reference genomes in fasta files using bloombuild.pl. These
-can later be queried using FACS.
-
-This version of facs.pl is based on facs_2.pl, using the more accurate scoring
-system.
-
-facs interrogates queries against filters and classifies queries
-onto genomes. The algorithm loops trough all queries for one filter
-at a time.
-
-Results are written to two files
-
-1. Reads matching a reference
-
-2. Reads not matching any reference
-
-=head1 Arguments
-
--b/--bloomfilter A file containing a list of filenames for already created bloom filters. Each line contains the file name of the specific bloom filter. (defaults to STDIN)
-
--op/--outfileprefix Output prefix for the output files (defaults to "")
-
--osma/--outfilesuffix Output suffix for the matches output files
-
--osmi/--outfilesuffix Output suffix for the mismatches output files
-
--k/--kmer Word length for K-mers to be queried against the bloom filter (defaults to '21')
-
--f/--falseposprob The false positive probaility that you accept (defaults to '0.0005')
-
--dbpr/--databsepath The unix path to the bloom filters (defaults to STDIN)
-
--o/--outdirectory The unix path to the output directory (defaults to STDIN)
-
--mc/--matchcutoff The percent identity to classify a match (defaults to 80)
-
--lc/--lengthcutoff The minimum required read length (defaults to 60)
-
--qp/--quickpasses Number of quick passes that must match before vetting read (defaults to 1)
-
-Note: You have to use the same false positive rate and K-mer length as was used when the bloom filters were created.
-
-=head1 I/O
-
-Input format (FASTA/Pearson)
-
-Output format (FASTA/Pearson)
-
-=head1 SEE ALSO
-
-bloombuild
-
-=cut
-
-use strict;
-use Bloom::Faster;
-use Pod::Usage;
-use Pod::Text;
-use Getopt::Long;
-
-use vars qw($USAGE);
-
-BEGIN {
-    $USAGE =
-qq{facs.pl < queryfile.fa -b bloomfilter.obj > file_suffix.fasta
--k/--kmer Word length for K-mers to be queried against the bloom filter (defaults to '21')
--f/--falseposprob The false positive probaility that you accept (defaults to '0.0005')
--b/--bloomfilter A file containing a list of filenames for already created bloom filters. Each line contains the file name of the specific bloom filter. (defaults to STDIN)
--op/--outfileprefix Output prefix for the output files (defaults to "")
--osma/--outfilesuffix Output suffix for the matching output files (defaults to "_matching.fasta")
--osmi/--outfilesuffix Output suffix for the mismatching output files (defaults to "_mismatching.fasta")
--dbpr/--databsepath The unixpath to the bloom filters (defaults to STDIN)
--o/--outputdirectory The unixpath to the output directory (defaults to STDIN)
--mc/--matchcutoff The percent identity to classify a match (defaults to "0.8"~80%)
--lc/--lengthcutoff The minimum required read length (defaults to 60)
--qp/--quickpasses Number of quick passes that must match before vetting read (defaults to 1)
-};
-
-}
-my ($rformat, $oformat, $outfileprefix, $matchoutfilesuffix, $mismatchoutfilesuffix, $kmerlength, 
-$falseposratebloom, $dbpr, $od, $matchcutoff, $lengthcutoff, $nrqp, 
-$help) = ('fasta','fasta', "", "_matching.fasta", "_mismatching.fasta", 21, 0.0005,".", ".", 0.8, 60, 1);
-my ($queryfile, $bloomfilterlist);
-
-GetOptions('k|kmerlength:i' => \$kmerlength,
-'f|falseposratebloom:s' => \$falseposratebloom,
-'b|bloomfilter:s' => \$bloomfilterlist,
-'op|outfileprefix:s' => \$outfileprefix,
-'osma|matchoutfilesuffix:s' => \$matchoutfilesuffix,
-'osmi|mismatchoutfilesuffix:s' => \$mismatchoutfilesuffix,
-'dbpr|databaseref:s'  => \$dbpr,
-'o|outdirectory:s'  => \$od,
-'mc|matchcutoff:s'  => \$matchcutoff,
-'lc|lengthcutoff:i'  => \$lengthcutoff,
-'qp|quickpasses:i'  => \$nrqp,
-'h|help' => \$help,
-);
-
-die $USAGE if( $help );
-
-if ($queryfile) {
-
-	}
-else {
-
-($queryfile) = @ARGV;
-
-if (@ARGV != 1) {
-  my $verbosity = 0;
-  if (@ARGV == 0) {
-    $verbosity = 2;
-  }
-  print"\n";
-  pod2usage({-message => "Must supply a query file and list of bloom filter(s).\n",
-	     -verbose => $verbosity
-	    });
-	}
-}
-
-my $refid;
-my $refcount;
-my @refs;
-
-my $queryid;
-my %querys;
-my $queryseq;
-
-my $filter;
-my $kmermatchcountqp=0;
-my $kmermatchcount=0;
-my $kmernomatchcount=0;
-my $bloomnomatchcount=0;
-my $bloommatchcount=0;
-my %allshort;
-my @matches;
-my @matchesid;
-my @matchscore;
-my $norevcheck=0;
-my $n_shorts = 0; 
-my $trackmatch=0;
-my $trackposition=0;
-
-
-my $queryheadercount=0;
-
-Querysseqs($queryfile); #reads query sequences and stores it in %querys
-
-Bloomfilterlist($bloomfilterlist); #reads reference genome list and stores it in @refs
-
-
-for (my $refid=0;$refid<scalar(@refs);$refid++) { 
-
-	#Loads Bloom filter from reference array
-	LoadBloom($refs[$refid]);
-	push @matchesid, $refs[$refid];  #Stores reference id for classification stats later
-
-	print STDERR "Filter:","\t", $refs[$refid],"\n";
-	print STDERR "Targetlength:","\t", $kmerlength,"\n"; 
-    	
-   	for $queryid (keys %querys) {  
-
-
-		if ( length($querys{$queryid})>$lengthcutoff ) { #Minimum length cut-off
-	
-		#Divides querie into K-mers, checks filter, calculates match score and classifies sequences
-		Sortquerykeys($querys{$queryid}, $kmerlength,$queryid);
-		
-			if ($norevcheck eq 0) {
-			#Translates and reverses queries
-			$querys{$queryid} =~tr/ATGC/TACG/;
-			$querys{$queryid} = reverse $querys{$queryid};
-			Sortquerykeys($querys{$queryid}, $kmerlength,$queryid);
-			}
-		$norevcheck=0; 
-
-		}
-		else {
-		$allshort{$queryid}= $querys{$queryid}; #Saves all queries that did not surpass length cut-off
-		delete $querys{$queryid}; #Deletes queries that did not surpass length cut-off from further querying
-		$n_shorts++;		
-		}
-
-    	}	
-    
-    	print STDERR "Finished with Classification of Query Keys","\n";
-		
-	#Writes matches to file
-	Write_matches($outfileprefix . "_" . $refs[$refid]);	
-	Print();
-	#Resets parameters
-	Blank();
-		if ( $refid>=(scalar(@refs)-1) ) {   # prints all Mismatches to file
-
-			for my $nomatchid (keys %querys) { #Collects all unclassified sequences 
-
-			$allshort{$nomatchid} = $querys{$nomatchid};
-	    
-			}
-
-		Write_mismatches($outfileprefix . "_" . $refs[$refid]);
-
-		}
-}
-
-
-#Prints matches and match score
-for (my $matchid=0;$matchid<scalar(@matchesid);$matchid++) { #repeats until no more file id
-
-print $matchesid[$matchid], "\t"; 
- $matchid++;
-   if ($matchesid[$matchid]) {
-	print $matchesid[$matchid], "\n"; 
-   }
-   else {
-	print "\n"; 
-   }
-
-}
-
-undef($filter);
-
-##########
-#End of main program
-##########
-
-##########
-###Subroutines:
-# my $refid - Capture filename in reference list
-# my @refs - Stores filename in reference list
-# my $refcount - Counts the number of filenames in reference file
-##########
-# my $queryid - Capture each header in queryfile
-# my %querys - Stores header and corresponding sequences in queryfile
-# my $queryseq - Capture each sequences in queryfile
-# my $queryheadercount=0 - Counts number of queries
-##########
-# my $filter - Bloom filter object
-# my $kmermatchcountqp=0 - Number of matching K-mers per query in Quick pass
-# my $kmermatchcount=0 - Number of matching nucleotides (K-mers*K-mer length) per queries, match score
-# my $kmernomatchcount=0 - Number of mismatching K-mers per queries
-# my $bloomnomatchcount=0 - Number of classifed queries
-# my $bloommatchcount=0 - Number of John Does
-# my %allshort - Stores all queries below length cut-off and at the end collects all John Does for printing
-# my @matches - Stores query id for classification 
-# my @matchesid - Prints all reference headers and number of classified queries
-# my $loopcount=0 - Loop for classifiying both forward and complementary reverse
-# my $n_shorts = 0 - Counts all short sequences 
-# my $trackmatch=0 - Tracks when to give a K-mer length score
-# my $trackposition=0 - Tracks position within length of K-mer within last match 
-##########
-
-
-sub LoadBloom {
- 
- my $filtername = shift;
- $filter = new Bloom::Faster("$dbpr/$filtername");
-
-}
-
-
-sub Print {
-
- print STDERR "Number of sequences in original query file:","\t", $queryheadercount,"\n";
- print STDERR "Number of remaining queries:\t", scalar (keys %querys),"\n";
- print STDERR "Number of Mapped Targets:","\t", $bloommatchcount, "\n";
- print STDERR "Number of short queries: \t", $n_shorts, "\n";
- push @matchesid, $bloommatchcount;
-
-
-}
-
-sub Blank {
-
-    $bloomnomatchcount =0;
-    $bloommatchcount=0;
-    $kmermatchcountqp=0;
-    $kmermatchcount=0;
-    $kmernomatchcount=0;
-    @matches = ();
-    @matchscore = ();     
-}
-
-
-sub Sortquerykeys {
-    
-    my $lengthseq = length($_[0]) -($_[1]-1); #Stops sliding window at the end
-	
-    for (my $i=0;$i<$lengthseq;$i++) {
-
-	my $query = substr ($_[0], $i, $_[1]);
-
-	#Quick pass of query
-	CheckfilterQP($i, $query);
-
-		if ($kmermatchcountqp) { 
-
-	            if ($kmermatchcountqp eq $nrqp) { #For queryies passing quickpass
-
-			$kmermatchcountqp=0;
-			$i=$lengthseq;
-    			
-			for (my $k=0;$k<$lengthseq;$k++) {
-
-				my $query = substr ($_[0], $k, $_[1]);
-	
-				#Queries the Bloom filter
-				Checkfilter($k, $query);
-
-    			}
-			
-			if ( ( $kmermatchcount/length($_[0]) ) > $matchcutoff) { #For matches
-					
-					push (@matches, $_[2]); #Saves match headers
-					$norevcheck=1;
-				    	$bloommatchcount++;
-	   				$kmermatchcount=0;
-	   				$kmernomatchcount=0;
-					$trackmatch = 0;
-					$trackposition = 0;
-				 	return;
-	  		}
-    			$kmermatchcount=0;
-   			$kmernomatchcount=0;
-			$trackmatch = 0;
-			$trackposition = 0;
-  			return;
-		}
-	}
-   }
-
-}
-
-sub CheckfilterQP {
-
-# $_[0] = Position in query
-# $_[1] = my $query
-     
-    if ($filter-> check( $_[1] ) ) {
-	 
-	 ++$kmermatchcountqp; #Adds to match score 
-	return;
-    }
-    
-    else {
-	
-	$_[0] = $_[0] + $kmerlength-1; #Increments position in query 
-	return;
-    }
-    
-}
-
-sub Checkfilter {
-
-# $_[0] = Position in query
-# $_[1] = my $query
-     
-    if ($trackposition >= $kmerlength ) {
-
-		$trackmatch = 0;
-		$trackposition = 0;
-    }
-
-    if ($filter-> check( $_[1] ) ) {
-	 
-		if ($trackmatch eq 0) {
-		
-		$kmermatchcount = $kmermatchcount + $kmerlength-1; #Adds to match score		
-		$trackmatch = 1;
-		$trackposition++;
-		}
-		else {
-		$kmermatchcount++;
-		$trackposition++;
-		}
-	return;
-    }
-    
-    else {
-	
-	$trackposition++;
-
-	return;
-    }
-    
-}
-
-
-sub Querysseqs {
-
-    open(QUERY, "<$_[0]") or die "Can't open $_[0]:$!, \n";    
-
-    while (<QUERY>) {
-        chomp $_;
-
-        if (m/^\s+$/) {		# Avoid blank lines
-            next;
-        }	
-
-	    if (/>(\S+)/) {
-	    
-	        $queryseq ="";
-	        $queryid = $1; 
-	        $queryheadercount++;
-	    }	
-	    else {
-	        $queryseq .= $_;
-	    }
-
-	    $querys{$queryid}=$queryseq;
-    }
-    
-    $queryheadercount = scalar(keys(%querys));
-    close(QUERY);
-    print STDERR "Finished Reading Query Sequences","\n";
-    return;
-}
-
-sub Bloomfilterlist {
- my $filename = shift;
-
- open(REF, "<$filename") or die "Can't open $filename:$!, \n";   
-
-    $refcount = 0;
-    while(<REF>) {
-
-	
-	if (/\S+/) {
-
-      chomp;
-      $refcount++;
-      push @refs, $_;
-    }
-
-    }
-    
-    close(REF);
-    print STDERR "Finished Reading Ref Filter List","\n"; 
-    return;
-}
-
-
-sub Write_matches {
-    
-	my $filename = shift;
-	$filename .= $matchoutfilesuffix;
-	
-    open (GENOME, ">$od/$filename") or die "Can't write to $od/$filename: $!\n";
-    
-    my $assemblysseq;
-
-	while (@matches) { 
-
-		my $matchid = pop @matches;
-		$assemblysseq .= $querys{$matchid};
-	
-		print GENOME '>', $matchid,"\n"; 
-	
-		for (my $i=0;$i<(length($assemblysseq)/60);$i++) {
-	    
-		    print GENOME substr($assemblysseq,$i*60,60),"\n";
-	    
-		}
-		$assemblysseq="";
-		delete $querys{$matchid};
-		
-	}
-     close (GENOME);
-    return;
-}
-
-sub Write_mismatches {
-    my $filename = shift;
-    $filename .= $mismatchoutfilesuffix;
-
-    open (GENOME, ">$od/$filename") or die "Can't write to $od/$filename: $!\n";
-
-    my $assemblysseq;
-    
-    foreach my $id (keys %allshort) {
-	
-	$assemblysseq .= $allshort{$id};
-	
-	print GENOME '>', $id,"\n";
-	
-	for (my $i=0;$i<(length($assemblysseq)/60);$i++) {
-	    
-	    print GENOME substr($assemblysseq,$i*60,60),"\n";
-	    
-	}
-	
-	$assemblysseq="";
-	
-    }
-     close (GENOME);
-    return;
-}
-
diff --git a/bin/facs_batch.pl b/bin/facs_batch.pl
deleted file mode 100755
index 50788a4..0000000
--- a/bin/facs_batch.pl
+++ /dev/null
@@ -1,518 +0,0 @@
-#!/usr/bin/perl -w
-
-# Loads query in FASTA format, K-mer length and Bloom Filters. Interrogates
-# queries against filters and classifies queries onto genomes. 
-#The algorithm loops trough all queries for one filter at a time. 
-# Copyright 2011 Henrik Stranneheim
-
-=pod
-
-=head1 NAME
-
-facs - Filter reads of DNA
-
-=head1 SYNOPSIS
-
-facs queryfile -b bloomfilterlist
-
-=head1 DESCRIPTION
-
-Build Bloom filters from reference genomes in fasta files using bloombuild.pl. These
-can later be queried using FACS.
-
-This version of facs.pl is based on facs_2.pl, using the more accurate scoring
-system.
-
-facs interrogates queries against filters and classifies queries
-onto genomes. The algorithm loops trough all queries for one filter
-at a time.
-
-Results are written to two files
-
-1. Reads matching a reference
-
-2. Reads not matching any reference
-
-=head3 COMMANDS AND OPTIONS
-
--b/--bloomfilter A file containing a list of filenames for already created bloom filters. Each line contains the file name of the specific bloom filter. (defaults to STDIN)
-
--op/--outfileprefix Output prefix for the output files (defaults to "")
-
--osma/--outfilesuffix Output suffix for the matches output files
-
--osmi/--outfilesuffix Output suffix for the mismatches output files
-
--k/--kmer Word length for K-mers to be queried against the bloom filter (defaults to '21')
-
--f/--falseposprob The false positive probaility that you accept (defaults to '0.0005')
-
--dbpr/--databsepath The unix path to the bloom filters (defaults to STDIN)
-
--o/--outdirectory The unix path to the output directory (defaults to STDIN)
-
--mc/--matchcutoff The percent identity to classify a match (defaults to 80)
-
--lc/--lengthcutoff The minimum required read length (defaults to 60)
-
--qp/--quickpasses Number of quick passes that must match before vetting read (defaults to 1)
-
--bs/--batchsize Number of reads that are analysed in batch (defaults to 5000)
-
-Note: You have to use the same false positive rate and K-mer length as was used when the bloom filters were created.
-
-=head1 I/O
-
-Input format (FASTA/Pearson)
-
-Output format (FASTA/Pearson)
-
-=head1 SEE ALSO
-
-bloombuild
-
-=cut
-
-use strict;
-use Bloom::Faster;
-use Pod::Usage;
-use Pod::Text;
-use Getopt::Long;
-
-use vars qw($USAGE);
-
-BEGIN {
-    $USAGE =
-qq{facs.pl < queryfile.fa -b bloomfilter.obj > file_suffix.fasta
--k/--kmer Word length for K-mers to be queried against the bloom filter (defaults to '21')
--f/--falseposprob The false positive probaility that you accept (defaults to '0.0005')
--b/--bloomfilter A file containing a list of filenames for already created bloom filters. Each line contains the file name of the specific bloom filter. (defaults to STDIN)
--op/--outfileprefix Output prefix for the output files (defaults to "")
--osma/--outfilesuffix Output suffix for the matching output files (defaults to "_matching.fasta")
--osmi/--outfilesuffix Output suffix for the mismatching output files (defaults to "_mismatching.fasta")
--dbpr/--databsepath The unixpath to the bloom filters (defaults to STDIN)
--o/--outputdirectory The unixpath to the output directory (defaults to STDIN)
--mc/--matchcutoff The percent identity to classify a match (defaults to "0.8"~80%)
--lc/--lengthcutoff The minimum required read length (defaults to 60)
--qp/--quickpasses Number of quick passes that must match before vetting read (defaults to 1)
--bs/--batchsize Number of reads that are analysed in batch (defaults to 5000)
-};
-
-}
-my ($rformat, $oformat, $outfileprefix, $matchoutfilesuffix, $mismatchoutfilesuffix, $kmerlength, 
-$falseposratebloom, $dbpr, $od, $matchcutoff, $lengthcutoff, $nrqp, $bs, 
-$help) = ('fasta','fasta', "", "_matching.fasta", "_mismatching.fasta", 21, 0.0005,".", ".", 0.8, 60, 1, 5000);
-my ($queryfile, $bloomfilterlist);
-
-GetOptions('k|kmerlength:i' => \$kmerlength,
-'f|falseposratebloom:s' => \$falseposratebloom,
-'b|bloomfilter:s' => \$bloomfilterlist,
-'op|outfileprefix:s' => \$outfileprefix,
-'osma|matchoutfilesuffix:s' => \$matchoutfilesuffix,
-'osmi|mismatchoutfilesuffix:s' => \$mismatchoutfilesuffix,
-'dbpr|databaseref:s'  => \$dbpr,
-'o|outdirectory:s'  => \$od,
-'mc|matchcutoff:s'  => \$matchcutoff,
-'lc|lengthcutoff:i'  => \$lengthcutoff,
-'qp|quickpasses:i'  => \$nrqp,
-'bs|batchsize:i'  => \$bs,
-'h|help' => \$help,
-);
-
-die $USAGE if( $help );
-
-if ($queryfile) {
-
-	}
-else {
-
-($queryfile) = @ARGV;
-
-if (@ARGV != 1) {
-  my $verbosity = 0;
-  if (@ARGV == 0) {
-    $verbosity = 2;
-  }
-  print"\n";
-  pod2usage({-message => "Must supply a query file and list of bloom filter(s).\n",
-	     -verbose => $verbosity
-	    });
-	}
-}
-
-my $refid;
-my $refcount;
-my @refs;
-my %refs;
-
-my $queryid;
-my %querys;
-my $queryseq;
-
-my $filter;
-my $kmermatchcountqp=0;
-my $kmermatchcount=0;
-my $kmernomatchcount=0;
-my $bloomnomatchcount=0;
-my $bloommatchcount=0;
-my %allshort;
-my @matches;
-my $norevcheck=0;
-my $n_shorts = 0; 
-my $trackmatch=0;
-my $trackposition=0;
-
-my $queryheadercount=0;
-
-Bloomfilterlist($bloomfilterlist); #reads reference genome list and stores it in @refs
-
-Querysseqs($queryfile); #reads query sequences and stores it in %querys
-
-sub Classify {
-
-for (my $refid=0;$refid<scalar(@refs);$refid++) { 
-
-	#Loads Bloom filter from reference array
-	LoadBloom($refs[$refid]);
-    	
-   	for $queryid (keys %querys) {  
-
-
-		if ( length($querys{$queryid})>$lengthcutoff ) { #Minimum length cut-off
-	
-		#Divides querie into K-mers, checks filter, calculates match score and classifies sequences
-		Sortquerykeys($querys{$queryid}, $kmerlength,$queryid, $refs[$refid]);
-		
-			if ($norevcheck eq 0) {
-			#Translates and reverses queries
-			$querys{$queryid} =~tr/ATGC/TACG/;
-			$querys{$queryid} = reverse $querys{$queryid};
-			Sortquerykeys($querys{$queryid}, $kmerlength,$queryid, $refs[$refid]);
-			}
-		$norevcheck=0; 
-
-		}
-		else {
-		$allshort{$queryid}= \$querys{$queryid}; #Saves all queries that did not surpass length cut-off
-		$n_shorts++;		
-		}
-
-    }	
-    
-		
-	#Writes matches to file
-	Write_matches($outfileprefix . "_" . $refs[$refid]);	
-	#Resets parameters
-	Blank();
-		if ( $refid>=(scalar(@refs)-1) ) {   
-
-			for my $nomatchid (keys %querys) { #Collects all unclassified sequences 
-
-			$allshort{$nomatchid} = \$querys{$nomatchid};
-	    
-			}
-
-		Write_mismatches($outfileprefix . "_");		# prints all Mismatches to file
-
-		}
-	}
-	return;
-}
-
-Print();
-
-for $refid (keys %refs) {		#Prints matches and match score
-
-print $refid, "\t"; 
-
-   if ($refs{$refid}) {
-   
-	print $refs{$refid},"\n";
-   }
-   else {
-	print "\n"; 
-   }
-
-}
-
-undef($filter);
-
-##########
-#End of main program
-##########
-
-##########
-###Subroutines:
-# my $refid - Capture filename in reference list
-# my @refs - Stores filename in reference list
-# my $refcount - Counts the number of filenames in reference file
-##########
-# my $queryid - Capture each header in queryfile
-# my %querys - Stores header and corresponding sequences in queryfile
-# my $queryseq - Capture each sequences in queryfile
-# my $queryheadercount=0 - Counts number of queries
-##########
-# my $filter - Bloom filter object
-# my $kmermatchcountqp=0 - Number of matching K-mers per query in Quick pass
-# my $kmermatchcount=0 - Number of matching nucleotides (K-mers*K-mer length) per queries, match score
-# my $kmernomatchcount=0 - Number of mismatching K-mers per queries
-# my $bloomnomatchcount=0 - Number of classifed queries
-# my $bloommatchcount=0 - Number of John Does
-# my %allshort - Stores all queries below length cut-off and at the end collects all John Does for printing
-# my @matches - Stores query id for classification 
-# my %refs - Prints all reference headers and number of classified queries
-# my $loopcount=0 - Loop for classifiying both forward and complementary reverse
-# my $n_shorts = 0 - Counts all short sequences 
-# my $trackmatch=0 - Tracks when to give a K-mer length score
-# my $trackposition=0 - Tracks position within length of K-mer within last match 
-##########
-
-
-sub LoadBloom {
- 
- my $filtername = shift;
- $filter = new Bloom::Faster("$dbpr/$filtername");
-
-}
-
-
-sub Print {
- print STDERR "Finished with Classification of Query Keys","\n";
- print STDERR "Number of sequences in original query file:","\t", $queryheadercount,"\n";
- print STDERR "Number of short queries: \t", $n_shorts, "\n";
-
-
-}
-
-sub Blank {
-
-    $bloomnomatchcount =0;
-    $bloommatchcount=0;
-    $kmermatchcountqp=0;
-    $kmermatchcount=0;
-    $kmernomatchcount=0;
-    @matches = ();
-    #@matchscore = ();     
-}
-
-
-sub Sortquerykeys {
-
-#$_[0] = $queryseq
-#$_[1] = $kmerlength
-#$_[2] = $queryid
-#$_[3] = $refid
-    
-    my $lengthseq = length($_[0]) -($_[1]-1); #Stops sliding window at the end
-	
-    for (my $i=0;$i<$lengthseq;$i++) {
-
-	my $query = substr ($_[0], $i, $_[1]);
-
-	#Quick pass of query
-	CheckfilterQP($i, $query);
-
-		if ($kmermatchcountqp) { 
-
-	            if ($kmermatchcountqp eq $nrqp) { #For queryies passing quickpass
-
-			$kmermatchcountqp=0;
-			$i=$lengthseq;
-    			
-			for (my $k=0;$k<$lengthseq;$k++) {
-
-				my $query = substr ($_[0], $k, $_[1]);
-	
-				#Queries the Bloom filter
-				Checkfilter($k, $query);
-
-    			}
-			
-			if ( ( $kmermatchcount/length($_[0]) ) > $matchcutoff) { #For matches
-					
-					push @matches, \$_[2]; #Saves match headers
-					$refs{$_[3]}++;
-					$norevcheck=1;
-				    $bloommatchcount++;
-	   				$kmermatchcount=0;
-	   				$kmernomatchcount=0;
-					$trackmatch = 0;
-					$trackposition = 0;
-				 	return;
-	  		}
-    		$kmermatchcount=0;
-   			$kmernomatchcount=0;
-			$trackmatch = 0;
-			$trackposition = 0;
-  			return;
-		}
-	}
-   }
-
-}
-
-sub CheckfilterQP {
-
-# $_[0] = Position in query
-# $_[1] = my $query
-     
-    if ($filter-> check( $_[1] ) ) {
-	 
-	 ++$kmermatchcountqp; #Adds to match score 
-	return;
-    }
-    
-    else {
-	
-	$_[0] = $_[0] + $kmerlength-1; #Increments position in query 
-	return;
-    }
-    
-}
-
-sub Checkfilter {
-
-# $_[0] = Position in query
-# $_[1] = my $query
-     
-    if ($trackposition >= $kmerlength ) {
-
-		$trackmatch = 0;
-		$trackposition = 0;
-    }
-
-    if ($filter-> check( $_[1] ) ) {
-	 
-		if ($trackmatch eq 0) {
-		
-		$kmermatchcount = $kmermatchcount + $kmerlength-1; #Adds to match score		
-		$trackmatch = 1;
-		$trackposition++;
-		}
-		else {
-		$kmermatchcount++;
-		$trackposition++;
-		}
-	return;
-    }
-    
-    else {
-	
-	$trackposition++;
-
-	return;
-    }
-    
-}
-
-
-sub Querysseqs {
-
-    open(QUERY, "<$_[0]") or die "Can't open $_[0]:$!, \n";    
-
-    while (<QUERY>) {
-        chomp $_;
-
-        if (m/^\s+$/) {		# Avoid blank lines
-            next;
-        }	
-
-	    if (/>(\S+)/) {
-	    
-	 		if (scalar(keys(%querys)) eq $bs) {   #Batch size
-	    	
-	    		print $queryheadercount=$queryheadercount + scalar(keys(%querys)), "\n";
-	    		Classify();
-	    		%querys = ();
-
-	    	}
-	        $queryseq ="";
-	        $queryid = $1; 	        
-	        
-	    }	
-	    else {
-	        $queryseq .= $_;
-	    }
-
-	    $querys{$queryid}=$queryseq;
-    }
-    
-    $queryheadercount=$queryheadercount + scalar(keys(%querys));
-    Classify(); #Catch the remainaing reads
-    close(QUERY);
-    print STDERR "Finished Reading Query Sequences","\n";
-    return;
-}
-
-sub Bloomfilterlist {
- my $filename = shift;
-
- open(REF, "<$filename") or die "Can't open $filename:$!, \n";   
-
-    $refcount = 0;
-    while(<REF>) {
-
-	
-	if (/\S+/) {
-
-      chomp;
-      $refcount++;
-      push @refs, $_;
-    }
-
-    }
-    
-    close(REF);
-    print STDERR "Finished Reading Ref Filter List","\n"; 
-    return;
-}
-
-
-sub Write_matches {
-    
-	my $filename = shift;
-	$filename .= $matchoutfilesuffix;
-	
-    open (GENOME, ">>$od/$filename") or die "Can't write to $od/$filename: $!\n";
-
-	while (@matches) { 
-
-		my $matchid = pop @matches;
-		
-		print GENOME '>', $$matchid,"\n"; 
-
-	    for (my $i=0;$i<(length($querys{$$matchid})/60);$i++) {
-	    
-	    	print GENOME substr($querys{$$matchid},$i*60,60),"\n";
-		}
-		
-		delete $querys{$$matchid};
-		
-	}
-
-     close (GENOME);
-    return;
-}
-
-sub Write_mismatches {
-    my $filename = shift;
-    $filename .= $mismatchoutfilesuffix;
-
-    open (GENOME, ">>$od/$filename") or die "Can't write to $od/$filename: $!\n";
-   
-    foreach my $id (keys %allshort) {
-	
-	print GENOME '>', $id,"\n";
-	
-	for (my $i=0;$i<(length(${$allshort{$id}})/60);$i++) {
-	    
-	    print GENOME substr(${$allshort{$id}},$i*60,60),"\n";
-	    
-	}
-	
-	delete $allshort{$id};
-    }
-     close (GENOME);
-    return;
-}
-
diff --git a/doc/GC_.py b/doc/GC_.py
new file mode 100644
index 0000000..57c9fef
--- /dev/null
+++ b/doc/GC_.py
@@ -0,0 +1,48 @@
+from Bio import SeqIO
+from Bio.Seq import Seq, reverse_complement, transcribe, back_transcribe, translate
+import re
+import time
+
+A_t = 0;
+G_t = 0;
+C_t = 0;
+T_t = 0;
+All = 0;
+All_r = 0;
+
+'''
+x1 = open("test.fna")
+for seq in SeqIO.parse(x1,'fasta'):
+    All_r+=1;
+    for word in seq:
+        All+=1;
+        if word=='A':
+            A_t+=1;
+        elif word=='C':
+            C_t+=1;
+        elif word=='G':
+            G_t+=1;
+        elif word=='T':
+            T_t+=1;
+
+print 'A_t->',A_t,'G_t->',G_t,'C_t->',C_t,'T_t->',T_t;
+print 'ALL->',All;
+print 'average length->', All//All_r;
+'''
+signal = 0;
+New_string = '';
+x1 = open("10xtest_GC44.fasta")
+for seq in SeqIO.parse(x1,'fasta'):
+    if signal == 0:
+       New_string+='>'
+       New_string+=seq.id;
+       New_string+='\n'
+       New_string+=seq.seq.tostring();
+       New_string+='\n';
+       signal = 1;
+    elif signal == 1:
+        signal = 0;
+        
+x2 = open("single.fasta",'w')
+x2.write(New_string)
+x2.close()
diff --git a/doc/ROC.m b/doc/ROC.m
new file mode 100644
index 0000000..0535a6c
--- /dev/null
+++ b/doc/ROC.m
@@ -0,0 +1,169 @@
+%%% OLD
+%{
+all = [93645	93641	93629	93611	93448	93224	85442	64825	33489
+88850	85935	75140	52472	26807	10760	4177	2180	1830
+37009	16940	6085	2700	1934	1818	1803	1791	1786
+5116	2254	1874	1819	1805	1798	1790	1786	1767
+2068	1857	1822	1810	1803	1793	1789	1778	1748
+1909	1829	1813	1805	1798	1791	1783	1763	1733
+]
+
+tp = [1794	1794	1794	1794	1794	1794	1794	1794	1794
+1794	1794	1794	1794	1794	1788	1783	1778	1778
+1786	1780	1778	1778	1778	1778	1778	1776	1776
+1779	1778	1778	1778	1777	1777	1776	1774	1763
+1778	1778	1778	1778	1777	1777	1776	1773	1745
+1778	1778	1778	1778	1777	1776	1771	1758	1731
+]
+%}
+
+%%% NEW
+%{
+all = [76239	18890	3488	1844	1755	1450	544	22	0
+3803	1810	1792	1747	1521	830	181	8	0
+2020	1793	1768	1765	1237	555	96	4	0
+1901	1789	1750	1575	1055	413	66	1	0
+1850	1776	1726	1475	894	325	49	1	0
+1800	1772	1680	1368	763	259	39	1	0
+]
+
+tp = [1794	1789	1778	1778	1748	1446	544	22	0
+1789	1778	1776	1741	1519	829	181	8	0
+1778	1776	1764	1764	1236	554	96	4	0
+1776	1773	1747	1574	1054	413	66	1	0
+1771	1766	1723	1474	893	325	49	1	0
+1763	1763	1678	1367	762	259	39	1	0
+]
+%}
+
+%%% CHB
+%{
+all = [91326	62121	19346	4677	1926	1783	1581	357	0
+6351	2003	1813	1799	1777	1693	1037	108	0
+1845	1813	1799	1786	1750	1535	756	55	0
+1828	1806	1795	1775	1715	1391	581	42	0
+1819	1806	1790	1764	1658	1241	484	31	0
+1812	1804	1783	1749	1607	1112	389	21	0
+]
+
+tp = [1794	1794	1787	1778	1778	1774	1577	357	0
+1778	1778	1778	1778	1769	1691	1036	108	0
+1778	1778	1778	1773	1747	1534	755	55	0
+1778	1778	1777	1766	1713	1390	580	42	0
+1778	1778	1774	1758	1656	1240	483	31	0
+1778	1778	1770	1744	1605	1111	388	21	0
+]
+%}
+sample = 1794
+%%%%----------------------------------------------------------------
+%%% OLD
+
+all = [82122	79488	76829	73919	70699	67584	65067	63269
+72104	69232	66812	64945	63704	62011	62169	61265
+65793	64489	63690	63050	62545	62011	61310	59976
+64111	63532	63020	62570	62126	61537	60605	58694
+63632	63107	62683	62237	61781	61113	59840	57237
+63332	62871	62431	62007	61497	60642	58906	55662
+]
+
+tp = [55137	55137	55137	55136	55136	55133	55119	55045
+55137	55137	55137	55136	55130	55114	55005	54568
+55137	55136	55135	55133	55123	55041	54715	53724
+55137	55136	55134	55125	55089	54895	54297	52690
+55132	55131	55128	55108	55013	54680	53716	51389
+55133	55132	55124	55070	54920	54394	52969	50011
+]
+%%%NEW
+%{
+all= [63031	61442	56426	41210	17921	3350	98	0
+62015	59624	51000	32294	12226	2158	70	0
+61427	57620	46338	26309	9075	1559	56	0
+60771	55443	42006	22076	7199	1110	54	0
+59949	53054	37785	18688	5825	825	50	0
+58947	50482	34152	15981	4691	675	46	0
+]
+tp = [55118	54623	50503	36586	15450	2754	83	0
+55001	53382	45495	28197	10117	1649	57	0
+54769	51680	41154	22636	7348	1147	43	0
+54374	49726	37128	18770	5744	819	41	0
+53784	47516	33212	15703	4558	603	37	0
+52983	45140	29880	13301	3609	487	34	0
+]
+%}
+%%%CHB
+%{
+all = [65308	63416	62267	60141	50654	24228	2155	0
+63226	62430	61349	57411	43730	17341	1322	0
+62747	61979	60040	54627	38021	13519	870	0
+62424	61571	59317	51727	33488	10850	672	0
+62176	61141	58029	48752	29403	8825	557	0
+61909	60533	56481	45688	25842	7332	480	0
+]
+tp = [55136	55130	55018	53720	45230	21061	1714	0
+55133	55103	54688	51451	38737	14643	999	0
+55129	55010	54070	48933	33411	11168	655	0
+55113	54875	53229	46254	29224	8812	493	0
+55084	54647	52129	43747	24549	7082	396	0
+55009	54246	50786	40617	22213	5813	333	0
+] 
+%}
+sample = 55137
+fp = all-tp
+fn = sample-tp
+tn = 100000-sample-fp
+tpr = tp./(tp+fn);
+fpr = fp./(fp+tn);
+
+Q_f = fp./all
+Q_f;
+hold on
+%for i=1:6,
+
+%tpr = tp(1,:)./(tp(1,:)+fn(1,:));
+%fpr = fp(1,:)./(fp(1,:)+tn(1,:));
+%plot(fpr(1,:),tpr(1,:),'--rs');  %12
+%plot(fpr(2,:),tpr(2,:),'--bs');  %13
+%plot(fpr(3,:),tpr(3,:),'--cs');  %14
+%plot(fpr(4,:),tpr(4,:),'-gs');   %15
+%plot(fpr(5,:),tpr(5,:),'-ms');   %16
+%plot(fpr(6,:),tpr(6,:),'-ks');   %17
+%end 
+hold off
+%fp = fp.*fp.*fp.*fp.*fp.*fp.*fp.*fp.*fp.*fp.*fp
+
+%{
+%vx = [0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1]
+vx = [0.3 0.4 0.5 0.6 0.7 0.8 0.9 1]
+hold on 
+plot(vx,Q_f(1,:),'--rs');  %12
+plot(vx,Q_f(2,:),'--bs');  %13
+plot(vx,Q_f(3,:),'--cs');  %14
+plot(vx,Q_f(4,:),'-gs');   %15
+plot(vx,Q_f(5,:),'-ms');   %16
+plot(vx,Q_f(6,:),'-ks');   %17
+hold off
+%acc = (tp+tn)/100000
+%tpr
+%fpr
+%acc
+%}
+%plot(tpr,tpr,'black')
+
+
+%{
+tp = [1794	1794	1792	1786	1779	1778	1778	1778	1775
+1781	1778	1778	1778	1776	1771	1774	1754	1721
+1778	1778	1778	1778	1776	1770	1758	1720	1617
+1778	1778	1778	1776	1772	1761	1732	1661	1531
+1778	1778	1778	1773	1765	1748	1702	1602	1420
+1778	1778	1777	1768	1758	1730	1649	1521	1308
+]
+
+all = [92129	74168	37245	14002	5541	2677	1923	1811	1787
+8113	2340	1847	1812	1800	1795	1784	1763	1725
+1856	1819	1807	1798	1792	1781	1762	1723	1618
+1837	1812	1801	1794	1783	1767	1735	1663	1532
+1824	1802	1800	1789	1775	1752	1704	1603	1421
+1818	1804	1797	1780	1767	1733	1651	1522	1309
+]
+%}
\ No newline at end of file
diff --git a/doc/prop.m b/doc/prop.m
new file mode 100644
index 0000000..b8837c0
--- /dev/null
+++ b/doc/prop.m
@@ -0,0 +1,41 @@
+hit =[1670777
+464386
+131302
+44158
+22436
+17064
+]
+
+hit1 = [10342350
+3092821
+842175
+241199
+88697
+50011
+]
+
+unhit = [47329223
+48335614
+48468698
+48355842
+48177564
+47982936
+]
+
+unhit1 = [39457650
+46507179
+48557825
+48958801
+48911303
+48749989
+]
+
+total = hit+unhit
+total1 = hit1+unhit1
+
+polp = hit./total
+polp1 = hit1./total1
+hold on
+plot(polp,'-rs');
+plot(polp1,'-ks');
+hold off
\ No newline at end of file
diff --git a/drass/Makefile b/drass/Makefile
new file mode 100644
index 0000000..fb19009
--- /dev/null
+++ b/drass/Makefile
@@ -0,0 +1,44 @@
+CFLAGS=-O3 -DFIFO -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE -Wall -fopenmp -g -DNODEBUG -lm -lz
+.PHONY: tests clean valgrind
+.SUFFIXES:.c .o
+PROG=facs
+
+LOBJS= big_query.o bloom.o file_dir.o good_build.o lookup8.o suggestions.o tool.o simple_check_1_ge.o simple_remove.o
+AOBJS= big_query.o bloom.o file_dir.o good_build.o lookup8.o suggestions.o tool.o simple_check_1_ge.o simple_remove.o
+
+all:$(PROG)
+
+tests: python
+	nosetests -v -s ../tests
+
+valgrind: python
+	valgrind --tool=memcheck --suppressions=../tests/utils/valgrind-python.supp nosetests -v -s ../tests/test_basic.py
+
+mpi:
+	@echo Make sure you have MPI support on your cluster hint: module load openmpi
+	#mpicc -c *.c ${CFLAGS}
+	#mpicc -c mpi_decon.c -O3 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE 
+	#mpicc -o mpi_decon mpi_decon.o bloom.o suggestions.o lookup8.o  -lm ${CFLAGS}
+	#mpicc -o mpi_bloom mpi_bloom.o bloom.o suggestions.o lookup8.o file_dir.o -lm ${CFLAGS}
+	#mpicc -o mpi_bloom_l mpi_bloom_l.o bloom.o suggestions.o lookup8.o file_dir.o -lm ${CFLAGS}
+	#mpirun -np 1 ./mpi_bloom_l -l tzcoolman  -q test.fna
+
+python:
+	rm -rf build/ ${PROG}.so && python setup.py build_ext --inplace && python setup.py develop
+
+clean:
+	rm -f core.* vgcore.* *.o *.so *.a *.info ${PROG}
+
+
+.c.o:
+		$(CC) -c $(DFLAGS) $(INCLUDES) $< -o $@ $(CFLAGS) 
+
+${PROG}:lib${PROG}.a $(AOBJS)
+
+${PROG}:lib${PROG}.a $(AOBJS) main.o
+		$(CC) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -l${PROG} $(LIBS) $(CFLAGS) 
+
+lib${PROG}.a:$(LOBJS)
+		$(AR) -csru $@ $(LOBJS)
+
+main.o: big_query.h bloom.h build.h check.h file_dir.h hashes.h tool.h remove.h remove_l.h
diff --git a/drass/big_query.c b/drass/big_query.c
new file mode 100644
index 0000000..58f83d5
--- /dev/null
+++ b/drass/big_query.c
@@ -0,0 +1,277 @@
+#include <zlib.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include "tool.h"
+#include "bloom.h"
+#include "check.h"
+#include "file_dir.h"
+#include "big_query.h"
+/*-------------------------------------*/
+#include <omp.h>
+/*-------------------------------------*/
+#define ONEG 1000000000
+#define ONE 100
+/*-------------------------------------*/
+
+
+static int
+query_usage(void)
+{
+    fprintf(stderr, "\nUsage: ./facs query [options]\n");
+    fprintf(stderr, "Options:\n");
+    fprintf(stderr, "\t-r reference bloom filter to query against\n");
+    return 1;
+}
+
+int bq_main(int argc, char** argv)
+{
+  if (argc < 2) return query_usage();
+  
+/*-------defaults for bloom filter building-------*/ 
+  int opt;
+  float tole_rate = 0;
+  float sampling_rate = 1;
+
+  char* ref = NULL;
+  char* list = NULL;
+  char* target_path = NULL;
+  char* source = NULL;
+
+  while ((opt = getopt (argc, argv, "s:t:r:o:q:l:h")) != -1) {
+      switch (opt) {
+          case 't':
+              (optarg) && ((tole_rate = atof(optarg)), 1);
+              break;
+          case 's':
+              (optarg) && ((sampling_rate = atof(optarg)), 1);
+              break;
+          case 'o':    
+              (optarg) && ((target_path = optarg), 1);
+              break;
+          case 'q':  
+              (optarg) && (source = optarg, 1);  
+              break;
+          case 'r':  
+              (optarg) && (ref = optarg, 1);  
+              break;
+          case 'l':
+              (optarg) && (list = optarg, 1);  
+              break;
+          case 'h':
+              return query_usage();
+          case '?':
+              printf ("Unknown option: -%c\n", (char) optopt);
+              return query_usage();
+      } 
+  } 
+
+  return query(source, ref, tole_rate, sampling_rate, list, target_path);
+}
+
+int query(char* query, char* bloom_filter, double tole_rate, double sampling_rate, char* list, char* target_path)
+{
+
+  gzFile zip;
+  int type = 0;
+  BIGCAST offset = 0;
+  char *detail = (char*) malloc((ONE*ONE*ONE)*sizeof(char));
+  char *position =  (char*) malloc((ONEG+1)*sizeof(char));
+  
+  bloom *bl_2 = NEW (bloom);
+  F_set *File_head = make_list (bloom_filter, list);
+
+  File_head->reads_num = 0;
+  File_head->reads_contam = 0;
+  File_head->hits = 0;
+  File_head->filename = bloom_filter;
+  load_bloom (File_head->filename, bl_2);  //load a bloom filter
+  if (tole_rate==0)
+      tole_rate = mco_suggestion (bl_2->k_mer); 
+  
+  if ((zip = gzopen(query, "rb"))<0) {
+	 perror("query open error...\n");
+     exit(-1);
+  }
+
+  if (strstr(query, ".fastq") || strstr(query, ".fq"))
+      type = 2;
+  else
+      type = 1;
+ 
+  
+  while (offset!=-1) {   
+	offset = CHUNKer(zip,offset,ONEG,position,type);
+    Queue *head = NEW (Queue);
+    head->location = NULL;
+    Queue *tail = NEW (Queue);
+    head->next = tail;
+    Queue *head2 = head;
+    get_parainfo (position, head);
+
+#pragma omp parallel
+{
+#pragma omp single nowait
+	{
+	  while (head != tail)
+	    {
+#pragma omp task firstprivate(head)
+	      {
+	if (head->location!=NULL) {
+            if (type == 1) {
+                fasta_process (bl_2, head, tail, File_head, sampling_rate,
+                               tole_rate);
+		    } else {
+                fastq_process (bl_2, head, tail, File_head, sampling_rate,
+                               tole_rate);
+            }
+         }
+	  }
+	      head = head->next;
+	    }       // End of firstprivate
+	}			// End of single - no implied barrier (nowait)
+}				// End of parallel region - implied barrier
+                //evaluate (detail, File_head->filename, File_head);
+ 
+  if (position != NULL) {
+      memset (position, 0, strlen(position));
+      //free (position);
+      }
+  else {
+      perror("Cannot memset, wrong position on fastq file\n");
+      exit(-1);
+  }
+
+  clean_list (head2, tail);
+  
+    }				//end while
+  free(position);
+  evaluate (detail, File_head->filename, File_head);
+  gzclose(zip);
+  bloom_destroy (bl_2);
+  statistic_save (detail, query, target_path);
+  
+  return 0;
+}
+
+char *strrstr(char *s, char *str)
+{
+    char *p; 
+    int len = strlen(s);
+    for (p = s + len - 1; p >= s; p--) {
+        if ((*p == *str) && (memcmp(p, str, strlen(str)) == 0)) 
+            return p;
+    }   
+    return NULL;
+}
+
+void clean_list (Queue* head, Queue *tail)
+{
+Queue *element;
+while (head!=tail)
+   {
+       element = head->next;
+       memset(head,0,sizeof(Queue));
+       free(head);
+       head = element;
+   }
+free(tail);
+}
+
+
+BIGCAST CHUNKer(gzFile zip,BIGCAST offset,int chunk,char *data,int type)
+{
+    char c, v;
+    char *pos = NULL;
+    int length = 0;
+    
+    if (type == 2)
+        v = '@';
+    else 
+        v = '>';
+
+    if (offset == 0)
+        while (offset <10*ONE)
+        {
+            c = gzgetc(zip);
+            if (c == v)
+                break;
+            offset++;
+        }
+        
+    gzseek (zip,offset,SEEK_SET);
+    gzread (zip,data,chunk);
+    
+    if (data != NULL)
+        length = strlen(data);
+
+    if (length>=chunk) {
+        if (type == 2) {
+            pos = strrstr (data,"\n+");
+            pos = bac_2_n (pos-1);
+        } else {
+            pos = strrchr (data,'>')-1;	
+        }
+    }
+
+    if (pos) {
+        offset += (pos-data);
+        memset (pos, 0, strlen(pos));
+    }
+
+    if (length<chunk)
+        offset=-1;
+
+    return offset;
+}
+
+BIGCAST CHUNKgz(gzFile zip, BIGCAST offset,int chunk,char *position,char *extra,int type)
+{
+	        memset(position,0,chunk);
+  			  char c, *position2 = position;
+  			  char *x;
+  			  int num=0;
+  	      if (offset == 0)
+          while (offset <10*ONE)
+          {
+             c = gzgetc(zip);
+    	       if ((c == '@' && type==2)&&(c == '>' && type==1))
+       	          break;
+             offset++;
+          }		
+          if (extra!=NULL)
+          	  {
+          	  memcpy(position,extra,strlen(extra));
+          	  position+=strlen(extra);
+          	  }
+          free(extra);
+          while (((c=gzgetc(zip))!=EOF)&&(num<chunk))
+          {
+             *position=c;
+             position++;
+             num++;
+          }
+	        x = strrstr(position2,"\n@");
+	        extra = (char*)malloc((position-x+1)*sizeof(char));
+	        memcpy (x,extra,position-x+1);
+	        offset+=(position-x+1);
+
+        return offset;	
+}
+
+char *bac_2_n (char *filename)
+{
+     while (*filename!='\n')
+           filename--;
+     filename--;      //move from \n
+     while (*filename!='\n')
+           filename--;
+     filename++;
+     return filename;
+}
diff --git a/drass/big_query.h b/drass/big_query.h
new file mode 100644
index 0000000..7a2ab44
--- /dev/null
+++ b/drass/big_query.h
@@ -0,0 +1,15 @@
+#ifndef _BIGQUERY
+#define _BIGQUERY
+
+#include "check.h"
+#include "bloom.h"
+#include <zlib.h>
+extern char *bac_2_n (char *filename);
+extern char *strrstr(char *s, char *str);
+//extern BIGCAST get_size (char *filename);
+extern void clean_list (Queue* head, Queue *tail);
+extern BIGCAST CHUNKer(gzFile zip,BIGCAST offset,int chunk,char *data,int type);
+extern BIGCAST CHUNKgz(gzFile zip, BIGCAST offset,int chunk,char *position,char *extra,int type);
+extern int bq_main (int argc, char **argv);
+extern int query (char* query, char* bloom_filter, double tole_rate, double sampling_rate, char* list, char* target_path);
+#endif
diff --git a/drass/bloom.c b/drass/bloom.c
new file mode 100644
index 0000000..d132efd
--- /dev/null
+++ b/drass/bloom.c
@@ -0,0 +1,639 @@
+#define _LARGEFILE_SOURCE
+#define _LARGEFILE64_SOURCE
+#define _FILE_OFFSET_BITS 64
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <errno.h>
+#include "bloom.h"
+#include "hashes.h"
+#include "file_dir.h"
+
+/*---------------------------*/
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+
+int seed[20] =
+  { 152501029, 152501717, 152503097, 152500171, 152500157, 152504837,
+  10161313, 10371313, 10431313, 10501313, 10581313, 10611313, 10641313,
+  10651313,
+  10671313, 10731313, 10821313, 10881313, 10951313, 11001313
+};
+
+int
+bloom_init (bloom * bloom, BIGNUM size, BIGNUM capacity, double error_rate,
+	    int hashes, hash_t hash, int flags)
+{
+  if (size < 1)
+    {
+      fprintf (stderr, "overflow1\n");
+      return -1;
+    }
+  else
+    {
+      /* this may waste a little time, but we need to ensure
+       * that our array has a prime number of elements, even
+       * if we have been requested to do otherwise */
+      bloom->stat.elements = find_close_prime (size);
+    }
+
+  if (hashes < 1)
+    {
+#ifdef DEBUG
+      fprintf (stderr, "hashes was %d,size %lld\n", hashes, size);
+#endif
+      return -1;
+
+    }
+  else
+    {
+      bloom->stat.ideal_hashes = hashes;
+    }
+
+  if (hash == NULL)
+    {
+      bloom->hash = (hash_t) HASH_FUNC;
+    }
+  else
+    {
+      bloom->hash = hash;
+    }
+
+  bloom->inserts = 0;
+
+	/**
+	If error rate and capacity were not specified, but size and num hashes were,
+	we can calculate the missing elements.
+	**/
+  if (capacity == 0 || error_rate == 0)
+    {
+      // From wikipedia, num hashes k that minimizes probability of error is k =~ (0.7 m) / n
+      // Therefore n =~ (0.7 m) / k
+      bloom->stat.capacity = 0.7 * bloom->stat.elements / hashes;
+      bloom->stat.e = powf (2.0, (float) -1 * hashes);
+    }
+  else
+    {
+      bloom->stat.capacity = capacity;
+      bloom->stat.e = error_rate;
+    }
+
+#ifdef DEBUG
+  fprintf (stderr, "bloom_init(%lld,%d) => (%lld,%d) =>%f\n",
+	   (BIGCAST) size, hashes, (BIGCAST) bloom->stat.elements,
+	   bloom->stat.ideal_hashes, bloom->stat.e);
+#endif
+
+  if ((size > TOPLIMIT))
+    {
+      fprintf (stderr, "overflow2\n");
+      return -2;
+    }
+
+  /* allocate our array of bytes.  where m is the size of our desired 
+   * bit vector, we allocate m/8 + 1 bytes. */
+  if ((bloom->vector = (char *) malloc (sizeof (char) *
+					((long long) (bloom->stat.elements /
+						      8) + 1))) == NULL)
+    {
+      perror ("malloc");
+      return -1;
+    }
+  else
+    memset (bloom->vector, 0,
+	    sizeof (char) * ((long long) (bloom->stat.elements / 8) + 1));
+
+  /* generate a collection of random integers, to use later
+   * when salting our keys before hashing them */
+
+  //sketchy_randoms(&bloom->random_nums,hashes);
+  //bloom->vector = "11111111";
+  //printf("vector size-> %d\n",sizeof(bloom->vector));
+  //memset(bloom->vector,0,sizeof(bloom->vector));
+
+  return 0;
+}
+
+void
+bloom_destroy (bloom * bloom)
+{
+
+  memset (bloom->vector, 0,
+	  sizeof (char) * ((long long) (bloom->stat.elements / 8) + 1));
+  free (bloom->vector);
+
+}
+
+int
+bloom_check (bloom * bloom, char *str)
+{
+//printf("In bloom_check\n");
+  return bloom_test (bloom, str, RO);
+}
+
+int
+bloom_add (bloom * bloom, char *str)
+{
+  int ret;
+  //printf("key--> %s\n",str);
+  ret = bloom_test (bloom, str, SET);
+  if (ret == 0)
+    {
+      bloom->inserts++;
+    }
+  return ret;
+}
+
+int
+bloom_test (bloom * bloom, char *str, int mode)
+{
+  int i, hit;
+  BIGNUM ret;
+  //printf("In test\n");
+  /* as many times as our ideal hash count dictates, salt our key
+   * and hash it into the bit vector */
+  hit = 1;
+  for (i = 0; i < bloom->stat.ideal_hashes; i++)
+    {
+
+      ret = bloom_hash (bloom, str, i, bloom->k_mer);
+
+      if (!test (bloom->vector, ret))
+	{
+	  hit = 0;
+	  if (mode == SET)
+	    {
+	      set (bloom->vector, ret);
+	    }
+	  else
+	    {
+	      /* if we are merely testing, we are done. */
+	      return hit;
+	    }
+	}
+    }
+
+  return hit;
+}
+
+BIGNUM
+bloom_hash (bloom * bloom, char *str, int i, int length)
+{
+  BIGNUM ret = 0;
+
+  ret = (BIGNUM) hash5 (str, seed[i], length) % (BIGNUM) bloom->stat.elements;
+
+  return ret;
+}
+
+int
+set (char *big, BIGNUM index)
+{
+  deref dr;
+
+  finder (index, &dr);
+  big[dr.index] += dr.spot;
+
+  return 0;
+}
+
+int
+test (char *big, BIGNUM index)
+{
+  deref dr;
+  char bucket;
+
+  finder (index, &dr);
+  bucket = big[dr.index];
+
+  if ((bucket & dr.spot) == dr.spot)
+    {
+      return 1;
+    }
+  else
+    {
+      return 0;
+    }
+
+}
+
+int
+finder (BIGNUM index, deref * dr)
+{
+
+  //dr->index = (BIGNUM) (index / 8);
+  //dr->spot = pow (2, (index % 8));
+  dr->index = (BIGNUM) (index >> 3);
+  //dr->spot = pow (2, (index % 8));
+  //dr->spot = 0x80;
+  //dr->spot = dr->spot >> (index & 0x07);
+  dr->spot = pow(2,(index & 0x07));
+  return 0;
+}
+
+
+BIGNUM
+report_capacity (bloom * bloom)
+{
+  return bloom->stat.capacity;
+}
+
+char* 
+prefix_make (char *filename, char *prefix, char *target)
+{
+    char *position1 = strrchr (filename, '/');
+
+    char *bloom_file = (char *) malloc (300 * sizeof (char));
+    memset (bloom_file, 0, 300);
+    if (is_dir(target)) {
+        strcat (bloom_file,target);
+        strcat (bloom_file,filename);
+    }  else if (target) {
+        strcat (bloom_file,target);
+    }
+       else if (target!=NULL && prefix!=NULL) {
+        if (position1!=NULL)
+            strncat (bloom_file,position1,strrchr(position1,'.')-position1);
+        else
+            strncat (bloom_file,filename,strrchr(filename,'.')-filename);
+        strcat (bloom_file, ".bloom"); 
+    }
+       else
+       {
+        if (position1!=NULL)
+            strncat (bloom_file,position1,strrchr(position1,'.')-position1);
+        else
+            strncat (bloom_file,filename,strrchr(filename,'.')-filename);
+       }
+
+    return bloom_file;
+}
+
+int
+save_bloom (char *filename, bloom * bl, char *prefix, char *target)
+{
+  char *bloom_file = NULL;
+  int fd;
+
+  bloom_file = prefix_make(filename, prefix, target);
+
+#ifdef DEBUG
+  printf("Bloom file to be written in: %s\n", bloom_file);
+#endif
+
+
+  //if (bloom_file[0]=='/')
+  //    bloom_file++;
+  if (prefix==NULL && target==NULL)
+      strcat (bloom_file,".bloom");
+  else if (is_dir(target))
+      strcat (bloom_file,".bloom");
+
+#ifdef __APPLE__
+  fd = open (bloom_file, O_RDWR | O_CREAT, PERMS);
+#else // assume linux
+  fd = open (bloom_file, O_RDWR | O_CREAT | O_LARGEFILE, PERMS);
+#endif
+
+  if (fd < 0)
+    {
+      perror (bloom_file);
+      return -1;
+    }
+
+  BIGNUM total_size =
+    sizeof (bloom) + sizeof (char) * ((long long) (bl->stat.elements / 8) +
+				      1) +
+    sizeof (int) * (bl->stat.ideal_hashes + 1);
+
+#ifdef __APPLE__
+  if (ftruncate (fd, total_size) < 0)
+#else
+  if (ftruncate64 (fd, total_size) < 0)
+#endif
+    {
+      printf ("[%d]-ftruncate64 error: %s/n", errno, strerror (errno));
+      close (fd);
+      return 0;
+    }
+
+  if(write (fd, bl, sizeof (bloom)) < 0) {
+      perror (" error writing bloom file ");
+      exit (EXIT_FAILURE);
+  }
+
+  total_size = (long long) (bl->stat.elements / 8) + 1;
+
+  BIGNUM off = 0;
+  while (total_size > TWOG)
+    {
+      if(write(fd, bl->vector + off, sizeof (char) * TWOG) < 0) {
+	      perror (" error writing bloom file ");
+	      exit (EXIT_FAILURE);
+      }
+      total_size -= TWOG;
+      off += TWOG;
+    }
+  if (write (fd, bl->vector + off, sizeof (char) * total_size) < 0){
+	      perror (" error writing bloom file ");
+	      exit (EXIT_FAILURE);
+  };
+  close (fd);
+
+  memset (bl->vector, 0,
+	  sizeof (char) * ((long long) (bl->stat.elements / 8) + 1));
+
+#ifdef DEBUG
+  printf ("big file process OK\n");
+#endif
+  return 0;
+
+}
+
+int
+load_bloom (char *filename, bloom * bl)
+{
+  int fd = 0;
+  int ret;
+
+#ifdef DEBUG
+  printf ("bloom name->%s\n", filename);
+#endif
+
+#ifdef __APPLE__
+  fd = open (filename, O_RDONLY, PERMS);
+#else
+  fd = open64 (filename, O_RDONLY, PERMS);
+#endif
+  if (fd < 0) {
+      perror (filename);
+      return -1;
+  }
+
+  if (read (fd, bl, sizeof (bloom)) < 0){
+     perror("Problem reading bloom filter");
+  };
+
+  bl->vector =
+    (char *) malloc (sizeof (char) *
+		     ((long long) (bl->stat.elements / 8) + 1));
+
+  BIGNUM off = 0, total_size = ((long long) (bl->stat.elements / 8) + 1);
+
+  while (total_size > TWOG) {
+      ret = read(fd, bl->vector + off, sizeof (char) * TWOG);
+      if (ret < 0)
+          perror("Problem reading bloom filter");
+      total_size -= TWOG;
+      off += TWOG;
+  }
+
+  ret = read (fd, bl->vector + off, sizeof (char) * total_size);
+
+#ifdef DEBUG
+  if (ret > 0)
+      printf ("bloom filter read successfully\n");
+  else ret = errno;
+#endif
+
+  close (fd);
+  return ret;
+}
+
+void
+write_result (char *filename, char *detail)
+{
+  int fd;
+
+  fd = open (filename, O_CREAT | O_RDWR, S_IRWXU);
+  if (write (fd, detail, strlen (detail)) < 0) {
+      perror (" error writing result file ");
+      exit (EXIT_FAILURE);
+  }
+
+  close (fd);
+}
+
+void
+rev_trans (char *s)
+{
+
+  int i;
+  int j;
+
+  for (i = 0, j = strlen (s) - 1; i < j; ++i, --j)
+    {
+      char c = s[i];
+      s[i] = s[j];
+      s[j] = c;
+    }
+
+  i = 0;
+
+  while (i < strlen (s))
+    {
+      switch (s[i])
+	{
+	case 'A':
+	  s[0] = 'T';
+	  break;
+	case 'C':
+	  s[0] = 'G';
+	  break;
+	case 'G':
+	  s[0] = 'C';
+	  break;
+	case 'T':
+	  s[0] = 'A';
+	  break;
+	case 'a':
+	  s[0] = 't';
+	  break;
+	case 'c':
+	  s[0] = 'g';
+	  break;
+	case 'g':
+	  s[0] = 'c';
+	  break;
+	case 't':
+	  s[0] = 'a';
+	  break;
+	}
+      s++;
+    }
+
+}
+
+char *
+mmaping (char *source)
+{
+
+  struct stat statbuf;
+
+  int src;
+  char *sm;
+
+  if ((src = open(source, O_RDONLY | O_LARGEFILE)) < 0)
+    {
+      perror (" open source ");
+      exit (EXIT_FAILURE);
+    }
+
+  if (fstat (src, &statbuf) < 0)
+    {
+      perror (" fstat source ");
+      exit (EXIT_FAILURE);
+    }
+
+  sm =
+    mmap (0, (BIGCAST) statbuf.st_size, PROT_READ, MAP_SHARED | MAP_NORESERVE,
+	  src, 0);
+
+  if (MAP_FAILED == sm)
+    {
+      perror (" mmap source ");
+      exit (EXIT_FAILURE);
+    }
+
+  return sm;
+}
+
+void
+build_help ()
+{
+  printf ("USAGE\n");
+  printf
+    ("##########################################################################\n");
+  printf ("---Bloom build----\n");
+  printf ("#  ./facs -m b [option] [option] [option] [option] <option>\n");
+  printf ("#\n");
+  printf ("#  Options:\n");
+  printf ("#  -m Mode selection: b or build can be taken\n");
+  printf ("#  -r reference file name or directory name\n");
+  printf ("#  -l a list containing multiple reference filenames\n");
+  printf ("!!! either -r or -l can only be allowed each time !!!\n");
+  printf ("#  -k k_mer size (default size 21)\n");
+  printf ("#  -e error rate (default rate 0.0005)\n");
+  printf ("#  -b 1 means show help description; 0 means normal bloom-build\n");
+  printf
+    ("#  -o output file name (default file is saved as the same as binary file)\n");
+  printf
+    ("##########################################################################\n");
+  exit (1);
+}
+
+void
+check_help ()
+{
+  printf ("USAGE\n");
+  printf
+    ("##########################################################################\n");
+  printf ("---contamination check---\n");
+  printf ("#  ./facs -m c [option] [option] [option] [option] [option] <option>\n");
+  printf ("#\n");
+  printf ("#  Options:\n");
+  printf ("#  -m Mode selection: b or build can be taken\n");
+  printf ("#  -t tolerant rate (default rate 0.8)\n");
+  printf ("#  -s sampling rate (default rate 1)\n");
+  printf ("#  -q query file name\n");
+  printf ("#  -l a list containing all bloom files\n");
+  printf ("#  -r single reference bloom filter file or directory\n");
+  printf ("!!! either -r or -l can only be allowed each time !!!\n");
+  printf ("#  -b 1 means show help description; 0 means normal check\n");
+  printf
+    ("#  -o output file name (default file is saved as the same path as the binary file)\n");
+  printf ("!  Either '-q' or '-l' is used at one run.\n");
+//printf("#\n");   
+//printf("#   *'1' is mode 1. For instance, when you use a ecoli filter and want to capture every contaminated read that caused by\n"); 
+//printf("#   ecoli in the 'human.fna' query file, use mode 1. Mode 2 is currently under evaluation\n");
+//printf("# \n");
+//printf("#   *When you would like to remove every possible contamination in human.fna, you can use a human bloom filter and mode 2 \n");
+//printf("#   to do that. It will capture everything that doesn't belong to human.\n");
+//printf("#\n");
+  printf
+    ("##########################################################################\n");
+  exit (1);
+}
+
+void
+remove_help ()
+{
+  printf ("USAGE\n");
+  printf
+    ("##########################################################################\n");
+  printf ("---contamination remove---\n");
+  printf ("#  ./facs -m r [option] [option] [option] [option] <option>\n");
+  printf ("#\n");
+  printf ("#  Options:\n");
+  printf ("#  -m Mode selection: b or build can be taken\n");
+  printf ("#  -t tolerant rate (default rate 0.8)\n");
+  printf ("#  -q query file name\n");
+  printf ("#  -l a list containing all bloom files\n");
+  printf ("#  -r reference bloom filter file or dir\n");
+  printf ("!!! either -r or -l can only be allowed each time !!!\n");
+  printf ("#  -b 1 means show help description; 0 means normal decontamination\n");
+  printf
+    ("#  -o output file name (default file is saved as the same path as the binary file)\n");
+  printf
+    ("##########################################################################\n");
+  exit (1);
+}
+
+void
+remove_l_help ()
+{
+  printf ("USAGE\n");
+  printf ("##########################################################################\n");
+  printf ("---contamination remove list mode---\n");
+  printf ("Pretty like mode r but with slight difference\n");
+  printf ("reads will be classified to the most like reference if\nmultiple reference files exist\n");
+  printf ("#  ./facs -m l [option] [option] [option] [option] <option>\n");
+  printf ("#\n");
+  printf ("#  Options:\n");
+  printf ("#  -m Mode selection: b or build can be taken\n");
+  printf ("#  -t tolerant rate (default rate 0.8)\n");
+  printf ("#  -q query file name\n");
+  printf ("#  -l a list containing all bloom files\n");
+  printf ("#  -r reference bloom filter file or dir\n");
+  printf ("!!! either -r or -l can only be allowed each time !!!\n");
+  printf ("#  -b 1 means show help description; 0 means normal decontamination\n");
+  printf ("#  -o output file name (default file is saved as the same path as the binary file)\n");
+  printf ("##########################################################################\n");
+  exit (1);
+}
+
+char *
+large_load (char *fifoname)
+{
+  int x = 0;
+  char ch;
+  FILE *fd;
+
+  printf ("fifoname->%s\n", fifoname);
+#ifdef __APPLE__
+  fd = fopen (fifoname, "r");
+#else
+  fd = fopen64 (fifoname, "r");
+#endif
+
+  char *data = (char *) malloc ((TWOG / 2 + 1) * sizeof (char));
+
+  data[TWOG / 2] = '\0';
+
+  while ((ch = fgetc (fd)) != EOF) {
+      data[x] = ch;
+      x++;
+  }
+
+#ifdef DEBUG
+  printf ("data length->%lld\n", (long long int) strlen(data));
+#endif
+
+  fclose (fd);
+  return data;
+}
diff --git a/drass/bloom.h b/drass/bloom.h
new file mode 100644
index 0000000..cdd2c0c
--- /dev/null
+++ b/drass/bloom.h
@@ -0,0 +1,133 @@
+#ifndef _BLOOM
+#define _BLOOM
+
+#include <limits.h>
+
+#define BIGNUM unsigned long long
+#define BIGNUM_STR "unsigned long"
+#define BIGCAST long long
+#define TOPLIMIT LONG_MAX
+#define PERMS 0644
+#define HUN 1000
+#define NEW(type) (type *) malloc(sizeof(type))
+
+#define FILE_MODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)
+#define TWOG 2000000000
+#define hashsize(n) ((BIGNUM)1<<(n))
+#define hashmask(n) (hashsize(n) - 1)
+
+#if !HAVE_SQRTL
+#define sqrtl(val) ((long double)sqrt((double)val))
+#endif
+
+#define SALTRAND 99999999
+#define CONS 1234567
+
+#define HASH_FUNC hash5
+
+typedef struct
+{
+	BIGNUM index;
+        char spot;
+
+} deref;
+
+typedef BIGNUM (*hash_t)(char *str);
+
+typedef struct 
+{
+	int *num;
+	int cnt;
+} randoms;
+
+typedef struct bloomstat
+{
+	BIGNUM elements; /* size of array */
+	int ideal_hashes; /* num hash functions */
+	BIGNUM capacity; /* number of elements */
+	double e; /* max error rate */
+} bl_stat;
+
+typedef struct
+{
+	char *vector;
+	hash_t hash;
+ 	BIGNUM inserts;
+  struct bloomstat stat;
+  int k_mer;
+  int dx;
+  float mcf;
+} bloom;
+
+typedef struct info
+{
+     char *location;
+     short *score;
+     short *number;                 			
+     struct info *next;          
+} Queue;
+
+typedef struct file_list
+{
+	char *filename;
+        short number;
+        BIGCAST reads_num;
+        BIGCAST reads_contam;
+        BIGCAST hits;
+        BIGCAST all_k;
+	struct file_list *next;
+} F_set;
+/* these are modes to test_all() */
+#define RO 0
+#define SET 1
+#define BVERBOSE 2
+
+/* errors */
+
+#define ERR_MALLOC 1
+#define ERR_OVERFLOW 2
+#define ERR_UNKNOWN 3
+
+BIGNUM mkprime(BIGNUM startval);
+
+
+extern int bloom_init(bloom *bloom, BIGNUM size, BIGNUM capacity,
+                      double error_rate, int hashes, hash_t hash, int flags);
+
+extern int bloom_check(bloom *bloom,char *str);
+extern int bloom_add(bloom *bloom,char *str);
+extern int bloom_test(bloom *bloom,char *str,int MODE);
+extern void bloom_destroy(bloom *bloom);
+
+extern int sketchy_randoms(randoms *rands,int cnt);
+extern int finder (BIGNUM index,deref *dr);
+extern int set(char *big,BIGNUM index);
+extern int test (char *big, BIGNUM index);
+extern BIGNUM bloom_hash(bloom *bloom,char *str, int i, int length);
+extern int bloom_hash_old(bloom *bloom,char *str, int i);
+
+extern BIGNUM find_close_prime(BIGNUM m);
+extern int get_suggestion(struct bloomstat *stats, BIGNUM n,double e);
+extern BIGCAST get_size (char *filename);
+extern int kmer_suggestion (BIGCAST size);
+extern float mco_suggestion (int k_mer);
+extern int is_prime(BIGNUM m);
+extern void get_rec (struct bloomstat *stat);
+extern BIGNUM report_capacity(bloom *bloom);
+
+extern void write_result (char *filename, char *detail);
+extern void build_help(void);
+extern void check_help(void);
+extern void remove_help(void);
+extern void remove_l_help(void);
+extern int save_bloom (char *filename, bloom *bl, char *prefix, char *target);
+extern int load_bloom (char *filename, bloom *bl);
+extern void rev_trans (char *s);
+//extern void cat_print(char *merge, char *remove);
+
+//extern void kmer_suggestion (BIGCAST size);
+//extern float mcf_suggestion (int k_mer);
+extern char *large_load (char *fifoname);
+extern char *mmaping (char *source);
+extern char *prefix_make (char *filename, char *prefix, char *target);
+#endif
diff --git a/drass/build.h b/drass/build.h
new file mode 100644
index 0000000..a744e59
--- /dev/null
+++ b/drass/build.h
@@ -0,0 +1,12 @@
+#ifndef _BUILD
+#define _BUILD
+
+#include "bloom.h"
+extern void ref_add (bloom *bl, char *position);
+extern void fastq_add (bloom *bl, char *position);
+extern void fasta_add (bloom *bl, char *position);
+extern char *fasta_data (bloom *bl_2, char *data);
+extern void init_bloom (bloom *bl, BIGNUM capacity, float error_rate, int k_mer, char *filename);
+extern int build(char *ref_name, char *target_path, int k_mer, double error_rate, char *prefix);
+extern int build_main(int argc, char **argv);
+#endif
diff --git a/drass/check.h b/drass/check.h
new file mode 100644
index 0000000..881afbd
--- /dev/null
+++ b/drass/check.h
@@ -0,0 +1,14 @@
+#ifndef _QUERY
+#define _QUERY
+
+#include "bloom.h"
+extern int check_main (int argc, char **argv);
+extern void evaluate (char *detail, char *filename, F_set *File_head);
+extern void statistic_save (char *detail, char *filename, char* prefix);
+void fasta_process (bloom * bl, Queue * info, Queue * tail, F_set *File_head, float sampling_rate, float tole_rate);
+void fastq_process (bloom * bl, Queue * info, Queue * tail, F_set *File_head, float sampling_rate, float tole_rate);
+int check_all(char *source, char *ref, float tole_rate, float sampling_rate, char *list, char *prefix);
+//extern int check(char *query, char *reference, char l, char *target_path, float sampling_rate, float tole_rate);
+//extern int checky (char *query, char *reference, char l, char *target_path, float sampling_rate, float tole_rate);
+
+#endif
diff --git a/drass/facs.c b/drass/facs.c
new file mode 100644
index 0000000..4caee88
--- /dev/null
+++ b/drass/facs.c
@@ -0,0 +1,79 @@
+#include <Python.h>
+#include "bloom.h"
+#include "file_dir.h"
+#include "tool.h"
+#include "build.h"
+#include "check.h"
+#include "big_query.h"
+#include "remove.h"
+
+static char module_docstring[] =
+    "This module provides an interface for building and querying FACS bloom filters";
+static char bloom_docstring[] =
+    "Builds a FACS bloom filter and performs queries against it.";
+
+/* Available functions */
+static PyObject *facs_bloom_build(PyObject *self, PyObject *args);
+static PyObject *facs_bloom_query(PyObject *self, PyObject *args);
+static PyObject *facs_bloom_remove(PyObject *self, PyObject *args);
+
+static PyMethodDef module_methods[] = {
+        {"build", facs_bloom_build, METH_VARARGS | METH_KEYWORDS, bloom_docstring},
+        {"query", facs_bloom_query, METH_VARARGS | METH_KEYWORDS, bloom_docstring},
+        {"remove", facs_bloom_remove, METH_VARARGS | METH_KEYWORDS, bloom_docstring},
+        {NULL, NULL, 0, NULL}
+};
+
+PyMODINIT_FUNC initfacs(void)
+{
+    PyObject *m = Py_InitModule3("facs", module_methods, module_docstring);
+    if (m == NULL)
+        return;
+}
+
+
+static PyObject *facs_bloom_query(PyObject *self, PyObject *args)
+{
+   double sampling_rate=1;
+   double tole_rate=0;
+   char *qry, *bloom;
+   int ret;
+
+   if (!PyArg_ParseTuple(args, "ss|dd", &qry, &bloom, &tole_rate, &sampling_rate))
+       return NULL;
+   ret = query(qry, bloom, tole_rate, sampling_rate, NULL, NULL);
+
+   return Py_BuildValue("i", ret);
+}
+
+static PyObject *facs_bloom_build(PyObject *self, PyObject *args)
+{
+   char *source, *bloom_filter, *prefix;
+   int ret;
+
+   //FACS operational defaults
+   int k_mer=0;
+   double error_rate=0.0005;
+
+   if (!PyArg_ParseTuple(args, "ss|ids", &source, &bloom_filter, &k_mer, &error_rate, &prefix))
+       return NULL;
+   
+   ret = build(source, bloom_filter, k_mer, error_rate, prefix);
+
+   return Py_BuildValue("i", ret);
+}
+
+
+static PyObject *facs_bloom_remove(PyObject *self, PyObject *args)
+{
+   double tole_rate=0;
+   char *src, *ref, *list, *prefix;
+   int ret;
+
+   if (!PyArg_ParseTuple(args, "ss|ssd", &src, &ref, &list, &prefix, &tole_rate))
+       return NULL;
+   
+   ret = remove_reads(src, ref, NULL, NULL, tole_rate);
+
+   return Py_BuildValue("i", ret);
+}
diff --git a/drass/file_dir.c b/drass/file_dir.c
new file mode 100644
index 0000000..e211b8e
--- /dev/null
+++ b/drass/file_dir.c
@@ -0,0 +1,164 @@
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <limits.h>
+#include <string.h>
+#include <stdio.h>
+#include <limits.h>
+#include "file_dir.h"
+#include "bloom.h"
+
+
+void
+get_file_path (const char *path, const char *file_name, char *file_path)
+{
+  strcpy (file_path, path);
+  if (file_path[strlen (path) - 1] != '/')
+    strcat (file_path, "/");
+  strcat (file_path, file_name);
+}
+
+int
+is_dir (const char *path)
+{
+  struct stat statbuf;
+  if (lstat (path, &statbuf) == 0)
+    {
+      return S_ISDIR (statbuf.st_mode) != 0;
+    }
+  return 0;
+}
+
+int
+is_file (const char *path)
+{
+  struct stat statbuf;
+  if (lstat (path, &statbuf) == 0)
+    return S_ISREG (statbuf.st_mode) != 0;
+  return 0;
+}
+
+int
+is_special_dir (const char *path)
+{
+  return strcmp (path, ".") == 0 || strcmp (path, "..") == 0;
+}
+
+F_set *
+make_list (char *file_user, char *list_user)
+{
+  struct DIR *dir;
+  struct dirent *dir_info;
+
+  F_set *head = NEW (F_set);
+  F_set *head1 = head;
+  
+  char *mimi = NULL;
+  char *pos = NULL;
+  int number = 0;
+
+  if (list_user)
+    {
+      list_user = mmaping (list_user);
+
+      while (list_user != '\0')
+	{
+
+	  mimi = (char *) malloc (300 * sizeof (char) + 1);
+	  memset (mimi, 0, 300);
+
+	  F_set *fset = NEW (F_set);
+
+	  if ((pos = strchr (list_user, '\n'))!=NULL)
+	    strncat (mimi, list_user, pos - list_user);
+	  else
+	    break;
+
+	  fset->filename = mimi;
+	  fset->number = number;
+	  fset->next = head->next;
+	  head->next = fset;
+	  head = head->next;
+	  list_user = pos + 1;
+	  number++;
+	}
+    }
+ 
+  else if (is_file (file_user))
+    {
+   
+      F_set *fset = NEW (F_set);
+      fset->filename = file_user;
+      fset->next = head->next;
+      head->next = fset;
+      head = head->next;
+      head->next = NULL;
+    }
+
+  else if (is_dir (file_user))
+    {
+
+
+      if ((dir = opendir (file_user)) == NULL)
+	{
+	  perror ("Empty dir\n");
+	  exit (-1);
+	}
+      while ((dir_info = readdir (dir)) != NULL)
+	{
+	  char *file_path = (char *) malloc (300 * sizeof (char));
+	  memset (file_path, 0, 300);
+	  get_file_path (file_user, dir_info->d_name, file_path);
+
+	  if (is_special_dir (dir_info->d_name))
+	    continue;
+
+	  if (!strstr (dir_info->d_name, ".bloom"))
+	    continue;
+
+	  printf ("file_path->%s\n", file_path);
+	  F_set *fset = NEW (F_set);
+	  fset->filename = file_path;
+	  fset->number = &number;
+	  printf ("fset->%d\n", fset->number);
+	  fset->next = head->next;
+	  head->next = fset;
+	  head = head->next;
+	  number++;
+	}
+    }
+
+  //free(file_path);
+  //free(mimi);
+  head1->next->reads_num = 0;
+  head1->next->reads_contam = 0;
+  return head1->next;
+
+}
+
+/*
+void main()
+{
+//char *list = "test_l";
+char *list;
+char *single_file = "single_file";
+char *dir = "/home/apple/xt/dir";
+
+F_set *head;
+
+head = make_list(single_file,list);
+
+head = head->next;
+
+while (head)
+      {
+      printf("position->%s\n",head->filename);
+      //printf("next->%d\n",head->next);
+      head = head->next;
+      }
+
+return;
+}
+*/
diff --git a/drass/file_dir.h b/drass/file_dir.h
new file mode 100644
index 0000000..7fc663c
--- /dev/null
+++ b/drass/file_dir.h
@@ -0,0 +1,11 @@
+#ifndef _FILE_DIR
+#define _FILE_DIR
+
+#include "bloom.h"
+extern int is_dir(const char *path);
+extern int is_file(const char *path);
+extern int is_special_dir(const char *path);
+extern F_set *make_list(char *file_user, char *list_user);
+extern void get_file_path(const char *path, const char *file_name,  char *file_path);
+
+#endif
diff --git a/drass/good_build.c b/drass/good_build.c
new file mode 100644
index 0000000..ce67403
--- /dev/null
+++ b/drass/good_build.c
@@ -0,0 +1,265 @@
+#define _LARGEFILE_SOURCE
+#define _LARGEFILE64_SOURCE
+#define _FILE_OFFSET_BITS 64
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+/*-------------------------------------*/
+//for file mapping in Linux
+#include<fcntl.h>
+#include<unistd.h>
+#include<sys/stat.h>
+#include<sys/time.h>
+#include<sys/mman.h>
+#include<sys/types.h>
+/*-------------------------------------*/
+#include "build.h"
+#include "bloom.h"
+#include "file_dir.h"
+#include "tool.h"
+
+static int
+build_usage(void)
+{
+    fprintf(stderr, "\nUsage: ./facs build [options]\n");
+    fprintf(stderr, "Options:\n");
+    fprintf(stderr, "\t-r path/to/something.fasta\n");
+    fprintf(stderr, "\t-o have to use it, but does not write file! :_/ XXX\n");
+    return 1;
+}
+
+int
+build_main (int argc, char **argv)
+{
+  if (argc < 2) return build_usage();
+
+  char *position;
+  BIGNUM capacity;
+
+/*-------defaults for bloom filter building-------*/ 
+  int opt;
+  int k_mer = 0;
+  float error_rate = 0.0005;
+
+  char* list = NULL;
+  char* prefix = NULL;
+  char* target_path = NULL;
+  char* source = NULL;
+  printf ("1st command->%s\n",argv[0]);
+  while ((opt = getopt (argc, argv, "ek:o:r:lh")) != -1) {
+      switch (opt) {
+          case 'e':
+              (optarg) && ((error_rate = atof (optarg)), 1);
+              break;
+          case 'k':
+              (optarg) && ((k_mer = atoi (optarg)), 1);
+              printf ("k_mer->%d\n",k_mer);
+              break;
+          case 'o':
+              (optarg) && ((target_path = optarg), 1); 
+              break;
+          case 'r':  
+              (optarg) && (source = optarg, 1);  
+              break;
+          case 'l':
+              (optarg) && (list = optarg, 1);  
+              break;
+          case 'h':
+              return build_usage();
+          case '?':
+              printf ("Unknown option: -%c\n", (char) optopt);
+              return build_usage();
+      } 
+  } 
+ 
+  printf("LIST IS %s\n", list);
+ 
+  if (!list) {
+#ifdef DEBUG
+    printf("[bloom build]: source is %s\n", source);
+    printf("[bloom build]: target is %s\n", target_path);
+#endif
+    build(source, target_path, k_mer, error_rate, NULL);
+  } else {
+      bloom *bl_2 = NEW (bloom);
+      Queue *head = NEW (Queue);
+      Queue *tail = NEW (Queue);
+      head->next = tail;
+      F_set *File_head = NEW (F_set);
+      File_head = make_list (source, list);
+      
+      while (File_head) {
+#ifdef DEBUG
+          printf ("Path->%s\n", File_head->filename);
+#endif
+          //map query- into memory--------------
+          position = mmaping (File_head->filename);
+          if (*position == '>')
+            capacity = strlen (position);
+          else
+            capacity = strlen (position) / 2;
+          
+          init_bloom (bl_2, capacity, error_rate, k_mer, File_head->filename);
+          ref_add (bl_2, position);
+          save_bloom (File_head->filename, bl_2, prefix,target_path);
+          bloom_destroy (bl_2);
+          
+          munmap (position, strlen (position));
+          File_head = File_head->next;
+    }
+  }
+  return 0;
+}
+
+void
+init_bloom (bloom * bl, BIGNUM capacity, float error_rate, int k_mer, char* filename)
+{
+  int flags = 3;
+  get_suggestion (&bl->stat, capacity, error_rate);
+
+#ifdef DEBUG
+  printf ("Capacity: %lld\n", bl->stat.capacity);
+  printf ("Vector size: %lld\n", bl->stat.elements);
+  printf ("Ideal hashes: %d\n", bl->stat.ideal_hashes);
+  printf ("Error rate: %f\n", bl->stat.e);
+  printf ("Real size: %lld\n", bl->stat.elements / 8);
+#endif
+
+  bloom_init (bl, bl->stat.elements, bl->stat.capacity, bl->stat.e,
+	      bl->stat.ideal_hashes, NULL, flags);
+  printf ("k_mer->%d\n",k_mer);
+  if (k_mer!=0)
+      bl->k_mer = k_mer;
+  else
+      bl->k_mer = kmer_suggestion(get_size(filename)); 
+  bl->dx = dx_add (bl->k_mer);
+}
+
+int
+build(char *ref_name, char *target_path, int k_mer, double error_rate, char *prefix)
+{
+  char *position = mmaping (ref_name);
+
+  bloom *bl = NEW (bloom);
+  if (k_mer!=0)
+      bl->k_mer = k_mer;
+  else
+      bl->k_mer = kmer_suggestion(get_size(ref_name));
+  printf ("k_mer->%d\n",bl->k_mer);
+  bl->stat.e = error_rate;
+  bl->dx = dx_add (bl->k_mer);
+  bl->stat.capacity = strlen (position);
+  get_rec (&bl->stat);
+
+  bloom_init (bl, bl->stat.elements, bl->stat.capacity, bl->stat.e,
+	      bl->stat.ideal_hashes, NULL, 3);
+  ref_add (bl, position);
+  save_bloom (ref_name, bl, NULL, target_path);
+
+  return 0;
+}
+
+/*-------------------------------------*/
+char *
+fasta_title (char *full)
+{
+  char *ptr;
+  ptr = strchr (full, '\n');
+  return ptr + 1;
+}
+
+/*-------------------------------------*/
+void
+fasta_add (bloom * bl, char *position)
+{
+  while (*position != '\0')
+    {
+      if (*position == '>')
+	      position = fasta_title (position);
+      else
+	      position = fasta_data (bl, position);
+    }
+}
+
+/*-------------------------------------*/
+void
+fastq_add (bloom * bl, char *position)
+{
+
+  char *key = (char *) malloc (bl->k_mer * sizeof (char) + 1);
+
+  while (position[0] != '\0')
+    {
+      position = strchr (position, '\n') + 1;
+
+      while (position[bl->k_mer - 1] != '\n') {
+    	  memcpy (key, position, sizeof (char) * bl->k_mer);
+	      key[bl->k_mer] = '\0';
+	      bloom_add (bl, key);
+	      position++;
+      }
+
+      position += bl->k_mer;
+      position = strchr (position, '\n') + 1;
+      char *v = strchr (position, '\n');
+
+      if (!v) break;
+      else position = v + 1;
+
+    }
+  free (key);
+}
+
+/*-------------------------------------*/
+char *
+fasta_data (bloom * bl_2, char *data)
+{
+  char *key = (char *) malloc (bl_2->k_mer * sizeof (char) + 1);
+  char *p = data;
+  int n = 0, m = 0;
+
+  while (*p != '>' && *p != '\0') {
+      while (n < bl_2->k_mer) {
+          if (p[m] == '>' || p[m] == '\0') {
+              m--;
+              break;
+          }
+
+          if (p[m] != '\r' && p[m] != '\n')
+            key[n++] = p[m];
+          m++;
+	  }
+
+      key[n] = '\0';
+
+      if (strlen (key) == bl_2->k_mer) {
+/*
+	  if (bloom_add (bl_2, key))
+	    hit++;
+	  else
+	    un_hit++;
+*/
+      }
+
+	  bloom_add (bl_2, key);
+	  p += 1;
+	  n = 0;
+	  m = 0;
+	}
+      free (key);
+      return p;
+}
+/*-------------------------------------*/
+void ref_add (bloom * bl, char *position)
+{
+    if (*position == '>')
+        fasta_add (bl, position);
+    else if (*position == '@')
+        fastq_add (bl, position);
+    else
+    {
+        perror ("wrong format\n");
+        exit (-1);
+    }
+}
diff --git a/drass/hashes.h b/drass/hashes.h
new file mode 100644
index 0000000..8e56ea4
--- /dev/null
+++ b/drass/hashes.h
@@ -0,0 +1,10 @@
+#ifndef _HASHES
+#define _HASHES
+
+extern BIGNUM hash1 (char *str, int seed);
+extern BIGNUM hash2 (char *str, int seed);
+extern BIGNUM hash3 (char *str, int seed);
+extern BIGNUM hash4 (char *str, int seed);
+extern BIGNUM hash5 (char *str, int seed, int length);
+
+#endif
diff --git a/drass/lookup8.c b/drass/lookup8.c
new file mode 100644
index 0000000..cd158cd
--- /dev/null
+++ b/drass/lookup8.c
@@ -0,0 +1,570 @@
+/*
+--------------------------------------------------------------------
+lookup8.c, by Bob Jenkins, January 4 1997, Public Domain.
+hash(), hash2(), hash3, and mix() are externally useful functions.
+Routines to test the hash are included if SELF_TEST is defined.
+You can use this free for any purpose.  It has no warranty.
+
+2009: This is obsolete.  I recently timed lookup3.c as being faster 
+at producing 64-bit results.
+--------------------------------------------------------------------
+*/
+#define SELF_TEST
+
+#include "bloom.h"
+#include "lookup8.h"
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+typedef unsigned long long ub8;	/* unsigned 8-byte quantities */
+typedef unsigned long int ub4;	/* unsigned 4-byte quantities */
+typedef unsigned char ub1;
+typedef long unsigned int lui;
+
+//#define hashsize(n) ((ub8)1<<(n))
+//#define hashmask(n) (hashsize(n)-1)
+
+/*
+--------------------------------------------------------------------
+mix -- mix 3 64-bit values reversibly.
+mix() takes 48 machine instructions, but only 24 cycles on a superscalar
+  machine (like Intel's new MMX architecture).  It requires 4 64-bit
+  registers for 4::2 parallelism.
+All 1-bit deltas, all 2-bit deltas, all deltas composed of top bits of
+  (a,b,c), and all deltas of bottom bits were tested.  All deltas were
+  tested both on random keys and on keys that were nearly all zero.
+  These deltas all cause every bit of c to change between 1/3 and 2/3
+  of the time (well, only 113/400 to 287/400 of the time for some
+  2-bit delta).  These deltas all cause at least 80 bits to change
+  among (a,b,c) when the mix is run either forward or backward (yes it
+  is reversible).
+This implies that a hash using mix64 has no funnels.  There may be
+  characteristics with 3-bit deltas or bigger, I didn't test for
+  those.
+--------------------------------------------------------------------
+*/
+#define mix64(a,b,c) \
+{ \
+  a -= b; a -= c; a ^= (c>>43); \
+  b -= c; b -= a; b ^= (a<<9); \
+  c -= a; c -= b; c ^= (b>>8); \
+  a -= b; a -= c; a ^= (c>>38); \
+  b -= c; b -= a; b ^= (a<<23); \
+  c -= a; c -= b; c ^= (b>>5); \
+  a -= b; a -= c; a ^= (c>>35); \
+  b -= c; b -= a; b ^= (a<<49); \
+  c -= a; c -= b; c ^= (b>>11); \
+  a -= b; a -= c; a ^= (c>>12); \
+  b -= c; b -= a; b ^= (a<<18); \
+  c -= a; c -= b; c ^= (b>>22); \
+}
+
+/*
+--------------------------------------------------------------------
+hash() -- hash a variable-length key into a 64-bit value
+  k     : the key (the unaligned variable-length array of bytes)
+  len   : the length of the key, counting by bytes
+  level : can be any 8-byte value
+Returns a 64-bit value.  Every bit of the key affects every bit of
+the return value.  No funnels.  Every 1-bit and 2-bit delta achieves
+avalanche.  About 41+5len instructions.
+
+The best hash table sizes are powers of 2.  There is no need to do
+mod a prime (mod is sooo slow!).  If you need less than 64 bits,
+use a bitmask.  For example, if you need only 10 bits, do
+  h = (h & hashmask(10));
+In which case, the hash table should have hashsize(10) elements.
+
+If you are hashing n strings (ub1 **)k, do it like this:
+  for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
+
+By Bob Jenkins, Jan 4 1997.  bob_jenkins@burtleburtle.net.  You may
+use this code any way you wish, private, educational, or commercial,
+but I would appreciate if you give me credit.
+
+See http://burtleburtle.net/bob/hash/evahash.html
+Use for hash table lookup, or anything where one collision in 2^^64
+is acceptable.  Do NOT use for cryptographic purposes.
+--------------------------------------------------------------------
+*/
+
+ub8
+hash (register ub1 *k, register ub8 length, register ub8 level)
+/*
+     register ub1 *k;		// the key
+     register ub8 length;	// the length of the key
+     register ub8 level;	// the previous hash, or an arbitrary value
+*/
+{
+  register ub8 a, b, c, len;
+
+  /* Set up the internal state */
+  len = length;
+  a = b = level;		/* the previous hash value */
+  c = 0x9e3779b97f4a7c13LL;	/* the golden ratio; an arbitrary value */
+
+  /*---------------------------------------- handle most of the key */
+  while (len >= 24)
+    {
+      a += (k[0] + ((ub8) k[1] << 8) + ((ub8) k[2] << 16) + ((ub8) k[3] << 24)
+	    + ((ub8) k[4] << 32) + ((ub8) k[5] << 40) + ((ub8) k[6] << 48) +
+	    ((ub8) k[7] << 56));
+      b +=
+	(k[8] + ((ub8) k[9] << 8) + ((ub8) k[10] << 16) +
+	 ((ub8) k[11] << 24) + ((ub8) k[12] << 32) + ((ub8) k[13] << 40) +
+	 ((ub8) k[14] << 48) + ((ub8) k[15] << 56));
+      c +=
+	(k[16] + ((ub8) k[17] << 8) + ((ub8) k[18] << 16) +
+	 ((ub8) k[19] << 24) + ((ub8) k[20] << 32) + ((ub8) k[21] << 40) +
+	 ((ub8) k[22] << 48) + ((ub8) k[23] << 56));
+      mix64 (a, b, c);
+      k += 24;
+      len -= 24;
+    }
+
+  /*------------------------------------- handle the last 23 bytes */
+  c += length;
+  switch (len)			/* all the case statements fall through */
+    {
+    case 23:
+      c += ((ub8) k[22] << 56);
+    case 22:
+      c += ((ub8) k[21] << 48);
+    case 21:
+      c += ((ub8) k[20] << 40);
+    case 20:
+      c += ((ub8) k[19] << 32);
+    case 19:
+      c += ((ub8) k[18] << 24);
+    case 18:
+      c += ((ub8) k[17] << 16);
+    case 17:
+      c += ((ub8) k[16] << 8);
+      /* the first byte of c is reserved for the length */
+    case 16:
+      b += ((ub8) k[15] << 56);
+    case 15:
+      b += ((ub8) k[14] << 48);
+    case 14:
+      b += ((ub8) k[13] << 40);
+    case 13:
+      b += ((ub8) k[12] << 32);
+    case 12:
+      b += ((ub8) k[11] << 24);
+    case 11:
+      b += ((ub8) k[10] << 16);
+    case 10:
+      b += ((ub8) k[9] << 8);
+    case 9:
+      b += ((ub8) k[8]);
+    case 8:
+      a += ((ub8) k[7] << 56);
+    case 7:
+      a += ((ub8) k[6] << 48);
+    case 6:
+      a += ((ub8) k[5] << 40);
+    case 5:
+      a += ((ub8) k[4] << 32);
+    case 4:
+      a += ((ub8) k[3] << 24);
+    case 3:
+      a += ((ub8) k[2] << 16);
+    case 2:
+      a += ((ub8) k[1] << 8);
+    case 1:
+      a += ((ub8) k[0]);
+      /* case 0: nothing left to add */
+    }
+  mix64 (a, b, c);
+  /*-------------------------------------------- report the result */
+  return c;
+}
+
+/*
+--------------------------------------------------------------------
+ This works on all machines, is identical to hash() on little-endian 
+ machines, and it is much faster than hash(), but it requires
+ -- that the key be an array of ub8's, and
+ -- that all your machines have the same endianness, and
+ -- that the length be the number of ub8's in the key
+--------------------------------------------------------------------
+*/
+ub8
+hash2 (register ub8 *k, register ub8 length, register ub8 level)
+/*     
+     register ub8 *k;		// the key
+     register ub8 length;	// the length of the key
+     register ub8 level;	// the previous hash, or an arbitrary value
+*/
+{
+  register ub8 a, b, c, len;
+
+  /* Set up the internal state */
+  len = length;
+  a = b = level;		/* the previous hash value */
+  c = 0x9e3779b97f4a7c13LL;	/* the golden ratio; an arbitrary value */
+
+  /*---------------------------------------- handle most of the key */
+  while (len >= 3)
+    {
+      a += k[0];
+      b += k[1];
+      c += k[2];
+      mix64 (a, b, c);
+      k += 3;
+      len -= 3;
+    }
+
+  /*-------------------------------------- handle the last 2 ub8's */
+  c += (length << 3);
+  switch (len)			/* all the case statements fall through */
+    {
+      /* c is reserved for the length */
+    case 2:
+      b += k[1];
+    case 1:
+      a += k[0];
+      /* case 0: nothing left to add */
+    }
+  mix64 (a, b, c);
+  /*-------------------------------------------- report the result */
+  return c;
+}
+
+/*
+--------------------------------------------------------------------
+ This is identical to hash() on little-endian machines, and it is much
+ faster than hash(), but a little slower than hash2(), and it requires
+ -- that all your machines be little-endian, for example all Intel x86
+    chips or all VAXen.  It gives wrong results on big-endian machines.
+--------------------------------------------------------------------
+*/
+
+ub8
+hash3 (register ub1 *k, register ub8 length, register ub8 level)
+/*
+     register ub1 *k;		// the key
+     register ub8 length;	// the length of the key
+     register ub8 level;	// the previous hash, or an arbitrary value
+*/
+{
+  register ub8 a, b, c, len;
+
+  /* Set up the internal state */
+  len = length;
+  a = b = level;		/* the previous hash value */
+  c = 0x9e3779b97f4a7c13LL;	/* the golden ratio; an arbitrary value */
+
+  /*---------------------------------------- handle most of the key */
+  if (((size_t) k) & 7)
+    {
+      while (len >= 24)
+	{
+	  a +=
+	    (k[0] + ((ub8) k[1] << 8) + ((ub8) k[2] << 16) +
+	     ((ub8) k[3] << 24) + ((ub8) k[4] << 32) + ((ub8) k[5] << 40) +
+	     ((ub8) k[6] << 48) + ((ub8) k[7] << 56));
+	  b +=
+	    (k[8] + ((ub8) k[9] << 8) + ((ub8) k[10] << 16) +
+	     ((ub8) k[11] << 24) + ((ub8) k[12] << 32) + ((ub8) k[13] << 40) +
+	     ((ub8) k[14] << 48) + ((ub8) k[15] << 56));
+	  c +=
+	    (k[16] + ((ub8) k[17] << 8) + ((ub8) k[18] << 16) +
+	     ((ub8) k[19] << 24) + ((ub8) k[20] << 32) + ((ub8) k[21] << 40) +
+	     ((ub8) k[22] << 48) + ((ub8) k[23] << 56));
+	  mix64 (a, b, c);
+	  k += 24;
+	  len -= 24;
+	}
+    }
+  else
+    {
+      while (len >= 24)		/* aligned */
+	{
+	  a += *(ub8 *) (k + 0);
+	  b += *(ub8 *) (k + 8);
+	  c += *(ub8 *) (k + 16);
+	  mix64 (a, b, c);
+	  k += 24;
+	  len -= 24;
+	}
+    }
+
+  /*------------------------------------- handle the last 23 bytes */
+  c += length;
+  switch (len)			/* all the case statements fall through */
+    {
+    case 23:
+      c += ((ub8) k[22] << 56);
+    case 22:
+      c += ((ub8) k[21] << 48);
+    case 21:
+      c += ((ub8) k[20] << 40);
+    case 20:
+      c += ((ub8) k[19] << 32);
+    case 19:
+      c += ((ub8) k[18] << 24);
+    case 18:
+      c += ((ub8) k[17] << 16);
+    case 17:
+      c += ((ub8) k[16] << 8);
+      /* the first byte of c is reserved for the length */
+    case 16:
+      b += ((ub8) k[15] << 56);
+    case 15:
+      b += ((ub8) k[14] << 48);
+    case 14:
+      b += ((ub8) k[13] << 40);
+    case 13:
+      b += ((ub8) k[12] << 32);
+    case 12:
+      b += ((ub8) k[11] << 24);
+    case 11:
+      b += ((ub8) k[10] << 16);
+    case 10:
+      b += ((ub8) k[9] << 8);
+    case 9:
+      b += ((ub8) k[8]);
+    case 8:
+      a += ((ub8) k[7] << 56);
+    case 7:
+      a += ((ub8) k[6] << 48);
+    case 6:
+      a += ((ub8) k[5] << 40);
+    case 5:
+      a += ((ub8) k[4] << 32);
+    case 4:
+      a += ((ub8) k[3] << 24);
+    case 3:
+      a += ((ub8) k[2] << 16);
+    case 2:
+      a += ((ub8) k[1] << 8);
+    case 1:
+      a += ((ub8) k[0]);
+      /* case 0: nothing left to add */
+    }
+  mix64 (a, b, c);
+  /*-------------------------------------------- report the result */
+  return c;
+}
+
+#ifdef SELF_TEST
+
+/* used for timings */
+void
+driver1 ()
+{
+  ub8 buf[256];
+  ub8 i;
+  ub8 h = 0;
+
+  for (i = 0; i < 256; ++i)
+    {
+      h = hash (buf, i, h);
+    }
+}
+
+/* check that every input bit changes every output bit half the time */
+#define HASHSTATE 1
+#define HASHLEN   1
+#define MAXPAIR 80
+#define MAXLEN 5
+void
+driver2 ()
+{
+  ub1 qa[MAXLEN + 1], qb[MAXLEN + 2], *a = &qa[0], *b = &qb[1];
+  ub8 c[HASHSTATE], d[HASHSTATE], i, j = 0, k, l, m, z;
+  ub8 e[HASHSTATE], f[HASHSTATE], g[HASHSTATE], h[HASHSTATE];
+  ub8 x[HASHSTATE], y[HASHSTATE];
+  ub8 hlen;
+
+  printf ("No more than %d trials should ever be needed \n", MAXPAIR / 2);
+  for (hlen = 0; hlen < MAXLEN; ++hlen)
+    {
+      z = 0;
+      for (i = 0; i < hlen; ++i)
+			    /*----------------------- for each byte, */
+	{
+	  for (j = 0; j < 8; ++j)
+			    /*------------------------ for each bit, */
+	    {
+	      for (m = 0; m < 8; ++m)
+			    /*-------- for serveral possible levels, */
+		{
+		  for (l = 0; l < HASHSTATE; ++l)
+		    e[l] = f[l] = g[l] = h[l] = x[l] = y[l] = ~((ub8) 0);
+
+	  /*---- check that every input bit affects every output bit */
+		  for (k = 0; k < MAXPAIR; k += 2)
+		    {
+		      ub8 finished = 1;
+		      /* keys have one bit different */
+		      for (l = 0; l < hlen + 1; ++l)
+			{
+			  a[l] = b[l] = (ub1) 0;
+			}
+		      /* have a and b be two keys differing in only one bit */
+		      a[i] ^= (k << j);
+		      a[i] ^= (k >> (8 - j));
+		      c[0] = hash (a, hlen, m);
+		      b[i] ^= ((k + 1) << j);
+		      b[i] ^= ((k + 1) >> (8 - j));
+		      d[0] = hash (b, hlen, m);
+		      /* check every bit is 1, 0, set, and not set at least once */
+		      for (l = 0; l < HASHSTATE; ++l)
+			{
+			  e[l] &= (c[l] ^ d[l]);
+			  f[l] &= ~(c[l] ^ d[l]);
+			  g[l] &= c[l];
+			  h[l] &= ~c[l];
+			  x[l] &= d[l];
+			  y[l] &= ~d[l];
+			  if (e[l] | f[l] | g[l] | h[l] | x[l] | y[l])
+			    finished = 0;
+			}
+		      if (finished)
+			break;
+		    }
+		  if (k > z)
+		    z = k;
+		  if (k == MAXPAIR)
+		    {
+		      printf ("Some bit didn't change: ");
+		      printf ("%.8lx %.8lx %.8lx %.8lx %.8lx %.8lx  ",
+			      (lui) e[0], (lui) f[0], (lui) g[0], (lui) h[0], (lui) x[0], (lui) y[0]);
+		      printf ("i %ld j %ld m %ld len %ld\n",
+			      (ub4) i, (ub4) j, (ub4) m, (ub4) hlen);
+		    }
+		  if (z == MAXPAIR)
+		    goto done;
+		}
+	    }
+	}
+    done:
+      if (z < MAXPAIR)
+	{
+	  printf ("required  %ld  trials\n", (ub4) (z / 2));
+	}
+    }
+  printf ("\n");
+}
+
+/* Check for reading beyond the end of the buffer and alignment problems */
+void
+driver3 ()
+{
+  ub1 buf[MAXLEN + 20], *b;
+  ub8 len;
+  ub1 q[] =
+    "This is the time for all good men to come to the aid of their country";
+  ub1 qq[] =
+    "xThis is the time for all good men to come to the aid of their country";
+  ub1 qqq[] =
+    "xxThis is the time for all good men to come to the aid of their country";
+  ub1 qqqq[] =
+    "xxxThis is the time for all good men to come to the aid of their country";
+  ub1 o[] =
+    "xxxxThis is the time for all good men to come to the aid of their country";
+  ub1 oo[] =
+    "xxxxxThis is the time for all good men to come to the aid of their country";
+  ub1 ooo[] =
+    "xxxxxxThis is the time for all good men to come to the aid of their country";
+  ub1 oooo[] =
+    "xxxxxxxThis is the time for all good men to come to the aid of their country";
+  ub8 h, i, j, ref, x, y;
+
+  printf ("Endianness.  These should all be the same:\n");
+  h = hash (q + 0, (ub8) (sizeof (q) - 1), (ub8) 0);
+  printf ("%.8lx%.8lx\n", (ub4) h, (ub4) (h >> 32));
+  h = hash (qq + 1, (ub8) (sizeof (q) - 1), (ub8) 0);
+  printf ("%.8lx%.8lx\n", (ub4) h, (ub4) (h >> 32));
+  h = hash (qqq + 2, (ub8) (sizeof (q) - 1), (ub8) 0);
+  printf ("%.8lx%.8lx\n", (ub4) h, (ub4) (h >> 32));
+  h = hash (qqqq + 3, (ub8) (sizeof (q) - 1), (ub8) 0);
+  printf ("%.8lx%.8lx\n", (ub4) h, (ub4) (h >> 32));
+  h = hash (o + 4, (ub8) (sizeof (q) - 1), (ub8) 0);
+  printf ("%.8lx%.8lx\n", (ub4) h, (ub4) (h >> 32));
+  h = hash (oo + 5, (ub8) (sizeof (q) - 1), (ub8) 0);
+  printf ("%.8lx%.8lx\n", (ub4) h, (ub4) (h >> 32));
+  h = hash (ooo + 6, (ub8) (sizeof (q) - 1), (ub8) 0);
+  printf ("%.8lx%.8lx\n", (ub4) h, (ub4) (h >> 32));
+  h = hash (oooo + 7, (ub8) (sizeof (q) - 1), (ub8) 0);
+  printf ("%.8lx%.8lx\n", (ub4) h, (ub4) (h >> 32));
+  printf ("\n");
+  for (h = 0, b = buf + 1; h < 8; ++h, ++b)
+    {
+      for (i = 0; i < MAXLEN; ++i)
+	{
+	  len = i;
+	  for (j = 0; j < i; ++j)
+	    *(b + j) = 0;
+
+	  /* these should all be equal */
+	  ref = hash (b, len, (ub8) 1);
+	  *(b + i) = (ub1) ~ 0;
+	  *(b - 1) = (ub1) ~ 0;
+	  x = hash (b, len, (ub8) 1);
+	  y = hash (b, len, (ub8) 1);
+	  if ((ref != x) || (ref != y))
+	    {
+	      printf ("alignment error: %.8lx %.8lx %.8lx %ld %ld\n", (lui) ref,
+                                                                  (lui) x,
+		                                                          (lui) y, 
+                                                                  (lui) h, 
+                                                                  (lui) i);
+	    }
+	}
+    }
+}
+
+/* check for problems with nulls */
+void
+driver4 ()
+{
+  ub1 buf[1];
+  ub8 h, i, state[HASHSTATE];
+
+
+  buf[0] = ~0;
+  for (i = 0; i < HASHSTATE; ++i)
+    state[i] = 1;
+  printf ("These should all be different\n");
+  for (i = 0, h = 0; i < 8; ++i)
+    {
+      h = hash (buf, (ub8) 0, h);
+      printf ("%2ld  0-byte strings, hash is  %.8lx%.8lx\n", (ub4) i,
+	      (ub4) h, (ub4) (h >> 32));
+    }
+}
+
+
+//int main()
+//{
+// driver1();   /* test that the key is hashed: used for timings */
+//  driver2();   /* test that whole key is hashed thoroughly */
+//  driver3();   /* test that nothing but the key is hashed */
+//  driver4();   /* test hashing multiple buffers (all buffers are null) */
+//  return 1;
+//}
+BIGNUM
+hash5 (const char *key, const int seed, int length)
+{
+  //char s[100]={0};
+  BIGNUM ret;
+  ret = hash3 ((unsigned char *) key, length, seed);
+  //printf("ret->%llx\n",ret);
+  ret = ret & 0xFFFFFFFFF;
+  //printf("mask->%llx\n",0x7FFFFFFF);
+  //printf("after ret->%lld\n",ret);
+  return ret;
+}
+#endif /* SELF_TEST */
+/*
+int main()
+{
+char *key = "TTTTTTTTTTTTAAAAAGCCC";
+hash5(key);
+char *key2 ="TTTTTGGGTTTTAAAAAGCCC";
+hash5(key2);
+}
+*/
diff --git a/drass/lookup8.h b/drass/lookup8.h
new file mode 100644
index 0000000..64feae8
--- /dev/null
+++ b/drass/lookup8.h
@@ -0,0 +1,7 @@
+#ifndef _LOOKUP8
+#define _LOOKUP8
+void driver1(void);
+void driver2(void);
+void driver3(void);
+void driver4(void);
+#endif
diff --git a/drass/main.c b/drass/main.c
new file mode 100644
index 0000000..436a00e
--- /dev/null
+++ b/drass/main.c
@@ -0,0 +1,60 @@
+#include "check.h"
+#include "build.h"
+#include "remove.h"
+#include "remove_l.h"
+#include "big_query.h"
+/*------------------------------*/  
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+/*------------------------------*/ 
+
+#ifndef PACKAGE_VERSION
+#define PACKAGE_VERSION "0.1"
+#endif
+
+static int
+usage(void)
+{
+    fprintf(stderr, "\n");
+    fprintf(stderr, "Program: facs (Sequence decontamination using bloom filters)\n");
+    fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
+    fprintf(stderr, "Contact: Enze Liu <enze.liu@scilifelab.se>\n\n");
+    fprintf(stderr, "Usage:   facs <command> [options]\n\n");
+    fprintf(stderr, "Command: build         build a bloom filter from a FASTA reference file\n");
+    fprintf(stderr, "         query         query a bloom filter given a FASTQ/FASTA file\n");
+    fprintf(stderr, "         remove        remove (contamination) sequences from FASTQ/FASTA file\n");
+    fprintf(stderr, "\n");
+    return 1;
+}
+
+
+int main (int argc, char **argv) 
+{
+  int ret=0;
+/*-------defaults-------*/ 
+/*
+k_mer = 21;
+tole_rate = 0.8;
+error_rate = 0.0005;
+sampling_rate = 1;
+
+help = 0;
+
+prefix = NULL;
+list = NULL;
+ref = NULL;
+source = NULL;
+mode = NULL;
+*/
+/*-------defaults-------*/
+
+  if (argc < 2) return usage();
+
+  if (strcmp(argv[1], "build") == 0) ret = build_main(argc-1, argv+1);
+  else if (strcmp(argv[1], "query") == 0) ret = bq_main(argc-1, argv+1);
+  else if (strcmp(argv[1], "remove") == 0) ret = remove_main(argc-1, argv+1);
+  else usage();
+  return ret;
+}
diff --git a/drass/mpi_bloom.c b/drass/mpi_bloom.c
new file mode 100644
index 0000000..c6078c4
--- /dev/null
+++ b/drass/mpi_bloom.c
@@ -0,0 +1,1055 @@
+#define _LARGEFILE_SOURCE
+#define _LARGEFILE64_SOURCE
+#define _FILE_OFFSET_BITS 64
+#include <math.h>
+#include <time.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+/*-------------------------------------*/
+//for file mapping in Linux
+#include<fcntl.h>
+#include<unistd.h>
+#include<sys/stat.h>
+#include<sys/time.h>
+#include<sys/mman.h>
+#include<sys/types.h>
+/*-------------------------------------*/
+#include "bloom.h"
+#include "hashes.h"
+/*-------------------------------------*/
+//openMP library
+#include<omp.h>
+//MPICH/OPENMPI
+#include<mpi.h>
+/*-------------------------------------*/
+//#define PERMS 0600
+//#define NEW(type) (type *) malloc(sizeof(type))
+/*-------------------------------------*/
+int ntask = 0, mytask = 0;
+/*-------------------------------------*/
+long long total_piece, PAGE, buffer, share, offset, reads_num =
+  0, reads_contam = 0, checky = 0, CHUNK, total_size = 0;
+/*-------------------------------------*/
+float error_rate, sampling_rate, contamination_rate, tole_rate;
+/*-------------------------------------*/
+int k_mer = 21, mode, mytask, ntask, type = 2, excel1, excel2, last_piece =
+  0, extra_piece = 0;
+int last = 0;
+/*-------------------------------------*/
+char *source, *all_ref, *position, *prefix;
+/*-------------------------------------*/
+Queue *head, *head2, *tail;
+/*-------------------------------------*/
+bloom *bl_2;
+/*-------------------------------------*/
+struct stat statbuf;
+/*-------------------------------------*/
+void list_init ();
+void struc_init ();
+void get_parainfo (char *full);
+void get_size (char *strFileName);
+void init (int argc, char **argv);
+void fasta_process (bloom * bl, Queue * info);
+void fastq_process (bloom * bl, Queue * info);
+void evaluate (char *detail, char *filename);
+void statistic_save (char *detail, char *filename);
+/*-------------------------------------*/
+int gather ();
+int fastq_full_check (bloom * bl, char *p, int distance);
+int fasta_full_check (bloom * bl, char *begin, char *next, char *model);
+int fastq_read_check (char *begin, int length, char *model, bloom * bl);
+int fasta_read_check (char *begin, char *next, char *model, bloom * bl);
+/*-------------------------------------*/
+char *jump (char *target);
+char *ammaping (char *source);
+//char* reallocate(Queue *info);
+/*-------------------------------------*/
+main (int argc, char **argv)
+{
+
+  MPI_Init (&argc, &argv);
+
+  MPI_Comm_size (MPI_COMM_WORLD, &ntask);
+
+  MPI_Comm_rank (MPI_COMM_WORLD, &mytask);
+
+  long sec, usec, i;
+
+  struct timezone tz;
+
+  struct timeval tv, tv2;
+
+  if (mytask == 0)		//starting time
+    {
+      gettimeofday (&tv, &tz);
+    }
+
+  init (argc, argv);		//initialize 
+
+  if (mode != 1 && mode != 2)
+    {
+      perror ("Mode select error.");
+      return -1;
+    }
+
+  char *detail = (char *) malloc (1000 * 1000 * sizeof (char));
+
+  memset (detail, 0, 1000 * 1000);
+
+  strcat (detail, "query-->");
+  strcat (detail, source);
+
+  struc_init ();		//structure init
+
+  load_bloom (all_ref, bl_2);
+
+  k_mer = bl_2->k_mer;
+
+  while (share > 0)
+    {
+      position = ammaping (source);
+
+      list_init ();
+
+      get_parainfo (position);
+
+      head = head->next;
+
+#pragma omp parallel
+      {
+#pragma omp single nowait
+	{
+	  while (head != tail)
+	    {
+
+#pragma omp task firstprivate(head)
+	      {
+		printf ("position->%0.10s\n", head->location);
+
+		if (type == 1)
+		  fasta_process (bl_2, head);
+		else
+		  fastq_process (bl_2, head);
+
+	      }
+	      head = head->next;
+
+	    }
+	}
+      }
+      munmap (position, buffer * PAGE);
+
+      share -= buffer;
+
+      offset += buffer;
+
+      //head = head2;
+    }
+  printf ("finish processing...\n");
+
+  MPI_Barrier (MPI_COMM_WORLD);	//wait until all threads finish jobs
+
+  gather ();			//gather all matched and missed info
+
+  if (mytask == 0)		//finishing time
+    {
+      gettimeofday (&tv2, &tz);
+
+      sec = tv2.tv_sec - tv.tv_sec;
+
+      usec = tv2.tv_usec - tv.tv_usec;
+
+      printf ("total=%ld sec\n", sec);
+
+      printf ("all->%d\n", excel1);
+
+      printf ("excecuted->%d\n", excel2);
+
+      evaluate (detail, all_ref);
+
+      statistic_save (detail, source);
+    }
+
+  MPI_Finalize ();
+
+  return 0;
+}
+
+/*-------------------------------------*/
+void
+init (int argc, char **argv)
+{
+  if (argc == 1 || !strcmp (argv[1], "-h") || !strcmp (argv[1], "-help"))
+    {
+      help ();
+      check_help ();
+    }
+/*-------default-------*/
+  mode = 1;
+  k_mer = 21;
+  tole_rate = 0.8;
+  error_rate = 0.0005;
+  sampling_rate = 1;
+  prefix = NULL;
+
+  int x;
+  while ((x = getopt (argc, argv, "e:k:m:t:o:r:q:s:")) != -1)
+    {
+      //printf("optind: %d\n", optind);
+      switch (x)
+	{
+	case 'e':
+	  printf ("Error rate: \nThe argument of -e is %s\n", optarg);
+	  (optarg) && ((error_rate = atof (optarg)), 1);
+	  break;
+	case 'k':
+	  printf ("K_mer size: \nThe argument of -k is %s\n", optarg);
+	  (optarg) && ((k_mer = atoi (optarg)), 1);
+	  break;
+	case 'm':
+	  printf ("Mode : \nThe argument of -m is %s\n", optarg);
+	  (optarg) && ((mode = atoi (optarg)), 1);
+	  break;
+	case 't':
+	  printf ("Tolerant rate: \nThe argument of -t is %s\n", optarg);
+	  (optarg) && ((tole_rate = atof (optarg)), 1);
+	  break;
+	case 's':
+	  printf ("Sampling rate: \nThe argument of -s is %s\n", optarg);
+	  (optarg) && ((sampling_rate = atof (optarg)), 1);
+	  break;
+	case 'o':
+	  printf ("Output : \nThe argument of -o is %s\n", optarg);
+	  (optarg) && ((prefix = optarg), 1);
+	  break;
+	case 'r':
+	  printf ("Bloom list : \nThe argument of -r is %s\n", optarg);
+	  (optarg) && (all_ref = optarg, 1);
+	  break;
+	case 'q':
+	  printf ("Query : \nThe argument of -q is %s\n", optarg);
+	  (optarg) && (source = optarg, 1);
+	  break;
+	case '?':
+	  printf ("Unknown option: -%c\n", (char) optopt);
+	  exit (0);
+	}
+
+    }
+
+  if (strstr (source, ".fasta") || strstr (source, ".fna"))
+    type = 1;
+
+  if ((!all_ref) || (!source))
+    exit (0);
+
+  if (mode != 1 && mode != 2)
+    {
+      perror ("Mode select error.");
+      exit (0);
+    }
+
+}
+
+/*-------------------------------------*/
+void
+struc_init ()
+{
+
+  bl_2 = NEW (bloom);
+
+  //head2 = head;
+
+  get_size (source);		//get total size of file
+
+  share = total_piece / ntask;	//every task gets an euqal piece
+
+  if (total_piece % ntask != 0 && mytask == (ntask - 1))
+
+    share += (total_piece % ntask);	//last node tasks extra job
+
+  offset = share * mytask;	//distribute the task
+
+}
+
+/*-------------------------------------*/
+char *
+ammaping (char *source)
+{
+  int src;
+  char *sm;
+
+  if ((src = open (source, O_RDONLY | O_LARGEFILE)) < 0)
+    {
+      perror (" open source ");
+      exit (EXIT_FAILURE);
+    }
+
+  if (fstat (src, &statbuf) < 0)
+    {
+      perror (" fstat source ");
+      exit (EXIT_FAILURE);
+    }
+
+  printf ("share->%d PAGES per node\n", share);
+
+  if (share >= CHUNK)
+    buffer = CHUNK;
+  else
+    buffer = share;
+  printf ("total pieces->%d\n", total_piece);
+  printf ("PAGE->%d\n", PAGE);
+  printf ("node %d chunk size %d buffer size %d offset %d\n", mytask, CHUNK,
+	  buffer, offset);
+
+  sm = mmap (0, buffer * PAGE, PROT_READ, MAP_SHARED | MAP_NORESERVE, src, offset * PAGE);	//everytime we process a chunk of data
+
+  //sm = mmap (0,share*PAGE, PROT_READ, MAP_SHARED | MAP_NORESERVE,src, offsetmytask*share*PAGE); //last time we process the rest
+
+  if (MAP_FAILED == sm)
+    {
+      perror (" mmap source ");
+      exit (EXIT_FAILURE);
+    }
+
+  return sm;
+}
+
+/*-------------------------------------*/
+void
+get_size (char *strFileName)
+{
+  stat (strFileName, &statbuf);
+  PAGE = getpagesize ();	//get memory PAGE definition 
+  total_piece = statbuf.st_size / PAGE;
+  total_size = statbuf.st_size;
+  CHUNK = 1000 * 1000 * 1000 * 1 / PAGE;	//1GB
+
+
+  //if (statbuf.st_size % PAGE != 0)    //need one more page if total data is not a time number of a memory PAGE
+  //extra_piece = statbuf.st_size % PAGE;
+  //printf ("extra_piece->%d\n", extra_piece);
+}
+
+/*-------------------------------------*/
+void
+get_parainfo (char *full)
+{
+  printf ("distributing...\n");
+
+  char *temp = full;
+
+  int cores = omp_get_num_procs ();
+
+  int offsett = buffer * PAGE / cores;
+
+  last_piece = buffer * PAGE - (cores - 1) * offsett;
+
+  int add = 0;
+
+  printf ("last piece->%d\n", last_piece);
+
+  Queue *pos = head;
+
+  if (type == 1)
+    {
+      for (add = 0; add < cores; add++)
+	{
+	  Queue *x = NEW (Queue);
+	  temp = strchr (full, '>');	//drop the possible fragment
+	  if (add != 0)
+	    temp = strchr (full + offsett * add, '>');
+	  x->location = temp;
+	  x->number = add;
+	  x->next = pos->next;
+	  pos->next = x;
+	  pos = pos->next;
+	}
+    }				// end if
+
+  else
+    {
+      for (add = 0; add < cores; add++)
+	{
+	  Queue *x = NEW (Queue);
+	  if (add == 0 && *full != '@')
+	    temp = strstr (full, "\n@") + 1;	//drop the fragment
+	  if (add != 0)
+	    temp = strstr (full + offsett * add, "\n@") + 1;
+	  x->location = temp;
+	  x->number = add;
+	  x->next = pos->next;
+	  pos->next = x;
+	  pos = pos->next;
+	}			//end else  
+    }
+
+}
+
+/*-------------------------------------*/
+int
+gather ()
+{
+  printf ("gathering...\n");
+  if (mytask == 0)
+    {
+      // The master thread will need to receive all computations from all other threads.
+      MPI_Status status;
+      // MPI_Recv(void *buf, int count, MPI_DAtatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status)
+      // We need to go and receive the data from all other threads.
+      // The arbitrary tag we choose is 1, for now.
+      int i = 0;
+      for (i = 1; i < ntask; i++)
+	{
+	  long long temp, temp2, temp3;
+	  MPI_Recv (&temp, 1, MPI_INT, i, 1, MPI_COMM_WORLD, &status);
+	  MPI_Recv (&temp2, 5, MPI_INT, i, 1, MPI_COMM_WORLD, &status);
+	  MPI_Recv (&temp3, 7, MPI_INT, i, 1, MPI_COMM_WORLD, &status);
+	  printf ("RECEIVED %lld from thread %d\n", temp, i);
+	  reads_num += temp;
+	  reads_contam += temp2;
+	  checky += temp3;
+	}
+    }
+  else
+    {
+      // We are finished with the results in this thread, and need to send the data to thread 1.
+      // MPI_Send(void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)
+      // The destination is thread 0, and the arbitrary tag we choose for now is 1.
+      MPI_Send (&reads_num, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
+      MPI_Send (&reads_contam, 5, MPI_INT, 0, 1, MPI_COMM_WORLD);
+      MPI_Send (&checky, 7, MPI_INT, 0, 1, MPI_COMM_WORLD);
+    }
+  return 1;
+}
+
+/*-------------------------------------*/
+void
+fastq_process (bloom * bl, Queue * info)
+{
+  printf ("fastq processing...\n");
+  char *p = info->location;
+  char *next, *temp, *temp_piece = NULL;
+
+  if (info->next != tail)
+    next = info->next->location;
+
+  else
+    {
+      printf ("last_piece  %d\n", last_piece);
+      temp_piece = (char *) malloc ((last_piece + 1) * sizeof (char));
+      memset (temp_piece, 0, last_piece + 1);
+      memcpy (temp_piece, info->location, last_piece - PAGE);
+      temp_piece[last_piece] = '\0';
+
+      temp = temp_piece;
+      while ((temp = strstr (temp, "\n@")))
+	{
+	  next = temp + 1;
+	  temp++;
+	}
+      p = temp_piece;
+    }
+
+  while (p != next)
+    {
+
+      temp = jump (p);		//generate random number and judge if need to scan this read
+      if (p != temp)
+	{
+	  p = temp;
+	  continue;
+	}
+      if (p == '\0' || p == NULL)
+	break;
+#pragma omp atomic
+      reads_num++;
+      p = strchr (p, '\n') + 1;
+      int distance = strchr (p, '\n') - p;
+
+      if (!fastq_read_check (p, distance, "normal", bl))
+#pragma omp atomic
+	reads_contam++;
+//printf("p->%s\n",p);
+      p = strchr (p, '\n') + 1;
+      p = strchr (p, '\n') + 1;
+      p = strchr (p, '\n') + 1;
+    }				// outside while
+  printf ("finish process...\n");
+  if (temp_piece)
+    free (temp_piece);
+}
+
+/*-------------------------------------*/
+int
+fastq_read_check (char *begin, int length, char *model, bloom * bl)
+{
+
+//printf("fastq read check...\n");
+
+  char *p = begin;
+  int distance = length;
+  int signal = 0, pre_kmer = 10, count_mis = 0;
+  char *previous, *key = (char *) malloc (k_mer * sizeof (char) + 1);
+  // int x = bloom_check(bl,"aaaaaaaaaaccccc");
+  // printf("xxxxx->%d\n",x);
+  while (distance > 0)
+    {
+      //printf("distance->%d\n",distance);
+      if (signal == 1)
+	break;
+
+      if (distance >= k_mer)
+	{
+	  memcpy (key, p, sizeof (char) * k_mer);	//need to be tested
+	  key[k_mer] = '\0';
+	  previous = p;
+	  p += k_mer;
+	  distance -= k_mer;
+	}
+
+      else
+	{
+	  memcpy (key, previous + distance, sizeof (char) * k_mer);
+	  p += (k_mer - distance);
+	  signal = 1;
+	}
+
+      if (model == "reverse")
+	rev_trans (key);
+
+      //printf("key->%s\n",key);          
+      if (mode == 1)
+	{
+	  if (bloom_check (bl, key))
+	    {
+	      //printf("hit\n");
+	      return fastq_full_check (bl, begin, length);
+	      //return 0;
+	    }
+	  //else
+	  //printf("unhit\n");
+	}
+
+      else
+	{
+	  if (!bloom_check (bl, key))
+	    {
+	      //printf("unhit\n");
+	      return fastq_full_check (bl, begin, length);
+	      //return 0;
+	    }
+	  //else
+	  //printf("hit\n");
+	}
+    }				// inner while
+
+  free (key);
+
+  if (model == "normal")	//use recursion to check the sequence forward and backward
+    return fastq_read_check (begin, length, "reverse", bl);
+  else
+    return 1;
+}
+
+/*-------------------------------------*/
+int
+fastq_full_check (bloom * bl, char *p, int distance)
+{
+
+//printf("fastq full check...\n");
+
+  int signal = 0, pre_kmer = -1;
+
+  int count_mis = 0, label_m = 0, label_mis = 0, count = 0, match_s = 0;
+
+  char *previous, *key = (char *) malloc (k_mer * sizeof (char) + 1);
+
+//printf("k_mer->%din",k_mer);
+  int length = distance;
+#pragma omp atomic
+  checky++;
+
+  while (distance >= k_mer)
+    {
+      memcpy (key, p, sizeof (char) * k_mer);
+      key[k_mer] = '\0';
+      previous = p;
+      p += 1;
+      if (bloom_check (bl, key))
+	{
+	  count++;
+	  if (pre_kmer == 1)
+	    {
+	      label_m++;
+	      if (count < (k_mer-1))
+		match_s++;
+	      else
+		{
+		  match_s += count;
+		  count = 0;
+		}
+	    }
+	  else
+	    {
+	      label_m += k_mer;
+	      match_s += k_mer - 1;
+	    }
+	  pre_kmer = 1;
+	  //printf("%d----%d\n",label_m,label_mis);
+	}
+      else
+	{
+	  count = 0;
+	  pre_kmer = 0;
+	}
+      distance--;
+    }				// end while
+  free (key);
+
+  label_mis = length - label_m;
+
+  if (((float) match_s / (float) (length)) >= tole_rate)
+    return 0;
+  else
+    return 1;
+}
+
+/*-------------------------------------*/
+void
+fasta_process (bloom * bl, Queue * info)
+{
+  printf ("fasta processing...\n");
+  char *p = info->location;
+  char *next, *temp, *temp_next, *temp_piece = NULL;
+
+  if (info->next != tail)
+    next = info->next->location;
+
+  else
+    {
+      last = 1;
+
+      printf ("last_piece %d\n", last_piece);
+
+      temp_piece = (char *) malloc ((last_piece + 1) * sizeof (char));
+
+      memset (temp_piece, 0, last_piece + 1);
+
+      memcpy (temp_piece, info->location, last_piece - PAGE);
+
+
+      temp_piece[last_piece] = '\0';
+
+      next = strrchr (temp_piece, '>');
+
+      printf ("temp_piece->%0.30s\n", temp_piece);
+      printf ("next->%0.30s\n", next);
+      printf ("length->%d\n", strlen (temp_piece));
+      printf ("test->%d\n", next - temp_piece);
+      p = temp_piece;
+      //printf ("p->%0.20s\n", p);
+    }
+
+  while (p != next)
+    {
+/*      
+      if (last == 1){
+         //printf ("p->%0.20s\n",p);
+         printf("loop\n");
+      }
+*/
+      //printf("here\n");
+      temp = jump (p);		//generate random number and judge if need to scan this read
+      if (p != temp)
+	{
+	  p = temp;
+	  continue;
+	}
+#pragma omp atomic
+      reads_num++;
+      //if (last == 1)
+      //printf("before\n");
+
+      temp_next = strchr (p + 1, '>');
+
+      //if (last == 1)
+      //if (temp_next)
+      //printf("temp_next->%0.20s\n",temp_next);
+      //else
+      //printf("boom\n");
+
+      if (!temp_next)
+	temp_next = next;
+
+      if (!fasta_read_check (p, temp_next, "normal", bl))
+	{
+#pragma omp atomic
+	  reads_contam++;
+	}
+
+      p = temp_next;
+      /*
+         if (last == 1)
+         {
+         printf ("p->%0.20s\n", p);
+         printf ("temp_next->%0.20s\n",temp_next);
+         if (p == next)
+         printf("ja\n");
+         }
+       */
+    }
+  if (temp_piece)
+    free (temp_piece);
+}
+
+/*-------------------------------------*/
+int
+fasta_read_check (char *begin, char *next, char *model, bloom * bl)
+{
+  //if (last == 1)
+  //printf("fasta read check...\n");
+
+//begin = strchr(begin+1,'\n')+1;
+
+//printf("read_check->%0.10s\n",begin);
+
+  char *p = strchr (begin + 1, '\n') + 1;
+
+  char *start = p;
+
+  char *key = (char *) malloc ((k_mer + 1) * sizeof (char));
+
+  char *pre_key = (char *) malloc ((k_mer + 1) * sizeof (char));
+
+  int n = 0, m = 0, count_enter = 0, result = 0;
+
+  //int label_m = 0, label_mis = 0;
+
+  key[k_mer] = '\0';
+
+
+  while (p != next)
+    {
+
+      //if (last == 1)
+      //{
+      //printf("p->%0.30s\n",p);
+      //printf("next->%0.30s\n",next);
+      /*
+         printf("next->%0.30s\n",next);
+         char *mov=p;
+         while (mov != next)
+         {
+         printf("(((->%0.30s\n",mov);
+         mov = strchr(mov,'>')+1;
+         }
+       */
+      //}
+
+      while (n < k_mer)
+	{
+	  if (p[m] == '>' || p[m] == '\0')
+	    {
+	      m--;
+	      break;
+	    }
+
+	  if (p[m] != '\r' && p[m] != '\n')
+	    key[n++] = p[m];
+	  else
+	    count_enter++;
+	  m++;
+	}			//inner while
+
+      if (m == 0)
+	break;
+
+      if (strlen (key) == k_mer)
+	memcpy (pre_key, key, sizeof (char) * (k_mer + 1));
+
+      else
+	{
+	  char *temp_key = (char *) malloc (k_mer * sizeof (char));
+
+	  memcpy (temp_key, pre_key + strlen (key), k_mer - strlen (key));
+	  memcpy (temp_key + k_mer - strlen (key), key,
+		  sizeof (char) * (strlen (key) + 1));
+	  free (key);
+	  key = temp_key;
+
+	}
+
+      p += m;
+
+      n = 0;
+
+      m = 0;
+
+      //if (last == 1)
+      //printf("key->%s\n",key);
+
+      if (model == "reverse")
+	rev_trans (key);
+
+      if (mode == 1)
+	{
+	  //printf("in\n");
+	  if (bloom_check (bl, key))
+	    {
+	      //printf("in\n");
+	      return fasta_full_check (bl, begin, next, model);
+	      //return 0;
+	    }
+	  //else
+	}			//outside if
+
+      else
+	{
+	  if (!bloom_check (bl, key))
+	    {
+	      //printf("unhit\n");
+	      return fasta_full_check (bl, begin, next, model);
+	      //return 0;
+	    }
+	  //else
+	  //printf("hit\n");
+	}			//outside else
+      memset (key, 0, k_mer);
+    }				//outside while
+
+  free (pre_key);
+  free (key);
+
+  if (model == "normal")	//use recursion to check the sequence forward and backward
+    return fasta_read_check (begin, next, "reverse", bl);
+  else
+    {
+//printf("in\n");
+/*
+if (mode==1)
+label_mis+=(next-start-count_enter+1);
+else
+label_m+=(next-start-count_enter+1);
+*/
+//printf("one read finish...\n");
+      return 1;
+    }
+}
+
+/*-------------------------------------*/
+int
+fasta_full_check (bloom * bl, char *begin, char *next, char *model)
+{
+
+#pragma omp atomic
+  checky++;
+
+  int label_m = 0, label_mis = 0, match_s = 0, count = 0;
+
+  //printf ("fasta full check...\n");
+
+  int n = 0, m = 0, count_enter = 0, pre_kmer = -1;
+
+  char *key = (char *) malloc ((k_mer + 1) * sizeof (char));
+
+  //printf("%0.10s\n",begin);
+
+  begin = strchr (begin + 1, '\n') + 1;
+
+  char *p = begin;
+
+  while (p != next)
+    {
+      if (*p == '\n')
+	count_enter++;
+      p++;
+    }
+
+  p = begin;
+
+  while (*p != '>' && *p != '\0')
+    {
+      while (n < k_mer)
+	{
+	  //printf("k_mer...\n");
+	  if (p[m] == '>' || p[m] == '\0')
+	    {
+	      m--;
+	      break;
+	    }
+
+	  if (p[m] != '\r' && p[m] != '\n')
+	    key[n++] = p[m];
+
+	  m++;
+	}
+      key[n] = '\0';
+
+      if (model == "reverse")
+	rev_trans (key);
+
+      if (strlen (key) == k_mer)
+	if (bloom_check (bl, key))
+	  {
+	    count++;
+	    if (pre_kmer == 1)
+	      {
+		label_m++;
+		if (count < (k_mer-1))
+		  match_s++;
+		else
+		  {
+		    match_s += count;
+		    count = 0;
+		  }
+	      }
+	    else
+	      {
+		label_m += k_mer;
+		match_s += k_mer - 1;
+	      }
+	    pre_kmer = 1;
+	    //printf("%d----%d\n",label_m,label_mis);
+	  }
+	else
+	  {
+	    count = 0;
+	    pre_kmer = 0;
+	  }
+
+      p++;
+      if (p[0] == '\n')
+	p++;
+      n = 0;
+      m = 0;
+    }				// end of while
+
+  if (((float) (match_s) / (float) (next - begin - count_enter)) >= (tole_rate))	//match >tole_rate considered as contaminated
+    return 0;
+  else
+    return 1;
+}
+
+/*-------------------------------------*/
+void
+evaluate (char *detail, char *filename)
+{
+  char buffer[100] = { 0 };
+  printf ("all->%d\n", reads_num);
+  printf ("contam->%d\n", reads_contam);
+  printf ("possbile->%d\n", checky);
+  contamination_rate = (double) (reads_contam) / (double) (reads_num);
+  if ((mode == 1 && contamination_rate == 0)
+      || (mode == 2 && contamination_rate == 1))
+    printf ("clean data...\n");
+  else if (mode == 1)
+    printf ("contamination rate->%f\n", contamination_rate);
+  else
+    printf ("contamination rate->%f\n", 1 - contamination_rate);
+
+  strcat (detail, "\nxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n");
+  strcat (detail, "bloom->");
+  strcat (detail, filename);
+  strcat (detail, "   \n");
+  sprintf (buffer, "all->%d\n", reads_num);
+  strcat (detail, buffer);
+  memset (buffer, 0, 100);
+  sprintf (buffer, "contam->%d\n", reads_contam);
+  strcat (detail, buffer);
+  memset (buffer, 0, 100);
+  sprintf (buffer, "possbile->%d\n", checky);
+  strcat (detail, buffer);
+  memset (buffer, 0, 100);
+  sprintf (buffer, "contamination rate->%f", contamination_rate);
+  strcat (detail, buffer);
+  memset (buffer, 0, 100);
+  reads_num = 0;
+  reads_contam = 0;
+  contamination_rate = 0;
+}
+
+/*-------------------------------------*/
+char *
+jump (char *target)
+{
+  //printf("here\n");
+  float seed = rand () % 10;
+
+  if (seed >= (float) sampling_rate * 10)
+    {
+
+      char *point;
+
+      if (type == 1)
+	point = strchr (target + 1, '>');	//point to >
+      else
+	point = strstr (target + 1, "\n@") + 1;	//point to @
+
+      if (point)
+	target = point;
+    }
+  return target;
+}
+
+/*-------------------------------------*/
+void
+statistic_save (char *detail, char *filename)
+{
+  char *position1,
+    *save_file = (char *) malloc (200 * sizeof (char)),
+    *possible_prefix = (char *) malloc (100 * sizeof (char));
+
+  memset (save_file, 0, 200);
+  memset (possible_prefix, 0, 100);
+
+  position1 = strrchr (filename, '/');
+
+  printf ("filename->%s\n", filename);
+
+  if (!prefix)
+    {
+      if (position1)
+	strncat (possible_prefix, filename, position1 + 1 - filename);
+    }
+  else
+    strcat (possible_prefix, prefix);
+
+  strcat (save_file, possible_prefix);
+
+  if (position1)
+    strncat (save_file, position1 + 1, strrchr (filename, '.') - position1);
+  else
+    strncat (save_file, filename, strrchr (filename, '.') - filename + 1);
+
+  strcat (save_file, "info");
+
+  printf ("bloom name->%s\n", save_file);
+
+  write_result (save_file, detail);
+}
+
+void
+list_init ()
+{
+  head = NEW (Queue);
+
+  tail = NEW (Queue);
+
+  head->next = tail;
+}
+
+/*
+
+char* reallocate(Queue *info)
+{
+  char *temp_piece = (char *) malloc (last_piece*sizeof(char)+1);
+  memcpy (temp_piece,info->location,last_piece);
+  //rintf("here\n");
+  temp_piece[last_piece]='\0';
+  return temp_piece;
+}
+
+*/
diff --git a/drass/remove.h b/drass/remove.h
new file mode 100644
index 0000000..c9359c7
--- /dev/null
+++ b/drass/remove.h
@@ -0,0 +1,11 @@
+#ifndef _REMOVE_CONTAM_L
+#define _REMOVE_CONTAM_L
+
+#include "bloom.h"
+extern void fasta_process_m (bloom * bl, Queue * info, Queue * tail, float tole_rate, F_set *File_head);
+extern void fastq_process_m (bloom * bl, Queue * info, Queue * tail, float tole_rate, F_set *File_head);
+extern int remove_main(int argc, char** argv);
+extern int remove_reads(char *source, char *ref, char *list, char *prefix, float tole_rate);
+extern void save_result (char *source, char *obj_file, int type, char *prefix, char *clean, char *clean2, char *contam, char *contam2);
+
+#endif
diff --git a/drass/remove_l.h b/drass/remove_l.h
new file mode 100644
index 0000000..a105546
--- /dev/null
+++ b/drass/remove_l.h
@@ -0,0 +1,12 @@
+#ifndef _REMOVE_CONTAM_L
+#define _REMOVE_CONTAM_L
+
+#include "bloom.h"
+extern void fasta_process_ml (F_set * File_head, bloom * bl, Queue * info, Queue * tail, char *clean, char *contam, float tole_rate);
+extern void fastq_process_ml (F_set * File_head, bloom * bl, Queue * info, Queue * tail, char *clean, char *contam, float tole_rate);
+extern void save_result_ml (char *source, char *obj_file, char *data, char *data2, int flag, int type, char* prefix);
+extern void all_save (F_set * File_head2, Queue * head2, Queue * tail, char *source, char *clean, char *clean2, char *contam, char *contam2, char *position, int type, char *prefix);
+extern int count_read (char *dick, char *next, int type);
+extern int remove_main_l(float tole_rate, char *source, char *ref, char *list, char *prefix, int help);
+
+#endif
diff --git a/drass/setup.cfg b/drass/setup.cfg
new file mode 100644
index 0000000..01bb954
--- /dev/null
+++ b/drass/setup.cfg
@@ -0,0 +1,3 @@
+[egg_info]
+tag_build = dev
+tag_svn_revision = true
diff --git a/drass/setup.py b/drass/setup.py
new file mode 100755
index 0000000..1c20fc0
--- /dev/null
+++ b/drass/setup.py
@@ -0,0 +1,34 @@
+from setuptools import setup, find_packages
+from setuptools.extension import Extension
+import sys, os
+
+version = '0.1'
+
+c_ext = Extension("facs", define_macros = [('DEBUG', '1'), ('FIFO', '1'), ('FILE_OFFSET_BITS', '64'), ('LARGE_FILE', '1')],
+                           sources = ["facs.c", "tool.c", "bloom.c", "good_build.c",
+                                      "suggestions.c", "lookup8.c", "file_dir.c",
+                                      "simple_check_1_ge.c", "big_query.c", "simple_remove.c"],
+                           extra_compile_args = ['-fopenmp'],
+                           extra_link_args=['-lgomp', '-lz'])
+
+setup(name='facs',
+      version=version,
+      description="FACS bloom filter implementation",
+      long_description="""FACS you""",
+      ext_modules=[c_ext],
+      classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
+      keywords='bloom filter probabilistic',
+      author='Enze Liu, Lars Arvestad, Henrik Stranneheim, Roman Valls Guimera',
+      author_email='roman@scilifelab.se',
+      url='http://facs.scilifelab.se/',
+      license='GPLv3',
+      packages=find_packages(exclude=['ez_setup', 'examples', 'tests']),
+      include_package_data=True,
+      zip_safe=False,
+      install_requires=[
+          # -*- Extra requirements: -*-
+      ],
+      entry_points="""
+      # -*- Entry points: -*-
+      """,
+      )
diff --git a/drass/simple_check_1_ge.c b/drass/simple_check_1_ge.c
new file mode 100644
index 0000000..206af97
--- /dev/null
+++ b/drass/simple_check_1_ge.c
@@ -0,0 +1,264 @@
+#define _LARGEFILE_SOURCE
+#define _LARGEFILE64_SOURCE
+#define _FILE_OFFSET_BITS 64
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+/*-------------------------------------*/
+//for file mapping in Linux
+#include<fcntl.h>
+#include<unistd.h>
+#include<sys/stat.h>
+#include<sys/time.h>
+#include<sys/mman.h>
+#include<sys/types.h>
+/*-------------------------------------*/
+#include "tool.h"
+#include "check.h"
+#include "bloom.h"
+#include "file_dir.h"
+/*-------------------------------------*/
+//openMP library
+#include<omp.h>
+/*-------------------------------------*/
+
+int check_main (int argc, char **argv)
+{
+   if (argc<2)  check_help();
+   
+/*-------defaults for bloom filter building-------*/ 
+  int opt;
+  float tole_rate = 0;
+  float sampling_rate = 1;
+  char* ref = NULL;
+  char* list = NULL;
+  char* target_path = NULL;
+  char* source = NULL;
+  while ((opt = getopt (argc, argv, "s:t:r:o:q:l:h")) != -1) {
+      switch (opt) {
+          case 't':
+              (optarg) && ((tole_rate = atof(optarg)), 1);
+              break;
+          case 's':
+              (optarg) && ((sampling_rate = atof(optarg)), 1);
+              break;
+          case 'o':    
+              (optarg) && ((target_path = optarg), 1);
+              break;
+          case 'q':  
+              (optarg) && (source = optarg, 1);  
+              break;
+          case 'r':  
+              (optarg) && (ref = optarg, 1);  
+              break;
+          case 'l':
+              (optarg) && (list = optarg, 1);  
+              break;
+          case 'h':
+              check_help();
+          case '?':
+              printf ("Unknown option: -%c\n", (char) optopt);
+              check_help();
+      } 
+  } 
+  return check_all (source, ref, tole_rate, sampling_rate, list, target_path);
+}
+
+int check_all (char *source, char *ref, float tole_rate, float sampling_rate, char *list, char *prefix)
+{
+  /*-------------------------------------*/
+  char *position;
+  char *detail = (char *) malloc (1000 * 1000 * sizeof (char));
+  memset (detail, 0, 1000 * 1000);
+  int type = 0;
+  /*-------------------------------------*/
+  Queue *head = NEW (Queue);
+  Queue *tail = NEW (Queue);
+  bloom *bl_2 = NEW (bloom);
+  Queue *head2;
+  head->location=NULL;
+  head2 = head;
+  head->next = tail;
+
+  F_set *File_head = NEW (F_set);
+  File_head = make_list (ref, list);
+  /*-------------------------------------*/
+  position = mmaping (source);
+  type = get_parainfo (position,head);
+  /*-------------------------------------*/
+  while (File_head)
+    {
+      load_bloom (File_head->filename, bl_2);
+      if (tole_rate==0)
+          tole_rate = mco_suggestion(bl_2->k_mer);
+#pragma omp parallel
+      {
+#pragma omp single nowait
+	{
+	  while (head != tail) {
+#pragma omp task firstprivate(head)
+	      { 
+		if (head->location!=NULL)
+                  {
+		  if (type == 1)
+		    fasta_process (bl_2, head, tail, File_head, sampling_rate,
+				   tole_rate);
+		  else
+		    fastq_process (bl_2, head, tail, File_head, sampling_rate,
+		  		   tole_rate);
+		  }
+	      }
+	      head = head->next;
+	    }
+	}			// End of single - no implied barrier (nowait)
+      }				// End of parallel region - implied barrier
+      evaluate (detail, File_head->filename, File_head);
+      /*-------------------------------------*/
+      File_head = File_head->next;
+      head = head2;
+      bloom_destroy (bl_2);
+      
+    }				//end while
+  statistic_save (detail, source, prefix);
+  munmap (position, strlen (position));
+
+  //check ("test.fna","k_12.bloom","r", prefix, 1, 0.8);
+  return 1;
+}
+
+/*-------------------------------------*/
+void
+fastq_process (bloom * bl, Queue * info, Queue *tail, F_set * File_head,
+	       float sampling_rate, float tole_rate)
+{
+
+  char *p = info->location;
+  char *next = NULL, *temp = NULL, *temp_piece = NULL;
+
+  if (info->location[0] != '@') {
+    return;
+  } else if (info->next != tail && info->next->location!=NULL) {
+    next = info->next->location;
+  } else {
+    next = strchr (p, '\0');
+  }
+
+  while (p != next)
+    {
+      //printf ("p->%0.50s\n",p);
+      temp = jump (p, 2, sampling_rate);	//generate random number and judge if need to scan this read
+
+      if (p != temp)
+	{
+	  p = temp;
+	  continue;
+	}
+
+#pragma omp atomic
+      File_head->reads_num++;
+
+      p = strchr (p, '\n') + 1;
+      if (fastq_read_check (p, strchr (p, '\n') - p, 'n', bl, tole_rate, File_head)> 0) {
+#pragma omp atomic
+	File_head->reads_contam++;
+      }
+
+      p = strchr (p, '\n') + 1;
+      p = strchr (p, '\n') + 1;
+      p = strchr (p, '\n') + 1;
+    }				// outside while
+  if (temp_piece)
+    free (temp_piece);
+
+}
+
+/*-------------------------------------*/
+void
+fasta_process (bloom * bl, Queue * info, Queue * tail, F_set * File_head,
+	       float sampling_rate, float tole_rate)
+{
+  #ifdef DEBUG
+  printf ("fasta processing...\n");
+  #endif
+  char *temp_next, *next, *temp;
+
+  if (info->location == NULL)
+    return;
+  else if (info->next != tail)
+    next = info->next->location;
+  else
+    next = strchr (info->location, '\0');
+
+  char *p = info->location;
+
+  while (p != next)
+    {
+      temp = jump (p, 1, sampling_rate);	//generate random number and judge if need to scan this read
+
+      if (p != temp)
+	{
+	  p = temp;
+	  continue;
+	}
+
+#pragma omp atomic
+      File_head->reads_num++;
+
+      temp_next = strchr (p + 1, '>');
+      if (!temp_next)
+	temp_next = next;
+
+      if (fasta_read_check (p, temp_next, 'n', bl, tole_rate, File_head) > 0)
+	{
+#pragma omp atomic
+	  File_head->reads_contam++;
+	}
+
+      p = temp_next;
+    }
+}
+
+/*-------------------------------------*/
+void
+evaluate (char *detail, char *filename, F_set * File_head)
+{
+  char buffer[200] = { 0 };
+  float contamination_rate =
+    (float) (File_head->reads_contam) / (float) (File_head->reads_num);
+
+// JSON output format by default
+  printf("{\n");
+  printf ("\t\"total_read_count\": %lld,\n", File_head->reads_num);
+  printf ("\t\"contaminated_reads\": %lld,\n", File_head->reads_contam);
+  printf ("\t\"total_hits\": %lld,\n", File_head->hits);
+  printf ("\t\"contamination_rate\": %f,\n", contamination_rate);
+  printf ("\t\"bloom_filename\":\"%s\"\n", filename);
+  printf("}\n");
+
+#ifdef DEBUG
+  strcat (detail, "Bloomfile\tAll\tContam\tcontam_rate\n");
+  strcat (detail, filename);
+#endif
+
+  sprintf (buffer, "  %lld\t%lld\t%f\n", File_head->reads_num,
+	   File_head->reads_contam, contamination_rate);
+  strcat (detail, buffer);
+}
+
+/*-------------------------------------*/
+void
+statistic_save (char *detail, char *filename, char *prefix)
+{
+  char *save_file = NULL;
+  save_file = prefix_make (filename, NULL, prefix);
+  if (save_file[0]=='/')
+      save_file++;
+  strcat (save_file,".info");
+
+#ifdef DEBUG
+  printf ("Basename->%s\n", filename);
+  printf ("Info name->%s\n", save_file);
+#endif
+  write_result (save_file, detail);
+}
diff --git a/drass/simple_remove.c b/drass/simple_remove.c
new file mode 100644
index 0000000..011f985
--- /dev/null
+++ b/drass/simple_remove.c
@@ -0,0 +1,347 @@
+#define _LARGEFILE_SOURCE
+#define _LARGEFILE64_SOURCE
+#define _FILE_OFFSET_BITS 64
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+/*-------------------------------------*/
+//for file mapping in Linux and timing
+#include<fcntl.h>
+#include<unistd.h>
+#include<sys/stat.h>
+#include<sys/time.h>
+#include<sys/mman.h>
+#include<sys/types.h>
+/*-------------------------------------*/
+#include "tool.h"
+#include "bloom.h"
+#include "remove.h"
+#include "file_dir.h"
+/*-------------------------------------*/
+//openMP library
+#include<omp.h>
+//#include<mpi.h>
+/*-------------------------------------*/
+char *clean, *contam;
+/*-------------------------------------*/
+
+int remove_main(int argc, char** argv)
+{
+  if (argc < 2) remove_help();
+/*-------defaults for bloom filter building-------*/ 
+  int opt;
+  float tole_rate = 0;
+  char* ref = NULL;
+  char* list = NULL;
+  char* target_path = NULL;
+  char* source = NULL;
+  while ((opt = getopt (argc, argv, "t:r:o:q:l:h")) != -1) {
+      switch (opt) {
+          case 't':
+              (optarg) && ((tole_rate = atof(optarg)), 1);
+              break;
+          case 'o':    
+              (optarg) && ((target_path = optarg), 1);
+              break;
+          case 'q':  
+              (optarg) && (source = optarg, 1);  
+              break;
+          case 'r':  
+              (optarg) && (ref = optarg, 1);  
+              break;
+          case 'l':
+              (optarg) && (list = optarg, 1);  
+              break;
+          case 'h':
+              remove_help();
+          case '?':
+              printf ("Unknown option: -%c\n", (char) optopt);
+              remove_help();
+      } 
+  } 
+  return remove_reads(source, ref, list, target_path, tole_rate);
+}
+int remove_reads(char *source, char *ref, char *list, char *prefix, float tole_rate)
+{
+  /*-------------------------------------*/
+  int type = 1;
+  char *position;
+  //char *clean;
+  //char *contam;
+  char *clean2;
+  char *contam2;
+  /*-------------------------------------*/
+  bloom *bl_2 = NEW (bloom);
+  Queue *head = NEW (Queue);
+  Queue *tail = NEW (Queue);
+  head->next = tail;
+  Queue *head2 = head;
+  F_set *File_head = NEW (F_set);
+  File_head = make_list (ref, list);
+  /*-------------------------------------*/
+  position = mmaping (source);
+  type = get_parainfo (position, head);
+  clean = (char *) malloc (strlen (position) * sizeof (char));
+  contam = (char *) malloc (strlen (position) * sizeof (char));
+  clean2 = clean;
+  contam2 = contam;
+  /*-------------------------------------*/
+  while (File_head)
+    {
+      memset (clean2, 0, strlen (position));
+      memset (contam2, 0, strlen (position));
+      load_bloom (File_head->filename, bl_2);
+      
+      if (tole_rate==0)
+      	tole_rate = mco_suggestion (bl_2->k_mer);
+#pragma omp parallel
+      {
+#pragma omp single nowait
+	{
+	  while (head != tail)
+	    {
+#pragma omp task firstprivate(head)
+	      {
+		if (head->location!=NULL) {
+		  if (type == 1)
+		    fasta_process_m (bl_2, head, tail, tole_rate, File_head);
+		  else
+		    fastq_process_m (bl_2, head, tail, tole_rate, File_head);
+                }
+	    }
+          }
+	      head = head->next;
+	}			// End of single - no implied barrier (nowait)
+      }				// End of parallel region - implied barrier
+      save_result (source, File_head->filename, type, prefix, clean, clean2,
+		   contam, contam2);
+      File_head = File_head->next;
+      head = head2;
+      bloom_destroy (bl_2);
+    }				//end while
+  munmap (position, strlen (position));
+  printf ("finish processing...\n");
+  return 0;
+}
+
+/*-------------------------------------*/
+void
+fastq_process_m (bloom * bl, Queue * info, Queue * tail, float tole_rate, F_set *File_head)
+{
+
+  int read_num = 0, read_contam = 0;
+  char *p = info->location;
+  char *next, *temp_start, *temp_end, *temp_piece = NULL;
+
+  if (info->next == NULL)
+    return;
+
+  else if (info->next != tail)
+    next = info->next->location;
+
+  else
+    next = strchr (p, '\0');
+
+  while (p != next)
+    {
+
+      read_num++;
+
+      temp_start = p;
+
+      if (p == '\0' || p == NULL)
+	break;
+
+      p = strchr (p, '\n') + 1;
+
+      temp_end = strstr (p, "\n@");
+
+      if (!temp_end)
+	temp_end = strchr (p, '\0');
+      int result =
+	fastq_read_check (p, strchr (p, '\n') - p, 'n', bl, tole_rate, File_head);
+
+      if (result == 0)
+	{
+#pragma omp critical
+	  {
+	    memcpy (clean, temp_start, temp_end - temp_start);
+	    clean += (temp_end - temp_start);
+	    if (*temp_end != '\0')
+	      {
+		clean[0] = '\n';
+		clean++;
+	      }
+	  }
+	}
+      else if (result > 0)
+	{
+#pragma omp critical
+	  {
+            read_contam++;
+	    memcpy (contam, temp_start, temp_end - temp_start);
+	    contam += (temp_end - temp_start);
+	    if (*temp_end != '\0')
+	      {
+		contam[0] = '\n';
+		contam++;
+	      }
+	  }
+	}
+
+
+      if (*temp_end == '\0')
+	break;
+
+      p = temp_end + 1;
+
+    }				// outside while
+//free(key);
+  if (temp_piece)
+    free (temp_piece);
+}
+
+/*-------------------------------------*/
+void
+fasta_process_m (bloom * bl, Queue * info, Queue * tail, float tole_rate, F_set *File_head)
+{
+  printf ("fasta processing...\n");
+
+  int read_num = 0, read_contam = 0;
+
+  char *p = info->location;
+
+  char *next;
+
+  char *temp = p;
+
+  if (info->next == NULL)
+    return;
+  else if (info->next != tail)
+    next = info->next->location;
+  else
+    next = strchr (p, '\0');
+
+  while (p != next)
+    {
+      read_num++;
+      temp = strchr (p + 1, '>');
+      if (!temp)
+	temp = next;
+
+      int result = fasta_read_check (p, temp, 'n', bl, tole_rate, File_head);
+      if (result == 0)
+	{
+#pragma omp critical
+	  {
+	    memcpy (clean, p, temp - p);
+	    clean += (temp - p);
+	  }
+	}
+      else if (result > 0)
+	{
+#pragma omp critical
+	  {
+            read_contam++;
+	    memcpy (contam, p, temp - p);
+	    contam += (temp - p);
+	  }
+	}
+      p = temp;
+    }
+  printf ("all->%d\ncontam->%d\n", read_num, read_contam);
+}
+
+/*-------------------------------------*/
+void
+save_result (char *source, char *obj_file, int type, char *prefix,
+	     char *clean, char *clean2, char *contam, char *contam2)
+{
+  printf ("source->%s\n",source);
+  printf ("obj_file->%s\n",obj_file);
+  printf ("prefix->%s\n",prefix);
+  char *so = NULL, *obj = NULL;
+
+  char *match = (char *) malloc (400 * sizeof (char)),
+    *mismatch = (char *) malloc (400 * sizeof (char)),
+    *so_name = (char *) malloc (200 * sizeof (char)),
+    *obj_name = (char *) malloc (200 * sizeof (char));
+
+  memset (match, 0, 400);
+  memset (mismatch, 0, 400);
+  memset (so_name, 0, 200);
+  memset (obj_name, 0, 200);
+  
+  so = strrchr (source, '/');
+  obj = strrchr (obj_file,'/');
+  if (so)
+     so += 1;
+  else 
+     so = NULL;
+  if (obj)
+     obj += 1;
+  else
+     obj = NULL;
+  if (so)
+    strncat (so_name, so, strrchr (source, '.') - so);
+  else
+    strncat (so_name, source, strrchr (source, '.') - source);
+  if (obj)
+    strncat (obj_name, obj, strrchr (obj_file, '.') - obj);
+  else
+    strncat (obj_name, obj_file, strrchr (obj_file, '.') - obj_file);
+  if (prefix)
+    {
+      strcat (match, prefix);
+      strcat (mismatch, prefix);
+    }
+  else if (so)
+    {
+      strncat (match, source, so - source);
+      strncat (mismatch, source, so - source);
+    }
+  //printf ("objname->%s\n",obj_name);
+  //printf ("match->%s\n", match);
+  //printf ("mismatch->%s\n", mismatch);
+  strcat (match, so_name);
+  strcat (mismatch, so_name);
+  //printf ("match->%s\n", match);
+  //printf ("mismatch->%s\n", mismatch);
+  strcat (match, "_");
+  strcat (mismatch, "_");
+  //printf ("match->%s\n", match);
+  //printf ("mismatch->%s\n", mismatch);
+  strcat (match, obj_name);
+  strcat (mismatch, obj_name);
+  //printf ("match->%s\n", match);
+  //printf ("mismatch->%s\n", mismatch);
+  strcat (match, "_contam");
+  strcat (mismatch, "_clean");
+
+  if (type == 1)
+    {
+      strcat (match, ".fasta");
+      strcat (mismatch, ".fasta");
+    }
+  else
+    {
+      strcat (match, ".fastq");
+      strcat (mismatch, ".fastq");
+    }
+  printf ("match->%s\n", match);
+  printf ("mis->%s\n", mismatch);
+
+  write_result (match, contam2);
+  write_result (mismatch, clean2);
+  free (match);
+  free (mismatch);
+  free (so_name);
+  free (obj_name);
+  memset (contam2, 0, strlen (contam2));
+  memset (clean2, 0, strlen (clean2));
+  clean = clean2;
+  contam = contam2;
+}
+
+/*-------------------------------------*/
diff --git a/drass/simple_remove_l.c b/drass/simple_remove_l.c
new file mode 100644
index 0000000..2be92bf
--- /dev/null
+++ b/drass/simple_remove_l.c
@@ -0,0 +1,392 @@
+#define _LARGEFILE_SOURCE
+#define _LARGEFILE64_SOURCE
+#define _FILE_OFFSET_BITS 64
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+/*-------------------------------------*/
+//for file mapping in Linux
+#include<fcntl.h>
+#include<unistd.h>
+#include<sys/stat.h>
+#include<sys/time.h>
+#include<sys/mman.h>
+#include<sys/types.h>
+/*-------------------------------------*/
+#include "tool.h"
+#include "bloom.h"
+#include "remove_l.h"
+#include "file_dir.h"
+/*-------------------------------------*/
+//openMP library
+#include<omp.h>
+/*-------------------------------------*/
+char *clean, *contam;
+/*-------------------------------------*/
+int
+remove_main_l (float tole_rate, char *source, char *ref, char *list, char *prefix, int help)
+{
+  if (help == 1)
+    {
+      remove_l_help ();
+      exit (1);
+    }
+  long sec, usec, i;
+  struct timezone tz;
+  struct timeval tv, tv2;
+  gettimeofday (&tv, &tz);
+/*-------------------------------------*/
+  char *position;
+  int type = 1;
+  char *clean2 = clean;
+  char *contam2 = contam;
+/*-------------------------------------*/
+/*-------------------------------------*/
+  bloom *bl_2 = NEW (bloom);
+  Queue *head = NEW (Queue);
+  Queue *tail = NEW (Queue);
+  head->next = tail;
+  Queue *head2 = head;
+  F_set *File_head = NEW (F_set);
+  File_head = make_list (ref, list);
+  F_set *File_head2 = File_head;
+
+  position = mmaping (source);
+  type = get_parainfo (position, head);
+  clean = (char *) malloc (strlen (position) * sizeof (char));
+  contam = (char *) malloc (strlen (position) * sizeof (char));
+  while (File_head)
+    {
+      load_bloom (File_head->filename, bl_2);
+      printf ("File name->%s ", File_head->filename);
+      printf ("File number->%d\n", File_head->number);
+#pragma omp parallel
+      {
+#pragma omp single nowait
+	{
+	  while (head != tail)
+	    {
+#pragma omp task firstprivate(head)
+	      {
+		if (head->location)
+		  {
+		    //printf("location->%0.20s\n",head->location);
+		    if (type == 1)
+		      fasta_process_ml (File_head, bl_2, head,tail,clean,contam,tole_rate);
+		    else
+		      fastq_process_ml (File_head, bl_2, head,tail,clean,contam,tole_rate);
+		  }
+	      }
+	      head = head->next;
+	    }
+	}			// End of single - no implied barrier (nowait)
+      }				// End of parallel region - implied barrier
+      head = head2;
+      bloom_destroy (bl_2);
+      File_head = File_head->next;
+    }				// End outside while
+
+  all_save (File_head2, head2, tail, source, clean, clean2, contam, contam2,
+	    position, type, prefix);
+  munmap (position, strlen (position));
+
+  printf ("finish processing...\n");
+#ifdef DEBUG
+  gettimeofday (&tv2, &tz);
+  sec = tv2.tv_sec - tv.tv_sec;
+  usec = tv2.tv_usec - tv.tv_usec;
+#endif
+  printf ("total=%ld sec\n", sec);
+
+  return 0;
+}
+
+/*-------------------------------------*/
+void
+fastq_process_ml (F_set * File_head, bloom * bl, Queue * info,  Queue * tail, char *clean, char *contam, float tole_rate)
+{
+  printf ("fastq processing...\n");
+
+  int read_num = 0, result = 0, countup = 0;
+  char *p = info->location;
+  char *next, *temp_start, *temp_end;
+
+  if (info->next == NULL)
+    return;
+  else if (info->next != tail)
+    next = info->next->location;
+  else
+    next = strchr (p, '\0');
+
+  if (info->score == NULL)
+    {
+      read_num = count_read (p, next,2);
+      printf ("read_num->%d\n", read_num);
+      info->score = (short *) malloc (read_num * sizeof (short));
+      info->number = (short *) malloc (read_num * sizeof (short));
+    }
+  while (p != next)
+    {
+      temp_start = p;
+
+      if (p == '\0' || p == NULL)
+	break;
+
+      p = strchr (p, '\n') + 1;
+
+      temp_end = strstr (p, "\n@");
+
+      if (!temp_end)
+	temp_end = strchr (p, '\0');
+
+      result =
+	fastq_read_check (p, strchr (p, '\n') - p, "normal", bl, tole_rate);
+
+      if (result == 0)
+	{
+#pragma omp critical
+	  {
+	    memcpy (clean, temp_start, temp_end - temp_start);
+	    clean += (temp_end - temp_start);
+	    if (*temp_end != '\0')
+	      {
+		clean[0] = '\n';
+		clean++;
+	      }
+	  }
+	}
+      else if (result == -1)
+	{
+	  continue;
+	}
+
+      //(optarg) && (source = optarg, 1);
+      if (info->score[countup] < result)
+	{
+	  info->score[countup] = result;	//record score 
+	  info->number[countup] = File_head->number;	//record bloom number
+	}
+      if (*temp_end == '\0')
+	break;
+      p = temp_end + 1;
+      countup++;
+    }				// end while
+}
+
+/*-------------------------------------*/
+void
+fasta_process_ml (F_set * File_head, bloom * bl, Queue * info, Queue * tail, char *clean, char *contam, float tole_rate)
+{
+  printf ("fasta processing...\n");
+
+  int read_num = 0, result = 0, countup = 0, sign = 0;
+
+  char *p = info->location;
+
+  char *next;
+
+  char *temp = p;
+
+  if (info->next == NULL)
+    return;
+  else if (info->next != tail)
+    next = info->next->location;
+  else
+    next = strchr (p, '\0');
+
+  if (info->score == NULL)
+    {
+      read_num = count_read (p, next, 1);
+      info->score = (short *) malloc (read_num * sizeof (short));
+      info->number = (short *) malloc (read_num * sizeof (short));
+    }
+
+  while (p != next)
+    {
+      temp = strchr (p + 1, '>');
+      if (!temp)
+	temp = next;
+
+      result = fasta_read_check (p, temp, "normal", bl, tole_rate);
+      //printf ("result->%d\n",result);
+      if (result == 0)
+	{
+#pragma omp critical
+	  {
+	    memcpy (clean, p, temp - p);
+	    clean += (temp - p);
+	  }
+	}
+      else if (result == -1)
+	continue;
+      if (info->score[countup] < result)
+	{
+	  info->score[countup] = result;	//record score 
+	  info->number[countup] = File_head->number;	//record bloom number
+	}
+
+      countup++;
+
+      p = temp;
+    }				// end while
+  //printf ("all->%d\ncontam->%d\n", read_num, read_contam);
+}
+
+/*-------------------------------------*/
+void
+save_result_ml (char *source, char *obj_file, char *data, char *data2,
+		int flag, int type, char *prefix)
+{
+  printf ("saving...\n");
+  char *match = (char *) malloc (4*HUN * sizeof (char)),
+    *so_name = (char *) malloc (2*HUN * sizeof (char)),
+    *obj_name = (char *) malloc (2*HUN * sizeof (char));
+
+  memset (match, 0, 4*HUN);
+  memset (so_name, 0, 2*HUN);
+  memset (obj_name, 0, 2*HUN);
+
+  char *so;
+  ((so = strrchr (source, '/'))) && (so += 1, 1) || (so = NULL);
+
+  char *obj;
+  ((obj = strrchr (obj_file, '/'))) && (obj += 1, 1) || (obj = NULL);
+
+  if (so)
+    strncat (so_name, so, strrchr (source, '.') - so);
+  else
+    strncat (so_name, source, strrchr (source, '.') - source);
+
+  if (obj)
+    strncat (obj_name, obj, strrchr (obj_file, '.') - obj);
+  else
+    strncat (obj_name, obj_file, strrchr (obj_file, '.') - obj_file);
+
+  if (prefix)
+    {
+      strcat (match, prefix);
+    }
+  else if (so)
+    {
+      strncat (match, source, so - source);
+    }
+  //printf ("match->%s\n", match);
+  //printf ("mismatch->%s\n", mismatch);
+  strcat (match, so_name);
+  //printf ("match->%s\n", match);
+  //printf ("mismatch->%s\n", mismatch);
+  strcat (match, "_");
+  //printf ("match->%s\n", match);
+  //printf ("mismatch->%s\n", mismatch);
+  strcat (match, obj_name);
+  //printf ("match->%s\n", match);
+  //printf ("mismatch->%s\n", mismatch);
+  if (flag == 0)
+    strcat (match, "_clean");
+  else
+    strcat (match, "_contam");
+  if (type == 1)
+    {
+      strcat (match, ".fasta");
+    }
+  else
+    {
+      strcat (match, ".fastq");
+    }
+  printf ("match->%s\n", match);
+
+  write_result (match, data2);
+
+  free (match);
+
+  free (so_name);
+
+  free (obj_name);
+
+  memset (data2, 0, strlen (data2));
+
+  data = data2;
+
+}
+
+/*-------------------------------------*/
+int
+count_read (char *data, char *next, int type)
+{
+  printf ("count_read\n");
+  int number = 1;
+  char *pos, *temp_next;
+  pos = data;
+
+  while (pos != next)
+    {
+      if (type == 1)
+	temp_next = strchr (pos + 1, '>');
+      else
+	temp_next = strstr (pos + 1, "\n@");
+      number++;
+      pos = temp_next;
+      if (!pos)
+	break;
+    }
+  return number;
+}
+
+/*-------------------------------------*/
+void
+all_save (F_set * File_head2, Queue * head2, Queue * tail, char *source, char *clean,
+	  char *clean2, char *contam, char *contam2, char *position, int type,
+	  char *prefix)
+{
+  char *pos, *next, *temp_next;
+  int countup;
+  Queue *head;
+  save_result_ml (source, File_head2->filename, clean, clean2, 0, type, prefix);	// save the clean data
+  free (clean2);
+
+//File_head2 = File_head2->next;
+//printf("1_dollar_%s\n",File_head2->filename);
+  while (File_head2)
+    {
+      head = head2;
+      head = head->next;
+      while (head != tail)
+	{
+	  countup = 0;
+	  pos = head->location;
+	  if (head->next->location != NULL)
+	    next = head->next->location;
+	  else
+	    next = strchr (head->location, '\0');
+	  //printf("head->%0.20s\n",head->location);                
+	  while (pos != next)
+	    {
+	      if (type == 1)
+		temp_next = strchr (pos + 1, '>');
+	      else
+		temp_next = strstr (pos + 1, "\n@");
+	      //printf("temp_next->%0.10s\n",temp_next);
+	      //printf("next->%0.10s\n",next);
+	      if (temp_next == NULL)
+		temp_next = next;
+	      //printf("???->%d---%d\n",head->score[countup],head->number[countup]);
+	      if (head->score[countup] > 0
+		  && (head->number[countup] == File_head2->number))
+		{
+		  memcpy (contam, pos, temp_next - pos);
+		  contam += temp_next - pos;
+		}
+	      countup++;
+	      pos = temp_next;
+	    }
+	  head = head->next;
+	  //save_result (source,File_head2->filename,contam,contam2,1);
+	}
+      memset (clean2, 0, strlen (position));
+      memset (contam2, 0, strlen (position));
+      save_result (source, File_head2->filename, contam, contam2, 1, type,
+		   prefix);
+      File_head2 = File_head2->next;
+    }
+}
diff --git a/drass/suggestions.c b/drass/suggestions.c
new file mode 100644
index 0000000..5b3b6cb
--- /dev/null
+++ b/drass/suggestions.c
@@ -0,0 +1,164 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "bloom.h"
+/*------------------------------*/
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#define MB 1048576
+
+float get_probability (BIGCAST hits, BIGCAST total, int k_mer)
+{
+double times = (double)total/(100*MB);
+double prob = 0;
+int rand_hit = 0;
+//((k_mer/3)==0)&&(k_mer=k_mer,1)||(k_mer-(k_mer%3));
+switch (k_mer)
+{
+  case 9:
+  rand_hit = 10;
+  break;
+  case 12:
+  rand_hit = 20;
+  break;
+  case 15:
+  rand_hit = 40;
+  break;
+  case 18:
+  rand_hit = 80;
+  break;
+  case 21:
+  rand_hit = 100;
+  break;
+  default:
+  printf ("cant handle this k_mer so far\n");
+  exit(-1);
+}
+prob = (double)hits/times;
+if (prob<rand_hit)
+    return 0;
+else
+    return hits/total; 
+}
+
+BIGCAST get_size (char *filename)
+{
+BIGCAST tim;
+struct stat statbuf;
+if ((tim=open(filename, O_RDONLY))<0)
+    {
+     printf("open file error...\n");
+     exit(-1);
+    }
+fstat (tim, &statbuf);
+return statbuf.st_size;
+}
+
+int kmer_suggestion (BIGCAST size)
+{
+  if (size<1*MB)
+     {
+      //bl->k_mer = 12;
+      //bl->mcf = 0.3;
+      return 12;
+     }
+  else if (size<20*MB)
+     {
+      //bl->k_mer = 15;
+      //bl->mcf = 0.4;
+      return 15;
+     }
+  else if (size<50*MB)
+     {
+      //bl->k_mer = 17;
+      //bl->mcf = 0.4;
+      return 17;
+     }
+  else if (size<200*MB)
+     {
+      //bl->k_mer = 18;
+      //bl->mcf = 0.3;
+      return 18;
+     }
+  else 
+     {
+      //bl->k_mer = 20;
+      //bl->mcf = 0.3;
+      return 20;
+     }
+}
+
+float mco_suggestion (int k_mer)
+{
+  if (k_mer<15)
+      return 0.3;
+  if (k_mer<18)
+      return 0.4;
+  else
+      return 0.3;
+}
+
+int
+get_suggestion (struct bloomstat *stats, BIGNUM n, double e)
+{
+  stats->capacity = n;
+  stats->e = e;
+  get_rec (stats);
+
+  return 0;
+}
+
+BIGNUM
+find_close_prime (BIGNUM m)
+{
+  if ((m % 2) == 0)
+    m += 1;
+
+  while (!is_prime (m))
+    {
+      m += 2;
+    }
+  return m;
+}
+
+/*
+Given the desired capacity and error rate, calculate the appropriate values
+for number of hash functions and size of array
+*/
+void
+get_rec (struct bloomstat *stat)
+{
+  /* assuming perfect number of cells, k directly depends on e */
+  stat->ideal_hashes = (int) log (stat->e) / log (0.5);
+  stat->elements =
+    find_close_prime ((BIGNUM) 13 * stat->capacity *
+		      (BIGNUM) stat->ideal_hashes / (BIGNUM) 9);
+  /*
+     recalculate k with the actual m, not the ideal 
+     wouldn't need to if it wasn't prime, but that causes problems
+     for hash algs
+   */
+  stat->ideal_hashes = 9 * stat->elements / (13 * stat->capacity);
+}
+
+int
+is_prime (BIGNUM m)
+{
+  BIGNUM a = (BIGNUM) sqrtl ((long double) m);
+  BIGNUM currval = 3;
+  if (m % 2 == 0 && m != 2)
+    return 0;
+  while (m % currval != 0 && currval < a)
+    {
+      if (m % (currval + 2) == 0)
+	return 0;
+      if (m % (currval + 4) == 0)
+	return 0;
+      currval += 8;
+    }
+  return (int) m % currval;
+}
diff --git a/drass/tool.c b/drass/tool.c
new file mode 100644
index 0000000..3517189
--- /dev/null
+++ b/drass/tool.c
@@ -0,0 +1,453 @@
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <limits.h>
+#include <string.h>
+#include <stdio.h>
+#include <limits.h>
+#include <omp.h>
+#include "tool.h"
+#include "bloom.h"
+#include "file_dir.h"
+/*-------------------------------------*/
+
+int
+fastq_read_check (char *begin, int length, char model, bloom * bl, float tole_rate, F_set *File_head)
+{
+  char *p = begin;
+  int distance = length;
+  int signal = 0, result = 0;
+  char  *previous, *key = (char *) malloc (bl->k_mer * sizeof (char) + 1);
+
+  while (distance > bl->k_mer)
+    {
+      if (signal == 1)
+	break;
+
+      if (distance >= bl->k_mer)
+	{
+	  memcpy (key, p, sizeof (char) * bl->k_mer);	//need to be tested
+	  key[bl->k_mer] = '\0';
+	  p += bl->k_mer;
+          previous = p;
+	  distance -= bl->k_mer;
+	}
+
+      else
+	{
+	  memcpy (key, previous + distance, sizeof (char) * bl->k_mer);
+	  p += (bl->k_mer - distance);
+	  signal = 1;
+	}
+
+      if (model == 'r')
+	rev_trans (key);
+
+      if (bloom_check (bl, key))
+	{
+	  result = fastq_full_check (bl, begin, length, model, tole_rate, File_head);
+	  if (result > 0)
+	    return result;
+	  else if (model == 'n')
+	    break;
+	}
+
+    }				//outside while
+  if (model == 'r')
+    return 0;
+  else
+    return fastq_read_check (begin, length, 'r', bl, tole_rate, File_head);
+}
+
+/*-------------------------------------*/
+int
+fastq_full_check (bloom * bl, char *p, int distance, char model,
+		  float tole_rate, F_set *File_head)
+{
+
+  //printf ("fastq full check...\n");
+
+  int length = distance;
+
+  int count = 0, match_s = 0, mark = 1, match_time = 0;
+
+  float result;
+
+  char *key = (char *) malloc (bl->k_mer * sizeof (char) + 1);
+
+  short prev = 0, conse = 0; 
+
+  while (distance >= bl->k_mer)
+    {
+      memcpy (key, p, sizeof (char) * bl->k_mer);
+      key[bl->k_mer] = '\0';
+      p += 1;
+
+      if (model == 'r')
+	rev_trans (key);
+
+      if (count >= bl->k_mer)
+	{
+	  mark = 1;
+	  count = 0;
+	}
+ if (strlen (key) == bl->k_mer)
+	{
+	  if (bloom_check (bl, key))
+	    {
+	      match_time++;
+	      if (prev == 1)
+                  conse++;
+              else
+                  {
+                  conse+=bl->k_mer;
+                  prev = 1;
+                  }
+	      if (mark == 1)
+		{
+		  match_s += (bl->k_mer - 1);
+		  mark = 0;
+		}
+	      else
+		match_s++;
+	    }
+
+	  else
+	    {
+	      prev = 0;
+	      //printf("unhit--->\n");
+	    }
+	  count++;
+	}			//outside if
+	distance--;
+    }				// end while
+  free (key);
+  result = (float)(match_time*bl->k_mer+conse)/(float)(length*bl->k_mer-2*bl->dx+conse);
+  //result = (float) match_s / (float) length;
+  #pragma omp atomic
+  File_head->hits+=match_time;
+  #pragma omp atomic 
+  File_head->all_k+=(length-bl->k_mer);
+  if (result >= tole_rate)
+    return match_s;
+  else
+    return 0;
+}
+
+/*-------------------------------------*/
+int
+fasta_read_check (char *begin, char *next, char model, bloom * bl,
+		  float tole_rate, F_set *File_head)
+{
+
+  char *p = strchr (begin + 1, '\n') + 1;
+
+  if (!p || *p == '>')
+    return 1;
+
+  int n, m, result, count_enter;
+  char *key = (char *) malloc ((bl->k_mer + 1) * sizeof (char));
+  char *pre_key = (char *) malloc ((bl->k_mer + 1) * sizeof (char));
+
+  key[bl->k_mer] = '\0';
+
+  while (p != next)
+    {
+      while (n < bl->k_mer)
+	{
+	  if (p[m] == '>' || p[m] == '\0')
+	    {
+	      m--;
+	      break;
+	    }
+
+	  if (p[m] != '\r' && p[m] != '\n')
+	    key[n++] = p[m];
+	  else
+	    count_enter++;
+	  m++;
+	}			//inner while
+
+      if (m == 0)
+	break;
+
+      if (strlen (key) == bl->k_mer)
+	memcpy (pre_key, key, sizeof (char) * (bl->k_mer + 1));
+
+      else
+	{
+	  char *temp_key = (char *) malloc (bl->k_mer * sizeof (char));
+
+	  memcpy (temp_key, pre_key + strlen (key), bl->k_mer - strlen (key));
+
+	  memcpy (temp_key + bl->k_mer - strlen (key), key,
+		  sizeof (char) * (strlen (key) + 1));
+
+	  free (key);
+
+	  key = temp_key;
+
+	}
+      p += m;
+
+      n = 0;
+
+      m = 0;
+
+      if (model == 'r')
+	rev_trans (key);
+
+      if (bloom_check (bl, key))
+	{
+	  result = fasta_full_check (bl, begin, next, model, tole_rate, File_head);
+	  if (result > 0)
+	    return result;
+	  //else if (model == 'n')     //use recursion to check the sequence forward and backward
+	  //    return fasta_read_check (begin, next, 'r', bl);
+	  else if (model == 'n')
+	    break;
+	}
+
+      //memset (key, 0, bl->k_mer);
+    }				//outside while
+  if (model == 'r')
+    return 0;
+  else
+    return fasta_read_check (begin, next, 'r', bl, tole_rate, File_head);
+}
+
+/*-------------------------------------*/
+int
+fasta_full_check (bloom * bl, char *begin, char *next, char model,
+		  float tole_rate, F_set *File_head)
+{
+  int match_s = 0, count = 0, mark = 1;
+
+  int n = 0, m = 0, count_enter = 0, match_time = 0;
+
+  short previous = 0, conse = 0; 
+  
+  float result;
+
+  char *key = (char *) malloc ((bl->k_mer + 1) * sizeof (char));
+
+  begin = strchr (begin + 1, '\n') + 1;
+
+  char *p = begin;
+
+  while (p != next)
+    {
+      if (*p == '\n')
+	count_enter++;
+      p++;
+    }
+
+  p = begin;
+
+  while (*p != '>' && *p != '\0')
+    {
+      while (n < bl->k_mer)
+	{
+	  if (p[m] == '>' || p[m] == '\0')
+	    {
+	      m--;
+	      break;
+	    }
+
+	  if (p[m] != '\r' && p[m] != '\n')
+	    key[n++] = p[m];
+
+	  m++;
+	}
+      key[n] = '\0';
+
+      if (model == 'r')
+	rev_trans (key);
+      //printf("key->%s\n",key);
+      if (count >= bl->k_mer)
+	{
+	  mark = 1;
+	  count = 0;
+	}
+      if (strlen (key) == bl->k_mer)
+	{
+	  if (bloom_check (bl, key))
+	    {
+	      match_time++;
+	      if (previous == 1)
+                  conse++;
+              else
+                  {
+                  conse+=bl->k_mer;
+                  previous = 1;
+                  }
+	      if (mark == 1)
+		{
+		  match_s += (bl->k_mer - 1);
+		  mark = 0;
+		}
+	      else
+		match_s++;
+	    }
+
+	  else
+	    {
+	      previous = 0;
+	      //printf("unhit--->\n");
+	    }
+
+	  count++;
+	}			//outside if
+      //printf("score->%d\n",match_s);
+      p++;
+      if (p[0] == '\n')
+	p++;
+      n = 0;
+      m = 0;
+    }				// end of while
+  //result = (float) match_s / (float) (next - begin - count_enter);
+  //result = (float) match_time*(bl->k_mer)/(float)((next-begin-count_enter-bl->k_mer+2)*(bl->k_mer)+2*dx_add(bl->k_mer));
+  //result = (float) ((match_time+conse)*(bl->k_mer))/(float)((next-begin-count_enter-bl->k_mer+2+conse)*(bl->k_mer)+2*dx_add(bl->k_mer));
+  //result = (float) ((match_time)*(bl->k_mer))/(float)((next-begin-count_enter-bl->k_mer+2)*(bl->k_mer)+2*dx_add(bl->k_mer));
+  //result = (float)(match_time*bl->k_mer+conse)/(float)((next-begin-count_enter-bl->k_mer+2)*bl->k_mer+conse+2*dx_add(bl->k_mer));
+  //printf ("result1->%f\n",result);
+  //result = (float)(match_time*bl->k_mer)/(float)((next-begin-count_enter)*bl->k_mer-2*dx_add(bl->k_mer-1));
+  result = (float)(match_time*bl->k_mer+conse)/(float)((next-begin-count_enter)*bl->k_mer-2*bl->dx+conse);
+  
+  #pragma omp atomic
+  File_head->hits+=match_time;
+  #pragma omp atomic
+  File_head->all_k+=(next-begin-count_enter-bl->k_mer);
+
+  if (result >= tole_rate)	//match >tole_rate considered as contaminated
+    return match_s;
+  else 
+    return 0;
+}
+
+int
+get_parainfo (char *full, Queue * head)
+{
+#ifdef DEBUG
+	  printf ("distributing...\n");
+#endif
+	  int type = 0;
+          char *previous = NULL;
+	  char *temp = full;
+	  int cores = omp_get_num_procs ();
+	  short add = 0;
+          int offset = 0;
+	  Queue *pos = head;
+       //   Queue *x = NEW (Queue);
+          int length = 0;
+
+      if (full != NULL) {
+          offset = strlen(full) / cores;
+          if (*full == '>')
+            type = 1;
+          else if (*full == '@')
+            type = 2;
+          else
+            {
+            perror ("wrong format\n");
+            exit (-1);
+            }
+          }
+      
+      if (type == 1) {
+              for (add = 0; add < cores; add++) {
+                  Queue *x = NEW (Queue);
+                  if (add == 0 && *full != '>')
+                    temp = strchr (full, '>');	//drop the possible fragment
+
+                  if (add != 0)
+                    temp = strchr (full + offset * add, '>');
+                  x->location = temp;
+                  x->number = &add;
+                  x->next = pos->next;
+                  pos->next = x;
+                  pos = pos->next;
+              }
+
+	  } else {
+              char *tx = strchr(full,'\n');
+              length = strchr(tx+1,'\n')-(tx+1);
+              printf ("reads length->%d\n",length);
+	      for (add = 0; add < cores; add++) {
+              Queue *x = NEW (Queue);
+              x->location = NULL;
+              //char *tx = strchr(full,'\n');
+              //length = strchr(tx+1,'\n')-(tx+1);
+              if (add != 0)
+                  temp = fastq_relocate(full, offset*add, length);
+                       
+              if (previous!=temp) {
+                  previous = temp;
+                  x->location = temp;
+                  x->number = &add;
+                  x->next = pos->next;
+                  pos->next = x;
+                  pos = pos->next;
+              }
+	      }
+    }
+
+  return type;
+}
+
+/*-------------------------------------*/
+char *
+jump (char *target, int type, float sampling_rate)
+{
+  //printf("here\n");
+  float seed = rand () % 10;
+
+  if (seed >= (float) sampling_rate * 10)
+    {
+
+      char *point;
+
+      if (type == 1)
+	point = strchr (target + 1, '>');	//point to >
+      else
+        {
+	point = strstr (target + 1, "\n+") + 1;	//point to +
+        point = strchr (point,'\n')+1;          //point to quality line
+        point = strchr (point,'\n')+1;          //point to next read starting
+        }
+      if (point)
+	target = point;
+    }
+  return target;
+}
+
+/*-------------------------------------*/
+char *fastq_relocate (char *data, int offset, int length){
+     char *target=NULL;
+
+     if(data != NULL && offset != 0)
+        target = strstr (data + offset, "\n+");
+
+     if (!target)
+         return NULL;
+     else {
+         //if ((strchr(target+1,'\n')-target+1)!=length)
+            target = strchr (target+1,'\n')+1; 
+         //if (target!=NULL)
+            target = strchr (target+1,'\n')+1;
+     }
+     
+     return target;
+}
+/*-------------------------------------*/
+int 
+dx_add (int k_mer)
+{
+   int x;
+   int y = 0;
+   for (x=1;x<k_mer;x++)
+        y+=x;
+   return y;
+}
diff --git a/drass/tool.h b/drass/tool.h
new file mode 100644
index 0000000..8e01fd3
--- /dev/null
+++ b/drass/tool.h
@@ -0,0 +1,14 @@
+#ifndef _TOOL
+#define _TOOL
+
+#include "bloom.h"
+extern int dx_add (int k_mer);
+extern int get_parainfo (char *full, Queue *head);
+extern char *fastq_relocate (char *data, int offset, int length);
+extern char *jump (char *target, int type, float sampling_rate);
+int fastq_full_check (bloom * bl, char *p, int distance,  char model, float tole_rate, F_set *File_head);
+int fasta_full_check (bloom * bl, char *begin, char *next, char model, float tole_rate, F_set *File_head);
+extern int fastq_read_check (char *begin, int length, char model, bloom * bl, float tole_rate, F_set *File_head);
+extern int fasta_read_check (char *begin, char *next, char model, bloom * bl, float tole_rate, F_set *File_head);
+
+#endif
diff --git a/tests/test_basic.py b/tests/test_basic.py
new file mode 100644
index 0000000..7a7559e
--- /dev/null
+++ b/tests/test_basic.py
@@ -0,0 +1,66 @@
+import os
+import sys
+import errno
+import glob
+import facs
+import unittest
+import subprocess
+import contextlib
+import collections
+
+import utils.helpers as helper
+
+
+class DrassBasicTest(unittest.TestCase):
+    """Build and query some simple bloom filters.
+    """
+    def setUp(self):
+        self.data_dir  = os.path.join(os.path.dirname(__file__), "data")
+        self.reference = os.path.join(os.path.dirname(__file__), "data", "reference")
+        self.bloom_dir = os.path.join(os.path.dirname(__file__), "data", "bloom")
+        self.custom_dir = os.path.join(os.path.dirname(__file__), "data", "custom")
+        self.synthetic_fastq = os.path.join(os.path.dirname(__file__), "data", "synthetic_fastq")
+
+        self.fastq_nreads = [1, 8, 200]
+
+        helper._mkdir_p(self.data_dir)
+        helper._mkdir_p(self.bloom_dir)
+        helper._mkdir_p(self.custom_dir)
+        helper._mkdir_p(self.synthetic_fastq)
+
+        # Downloads reference genome(s)
+        helper._download_test_files(self.data_dir)
+
+    def test_1_build_ref(self):
+        """ Build bloom filters out of the reference genomes directory.
+        """
+        for ref in os.listdir(self.reference):
+            facs.build(os.path.join(self.reference, ref),
+		os.path.join(self.bloom_dir, os.path.splitext(ref)[0]+".bloom"))
+
+    def test_2_query(self):
+        """ Generate dummy fastq files.
+        """
+        for nreads in self.fastq_nreads:
+            test_fname = "test%s.fastq" % nreads
+            helper._generate_dummy_fastq(os.path.join(self.synthetic_fastq, test_fname), nreads)
+            facs.query(os.path.join(self.synthetic_fastq, test_fname),
+                        os.path.join(self.bloom_dir, "U00096.2.bloom"))
+
+
+    def test_3_query_custom(self):
+        """ Query against the uncompressed FastQ files files manually deposited in data/custom folder.
+        """
+        for sample in glob.glob(os.path.join(self.custom_dir, "*.fastq")):
+    	    print "\nQuerying against uncompressed sample %s" % sample
+            facs.query(os.path.join(self.custom_dir, sample),
+                        os.path.join(self.bloom_dir, "U00096.2.bloom"))
+
+
+    def test_4_query_custom_small_compressed(self):
+	""" Query gzip compressed fastq files (less than 20MB).
+	"""
+        for sample in glob.glob(os.path.join(self.custom_dir, "*.fastq.gz")): 
+    	    print "\nQuerying against compressed sample %s" % sample
+            if os.path.getsize(os.path.join(self.custom_dir, sample)) < 20*1024*1204:
+		facs.query(os.path.join(self.custom_dir, sample),os.path.join(self.bloom_dir, "U00096.2.bloom"))
diff --git a/tests/test_thousand_genomes.py b/tests/test_thousand_genomes.py
new file mode 100644
index 0000000..670ba4e
--- /dev/null
+++ b/tests/test_thousand_genomes.py
@@ -0,0 +1,51 @@
+import os
+import sys
+import errno
+import glob
+import facs
+import unittest
+import subprocess
+import contextlib
+import collections
+
+
+class ThousandGenomesTest(unittest.TestCase):
+    """Build and query some simple bloom filters.
+    """
+    def setUp(self):
+        self.custom_dir = os.path.join(os.path.dirname(__file__), "data", "custom")
+        self.bloom_dir = os.path.join(os.path.dirname(__file__), "data", "bloom")
+        self._install_1000g_test_files(self.custom_dir)
+
+    def test_query_NA21137_illumina(self):
+        """ Query gzip compressed fastq files
+        """
+        for sample in glob.glob(os.path.join(self.custom_dir, "*.fastq.gz")):
+            facs.query(os.path.join(self.custom_dir, sample),
+                        os.path.join(self.bloom_dir, "U00096.2.bloom"))
+
+    #XXX
+    #def test_1_query_NA21137_454(self):
+    #def test_1_query_NA21137_iontorrent(self):
+
+    def _install_1000g_test_files(self, data_dir):
+        """Download 1000 genomes exome data
+
+        See ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/sequence_indices/20120522.sequence.index
+        for an index of recent sequencing runs.
+
+        Here sequencing data from individual NA21137 has (arbitrarily) been chosen for download. Sequencing was
+        done at BROAD institute on a Illumina HiSeq 2000.
+        """
+
+        individual = "NA21137"
+        fname = "SRR362119.filt.fastq.gz"
+
+        base_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data/%s" % individual
+        fastq_url = os.path.join(base_url, "sequence_read", fname)
+        dst = os.path.join(data_dir, fname)
+        
+        if not os.path.exists(dst):
+            print("downloading %s from %s" % (fname, base_url))
+            cl = ["wget", fastq_url, "-O", dst]
+            subprocess.check_call(cl)
diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/utils/fastq_dummy.py b/tests/utils/fastq_dummy.py
new file mode 100755
index 0000000..5e123a5
--- /dev/null
+++ b/tests/utils/fastq_dummy.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+"""
+Simple dummy FastQ generator to easily test against.
+
+Usage:
+    dummy_fastq.py <num_reads> <dst_file_path>
+
+Example:
+    ./tests/fastq_dummy.py 100 ./tests/data/ecoli_dummy.fastq
+"""
+
+import os
+import sys
+
+reads=int(sys.argv[1])
+dummy_fastq=os.path.join(os.path.dirname(sys.argv[2]), os.path.basename(sys.argv[2]))
+
+
+ecoli_read = """
+AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAA
++ 
+BPaaceeefgggfhiifghiihgiiihiiiihhhhhhhfhgcgh_fegefafhhihcegbgafdbdgggceeecdd]^aWZ^Y]bba^[_b]GTXX]aOPJPSB
+"""
+header='@HWI-ST188:2:1101:2751:1987#0/1'
+stride=13
+
+with open(dummy_fastq, "w") as f:
+    f.write(header)
+    f.write(ecoli_read)
+
+    for r in xrange(reads):
+        f.write(header + "TASK ID: " + str(r) + '\n')
+		 
+        f.write('GATTACAT' * stride + '\n')
+        f.write('+' + '\n')
+        f.write('arvestad' * stride + '\n')
diff --git a/tests/utils/helpers.py b/tests/utils/helpers.py
new file mode 100644
index 0000000..872f7fa
--- /dev/null
+++ b/tests/utils/helpers.py
@@ -0,0 +1,143 @@
+import os
+import collections
+import contextlib
+import subprocess
+import errno
+from contextlib import contextmanager
+
+import tempfile
+from tempfile import NamedTemporaryFile
+import functools
+import urllib
+
+
+# Aux methods
+
+header='@HWI-ST188:2:1101:2751:1987#0/1'
+ecoli_read = \
+"""
+AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAA
++ 
+@Paaceeefgggfhiifghiihgiiihiiiihhhhhhhfhgcgh_fegefafhhihcegbgafdbdgggceeecdd]^aWZ^Y]bba^[_b]GTXX]aOPJPSB
+"""
+
+def _generate_dummy_fastq(fname, num_reads):
+    """ Generates simplest reads with dummy qualities
+    """
+    stride=13
+
+    if not os.path.exists(fname):
+        with open(fname, "w") as f:
+            f.write(header)
+            # Spike one ecoli read
+            f.write(ecoli_read)
+
+            for r in xrange(num_reads):
+                # Identify reads uniquely for later debugging (task distribution, for instance)
+                f.write(header + 'TASK ID: ' + str(r) + '\n')
+                 
+                f.write('GATTACAT' * stride + '\n')
+                f.write('+' + '\n')
+                f.write('arvestad' * stride + '\n')
+
+def _download_test_files(data_dir):
+    """Download required sequence and reference files.
+    """
+
+    DlInfo = collections.namedtuple("ecoli", "fname dirname version")
+    download_data = [DlInfo("U00096.2.fasta", "reference", None)]
+
+    for dl in download_data:
+        url = "http://togows.dbcls.jp/entry/ncbi-nucleotide/{fname}".format(fname=dl.fname)
+        dirname = os.path.join(data_dir, dl.dirname)
+        
+        if not os.path.exists(dirname):
+            os.mkdir(dirname)
+        if not os.path.exists(os.path.join(dirname, dl.fname)):
+            _download_to_dir(url, dirname)
+
+def _download_to_dir(url, dirname):
+    fname = os.path.basename(url)
+
+    cl = ["wget", url, "-O", os.path.join(dirname, fname)]
+    subprocess.check_call(cl)
+
+    # compressed tarball?
+    if os.path.splitext(fname) == ".gz":
+        cl = ["tar", "-xzvpf", os.path.basename(url)]
+        subprocess.check_call(cl)
+    #os.rename(os.path.basename(dirname), dirname)
+    #os.remove(os.path.basename(url))
+
+def _mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError as exc:
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else: raise
+
+@contextmanager
+def _make_tmp_dir():
+    tmp_dir = os.environ["TMPDIR"]
+    if not tmp_dir:
+        home_dir = os.environ["HOME"]
+        tmp_dir = os.path.join(home_dir, "tmp")
+    work_dir = os.path.join(tmp_dir, "cloudbiolinux")
+    if not os.path.exists(work_dir):
+        subprocess.check_call(["mkdir", "-p", work_dir])
+    yield work_dir
+    if os.path.exists(work_dir):
+        subprocess.check_call(["rm", "-rf", work_dir])
+
+@contextmanager
+def cd(path):
+    old_dir = os.getcwd()
+    os.chdir(path)
+    try:
+        yield
+    finally:
+        os.chdir(old_dir)
+
+def _fetch_and_unpack(url, need_dir=True):
+    if url.startswith(("git", "svn", "hg", "cvs")):
+        base = os.path.splitext(os.path.basename(url.split()[-1]))[0]
+        if os.path.exists(base):
+            env.safe_sudo("rm -rf {0}".format(base))
+        subprocess.check_call(url)
+        return base
+    else:
+        tar_file, dir_name, tar_cmd = _get_expected_file(url)
+        print tar_file, dir_name, tar_cmd
+        if not os.path.exists(tar_file):
+            subprocess.check_call(["wget", "--no-check-certificate", "-O", tar_file, url])
+        subprocess.check_call(["tar_cmd", "tar_file"])
+        return _safe_dir_name(dir_name, need_dir)
+
+def _get_expected_file(url):
+    tar_file = os.path.split(url.split("?")[0])[-1]
+    safe_tar = "--pax-option='delete=SCHILY.*,delete=LIBARCHIVE.*'"
+    exts = {(".tar.gz", ".tgz") : "tar %s -xzpf" % safe_tar,
+            (".tar",) : "tar %s -xpf" % safe_tar,
+            (".tar.bz2",): "tar %s -xjpf" % safe_tar,
+            (".zip",) : "unzip"}
+    for ext_choices, tar_cmd in exts.iteritems():
+        for ext in ext_choices:
+            if tar_file.endswith(ext):
+                return tar_file, tar_file[:-len(ext)], tar_cmd
+    raise ValueError("Did not find extract command for %s" % url)
+
+def _configure_make(env):
+    subprocess.check_call(["./configure", "--disable-werror", "--prefix=", env.system_install])
+    subprocess.check_call(["make", "install"])
+
+def _get_install(url, env, make_command, post_unpack_fn=None):
+    """Retrieve source from a URL and install in our system directory.
+    """
+    with _make_tmp_dir() as work_dir:
+        with cd(work_dir):
+            dir_name = _fetch_and_unpack(url)
+            with cd(dir_name):
+                if post_unpack_fn:
+                    post_unpack_fn(env)
+                make_command(env)
diff --git a/tests/utils/valgrind-python.supp b/tests/utils/valgrind-python.supp
new file mode 100644
index 0000000..e50fc4b
--- /dev/null
+++ b/tests/utils/valgrind-python.supp
@@ -0,0 +1,391 @@
+#
+# This is a valgrind suppression file that should be used when using valgrind.
+#
+#  Here's an example of running valgrind:
+#
+#	cd python/dist/src
+#	valgrind --tool=memcheck --suppressions=Misc/valgrind-python.supp \
+#		./python -E -tt ./Lib/test/regrtest.py -u bsddb,network
+#
+# You must edit Objects/obmalloc.c and uncomment Py_USING_MEMORY_DEBUGGER
+# to use the preferred suppressions with Py_ADDRESS_IN_RANGE.
+#
+# If you do not want to recompile Python, you can uncomment
+# suppressions for PyObject_Free and PyObject_Realloc.
+#
+# See Misc/README.valgrind for more information.
+
+# all tool names: Addrcheck,Memcheck,cachegrind,helgrind,massif
+{
+   ADDRESS_IN_RANGE/Invalid read of size 4
+   Memcheck:Addr4
+   fun:Py_ADDRESS_IN_RANGE
+}
+
+{
+   ADDRESS_IN_RANGE/Invalid read of size 4
+   Memcheck:Value4
+   fun:Py_ADDRESS_IN_RANGE
+}
+
+{
+   ADDRESS_IN_RANGE/Invalid read of size 8 (x86_64 aka amd64)
+   Memcheck:Value8
+   fun:Py_ADDRESS_IN_RANGE
+}
+
+{
+   ADDRESS_IN_RANGE/Conditional jump or move depends on uninitialised value
+   Memcheck:Cond
+   fun:Py_ADDRESS_IN_RANGE
+}
+
+#
+# Leaks (including possible leaks)
+#    Hmmm, I wonder if this masks some real leaks.  I think it does.
+#    Will need to fix that.
+#
+
+{
+   Suppress leaking the GIL.  Happens once per process, see comment in ceval.c.
+   Memcheck:Leak
+   fun:malloc
+   fun:PyThread_allocate_lock
+   fun:PyEval_InitThreads
+}
+
+{
+   Suppress leaking the GIL after a fork.
+   Memcheck:Leak
+   fun:malloc
+   fun:PyThread_allocate_lock
+   fun:PyEval_ReInitThreads
+}
+
+{
+   Suppress leaking the autoTLSkey.  This looks like it shouldn't leak though.
+   Memcheck:Leak
+   fun:malloc
+   fun:PyThread_create_key
+   fun:_PyGILState_Init
+   fun:Py_InitializeEx
+   fun:Py_Main
+}
+
+{
+   Hmmm, is this a real leak or like the GIL?
+   Memcheck:Leak
+   fun:malloc
+   fun:PyThread_ReInitTLS
+}
+
+{
+   Handle PyMalloc confusing valgrind (possibly leaked)
+   Memcheck:Leak
+   fun:realloc
+   fun:_PyObject_GC_Resize
+   fun:COMMENT_THIS_LINE_TO_DISABLE_LEAK_WARNING
+}
+
+{
+   Handle PyMalloc confusing valgrind (possibly leaked)
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyObject_GC_New
+   fun:COMMENT_THIS_LINE_TO_DISABLE_LEAK_WARNING
+}
+
+{
+   Handle PyMalloc confusing valgrind (possibly leaked)
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyObject_GC_NewVar
+   fun:COMMENT_THIS_LINE_TO_DISABLE_LEAK_WARNING
+}
+
+#
+# Non-python specific leaks
+#
+
+{
+   Handle pthread issue (possibly leaked)
+   Memcheck:Leak
+   fun:calloc
+   fun:allocate_dtv
+   fun:_dl_allocate_tls_storage
+   fun:_dl_allocate_tls
+}
+
+{
+   Handle pthread issue (possibly leaked)
+   Memcheck:Leak
+   fun:memalign
+   fun:_dl_allocate_tls_storage
+   fun:_dl_allocate_tls
+}
+
+###{
+###   ADDRESS_IN_RANGE/Invalid read of size 4
+###   Memcheck:Addr4
+###   fun:PyObject_Free
+###}
+###
+###{
+###   ADDRESS_IN_RANGE/Invalid read of size 4
+###   Memcheck:Value4
+###   fun:PyObject_Free
+###}
+###
+###{
+###   ADDRESS_IN_RANGE/Conditional jump or move depends on uninitialised value
+###   Memcheck:Cond
+###   fun:PyObject_Free
+###}
+
+###{
+###   ADDRESS_IN_RANGE/Invalid read of size 4
+###   Memcheck:Addr4
+###   fun:PyObject_Realloc
+###}
+###
+###{
+###   ADDRESS_IN_RANGE/Invalid read of size 4
+###   Memcheck:Value4
+###   fun:PyObject_Realloc
+###}
+###
+###{
+###   ADDRESS_IN_RANGE/Conditional jump or move depends on uninitialised value
+###   Memcheck:Cond
+###   fun:PyObject_Realloc
+###}
+
+###
+### All the suppressions below are for errors that occur within libraries
+### that Python uses.  The problems to not appear to be related to Python's
+### use of the libraries.
+###
+
+{
+   Generic ubuntu ld problems
+   Memcheck:Addr8
+   obj:/lib/ld-2.4.so
+   obj:/lib/ld-2.4.so
+   obj:/lib/ld-2.4.so
+   obj:/lib/ld-2.4.so
+}
+
+{
+   Generic gentoo ld problems
+   Memcheck:Cond
+   obj:/lib/ld-2.3.4.so
+   obj:/lib/ld-2.3.4.so
+   obj:/lib/ld-2.3.4.so
+   obj:/lib/ld-2.3.4.so
+}
+
+{
+   DBM problems, see test_dbm
+   Memcheck:Param
+   write(buf)
+   fun:write
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   fun:dbm_close
+}
+
+{
+   DBM problems, see test_dbm
+   Memcheck:Value8
+   fun:memmove
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   fun:dbm_store
+   fun:dbm_ass_sub
+}
+
+{
+   DBM problems, see test_dbm
+   Memcheck:Cond
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   fun:dbm_store
+   fun:dbm_ass_sub
+}
+
+{
+   DBM problems, see test_dbm
+   Memcheck:Cond
+   fun:memmove
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   fun:dbm_store
+   fun:dbm_ass_sub
+}
+
+{
+   GDBM problems, see test_gdbm
+   Memcheck:Param
+   write(buf)
+   fun:write
+   fun:gdbm_open
+
+}
+
+{
+   ZLIB problems, see test_gzip
+   Memcheck:Cond
+   obj:/lib/libz.so.1.2.3
+   obj:/lib/libz.so.1.2.3
+   fun:deflate
+}
+
+{
+   Avoid problems w/readline doing a putenv and leaking on exit
+   Memcheck:Leak
+   fun:malloc
+   fun:xmalloc
+   fun:sh_set_lines_and_columns
+   fun:_rl_get_screen_size
+   fun:_rl_init_terminal_io
+   obj:/lib/libreadline.so.4.3
+   fun:rl_initialize
+}
+
+###
+### These occur from somewhere within the SSL, when running
+###  test_socket_sll.  They are too general to leave on by default.
+###
+###{
+###   somewhere in SSL stuff
+###   Memcheck:Cond
+###   fun:memset
+###}
+###{
+###   somewhere in SSL stuff
+###   Memcheck:Value4
+###   fun:memset
+###}
+###
+###{
+###   somewhere in SSL stuff
+###   Memcheck:Cond
+###   fun:MD5_Update
+###}
+###
+###{
+###   somewhere in SSL stuff
+###   Memcheck:Value4
+###   fun:MD5_Update
+###}
+
+#
+# All of these problems come from using test_socket_ssl
+#
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:BN_bin2bn
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:BN_num_bits_word
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Value4
+   fun:BN_num_bits_word
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:BN_mod_exp_mont_word
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:BN_mod_exp_mont
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Param
+   write(buf)
+   fun:write
+   obj:/usr/lib/libcrypto.so.0.9.7
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:RSA_verify
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Value4
+   fun:RSA_verify
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Value4
+   fun:DES_set_key_unchecked
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Value4
+   fun:DES_encrypt2
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   obj:/usr/lib/libssl.so.0.9.7
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Value4
+   obj:/usr/lib/libssl.so.0.9.7
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:BUF_MEM_grow_clean
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:memcpy
+   fun:ssl3_read_bytes
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:SHA1_Update
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Value4
+   fun:SHA1_Update
+}
+
+