Skip to content

Commit 19d3247

Browse files
committed
update alignment scripts
1 parent 38c2c4e commit 19d3247

File tree

3 files changed

+71
-23
lines changed

3 files changed

+71
-23
lines changed

eval_aligners.ipynb

+41-20
Large diffs are not rendered by default.

evaluate_alignment.sh

+14-3
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ ILLUMINA_XDROP=100
1919
PACBIO_XDROP=27
2020
ILLUMINA_SEED_LENGTH=10000
2121
PACBIO_SEED_LENGTH=19
22+
PACBIO_MIN_EXACT_MATCH=0.02
23+
ILLUMINA_MIN_EXACT_MATCH=0.7
2224

2325
echo "Simulating Illumina reads"
2426
art_illumina -ss HS25 -i $INFILE -p -c 1280 -l $ILLUMINA_LENGTH -m 200 -s 10 -o $BASENAME.sim -rs $SEED -sam
@@ -28,8 +30,8 @@ bedtools getfasta -fi $BASENAME.fa -bed <(bedtools bamtobed -i $BASENAME.sim.bam
2830
reformat.sh in=$BASENAME.sim.illumina.fq out=$BASENAME.sim.illumina.fa fastawrap=-1 overwrite=true
2931

3032
echo "Simulating PacBio reads"
31-
PBSIM_DEPTH=0.0504 # chr22
32-
#PBSIM_DEPTH=0.4284 # E. coli
33+
PBSIM_DEPTH=0.0504 # illumina
34+
PBSIM_DEPTH=0.4284 # pacbio
3335
../pbsim --data-type CLR --depth $PBSIM_DEPTH --model_qc ../model_qc_clr --data-type CLR --length-mean $PACBIO_LENGTH --length-sd 0 --length-min $PACBIO_LENGTH --length-max $PACBIO_LENGTH $BASENAME.fa --prefix $BASENAME --seed $SEED
3436
cat ${BASENAME}_0001.fastq | paste - - - - | awk '!($2~/N/)' | shuf --random-source=../seed | tr "\t" "\n" > $BASENAME.sim.pacbio.fq
3537
cat ${BASENAME}_0001.maf | paste - - - - | awk -F"\t" '!($(NF-1)~/N/)' | shuf --random-source=../seed | tr "\t" "\n" > $BASENAME.sim.maf
@@ -41,10 +43,12 @@ if [[ $a == "illumina" ]]; then
4143
LENGTH=$ILLUMINA_LENGTH
4244
XDROP=$ILLUMINA_XDROP
4345
SEED_LENGTH=$ILLUMINA_SEED_LENGTH
46+
MIN_EXACT_MATCH=$ILLUMINA_MIN_EXACT_MATCH
4447
else
4548
LENGTH=$PACBIO_LENGTH
4649
XDROP=$PACBIO_XDROP
4750
SEED_LENGTH=$PACBIO_SEED_LENGTH
51+
MIN_EXACT_MATCH=$PACBIO_MIN_EXACT_MATCH
4852
fi
4953

5054
echo "Aligning to whole graph"
@@ -54,7 +58,7 @@ echo " parasail"
5458
../run_parasail.sh $BASENAME $a $LENGTH metagraph_base.out > $BASENAME.sim.$a.metagraph_base.out.results.sam
5559

5660
echo "Aligning by coords"
57-
/usr/bin/time -v $METAGRAPH align $FLAGS --align-chain --align-max-seed-length $SEED_LENGTH --align-xdrop 100 -i $BASENAME.dbg $BASENAME.sim.$a.fq -a $BASENAME.${COORD_TYPE}_coord.annodbg > $BASENAME.sim.$a.metagraph_coord.out 2> $BASENAME.sim.$a.metagraph_coord.log
61+
/usr/bin/time -v $METAGRAPH align $FLAGS --align-chain --align-min-exact-match $MIN_EXACT_MATCH --align-max-seed-length $SEED_LENGTH --align-xdrop 100 -i $BASENAME.dbg $BASENAME.sim.$a.fq -a $BASENAME.${COORD_TYPE}_coord.annodbg > $BASENAME.sim.$a.metagraph_coord.out 2> $BASENAME.sim.$a.metagraph_coord.log
5862
awk -F"\t" '{print ">1\n"$4}' $BASENAME.sim.$a.metagraph_coord.out > $BASENAME.sim.$a.metagraph_coord.out.fa
5963
echo " parasail"
6064
../run_parasail.sh $BASENAME $a $LENGTH metagraph_coord.out > $BASENAME.sim.$a.metagraph_coord.out.results.sam
@@ -84,5 +88,12 @@ bedtools getfasta -fi $BASENAME.fa -bed <(bedtools bamtobed -i $BASENAME.sim.$a.
8488
echo " parasail"
8589
../run_parasail.sh $BASENAME $a $LENGTH pufferfish.sam > $BASENAME.sim.$a.pufferfish.sam.results.sam
8690

91+
echo "GraphAligner"
92+
/usr/bin/time -v GraphAligner -g $BASENAME.gfa -f $BASENAME.sim.$a.fa -x vg -a temp.$a.gaf > $BASENAME.sim.$a.graphaligner.log 2>&1
93+
cat $BASENAME.sim.$a.fa | paste - - | cut -f1 | tr -d ">" | while read F; do echo "$(grep -F $F" " temp.$a.gaf)" | head -n 1; done > $BASENAME.sim.$a.graphaligner.gaf
94+
samtools faidx $BASENAME.fa $(awk -F"\t" '{if (NF==0){printf("'$BASENAME':1-1 ")} else {left=$3; right=$2-$4; if ($6 == ">s1") { printf("'$BASENAME':%d-%d ",$8+1-left,$9+right)} else {printf("'$BASENAME':%d-%d ",$7-$9+1-right,$7-$8+left)} }}' $BASENAME.sim.$a.graphaligner.gaf) -n 100000 | paste - - | awk '{print ">1\n"$2}' | tr -d "N" > $BASENAME.sim.$a.graphaligner.gaf.fa
95+
echo " parasail"
96+
../run_parasail.sh $BASENAME $a $LENGTH graphaligner.gaf > $BASENAME.sim.$a.graphaligner.gaf.results.sam
97+
8798
done
8899

run_parasail.sh

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
3+
BASENAME=$1
4+
a=$2
5+
LENGTH=$3
6+
PROGRAM=$4
7+
8+
cat <(echo "@SQ SN:1 LN:$LENGTH") \
9+
<(echo "@SQ SN:2 LN:$LENGTH") \
10+
<(
11+
paste <(cat $BASENAME.sim.$a.refs.fa | paste - -) \
12+
<(cat $BASENAME.sim.$a.$PROGRAM.fa | paste - -) \
13+
<(cat $BASENAME.sim.$a.$PROGRAM.fa | tr -d ">" | paste - - | tr "ATGC" "TACG" | rev | awk -F"\t" '{print ">2\t"$1}') | tr '$' 'N' |
14+
while read F; do
15+
echo "$F" | tr "\t" "\n" | ../parasail_aligner -a parasail_nw_trace_striped_avx2_256_16 -c 1 -e 1 -o 1 -M 1 -X 1 -x -s 0 -l 0 -i 0 -d -b 64 -g /dev/stdout -O SAM | head -n 2
16+
done) | samtools view -h

0 commit comments

Comments
 (0)