-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmakeblastdb.sh
45 lines (35 loc) · 1.84 KB
/
makeblastdb.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
DATABASE=$1
FASTADIR=$2
ITERATIONS=$3
TMP=$4
# This shell script, `makeblastdb.sh`, is responsible for creating a BLAST database from FASTA files generated by
# `family_fasta.py`. The script performs the following steps:
#
# 1. Iterates over a specified number of iterations. Each iteration corresponds to a chunk of families generated by
# `family_fasta.py`.
# 2. For each iteration, it reads the corresponding FASTA file which contains unaligned sequences of a chunk of
# families.
# 3. It filters the sequences that have associated Open Tree of Life (OpenTOL) IDs (ott IDs) and reformats the headers
# to retain the process ID. The reformatted sequences are written to a temporary file.
# 4. After all iterations, it uses `makeblastdb` to index the temporary file, creating a BLAST database.
# 5. Finally, it removes the temporary file.
#
# The script uses command line arguments for the database file to create, directory containing the FASTA files, number
# of iterations (chunks of families), and the temporary file to use during the process. It is invoked by the Snakefile
# as a shell command in the rule `makeblastdb`.
# doing this sequentially to avoid race conditions in BLAST indexing
for i in $(seq 1 $ITERATIONS); do
# using the output from family_fasta
INFILE=${FASTADIR}/${i}-of-${ITERATIONS}/unaligned.fa
# only keep records with ott IDs, reformat the headers to retain the process ID, write to $TMP
# start new file on first iteration, then append
if [ "${i}" -eq 1 ]; then
egrep -A 1 'ott[0-9]+' ${INFILE} | awk -F'|' '/^>/ {print ">"$3; next} {print}' > ${TMP}
else
egrep -A 1 'ott[0-9]+' ${INFILE} | awk -F'|' '/^>/ {print ">"$3; next} {print}' >> ${TMP}
fi
done
# index $TMP with makeblastdb, growing the database
makeblastdb -in ${TMP} -dbtype nucl -out ${DATABASE} -parse_seqids
# remove the temporary file
rm ${TMP}