forked from EduardoCantos1998/fastascan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfastascan.v2.sh
executable file
·71 lines (56 loc) · 2.02 KB
/
fastascan.v2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#! /bin/bash
# If directory is given as $1, find the fasta files and the links
if [[ -e $1 ]]; then
fafiles=$(find $1 -iregex '.*\.fa[a-zA-Z]*$')
else
# Find the files default settings
fafiles=$(find . -iregex '.*\.fa[a-zA-Z]*$')
fi
# Check if there are fasta files
if [[ -z $fafiles ]]; then
echo "There are no fasta files in this directory. Please try somewhere else."
exit
fi
# Print the headers into a file
echo -e "File_name\t#_of_sequences\tSequence_Length\tSymlink\tNuc/Prot" > table.FastaScan.tbl
echo $fafiles | sed 's/\s/\n/g' | while read i; do
# Number of sequences
nums=$(grep --text '^>[a-zA-Z1-9]*' $i | wc -l)
# Total length of each file
seq=$(sed -e '/^>.*/ d' -e 's/-//g' -e 's/\n//g' $i | tr '\0' '\n')
length=$(echo $seq | sed -e 's/\s//g' | wc -m | awk '{print $1 - 1}')
# The last awk is to remove the extra count wc -m produces
# length=$(sed -e '/^>.*/ d' -e 's/-//g' $i | awk '{sum+=length} END {print sum}')
if [[ nums -eq 0 ]];then
length=$(echo 0)
fi
# Check if the file is a link
if [[ -L $i ]]; then
link=$(echo "True")
else
link=$(echo "False")
fi
# Since every protein starts with a Methionine.
if [[ $(grep --text '^[Mm]' $i | wc -l) -gt 0 ]];then
nucprot=$(echo "Protein")
else
nucprot=$(echo "Nucleotide")
fi
# Check if the length is 0, then the Nuc/Prot variable needs to be "NA"
if [[ $nums -eq 0 ]]; then
nucprot=$(echo "Not_Determined")
fi
# Put it all together
echo -e "$i\t$nums\t$length\t$link\t$nucprot" >> table.FastaScan.tbl
done
# Print the table with all the data
column -t table.FastaScan.tbl
# Total number of sequences
echo
echo "Total number of fasta files: " $(echo $fafiles | sed -e 's/\s/\n/g' | wc -l)
echo "Total number of sequences: " $(awk '(NR>1) && ($4!="True"){sum+=$2}END{print sum}' table.FastaScan.tbl)
echo "Total length of the sequences: " $(awk '(NR>1) && ($4!="True"){sum+=$3}END{print sum}' table.FastaScan.tbl)
echo
echo "A random title: "
echo $(cat $fafiles | grep --text '^>[a-zA-Z1-9]*' | shuf -n 1)
rm table.FastaScan.tbl