Skip to content

Commit d70ac05

Browse files
committed
v2.0
1 parent 5d4c105 commit d70ac05

12 files changed

+1418
-3543
lines changed

RUN.sh

+123-72
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,17 @@
11
#!/bin/bash
22
function printVersion {
3-
printf "Spliceogen 1.0 1-March-2019\n"
3+
printf "Spliceogen 2.0 1-October-2019\n"
44
}
55
function printHelp {
66
cat <<-END
77
Usage:
88
------
99
3 required args:
10-
1) -inputVCF path/to/VCF/input(s).VCF
11-
OR
12-
-inputBED path/to/input(s).BED
13-
Note: wildcard matching of multiple files is allowed
14-
2) -gtf path/to/annotation.GTF
15-
3) -fasta path/to/genome.fasta
10+
1) -input path/to/VCF/input/file(s).VCF
11+
Note: multiple input files are accepted "eg. -input *.vcf"
12+
Note: deprecated v1.0 tags "inputVCF" and "inputBED" are still accepted
13+
2) -gtf path/to/annotation.gtf
14+
3) -fasta path/to/genome.fa
1615
optional arg:
1716
4) -branchpointer hgXX
1817
OR
@@ -22,8 +21,6 @@ END
2221
}
2322
#set default parameters
2423
POSITIONAL=()
25-
INPUTVCF="FALSE"
26-
INPUTBED="FALSE"
2724
INPUTFILES=""
2825
USEBP=""
2926
USEBPINDELS=""
@@ -43,8 +40,16 @@ case $key in
4340
printVersion
4441
exit 0
4542
;;
43+
-input)
44+
INPUTFILES="$2"
45+
shift
46+
shift
47+
while [ "$1" ] && [[ ! $1 == *-* ]]; do
48+
INPUTFILES="$INPUTFILES $1"
49+
shift
50+
done
51+
;;
4652
-inputVCF)
47-
INPUTVCF="TRUE"
4853
INPUTFILES="$2"
4954
shift
5055
shift
@@ -54,7 +59,6 @@ case $key in
5459
done
5560
;;
5661
-inputBED)
57-
INPUTBED="TRUE"
5862
INPUTFILES="$2"
5963
shift
6064
shift
@@ -97,36 +101,26 @@ elif [ ! -f "$ANNOTATION" ]; then
97101
echo "GTF annotation file not found: use -gtf path/to/gencodeXX.gtf\nExiting..."
98102
exit 1
99103
fi
100-
checkGzip=$( file --mime-type "$FASTAPATH" "$ANNOTATION" "$INPUTFILES" | grep gzip)
101-
if [ ! "$checkGzip" = "" ]; then
102-
echo "Error: Input FASTA and GTF files must be unzipped. Exiting..."
103-
exit 1
104-
fi
105-
#check input files for consistenty in "chr" nomenlature
106-
gtfChr=$(head "$ANNOTATION" | tail -1 | awk '{print $1}' | grep chr)
107-
fastaChr=$(head -1 "$FASTAPATH" | awk '{print $1}' | grep chr)
108-
firstInputFile=$(echo "$INPUTFILES" | awk '{print $1}')
109-
inputChr=$(tail -1 "$firstInputFile" | awk '{print $1}' | grep chr)
110-
warningChr="Warning: it appears the provided gtf, fasta, and input files use inconsistent Chromosome nomenclature. Eg. \"chr1\" vs \"1\". This will likely cause issues. Please edit them for consistency"
111-
if [ "$gtfChr" == "" ]; then
104+
#check input gtf/fasta "chr" nomenclature
105+
gtfChr=$(zcat -f "$ANNOTATION" | grep -v '^GL000' | tail -1 | awk '{print $1}' | grep chr)
106+
fastaChr=$(cat "$FASTAPATH" | head -1 | awk '{print $1}' | grep chr)
107+
gtfChrAdd=""
108+
gtfChrRemove="UnmatchedString"
112109
if [ "$fastaChr" != "" ]; then
113-
echo "$warningChr"
114-
elif [ "$inputChr" != "" ]; then
115-
echo "$warningChr"
110+
if [ "$gtfChr" == "" ]; then
111+
gtfChrAdd="chr"
112+
fi
113+
elif [ "$fastaChr" == "" ]; then
114+
if [ "$gtfChr" != "" ]; then
115+
gtfChrRemove="chr"
116+
fi
116117
fi
117-
else
118-
if [ "$fastaChr" == "" ]; then
119-
echo "$warningChr"
120-
elif [ "$inputChr" == "" ]; then
121-
echo "$warningChr"
122-
fi
123-
fi
124118
#prepare splice site intervals from annotation.gtf
125119
gtfBasename=$(basename $ANNOTATION)
126120
if [ ! -f data/"$gtfBasename"_SpliceSiteIntervals.txt ] || [[ "$ANNOTATION" -nt data/"$gtfBasename"_SpliceSiteIntervals.txt ]] ; then
127121
echo "Preparing splice site annotation..."
128-
grep '[[:blank:]]gene[[:blank:]]\|[[:blank:]]transcript[[:blank:]]\|[[:blank:]]exon[[:blank:]]' "$ANNOTATION" | grep -v '^GL000' |
129-
java -cp bin getSpliceSiteIntervalsFromGTF > data/"$gtfBasename"_SpliceSiteIntervals.txt
122+
zcat -f "$ANNOTATION" | grep '[[:blank:]]gene[[:blank:]]\|[[:blank:]]transcript[[:blank:]]\|[[:blank:]]exon[[:blank:]]' | grep -v '^GL000' |
123+
sed "s/$gtfChrRemove//" | awk -v var="$gtfChrAdd" '{print var$0}' | java -cp bin getSpliceSiteIntervalsFromGTF > data/"$gtfBasename"_SpliceSiteIntervals.txt
130124
fi
131125
#for each input VCF/BED file
132126
for FILE in $INPUTFILES; do
@@ -139,9 +133,39 @@ for FILE in $INPUTFILES; do
139133
echo "Input file: $fileID"
140134
#remove temp files from any previous run
141135
rm temp/"$fileID"* 2> /dev/null
136+
#check input file type
137+
FILETYPE=""
138+
nFields=$(zcat -f $FILE | tail -1 | wc -w)
139+
vcfHeader=$(zcat -f $FILE | head -1 | grep VCF)
140+
if [ "$nFields" -eq 4 ]; then
141+
FILETYPE="TSV"
142+
elif [ ! -z "$vcfHeader" ]; then
143+
FILETYPE="VCF"
144+
else
145+
FILETYPE="BED"
146+
fi
147+
echo "File type: $FILETYPE"
148+
#correct mismatches in "chr" nomenclature among input files
149+
inputChr=$(zcat -f "$FILE" | tail -1 | awk '{print $1}' | grep chr)
150+
inputChrAdd=""
151+
inputChrRemove="UnmatchedString"
152+
if [ "$fastaChr" != "" ]; then
153+
if [ "$inputChr" == "" ]; then
154+
inputChrAdd="chr"
155+
fi
156+
elif [ "$fastaChr" == "" ]; then
157+
if [ "$inputChr" != "" ]; then
158+
inputChrRemove="chr"
159+
fi
160+
fi
142161
#sort body of input file
143-
grep "^#" "$FILE" > temp/"$fileID"_sorted
144-
grep -v "^#" "$FILE" | sort -k1,1 -k2,2n >> temp/"$fileID"_sorted
162+
zcat -f "$FILE" | grep "^#" > temp/"$fileID"_sorted
163+
if [ "$FILETYPE" == "TSV" ]; then
164+
zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | awk -v OFS="\\t" -v var=$inputChrAdd '{print var$1, $2, $2, "x", "1", ".", $3, $4}' >> temp/"$fileID"_sorted
165+
else
166+
#zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | sed "/^/$inputChrAdd/" >> temp/"$fileID"_sorted
167+
zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | awk -v OFS="\\t" -v var=$inputChrAdd '{print var$0}' >> temp/"$fileID"_sorted
168+
fi
145169
#check bedtools is installed
146170
bedtoolsLocation=$(which bedtools);
147171
if [ "$bedtoolsLocation" == "" ]; then
@@ -157,7 +181,7 @@ for FILE in $INPUTFILES; do
157181
fi
158182
#bedtools intersect to get strand info
159183
echo "Retrieving strand info..."
160-
grep '[[:blank:]]gene[[:blank:]]' "$ANNOTATION" | sort -k1,1 -k4,4n | grep -v '^GL000' | bedtools intersect -a temp/"$fileID"_sorted -b stdin -wa -wb -sorted > temp/"$fileID"unstrandedInput.txt
184+
zcat -f "$ANNOTATION" | grep '[[:blank:]]gene[[:blank:]]' | sort -k1,1 -k4,4n | grep -v '^GL000' | awk -v var="$gtfChrAdd" -v OFS="\\t" '{print var$0}' | sed "s/$gtfChrRemove//" | bedtools intersect -a temp/"$fileID"_sorted -b stdin -wa -wb -sorted > temp/"$fileID"unstrandedInput.txt
161185
if [ $? -ne 0 ]; then
162186
echo "Warning. Bedtools intersect returned non-zero exit status. Intersection failed between provided variant VCF/BED file and provided GTF. See above error message for more details"
163187
fi
@@ -166,10 +190,10 @@ for FILE in $INPUTFILES; do
166190
exit 1
167191
fi
168192
#generate flanking intervals.bed for bedtools getfasta and branchpointer input
169-
if [ "$INPUTVCF" = "TRUE" ]; then
193+
if [ "$FILETYPE" = "VCF" ]; then
170194
grep '[[:blank:]]+[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "+", $4, $5}' | ( [[ "$USEBP" ]] && tee temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals > temp/"$fileID"fastaIntervals.bed
171195
grep '[[:blank:]]-[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "-", $4, $5}' | ( [[ "$USEBP" ]] && tee -a temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals >> temp/"$fileID"fastaIntervals.bed
172-
elif [ "$INPUTBED" = "TRUE" ]; then
196+
elif [ "$FILETYPE" = "TSV" ] || [ "$FILETYPE" = "BED" ] ; then
173197
grep '[[:blank:]]+[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "+", $7, $8}' | ( [[ "$USEBP" ]] && tee temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals > temp/"$fileID"fastaIntervals.bed
174198
grep '[[:blank:]]-[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "-", $7, $8}' | ( [[ "$USEBP" ]] && tee -a temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals >> temp/"$fileID"fastaIntervals.bed
175199
fi
@@ -185,25 +209,42 @@ for FILE in $INPUTFILES; do
185209
fi
186210
#seqScan: generates input strings for maxentscan and genesplicer as well as ESRseq scores
187211
echo "Scanning for motifs..."
188-
rm output/mesOmmitted/"$fileID" 2> /dev/null
212+
rm output/"$fileID"mesOmmitted.txt 2> /dev/null
213+
rm output/"$fileID"refMismatch.txt 2> /dev/null
189214
java -cp bin seqScan temp/"$fileID"seqToScan.FASTA -useESR $fileID 1>&2
215+
if [ -s output/"$fileID"refMismatch.txt ]; then
216+
refMismatchCount=$(wc -l output/"$fileID"refMismatch.txt | awk '{print $1}')
217+
echo "Note: $refMismatchCount variants were excluded because the provided Reference allele does not match the nucleotide(s) in the provided FASTA. IDs of excluded variant(s) are outputted here: Spliceogen/output/""$fileID""refMismatch.txt"
218+
fi
219+
if [ -s output/"$fileID"mesOmmitted.txt ]; then
220+
mesOmmittedCount=$(wc -l output/"$fileID"mesOmmitted.txt | awk '{print $1}')
221+
echo "Note: $mesOmmittedCount variants were excluded from MaxEntScan because their flanking FASTA sequence contains invalid characters (most commonly \"n\"), which cannot be processed by MaxEntScan. IDs of ommitted variant(s) are listed in: Spliceogen/output/""$fileID""mesOmmitted.txt"
222+
fi
190223
#run maxEntScan and confirm non-zero exit, since invalid inputs cause it to exit early
191-
echo "Running MaxEntScan..."
192-
perl score5.pl temp/"$fileID"mesDonorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesDonorScores.txt
193-
retVal=( ${PIPESTATUS[0]} )
194-
if [ $retVal -ne 0 ]; then
195-
echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
196-
exit $retVal
197-
fi
198-
perl score3.pl temp/"$fileID"mesAcceptorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesAcceptorScores.txt
199-
retVal=( ${PIPESTATUS[0]} )
200-
if [ $retVal -ne 0 ]; then
201-
echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
202-
exit $retVal
224+
if [ -s temp/"$fileID"mesDonorInput.txt ] || [ -s temp/"$fileID"mesAcceptorInput.txt ] ; then
225+
echo "Running MaxEntScan..."
226+
perl score5.pl temp/"$fileID"mesDonorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesDonorScores.txt
227+
retVal=( ${PIPESTATUS[0]} )
228+
if [ $retVal -ne 0 ]; then
229+
echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
230+
exit $retVal
231+
fi
232+
perl score3.pl temp/"$fileID"mesAcceptorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesAcceptorScores.txt
233+
retVal=( ${PIPESTATUS[0]} )
234+
if [ $retVal -ne 0 ]; then
235+
echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
236+
exit $retVal
237+
fi
238+
else
239+
echo "No input for MaxEntScan"
203240
fi
204241
#run genesplicer
205-
echo "Running GeneSplicer..."
206-
bin/linux/genesplicerAdapted temp/"$fileID"gsInput.FASTA human > temp/"$fileID"gsScores.txt
242+
if [ -s temp/"$fileID"gsInput.FASTA ] ; then
243+
echo "Running GeneSplicer..."
244+
bin/linux/genesplicerAdapted temp/"$fileID"gsInput.FASTA human > temp/"$fileID"gsScores.txt
245+
else
246+
echo "No input for GeneSplicer"
247+
fi
207248
#run branchpointer SNPs
208249
if [ "$USEBP" = "TRUE" -a "$USEBPINDELS" = "FALSE" ]; then
209250
echo "Running Branchpointer..."
@@ -226,25 +267,35 @@ for FILE in $INPUTFILES; do
226267
#awk -v OFS=\\t '{print $2, $3, $4, $8, $9, $15, $16, $22, $23, $24, $25}' output/"$fileID"bpOutputSNPs.txt" > output/bpSNPsSummarised.txt
227268
fi
228269
#merge scores into one line
229-
echo "Processing scores..."
230-
cat temp/"$fileID"mesDonorScores.txt temp/"$fileID"mesAcceptorScores.txt temp/"$fileID"gsScores.txt temp/"$fileID"ESRoutput.txt data/"$gtfBasename"_SpliceSiteIntervals.txt sources/terminatingMergeLine.txt |
231-
sort -k1,1 -k 2,2n -k 3 -k 4 -s | java -cp bin mergeOutput "$fileID"
232-
#sort predictions
233-
if [ -s temp/"$fileID"_donorCreating_unsorted.txt ]; then
234-
sort -gr -k11,11 temp/"$fileID"_donorCreating_unsorted.txt >> output/"$fileID"_donorCreating.txt
235-
else
236-
rm output/"$fileID"_donorCreating.txt
237-
fi
238-
if [ -s temp/"$fileID"_acceptorCreating_unsorted.txt ]; then
239-
sort -gr -k11,11 temp/"$fileID"_acceptorCreating_unsorted.txt >> output/"$fileID"_acceptorCreating.txt
270+
scoresToMerge=""
271+
if [ -s temp/"$fileID"gsScores.txt ] ; then
272+
scoresToMerge="temp/"$fileID"gsScores.txt"
273+
fi
274+
if [ -s temp/"$fileID"mesDonorScores.txt ] ; then
275+
scoresToMerge="$scoresToMerge temp/"$fileID"mesDonorScores.txt"
276+
fi
277+
if [ -s temp/"$fileID"mesAcceptorScores.txt ] ; then
278+
scoresToMerge="$scoresToMerge temp/"$fileID"mesAcceptorScores.txt"
279+
fi
280+
if [ -s temp/"$fileID"ESRoutput.txt ] ; then
281+
scoresToMerge="$scoresToMerge temp/"$fileID"ESRoutput.txt"
282+
fi
283+
checkScoresExist=$(echo "$scoresToMerge" | grep "temp")
284+
if [ -z "$checkScoresExist" ]; then
285+
echo "No MaxEntScan/GeneSplicer/ESRseq scores to process"
240286
else
241-
rm output/"$fileID"_acceptorCreating.txt
287+
echo "Processing scores..."
288+
cat $(echo "$scoresToMerge") data/"$gtfBasename"_SpliceSiteIntervals.txt sources/terminatingMergeLine.txt | sort -k1,1 -k 2,2n -k 3 -k 4 -s | java -cp bin mergeOutput "$fileID"
289+
fi
290+
#sort predictions
291+
if [ -s temp/"$fileID"_gain_unsorted.txt ]; then
292+
echo -e "#CHR\tSTART\tREF\tALT\tGENE\tdonGainP\taccGainP" > output/"$fileID"_ssGain.txt
293+
sort -gr -k8,8 temp/"$fileID"_gain_unsorted.txt | cut -f1-7 >> output/"$fileID"_ssGain.txt
242294
fi
243-
if [ -s temp/"$fileID"_withinSS_unsorted.txt ]; then
244-
sort -gr -k17,17 temp/"$fileID"_withinSS_unsorted.txt | cut -f1-15 >> output/"$fileID"_withinSS.txt
245-
else
246-
rm output/"$fileID"_withinSS.txt
295+
if [ -s temp/"$fileID"_loss_unsorted.txt ]; then
296+
echo -e "#CHR\tSTART\tREF\tALT\tGENE\twithinSS\tdonGainP\taccGainP" > output/"$fileID"_withinSS.txt
297+
sort -gr -k9,9 temp/"$fileID"_loss_unsorted.txt | cut -f1-8 >> output/"$fileID"_withinSS.txt
247298
fi
248299
#clean up temp files
249-
rm temp/"$fileID"* 2> /dev/null
300+
#rm temp/"$fileID"* 2> /dev/null
250301
done

bin/mergeOutput.class

-535 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)