1
1
#! /bin/bash
2
2
function printVersion {
3
- printf " Spliceogen 1 .0 1-March -2019\n"
3
+ printf " Spliceogen 2 .0 1-October -2019\n"
4
4
}
5
5
function printHelp {
6
6
cat << -END
7
7
Usage:
8
8
------
9
9
3 required args:
10
- 1) -inputVCF path/to/VCF/input(s).VCF
11
- OR
12
- -inputBED path/to/input(s).BED
13
- Note: wildcard matching of multiple files is allowed
14
- 2) -gtf path/to/annotation.GTF
15
- 3) -fasta path/to/genome.fasta
10
+ 1) -input path/to/VCF/input/file(s).VCF
11
+ Note: multiple input files are accepted "eg. -input *.vcf"
12
+ Note: deprecated v1.0 tags "inputVCF" and "inputBED" are still accepted
13
+ 2) -gtf path/to/annotation.gtf
14
+ 3) -fasta path/to/genome.fa
16
15
optional arg:
17
16
4) -branchpointer hgXX
18
17
OR
22
21
}
23
22
# set default parameters
24
23
POSITIONAL=()
25
- INPUTVCF=" FALSE"
26
- INPUTBED=" FALSE"
27
24
INPUTFILES=" "
28
25
USEBP=" "
29
26
USEBPINDELS=" "
@@ -43,8 +40,16 @@ case $key in
43
40
printVersion
44
41
exit 0
45
42
;;
43
+ -input)
44
+ INPUTFILES=" $2 "
45
+ shift
46
+ shift
47
+ while [ " $1 " ] && [[ ! $1 == * -* ]]; do
48
+ INPUTFILES=" $INPUTFILES $1 "
49
+ shift
50
+ done
51
+ ;;
46
52
-inputVCF)
47
- INPUTVCF=" TRUE"
48
53
INPUTFILES=" $2 "
49
54
shift
50
55
shift
@@ -54,7 +59,6 @@ case $key in
54
59
done
55
60
;;
56
61
-inputBED)
57
- INPUTBED=" TRUE"
58
62
INPUTFILES=" $2 "
59
63
shift
60
64
shift
@@ -97,36 +101,26 @@ elif [ ! -f "$ANNOTATION" ]; then
97
101
echo " GTF annotation file not found: use -gtf path/to/gencodeXX.gtf\nExiting..."
98
102
exit 1
99
103
fi
100
- checkGzip=$( file --mime-type " $FASTAPATH " " $ANNOTATION " " $INPUTFILES " | grep gzip)
101
- if [ ! " $checkGzip " = " " ]; then
102
- echo " Error: Input FASTA and GTF files must be unzipped. Exiting..."
103
- exit 1
104
- fi
105
- # check input files for consistenty in "chr" nomenlature
106
- gtfChr=$( head " $ANNOTATION " | tail -1 | awk ' {print $1}' | grep chr)
107
- fastaChr=$( head -1 " $FASTAPATH " | awk ' {print $1}' | grep chr)
108
- firstInputFile=$( echo " $INPUTFILES " | awk ' {print $1}' )
109
- inputChr=$( tail -1 " $firstInputFile " | awk ' {print $1}' | grep chr)
110
- warningChr=" Warning: it appears the provided gtf, fasta, and input files use inconsistent Chromosome nomenclature. Eg. \" chr1\" vs \" 1\" . This will likely cause issues. Please edit them for consistency"
111
- if [ " $gtfChr " == " " ]; then
104
+ # check input gtf/fasta "chr" nomenclature
105
+ gtfChr=$( zcat -f " $ANNOTATION " | grep -v ' ^GL000' | tail -1 | awk ' {print $1}' | grep chr)
106
+ fastaChr=$( cat " $FASTAPATH " | head -1 | awk ' {print $1}' | grep chr)
107
+ gtfChrAdd=" "
108
+ gtfChrRemove=" UnmatchedString"
112
109
if [ " $fastaChr " != " " ]; then
113
- echo " $warningChr "
114
- elif [ " $inputChr " != " " ]; then
115
- echo " $warningChr "
110
+ if [ " $gtfChr " == " " ]; then
111
+ gtfChrAdd=" chr"
112
+ fi
113
+ elif [ " $fastaChr " == " " ]; then
114
+ if [ " $gtfChr " != " " ]; then
115
+ gtfChrRemove=" chr"
116
+ fi
116
117
fi
117
- else
118
- if [ " $fastaChr " == " " ]; then
119
- echo " $warningChr "
120
- elif [ " $inputChr " == " " ]; then
121
- echo " $warningChr "
122
- fi
123
- fi
124
118
# prepare splice site intervals from annotation.gtf
125
119
gtfBasename=$( basename $ANNOTATION )
126
120
if [ ! -f data/" $gtfBasename " _SpliceSiteIntervals.txt ] || [[ " $ANNOTATION " -nt data/" $gtfBasename " _SpliceSiteIntervals.txt ]] ; then
127
121
echo " Preparing splice site annotation..."
128
- grep ' [[:blank:]]gene[[:blank:]]\|[[:blank:]]transcript[[:blank:]]\|[[:blank:]]exon[[:blank:]]' " $ANNOTATION " | grep -v ' ^GL000' |
129
- java -cp bin getSpliceSiteIntervalsFromGTF > data/" $gtfBasename " _SpliceSiteIntervals.txt
122
+ zcat -f " $ANNOTATION " | grep ' [[:blank:]]gene[[:blank:]]\|[[:blank:]]transcript[[:blank:]]\|[[:blank:]]exon[[:blank:]]' | grep -v ' ^GL000' |
123
+ sed " s/ $gtfChrRemove // " | awk -v var= " $gtfChrAdd " ' {print var$0} ' | java -cp bin getSpliceSiteIntervalsFromGTF > data/" $gtfBasename " _SpliceSiteIntervals.txt
130
124
fi
131
125
# for each input VCF/BED file
132
126
for FILE in $INPUTFILES ; do
@@ -139,9 +133,39 @@ for FILE in $INPUTFILES; do
139
133
echo " Input file: $fileID "
140
134
# remove temp files from any previous run
141
135
rm temp/" $fileID " * 2> /dev/null
136
+ # check input file type
137
+ FILETYPE=" "
138
+ nFields=$( zcat -f $FILE | tail -1 | wc -w)
139
+ vcfHeader=$( zcat -f $FILE | head -1 | grep VCF)
140
+ if [ " $nFields " -eq 4 ]; then
141
+ FILETYPE=" TSV"
142
+ elif [ ! -z " $vcfHeader " ]; then
143
+ FILETYPE=" VCF"
144
+ else
145
+ FILETYPE=" BED"
146
+ fi
147
+ echo " File type: $FILETYPE "
148
+ # correct mismatches in "chr" nomenclature among input files
149
+ inputChr=$( zcat -f " $FILE " | tail -1 | awk ' {print $1}' | grep chr)
150
+ inputChrAdd=" "
151
+ inputChrRemove=" UnmatchedString"
152
+ if [ " $fastaChr " != " " ]; then
153
+ if [ " $inputChr " == " " ]; then
154
+ inputChrAdd=" chr"
155
+ fi
156
+ elif [ " $fastaChr " == " " ]; then
157
+ if [ " $inputChr " != " " ]; then
158
+ inputChrRemove=" chr"
159
+ fi
160
+ fi
142
161
# sort body of input file
143
- grep " ^#" " $FILE " > temp/" $fileID " _sorted
144
- grep -v " ^#" " $FILE " | sort -k1,1 -k2,2n >> temp/" $fileID " _sorted
162
+ zcat -f " $FILE " | grep " ^#" > temp/" $fileID " _sorted
163
+ if [ " $FILETYPE " == " TSV" ]; then
164
+ zcat -f " $FILE " | grep -v " ^#" | sort -k1,1 -k2,2n | sed " s/$inputChrRemove //" | awk -v OFS=" \\ t" -v var=$inputChrAdd ' {print var$1, $2, $2, "x", "1", ".", $3, $4}' >> temp/" $fileID " _sorted
165
+ else
166
+ # zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | sed "/^/$inputChrAdd/" >> temp/"$fileID"_sorted
167
+ zcat -f " $FILE " | grep -v " ^#" | sort -k1,1 -k2,2n | sed " s/$inputChrRemove //" | awk -v OFS=" \\ t" -v var=$inputChrAdd ' {print var$0}' >> temp/" $fileID " _sorted
168
+ fi
145
169
# check bedtools is installed
146
170
bedtoolsLocation=$( which bedtools) ;
147
171
if [ " $bedtoolsLocation " == " " ]; then
@@ -157,7 +181,7 @@ for FILE in $INPUTFILES; do
157
181
fi
158
182
# bedtools intersect to get strand info
159
183
echo " Retrieving strand info..."
160
- grep ' [[:blank:]]gene[[:blank:]]' " $ANNOTATION " | sort -k1,1 -k4,4n | grep -v ' ^GL000' | bedtools intersect -a temp/" $fileID " _sorted -b stdin -wa -wb -sorted > temp/" $fileID " unstrandedInput.txt
184
+ zcat -f " $ANNOTATION " | grep ' [[:blank:]]gene[[:blank:]]' | sort -k1,1 -k4,4n | grep -v ' ^GL000' | awk -v var= " $gtfChrAdd " -v OFS= " \\ t " ' {print var$0} ' | sed " s/ $gtfChrRemove // " | bedtools intersect -a temp/" $fileID " _sorted -b stdin -wa -wb -sorted > temp/" $fileID " unstrandedInput.txt
161
185
if [ $? -ne 0 ]; then
162
186
echo " Warning. Bedtools intersect returned non-zero exit status. Intersection failed between provided variant VCF/BED file and provided GTF. See above error message for more details"
163
187
fi
@@ -166,10 +190,10 @@ for FILE in $INPUTFILES; do
166
190
exit 1
167
191
fi
168
192
# generate flanking intervals.bed for bedtools getfasta and branchpointer input
169
- if [ " $INPUTVCF " = " TRUE " ]; then
193
+ if [ " $FILETYPE " = " VCF " ]; then
170
194
grep ' [[:blank:]]+[[:blank:]]' temp/" $fileID " unstrandedInput.txt | awk -v OFS=" \\ t" ' {print ".", $1, $2, "+", $4, $5}' | ( [[ " $USEBP " ]] && tee temp/" $fileID " bpInput.txt || cat ) | java -cp bin getFastaIntervals > temp/" $fileID " fastaIntervals.bed
171
195
grep ' [[:blank:]]-[[:blank:]]' temp/" $fileID " unstrandedInput.txt | awk -v OFS=" \\ t" ' {print ".", $1, $2, "-", $4, $5}' | ( [[ " $USEBP " ]] && tee -a temp/" $fileID " bpInput.txt || cat ) | java -cp bin getFastaIntervals >> temp/" $fileID " fastaIntervals.bed
172
- elif [ " $INPUTBED " = " TRUE " ]; then
196
+ elif [ " $FILETYPE " = " TSV " ] || [ " $FILETYPE " = " BED " ] ; then
173
197
grep ' [[:blank:]]+[[:blank:]]' temp/" $fileID " unstrandedInput.txt | awk -v OFS=" \\ t" ' {print ".", $1, $2, "+", $7, $8}' | ( [[ " $USEBP " ]] && tee temp/" $fileID " bpInput.txt || cat ) | java -cp bin getFastaIntervals > temp/" $fileID " fastaIntervals.bed
174
198
grep ' [[:blank:]]-[[:blank:]]' temp/" $fileID " unstrandedInput.txt | awk -v OFS=" \\ t" ' {print ".", $1, $2, "-", $7, $8}' | ( [[ " $USEBP " ]] && tee -a temp/" $fileID " bpInput.txt || cat ) | java -cp bin getFastaIntervals >> temp/" $fileID " fastaIntervals.bed
175
199
fi
@@ -185,25 +209,42 @@ for FILE in $INPUTFILES; do
185
209
fi
186
210
# seqScan: generates input strings for maxentscan and genesplicer as well as ESRseq scores
187
211
echo " Scanning for motifs..."
188
- rm output/mesOmmitted/" $fileID " 2> /dev/null
212
+ rm output/" $fileID " mesOmmitted.txt 2> /dev/null
213
+ rm output/" $fileID " refMismatch.txt 2> /dev/null
189
214
java -cp bin seqScan temp/" $fileID " seqToScan.FASTA -useESR $fileID 1>&2
215
+ if [ -s output/" $fileID " refMismatch.txt ]; then
216
+ refMismatchCount=$( wc -l output/" $fileID " refMismatch.txt | awk ' {print $1}' )
217
+ echo " Note: $refMismatchCount variants were excluded because the provided Reference allele does not match the nucleotide(s) in the provided FASTA. IDs of excluded variant(s) are outputted here: Spliceogen/output/" " $fileID " " refMismatch.txt"
218
+ fi
219
+ if [ -s output/" $fileID " mesOmmitted.txt ]; then
220
+ mesOmmittedCount=$( wc -l output/" $fileID " mesOmmitted.txt | awk ' {print $1}' )
221
+ echo " Note: $mesOmmittedCount variants were excluded from MaxEntScan because their flanking FASTA sequence contains invalid characters (most commonly \" n\" ), which cannot be processed by MaxEntScan. IDs of ommitted variant(s) are listed in: Spliceogen/output/" " $fileID " " mesOmmitted.txt"
222
+ fi
190
223
# run maxEntScan and confirm non-zero exit, since invalid inputs cause it to exit early
191
- echo " Running MaxEntScan..."
192
- perl score5.pl temp/" $fileID " mesDonorInput.txt | java -cp bin processScoresMES > temp/" $fileID " mesDonorScores.txt
193
- retVal=( ${PIPESTATUS[0]} )
194
- if [ $retVal -ne 0 ]; then
195
- echo " MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
196
- exit $retVal
197
- fi
198
- perl score3.pl temp/" $fileID " mesAcceptorInput.txt | java -cp bin processScoresMES > temp/" $fileID " mesAcceptorScores.txt
199
- retVal=( ${PIPESTATUS[0]} )
200
- if [ $retVal -ne 0 ]; then
201
- echo " MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
202
- exit $retVal
224
+ if [ -s temp/" $fileID " mesDonorInput.txt ] || [ -s temp/" $fileID " mesAcceptorInput.txt ] ; then
225
+ echo " Running MaxEntScan..."
226
+ perl score5.pl temp/" $fileID " mesDonorInput.txt | java -cp bin processScoresMES > temp/" $fileID " mesDonorScores.txt
227
+ retVal=( ${PIPESTATUS[0]} )
228
+ if [ $retVal -ne 0 ]; then
229
+ echo " MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
230
+ exit $retVal
231
+ fi
232
+ perl score3.pl temp/" $fileID " mesAcceptorInput.txt | java -cp bin processScoresMES > temp/" $fileID " mesAcceptorScores.txt
233
+ retVal=( ${PIPESTATUS[0]} )
234
+ if [ $retVal -ne 0 ]; then
235
+ echo " MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
236
+ exit $retVal
237
+ fi
238
+ else
239
+ echo " No input for MaxEntScan"
203
240
fi
204
241
# run genesplicer
205
- echo " Running GeneSplicer..."
206
- bin/linux/genesplicerAdapted temp/" $fileID " gsInput.FASTA human > temp/" $fileID " gsScores.txt
242
+ if [ -s temp/" $fileID " gsInput.FASTA ] ; then
243
+ echo " Running GeneSplicer..."
244
+ bin/linux/genesplicerAdapted temp/" $fileID " gsInput.FASTA human > temp/" $fileID " gsScores.txt
245
+ else
246
+ echo " No input for GeneSplicer"
247
+ fi
207
248
# run branchpointer SNPs
208
249
if [ " $USEBP " = " TRUE" -a " $USEBPINDELS " = " FALSE" ]; then
209
250
echo " Running Branchpointer..."
@@ -226,25 +267,35 @@ for FILE in $INPUTFILES; do
226
267
# awk -v OFS=\\t '{print $2, $3, $4, $8, $9, $15, $16, $22, $23, $24, $25}' output/"$fileID"bpOutputSNPs.txt" > output/bpSNPsSummarised.txt
227
268
fi
228
269
# merge scores into one line
229
- echo " Processing scores..."
230
- cat temp/" $fileID " mesDonorScores.txt temp/" $fileID " mesAcceptorScores.txt temp/" $fileID " gsScores.txt temp/" $fileID " ESRoutput.txt data/" $gtfBasename " _SpliceSiteIntervals.txt sources/terminatingMergeLine.txt |
231
- sort -k1,1 -k 2,2n -k 3 -k 4 -s | java -cp bin mergeOutput " $fileID "
232
- # sort predictions
233
- if [ -s temp/" $fileID " _donorCreating_unsorted.txt ]; then
234
- sort -gr -k11,11 temp/" $fileID " _donorCreating_unsorted.txt >> output/" $fileID " _donorCreating.txt
235
- else
236
- rm output/" $fileID " _donorCreating.txt
237
- fi
238
- if [ -s temp/" $fileID " _acceptorCreating_unsorted.txt ]; then
239
- sort -gr -k11,11 temp/" $fileID " _acceptorCreating_unsorted.txt >> output/" $fileID " _acceptorCreating.txt
270
+ scoresToMerge=" "
271
+ if [ -s temp/" $fileID " gsScores.txt ] ; then
272
+ scoresToMerge=" temp/" $fileID " gsScores.txt"
273
+ fi
274
+ if [ -s temp/" $fileID " mesDonorScores.txt ] ; then
275
+ scoresToMerge=" $scoresToMerge temp/" $fileID " mesDonorScores.txt"
276
+ fi
277
+ if [ -s temp/" $fileID " mesAcceptorScores.txt ] ; then
278
+ scoresToMerge=" $scoresToMerge temp/" $fileID " mesAcceptorScores.txt"
279
+ fi
280
+ if [ -s temp/" $fileID " ESRoutput.txt ] ; then
281
+ scoresToMerge=" $scoresToMerge temp/" $fileID " ESRoutput.txt"
282
+ fi
283
+ checkScoresExist=$( echo " $scoresToMerge " | grep " temp" )
284
+ if [ -z " $checkScoresExist " ]; then
285
+ echo " No MaxEntScan/GeneSplicer/ESRseq scores to process"
240
286
else
241
- rm output/" $fileID " _acceptorCreating.txt
287
+ echo " Processing scores..."
288
+ cat $( echo " $scoresToMerge " ) data/" $gtfBasename " _SpliceSiteIntervals.txt sources/terminatingMergeLine.txt | sort -k1,1 -k 2,2n -k 3 -k 4 -s | java -cp bin mergeOutput " $fileID "
289
+ fi
290
+ # sort predictions
291
+ if [ -s temp/" $fileID " _gain_unsorted.txt ]; then
292
+ echo -e " #CHR\tSTART\tREF\tALT\tGENE\tdonGainP\taccGainP" > output/" $fileID " _ssGain.txt
293
+ sort -gr -k8,8 temp/" $fileID " _gain_unsorted.txt | cut -f1-7 >> output/" $fileID " _ssGain.txt
242
294
fi
243
- if [ -s temp/" $fileID " _withinSS_unsorted.txt ]; then
244
- sort -gr -k17,17 temp/" $fileID " _withinSS_unsorted.txt | cut -f1-15 >> output/" $fileID " _withinSS.txt
245
- else
246
- rm output/" $fileID " _withinSS.txt
295
+ if [ -s temp/" $fileID " _loss_unsorted.txt ]; then
296
+ echo -e " #CHR\tSTART\tREF\tALT\tGENE\twithinSS\tdonGainP\taccGainP" > output/" $fileID " _withinSS.txt
297
+ sort -gr -k9,9 temp/" $fileID " _loss_unsorted.txt | cut -f1-8 >> output/" $fileID " _withinSS.txt
247
298
fi
248
299
# clean up temp files
249
- rm temp/" $fileID " * 2> /dev/null
300
+ # rm temp/"$fileID"* 2> /dev/null
250
301
done
0 commit comments