VCCRI
diff --git a/‎RUN.sh
+123-72 b/‎RUN.sh
+123-72
diff --git a/‎bin/mergeOutput.class
-535 Bytes b/‎bin/mergeOutput.class
-535 Bytes
@@ -1,18 +1,17 @@
 #!/bin/bash
 function printVersion {
-    printf "Spliceogen 1.0 1-March-2019\n"
+    printf "Spliceogen 2.0 1-October-2019\n"
 }
 function printHelp {
     cat <<-END
 Usage:
 ------
 3 required args:
-1)  -inputVCF path/to/VCF/input(s).VCF
-        OR
-    -inputBED path/to/input(s).BED 
-        Note: wildcard matching of multiple files is allowed
-2)  -gtf path/to/annotation.GTF
-3)  -fasta path/to/genome.fasta
+1)  -input path/to/VCF/input/file(s).VCF
+        Note: multiple input files are accepted "eg. -input *.vcf"
+        Note: deprecated v1.0 tags "inputVCF" and "inputBED" are still accepted
+2)  -gtf path/to/annotation.gtf
+3)  -fasta path/to/genome.fa
 optional arg:
 4)  -branchpointer hgXX
         OR
@@ -22,8 +21,6 @@ END
 }
 #set default parameters
 POSITIONAL=()
-INPUTVCF="FALSE"
-INPUTBED="FALSE"
 INPUTFILES=""
 USEBP=""
 USEBPINDELS=""
@@ -43,8 +40,16 @@ case $key in
     printVersion
     exit 0
     ;;
+    -input)
+    INPUTFILES="$2"
+    shift
+    shift
+    while [ "$1" ] && [[ ! $1 == *-* ]]; do
+        INPUTFILES="$INPUTFILES $1"
+        shift
+    done
+    ;;
     -inputVCF)
-    INPUTVCF="TRUE"
     INPUTFILES="$2"
     shift
     shift
@@ -54,7 +59,6 @@ case $key in
     done
     ;;
     -inputBED)
-    INPUTBED="TRUE"
     INPUTFILES="$2"
     shift
     shift
@@ -97,36 +101,26 @@ elif [ ! -f "$ANNOTATION" ]; then
     echo "GTF annotation file not found: use -gtf path/to/gencodeXX.gtf\nExiting..."
     exit 1
 fi
-checkGzip=$( file --mime-type "$FASTAPATH" "$ANNOTATION" "$INPUTFILES" | grep gzip)
-if [ ! "$checkGzip" = "" ]; then
-    echo "Error: Input FASTA and GTF files must be unzipped. Exiting..."
-    exit 1
-fi
-#check input files for consistenty in "chr" nomenlature
-gtfChr=$(head "$ANNOTATION" | tail -1 | awk '{print $1}' | grep chr)
-fastaChr=$(head -1 "$FASTAPATH" | awk '{print $1}' | grep chr)
-firstInputFile=$(echo "$INPUTFILES" | awk '{print $1}')
-inputChr=$(tail -1 "$firstInputFile" | awk '{print $1}' | grep chr)
-warningChr="Warning: it appears the provided gtf, fasta, and input files use inconsistent Chromosome nomenclature. Eg. \"chr1\" vs \"1\". This will likely cause issues. Please edit them for consistency"
-if [ "$gtfChr" == "" ]; then
+#check input gtf/fasta "chr" nomenclature
+gtfChr=$(zcat -f "$ANNOTATION" | grep -v '^GL000' | tail -1 | awk '{print $1}' | grep chr)
+fastaChr=$(cat "$FASTAPATH" | head -1 | awk '{print $1}' | grep chr)
+    gtfChrAdd=""
+    gtfChrRemove="UnmatchedString"
     if [ "$fastaChr" != "" ]; then
-        echo "$warningChr"
-    elif [ "$inputChr" != "" ]; then
-        echo "$warningChr"
+        if [ "$gtfChr" == "" ]; then
+            gtfChrAdd="chr"
+        fi        
+    elif [ "$fastaChr" == "" ]; then
+        if [ "$gtfChr" != "" ]; then
+            gtfChrRemove="chr"
+        fi        
     fi
-else
-    if [ "$fastaChr" == "" ]; then
-        echo "$warningChr"
-    elif [ "$inputChr" == "" ]; then
-        echo "$warningChr"
-    fi
-fi
 #prepare splice site intervals from annotation.gtf
 gtfBasename=$(basename $ANNOTATION)
 if [ ! -f data/"$gtfBasename"_SpliceSiteIntervals.txt ] || [[ "$ANNOTATION" -nt data/"$gtfBasename"_SpliceSiteIntervals.txt ]] ; then
     echo "Preparing splice site annotation..."
-    grep '[[:blank:]]gene[[:blank:]]\|[[:blank:]]transcript[[:blank:]]\|[[:blank:]]exon[[:blank:]]' "$ANNOTATION" | grep -v '^GL000' |
-    java -cp bin getSpliceSiteIntervalsFromGTF > data/"$gtfBasename"_SpliceSiteIntervals.txt
+    zcat -f "$ANNOTATION" | grep '[[:blank:]]gene[[:blank:]]\|[[:blank:]]transcript[[:blank:]]\|[[:blank:]]exon[[:blank:]]' | grep -v '^GL000' | 
+    sed "s/$gtfChrRemove//" | awk -v var="$gtfChrAdd" '{print var$0}' | java -cp bin getSpliceSiteIntervalsFromGTF > data/"$gtfBasename"_SpliceSiteIntervals.txt
 fi
 #for each input VCF/BED file
 for FILE in $INPUTFILES; do
@@ -139,9 +133,39 @@ for FILE in $INPUTFILES; do
     echo "Input file: $fileID"
     #remove temp files from any previous run
     rm temp/"$fileID"* 2> /dev/null
+    #check input file type
+    FILETYPE=""
+    nFields=$(zcat -f $FILE | tail -1 | wc -w)
+    vcfHeader=$(zcat -f $FILE | head -1 | grep VCF)
+    if [ "$nFields" -eq 4 ]; then
+        FILETYPE="TSV"
+    elif [ ! -z "$vcfHeader" ]; then
+        FILETYPE="VCF"
+    else
+        FILETYPE="BED"
+    fi
+    echo "File type: $FILETYPE"
+    #correct mismatches in "chr" nomenclature among input files
+    inputChr=$(zcat -f "$FILE" | tail -1 | awk '{print $1}' | grep chr)
+    inputChrAdd=""
+    inputChrRemove="UnmatchedString"
+    if [ "$fastaChr" != "" ]; then
+        if [ "$inputChr" == "" ]; then
+            inputChrAdd="chr"
+        fi        
+    elif [ "$fastaChr" == "" ]; then
+        if [ "$inputChr" != "" ]; then
+            inputChrRemove="chr"
+        fi        
+    fi
     #sort body of input file
-    grep "^#" "$FILE" > temp/"$fileID"_sorted
-    grep -v "^#" "$FILE" | sort -k1,1 -k2,2n >> temp/"$fileID"_sorted
+        zcat -f "$FILE" | grep "^#" > temp/"$fileID"_sorted
+        if [ "$FILETYPE" == "TSV" ]; then
+            zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | awk -v OFS="\\t" -v var=$inputChrAdd '{print var$1, $2, $2, "x", "1", ".", $3, $4}' >> temp/"$fileID"_sorted
+        else 
+            #zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | sed "/^/$inputChrAdd/" >> temp/"$fileID"_sorted
+            zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | awk -v OFS="\\t" -v var=$inputChrAdd '{print var$0}' >> temp/"$fileID"_sorted
+        fi
     #check bedtools is installed
     bedtoolsLocation=$(which bedtools);
     if [ "$bedtoolsLocation" == "" ]; then
@@ -157,7 +181,7 @@ for FILE in $INPUTFILES; do
     fi
     #bedtools intersect to get strand info
     echo "Retrieving strand info..."
-    grep '[[:blank:]]gene[[:blank:]]' "$ANNOTATION" | sort -k1,1 -k4,4n | grep -v '^GL000' | bedtools intersect -a temp/"$fileID"_sorted -b stdin -wa -wb -sorted  > temp/"$fileID"unstrandedInput.txt 
+    zcat -f "$ANNOTATION" | grep '[[:blank:]]gene[[:blank:]]' | sort -k1,1 -k4,4n | grep -v '^GL000' | awk -v var="$gtfChrAdd" -v OFS="\\t" '{print var$0}' | sed "s/$gtfChrRemove//" | bedtools intersect -a temp/"$fileID"_sorted -b stdin -wa -wb -sorted  > temp/"$fileID"unstrandedInput.txt 
     if [ $? -ne 0 ]; then
         echo "Warning. Bedtools intersect returned non-zero exit status. Intersection failed between provided variant VCF/BED file and provided GTF. See above error message for more details"
     fi
@@ -166,10 +190,10 @@ for FILE in $INPUTFILES; do
         exit 1
     fi
     #generate flanking intervals.bed for bedtools getfasta and branchpointer input
-    if [ "$INPUTVCF" = "TRUE" ]; then
+    if [ "$FILETYPE" = "VCF" ]; then
         grep '[[:blank:]]+[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "+", $4, $5}' | ( [[ "$USEBP" ]] && tee temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals > temp/"$fileID"fastaIntervals.bed
         grep '[[:blank:]]-[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "-", $4, $5}' | ( [[ "$USEBP" ]] && tee -a temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals >> temp/"$fileID"fastaIntervals.bed
-    elif [ "$INPUTBED" = "TRUE" ]; then
+    elif [ "$FILETYPE" = "TSV" ] || [ "$FILETYPE" = "BED" ] ; then
         grep '[[:blank:]]+[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "+", $7, $8}' | ( [[ "$USEBP" ]] && tee temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals > temp/"$fileID"fastaIntervals.bed
         grep '[[:blank:]]-[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "-", $7, $8}' | ( [[ "$USEBP" ]] && tee -a temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals >> temp/"$fileID"fastaIntervals.bed
     fi
@@ -185,25 +209,42 @@ for FILE in $INPUTFILES; do
     fi
     #seqScan: generates input strings for maxentscan and genesplicer as well as ESRseq scores
     echo "Scanning for motifs..."
-    rm output/mesOmmitted/"$fileID" 2> /dev/null
+    rm output/"$fileID"mesOmmitted.txt 2> /dev/null
+    rm output/"$fileID"refMismatch.txt 2> /dev/null
     java -cp bin seqScan temp/"$fileID"seqToScan.FASTA -useESR $fileID 1>&2
+    if [ -s output/"$fileID"refMismatch.txt ]; then
+        refMismatchCount=$(wc -l output/"$fileID"refMismatch.txt | awk '{print $1}')
+        echo "Note: $refMismatchCount variants were excluded because the provided Reference allele does not match the nucleotide(s) in the provided FASTA. IDs of excluded variant(s) are outputted here: Spliceogen/output/""$fileID""refMismatch.txt"
+    fi
+    if [ -s output/"$fileID"mesOmmitted.txt ]; then
+        mesOmmittedCount=$(wc -l output/"$fileID"mesOmmitted.txt | awk '{print $1}')
+        echo "Note: $mesOmmittedCount variants were excluded from MaxEntScan because their flanking FASTA sequence contains invalid characters (most commonly \"n\"), which cannot be processed by MaxEntScan. IDs of ommitted variant(s) are listed in: Spliceogen/output/""$fileID""mesOmmitted.txt"
+    fi
     #run maxEntScan and confirm non-zero exit, since invalid inputs cause it to exit early
-    echo "Running MaxEntScan..."
-    perl score5.pl temp/"$fileID"mesDonorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesDonorScores.txt
-    retVal=( ${PIPESTATUS[0]} )
-    if [ $retVal -ne 0 ]; then
-        echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
-    exit $retVal
-    fi
-    perl score3.pl temp/"$fileID"mesAcceptorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesAcceptorScores.txt
-    retVal=( ${PIPESTATUS[0]} )
-    if [ $retVal -ne 0 ]; then
-        echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
-    exit $retVal
+    if [ -s temp/"$fileID"mesDonorInput.txt ] || [ -s temp/"$fileID"mesAcceptorInput.txt ] ; then
+        echo "Running MaxEntScan..."
+        perl score5.pl temp/"$fileID"mesDonorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesDonorScores.txt
+        retVal=( ${PIPESTATUS[0]} )
+        if [ $retVal -ne 0 ]; then
+            echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
+        exit $retVal
+        fi
+        perl score3.pl temp/"$fileID"mesAcceptorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesAcceptorScores.txt
+        retVal=( ${PIPESTATUS[0]} )
+        if [ $retVal -ne 0 ]; then
+            echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
+        exit $retVal
+        fi
+    else
+        echo "No input for MaxEntScan"
     fi
     #run genesplicer
-    echo "Running GeneSplicer..."
-    bin/linux/genesplicerAdapted temp/"$fileID"gsInput.FASTA human > temp/"$fileID"gsScores.txt
+    if [ -s temp/"$fileID"gsInput.FASTA ] ; then
+        echo "Running GeneSplicer..."
+        bin/linux/genesplicerAdapted temp/"$fileID"gsInput.FASTA human > temp/"$fileID"gsScores.txt
+    else
+        echo "No input for GeneSplicer"
+    fi
     #run branchpointer SNPs
     if [ "$USEBP" = "TRUE" -a "$USEBPINDELS" = "FALSE" ]; then
         echo "Running Branchpointer..."
@@ -226,25 +267,35 @@ for FILE in $INPUTFILES; do
         #awk -v OFS=\\t '{print $2, $3, $4, $8, $9, $15, $16, $22, $23, $24, $25}' output/"$fileID"bpOutputSNPs.txt" > output/bpSNPsSummarised.txt
     fi
     #merge scores into one line
-    echo "Processing scores..."
-    cat temp/"$fileID"mesDonorScores.txt temp/"$fileID"mesAcceptorScores.txt temp/"$fileID"gsScores.txt temp/"$fileID"ESRoutput.txt data/"$gtfBasename"_SpliceSiteIntervals.txt sources/terminatingMergeLine.txt |
-    sort -k1,1 -k 2,2n -k 3 -k 4 -s | java -cp bin mergeOutput "$fileID"
-    #sort predictions
-    if [ -s temp/"$fileID"_donorCreating_unsorted.txt ]; then
-        sort -gr -k11,11 temp/"$fileID"_donorCreating_unsorted.txt >> output/"$fileID"_donorCreating.txt
-    else
-        rm output/"$fileID"_donorCreating.txt
-    fi 
-    if [ -s temp/"$fileID"_acceptorCreating_unsorted.txt ]; then
-        sort -gr -k11,11 temp/"$fileID"_acceptorCreating_unsorted.txt >> output/"$fileID"_acceptorCreating.txt
+    scoresToMerge=""
+    if [ -s temp/"$fileID"gsScores.txt ] ; then
+        scoresToMerge="temp/"$fileID"gsScores.txt"
+    fi
+    if [ -s temp/"$fileID"mesDonorScores.txt ] ; then
+        scoresToMerge="$scoresToMerge temp/"$fileID"mesDonorScores.txt"
+    fi
+    if [ -s temp/"$fileID"mesAcceptorScores.txt ] ; then
+        scoresToMerge="$scoresToMerge temp/"$fileID"mesAcceptorScores.txt"
+    fi
+    if [ -s temp/"$fileID"ESRoutput.txt ] ; then
+        scoresToMerge="$scoresToMerge temp/"$fileID"ESRoutput.txt"
+    fi
+    checkScoresExist=$(echo "$scoresToMerge" | grep "temp")
+    if [ -z "$checkScoresExist" ]; then
+        echo "No MaxEntScan/GeneSplicer/ESRseq scores to process"
     else
-        rm output/"$fileID"_acceptorCreating.txt
+        echo "Processing scores..."
+        cat $(echo "$scoresToMerge") data/"$gtfBasename"_SpliceSiteIntervals.txt sources/terminatingMergeLine.txt | sort -k1,1 -k 2,2n -k 3 -k 4 -s | java -cp bin mergeOutput "$fileID"
+    fi
+    #sort predictions
+    if [ -s temp/"$fileID"_gain_unsorted.txt ]; then
+        echo -e "#CHR\tSTART\tREF\tALT\tGENE\tdonGainP\taccGainP" > output/"$fileID"_ssGain.txt
+        sort -gr -k8,8 temp/"$fileID"_gain_unsorted.txt | cut -f1-7 >> output/"$fileID"_ssGain.txt
     fi 
-    if [ -s temp/"$fileID"_withinSS_unsorted.txt ]; then
-        sort -gr -k17,17 temp/"$fileID"_withinSS_unsorted.txt | cut -f1-15 >> output/"$fileID"_withinSS.txt
-    else
-        rm output/"$fileID"_withinSS.txt
+    if [ -s temp/"$fileID"_loss_unsorted.txt ]; then
+        echo -e "#CHR\tSTART\tREF\tALT\tGENE\twithinSS\tdonGainP\taccGainP" > output/"$fileID"_withinSS.txt
+        sort -gr -k9,9 temp/"$fileID"_loss_unsorted.txt | cut -f1-8 >> output/"$fileID"_withinSS.txt
     fi 
     #clean up temp files
-    rm temp/"$fileID"* 2> /dev/null
+    #rm temp/"$fileID"* 2> /dev/null
 done