|
| 1 | +from matplotlib.backends.backend_pdf import PdfPages |
| 2 | +from scipy import stats |
| 3 | + |
| 4 | +import os |
| 5 | +import matplotlib.pyplot as plt |
| 6 | +import numpy |
| 7 | +import pandas |
| 8 | +import logging |
| 9 | +import argparse |
| 10 | + |
| 11 | +#################### |
| 12 | +## ARGS INPUT ## |
| 13 | +#################### |
| 14 | + |
| 15 | +tool_description = """ |
| 16 | +This tool checks the reproducibility of the crosslinking sites between samples in fasta format. |
| 17 | +Ideally the trend should follow a diagonal line with the highest reproducible motif in the right |
| 18 | +upper most corner. |
| 19 | +By default output is written to source file location. |
| 20 | +Example usage: |
| 21 | +crosslink_quality_check.py exp_rep_1.fasta exp_rep_2.fasta controls.fasta kmer_length -o output.pdf |
| 22 | +""" |
| 23 | + |
| 24 | +# parse command line arguments |
| 25 | +parser = argparse.ArgumentParser(description=tool_description, |
| 26 | + formatter_class=argparse.RawDescriptionHelpFormatter) |
| 27 | +# positional arguments |
| 28 | +parser.add_argument( |
| 29 | + "exp_rep_1", |
| 30 | + help="Path to first experiment replicate fasta file.") |
| 31 | +parser.add_argument( |
| 32 | + "exp_rep_2", |
| 33 | + help="Path to second experiment replicate fasta file.") |
| 34 | +parser.add_argument( |
| 35 | + 'controls', |
| 36 | + nargs='+', |
| 37 | + help="Path to the control fasta files. Specify at least one more files.") |
| 38 | +parser.add_argument( |
| 39 | + 'kmer_length', |
| 40 | + type=int, |
| 41 | + help="Length of the kmers. Keep in mind that the sequences should be long enough.") |
| 42 | +# optional arguments |
| 43 | +parser.add_argument( |
| 44 | + "-o", "--outfile", |
| 45 | + help="Write results to this file.") |
| 46 | +parser.add_argument( |
| 47 | + "-d", "--debug", |
| 48 | + help="Print lots of debugging information", |
| 49 | + action="store_true") |
| 50 | + |
| 51 | +args = parser.parse_args() |
| 52 | +if args.debug: |
| 53 | + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") |
| 54 | +else: |
| 55 | + logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") |
| 56 | +logging.info("Parsed arguments:") |
| 57 | +logging.info(" exp_rep_1: '{}'".format(args.exp_rep_1)) |
| 58 | +logging.info(" exp_rep_2: '{}'".format(args.exp_rep_2)) |
| 59 | +logging.info(" controls: '{}'".format(args.controls)) |
| 60 | +logging.info(" kmer_length: '{}'".format(args.kmer_length)) |
| 61 | +if args.outfile: |
| 62 | + logging.info(" outfile: enabled writing to file") |
| 63 | + logging.info(" outfile: '{}'".format(args.outfile)) |
| 64 | +logging.info("") |
| 65 | + |
| 66 | +if args.kmer_length <= 1: |
| 67 | + raise Exception("[ERROR] kmer length too short. Your kmer length makes no sense.") |
| 68 | + |
| 69 | +################### |
| 70 | +## READ DATA ## |
| 71 | +################### |
| 72 | + |
| 73 | +print("[START]") |
| 74 | +print("[NOTE] Read data") |
| 75 | + |
| 76 | +# your read data in bed6 format |
| 77 | +files = [args.exp_rep_1, args.exp_rep_2] |
| 78 | +files.extend(args.controls) |
| 79 | + |
| 80 | +# read first line of first file to get length of the sequences |
| 81 | +tmp = open(files[0]) |
| 82 | +firstline = tmp.readline() |
| 83 | + |
| 84 | +# we assume that the middle of the sequence is the crosslink nucleotide |
| 85 | +cl_nucleotide = int(len(firstline)/2) |
| 86 | + |
| 87 | +# get starting and end point for the kmers |
| 88 | +start_iter = cl_nucleotide - args.kmer_length + 1 |
| 89 | +end_iter = cl_nucleotide + args.kmer_length + 1 |
| 90 | + |
| 91 | +tmp.close() |
| 92 | + |
| 93 | +print("[NOTE] finish") |
| 94 | + |
| 95 | +###################### |
| 96 | +## PROCESS DATA ## |
| 97 | +###################### |
| 98 | + |
| 99 | +print("[NOTE] Process data") |
| 100 | + |
| 101 | +# create a dictornary for the kmers |
| 102 | +kmer_dict = dict() |
| 103 | + |
| 104 | +f = 0 |
| 105 | +for file in files: |
| 106 | + with open(file) as openfileobject: |
| 107 | + for seq in openfileobject: |
| 108 | + # check length of kmer |
| 109 | + if len(seq) < (args.kmer_length*2 - 1): |
| 110 | + raise Exception("[ERROR] kmer length too long, in other words, sequence is too short.") |
| 111 | + |
| 112 | + # go over your sequence and generate kmers of length args.kmer_length |
| 113 | + # put kmers into dictonary |
| 114 | + for i in range(start_iter, cl_nucleotide+1): |
| 115 | + kmer = seq[i:i+args.kmer_length] |
| 116 | + # if kmer already exists, then increment for file f |
| 117 | + if kmer in kmer_dict: |
| 118 | + kmer_dict[kmer][f] += 1 |
| 119 | + # if kmer does not exist, then create a new vector of size len(files) |
| 120 | + else: |
| 121 | + init = numpy.zeros(len(files)) |
| 122 | + init[f] = 1 |
| 123 | + kmer_dict[kmer] = init |
| 124 | + |
| 125 | + openfileobject.close() |
| 126 | + f +=1 |
| 127 | + |
| 128 | +sum_vector = numpy.zeros(len(files)) |
| 129 | + |
| 130 | +kmer_values_vectors = kmer_dict.values() |
| 131 | + |
| 132 | +# get the total number of kmers for each files (sample) |
| 133 | +for values_vector in kmer_values_vectors: |
| 134 | + sum_vector += values_vector |
| 135 | + |
| 136 | +# calculate the realtive abundance of each kmer for each file (sample) |
| 137 | +for kmer in kmer_dict: |
| 138 | + kmer_dict[kmer] = kmer_dict[kmer]/sum_vector |
| 139 | + |
| 140 | +print("[NOTE] finish") |
| 141 | + |
| 142 | +############## |
| 143 | +## PLOT ## |
| 144 | +############## |
| 145 | + |
| 146 | +print("[NOTE] Make plots") |
| 147 | + |
| 148 | +plotpath = os.path.dirname(os.path.abspath(__file__)) + '/' |
| 149 | + |
| 150 | +df = pandas.DataFrame(kmer_dict).T |
| 151 | + |
| 152 | +# sort the dictionary (now dataframe) accoridng two the first two files (here replicates of the experiemtn) |
| 153 | +df_sorted = df.sort_values([0, 1], ascending=[False, False]) |
| 154 | +df_sorted.columns = files |
| 155 | +df_sorted.to_csv(plotpath + 'reproducible_motifs.csv', sep='\t') |
| 156 | + |
| 157 | +# change colun names back to integer for convience |
| 158 | +df_sorted.columns = [x for x in range(len(files))] |
| 159 | + |
| 160 | +# Find the n most reproducible motifs in the two replicates of your experiment |
| 161 | +n = 10 |
| 162 | +top_n_motifs = ["red" for x in range(n)] |
| 163 | +rest_of_points = ["blue" for x in range(len(df_sorted[0]) - n)] |
| 164 | +colors_for_scatterplot = top_n_motifs + rest_of_points |
| 165 | + |
| 166 | +# create a plot and list of motifs with their sorted relative abundance for each pair of files |
| 167 | +p = 1 |
| 168 | +for i in range(0,len(files)): |
| 169 | + for j in range(i+1, len(files)): |
| 170 | + |
| 171 | + outfile_name_plot = "" |
| 172 | + outfile_name_motif_table = "" |
| 173 | + if args.outfile: |
| 174 | + outfile_name_plot = args.outfile + '_' + str(p) + '.pdf' |
| 175 | + else: |
| 176 | + outfile_name_plot = plotpath + 'Crosslink_Kmer_Quality_Check_' + str(i) + '_' + str(j) + '_' + str(p) + '.pdf' |
| 177 | + |
| 178 | + p += 1 |
| 179 | + pp = PdfPages(outfile_name_plot) |
| 180 | + |
| 181 | + # do linear regression for the two files |
| 182 | + slope, intercept, r_value, p_value, std_err = stats.linregress(df_sorted[i], df_sorted[j]) |
| 183 | + |
| 184 | + plt.scatter(df_sorted[i], df_sorted[j], c=colors_for_scatterplot, s=2) |
| 185 | + plt.ylabel(files[i]) |
| 186 | + plt.xlabel(files[j]) |
| 187 | + |
| 188 | + max_x = max(df_sorted[i]) |
| 189 | + max_y = max(df_sorted[j]) |
| 190 | + |
| 191 | + plt.title("R" + r'$^2 =$' + " " + str(r_value), fontsize=15) |
| 192 | + |
| 193 | + pp.savefig() |
| 194 | + pp.close() |
| 195 | + plt.close() |
| 196 | + |
| 197 | +print("[FINISH]") |
| 198 | + |
0 commit comments