Made small changes to existing files and added a new version of crosslink_quality_check.py

heylf · heylf · commit 9b601dd0ec96 · 2017-12-08T17:14:25.000+01:00
diff --git a/chromosome_quality_check.py b/chromosome_quality_check.py
@@ -20,7 +20,7 @@
 cDNA (x-axis).
 By default output is written to source file location.
 Example usage:
-chromosome_quality_check.py file1.bed file2.bed --out output.pdf
+chromosome_quality_check.py file1.bed file2.bed -o output.pdf
 """
 
 # parse command line arguments
@@ -95,7 +95,7 @@ def count_values_for_set(s):
 
 common_sets = set(main_groupcount[0].keys())
 
-# reads that are the same between the sets
+# positions that are the same between the sets
 for i in range(0,len(main_groupcount)):
     sample = main_groupcount[i]
     common_sets = common_sets & set(sample.keys())
@@ -104,7 +104,7 @@ def count_values_for_set(s):
     for x in counted_set.keys():
         all[min(x - 1, 4)] += counted_set[x]
 
-# add to the intersection-counts for each sample the number of reads
+# add to the intersection-counts for each sample the number of positions
 # that are x-times duplicated
 for region in common_sets:
     for sample in main_groupcount:
@@ -137,4 +137,4 @@ def count_values_for_set(s):
 pp.savefig()
 pp.close()
 
-print("[FINISH]")
+print("[FINISH]")
diff --git a/cross_convolution_analysis.py b/cross_convolution_analysis.py
@@ -20,7 +20,7 @@
 be observable. 
 By default output is written to source file location.
 Example usage:
-cross_convolution_analysis.py reads.bam genome_table.tsv shift --out output.pdf
+cross_convolution_analysis.py reads.bam genome_table.tsv shift -o output.pdf
 """
 
 # parse command line arguments
@@ -160,4 +160,4 @@
 pp.savefig()
 pp.close()
 
-print('[FINISH]')
+print('[FINISH]')
diff --git a/crosslink_quality_check.py b/crosslink_quality_check.py
@@ -1,17 +1,24 @@
-import argparse
 from matplotlib.backends.backend_pdf import PdfPages
+from scipy import stats
 
 import os
 import matplotlib.pyplot as plt
 import numpy
 import pandas
 import logging
+import argparse
 
 ####################
 ##   ARGS INPUT   ##
 ####################
 
 tool_description = """
+This tool checks the reproducibility of the crosslinking sites between samples in fasta format.
+Ideally the trend should follow a diagonal line with the highest reproducible motif in the right
+upper most corner.
+By default output is written to source file location.
+Example usage:
+crosslink_quality_check.py file1.fasta file2.fasta kmer_length -o output.pdf
 """
 
 # parse command line arguments
@@ -140,14 +147,28 @@
 for i in range(0,len(files)):
     for j in range(i+1, len(files)):
 
-        pp = PdfPages(plotpath + 'Crosslink_Kmer_Quality_Check_' + str(i) + '_' + str(j) + '.pdf')
+        outfile_name = ""
+        if args.outfile:
+            outfile_name = args.outfile
+        else:
+            outfile_name = plotpath + 'Crosslink_Kmer_Quality_Check_' + str(i) + '_' + str(j) + '.pdf'
+
+        pp = PdfPages(outfile_name)
 
         df = pandas.DataFrame(kmer_dict).T
 
+        # do linear regression for the two files
+        slope, intercept, r_value, p_value, std_err = stats.linregress(df[i], df[j])
+
         plt.plot(df[i], df[j],  ls='', marker='.', ms=10.0)
         plt.ylabel(files[i])
         plt.xlabel(files[j])
 
+        max_x = max(df[i])
+        max_y = max(df[j])
+
+        plt.text(max_x/3, max_y/2 + max_y*.2, "R" + r'$^2 =$' + " " + str(r_value), fontsize=15)
+
         pp.savefig()
         pp.close()
 
diff --git a/crosslink_quality_check_colored.py b/crosslink_quality_check_colored.py
@@ -0,0 +1,198 @@
+from matplotlib.backends.backend_pdf import PdfPages
+from scipy import stats
+
+import os
+import matplotlib.pyplot as plt
+import numpy
+import pandas
+import logging
+import argparse
+
+####################
+##   ARGS INPUT   ##
+####################
+
+tool_description = """
+This tool checks the reproducibility of the crosslinking sites between samples in fasta format.
+Ideally the trend should follow a diagonal line with the highest reproducible motif in the right
+upper most corner.
+By default output is written to source file location.
+Example usage:
+crosslink_quality_check.py exp_rep_1.fasta exp_rep_2.fasta controls.fasta kmer_length -o output.pdf
+"""
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+    "exp_rep_1",
+    help="Path to first experiment replicate fasta file.")
+parser.add_argument(
+    "exp_rep_2",
+    help="Path to second experiment replicate fasta file.")
+parser.add_argument(
+    'controls',
+    nargs='+',
+    help="Path to the control fasta files. Specify at least one more files.")
+parser.add_argument(
+    'kmer_length',
+    type=int,
+    help="Length of the kmers. Keep in mind that the sequences should be long enough.")
+# optional arguments
+parser.add_argument(
+    "-o", "--outfile",
+    help="Write results to this file.")
+parser.add_argument(
+    "-d", "--debug",
+    help="Print lots of debugging information",
+    action="store_true")
+
+args = parser.parse_args()
+if args.debug:
+    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+else:
+    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info("  exp_rep_1: '{}'".format(args.exp_rep_1))
+logging.info("  exp_rep_2: '{}'".format(args.exp_rep_2))
+logging.info("  controls: '{}'".format(args.controls))
+logging.info("  kmer_length: '{}'".format(args.kmer_length))
+if args.outfile:
+    logging.info("  outfile: enabled writing to file")
+    logging.info("  outfile: '{}'".format(args.outfile))
+logging.info("")
+
+if args.kmer_length <= 1:
+    raise Exception("[ERROR] kmer length too short. Your kmer length makes no sense.")
+
+###################
+##   READ DATA   ##
+###################
+
+print("[START]")
+print("[NOTE] Read data")
+
+# your read data in bed6 format
+files = [args.exp_rep_1, args.exp_rep_2]
+files.extend(args.controls)
+
+# read first line of first file to get length of the sequences
+tmp = open(files[0])
+firstline = tmp.readline()
+
+# we assume that the middle of the sequence is the crosslink nucleotide
+cl_nucleotide = int(len(firstline)/2)
+
+# get starting and end point for the kmers
+start_iter = cl_nucleotide - args.kmer_length + 1
+end_iter = cl_nucleotide + args.kmer_length + 1
+
+tmp.close()
+
+print("[NOTE] finish")
+
+######################
+##   PROCESS DATA   ##
+######################
+
+print("[NOTE] Process data")
+
+# create a dictornary for the kmers
+kmer_dict = dict()
+
+f = 0
+for file in files:
+    with open(file) as openfileobject:
+        for seq in openfileobject:
+            # check length of kmer
+            if len(seq) < (args.kmer_length*2 - 1):
+                raise Exception("[ERROR] kmer length too long, in other words, sequence is too short.")
+
+            # go over your sequence and generate kmers of length args.kmer_length
+            # put kmers into dictonary
+            for i in range(start_iter, cl_nucleotide+1):
+                kmer = seq[i:i+args.kmer_length]
+                # if kmer already exists, then increment for file f
+                if kmer in kmer_dict:
+                    kmer_dict[kmer][f] += 1
+                # if kmer does not exist, then create a new vector of size len(files)
+                else:
+                    init = numpy.zeros(len(files))
+                    init[f] = 1
+                    kmer_dict[kmer] = init
+
+    openfileobject.close()
+    f +=1
+
+sum_vector = numpy.zeros(len(files))
+
+kmer_values_vectors = kmer_dict.values()
+
+# get the total number of kmers for each files (sample)
+for values_vector in kmer_values_vectors:
+    sum_vector += values_vector
+
+# calculate the realtive abundance of each kmer for each file (sample)
+for kmer in kmer_dict:
+    kmer_dict[kmer] = kmer_dict[kmer]/sum_vector
+
+print("[NOTE] finish")
+
+##############
+##   PLOT   ##
+##############
+
+print("[NOTE] Make plots")
+
+plotpath = os.path.dirname(os.path.abspath(__file__)) + '/'
+
+df = pandas.DataFrame(kmer_dict).T
+
+# sort the dictionary (now dataframe) accoridng two the first two files (here replicates of the experiemtn)
+df_sorted = df.sort_values([0, 1], ascending=[False, False])
+df_sorted.columns = files
+df_sorted.to_csv(plotpath + 'reproducible_motifs.csv', sep='\t')
+
+# change colun names back to integer for convience
+df_sorted.columns = [x for x in range(len(files))]
+
+# Find the n most reproducible motifs in the two replicates of your experiment
+n = 10
+top_n_motifs = ["red" for x in range(n)]
+rest_of_points = ["blue" for x in range(len(df_sorted[0]) - n)]
+colors_for_scatterplot = top_n_motifs + rest_of_points
+
+# create a plot and list of motifs with their sorted relative abundance for each pair of files
+p = 1
+for i in range(0,len(files)):
+    for j in range(i+1, len(files)):
+
+        outfile_name_plot = ""
+        outfile_name_motif_table = ""
+        if args.outfile:
+            outfile_name_plot = args.outfile + '_' + str(p) + '.pdf'
+        else:
+            outfile_name_plot = plotpath + 'Crosslink_Kmer_Quality_Check_' + str(i) + '_' + str(j) + '_' + str(p) + '.pdf'
+
+        p  += 1
+        pp = PdfPages(outfile_name_plot)
+
+        # do linear regression for the two files
+        slope, intercept, r_value, p_value, std_err = stats.linregress(df_sorted[i], df_sorted[j])
+
+        plt.scatter(df_sorted[i], df_sorted[j], c=colors_for_scatterplot, s=2)
+        plt.ylabel(files[i])
+        plt.xlabel(files[j])
+
+        max_x = max(df_sorted[i])
+        max_y = max(df_sorted[j])
+
+        plt.title("R" + r'$^2 =$' + " " + str(r_value), fontsize=15)
+
+        pp.savefig()
+        pp.close()
+        plt.close()
+
+print("[FINISH]")
+
diff --git a/fetch_DNA_sequence.py b/fetch_DNA_sequence.py
@@ -12,7 +12,7 @@
 sequences in as fasta.
 By default output is written to source file location.
 Example usage:
-fetch_DNA_sequence.py interval-file --out output.file
+fetch_DNA_sequence.py interval-file -o output.file
 """
 
 # parse command line arguments
@@ -55,7 +55,7 @@
 cl_regions = pandas.read_table(file, sep='\t', names=['chrom', 'start', 'stop'])
 
 # link to the reference genome in fasta format
-fastafile = pysam.Fastafile("test-data/hg18.fa")
+fastafile = pysam.Fastafile("test-data/hg19.fa")
 
 print("[NOTE] finish")
 
@@ -75,8 +75,8 @@
 
 # get sequence for coordinates
 for i in range(0,len(cl_regions)):
-   sequence_file.write(fastafile.fetch(cl_regions['chrom'][i], cl_regions['start'][i], cl_regions['stop'][i]) + '\n')
+    sequence_file.write(fastafile.fetch(cl_regions['chrom'][i], cl_regions['start'][i], cl_regions['stop'][i]) + '\n')
 
 sequence_file.close()
 
-print("[FINISH]")
+print("[FINISH]")