Skip to content

Commit 9b601dd

Browse files
committed
Made small changes to existing files and added a new version of crosslink_quality_check.py
1 parent 0c371d1 commit 9b601dd

5 files changed

+231
-12
lines changed

chromosome_quality_check.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
cDNA (x-axis).
2121
By default output is written to source file location.
2222
Example usage:
23-
chromosome_quality_check.py file1.bed file2.bed --out output.pdf
23+
chromosome_quality_check.py file1.bed file2.bed -o output.pdf
2424
"""
2525

2626
# parse command line arguments
@@ -95,7 +95,7 @@ def count_values_for_set(s):
9595

9696
common_sets = set(main_groupcount[0].keys())
9797

98-
# reads that are the same between the sets
98+
# positions that are the same between the sets
9999
for i in range(0,len(main_groupcount)):
100100
sample = main_groupcount[i]
101101
common_sets = common_sets & set(sample.keys())
@@ -104,7 +104,7 @@ def count_values_for_set(s):
104104
for x in counted_set.keys():
105105
all[min(x - 1, 4)] += counted_set[x]
106106

107-
# add to the intersection-counts for each sample the number of reads
107+
# add to the intersection-counts for each sample the number of positions
108108
# that are x-times duplicated
109109
for region in common_sets:
110110
for sample in main_groupcount:
@@ -137,4 +137,4 @@ def count_values_for_set(s):
137137
pp.savefig()
138138
pp.close()
139139

140-
print("[FINISH]")
140+
print("[FINISH]")

cross_convolution_analysis.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
be observable.
2121
By default output is written to source file location.
2222
Example usage:
23-
cross_convolution_analysis.py reads.bam genome_table.tsv shift --out output.pdf
23+
cross_convolution_analysis.py reads.bam genome_table.tsv shift -o output.pdf
2424
"""
2525

2626
# parse command line arguments
@@ -160,4 +160,4 @@
160160
pp.savefig()
161161
pp.close()
162162

163-
print('[FINISH]')
163+
print('[FINISH]')

crosslink_quality_check.py

+23-2
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,24 @@
1-
import argparse
21
from matplotlib.backends.backend_pdf import PdfPages
2+
from scipy import stats
33

44
import os
55
import matplotlib.pyplot as plt
66
import numpy
77
import pandas
88
import logging
9+
import argparse
910

1011
####################
1112
## ARGS INPUT ##
1213
####################
1314

1415
tool_description = """
16+
This tool checks the reproducibility of the crosslinking sites between samples in fasta format.
17+
Ideally the trend should follow a diagonal line with the highest reproducible motif in the right
18+
upper most corner.
19+
By default output is written to source file location.
20+
Example usage:
21+
crosslink_quality_check.py file1.fasta file2.fasta kmer_length -o output.pdf
1522
"""
1623

1724
# parse command line arguments
@@ -140,14 +147,28 @@
140147
for i in range(0,len(files)):
141148
for j in range(i+1, len(files)):
142149

143-
pp = PdfPages(plotpath + 'Crosslink_Kmer_Quality_Check_' + str(i) + '_' + str(j) + '.pdf')
150+
outfile_name = ""
151+
if args.outfile:
152+
outfile_name = args.outfile
153+
else:
154+
outfile_name = plotpath + 'Crosslink_Kmer_Quality_Check_' + str(i) + '_' + str(j) + '.pdf'
155+
156+
pp = PdfPages(outfile_name)
144157

145158
df = pandas.DataFrame(kmer_dict).T
146159

160+
# do linear regression for the two files
161+
slope, intercept, r_value, p_value, std_err = stats.linregress(df[i], df[j])
162+
147163
plt.plot(df[i], df[j], ls='', marker='.', ms=10.0)
148164
plt.ylabel(files[i])
149165
plt.xlabel(files[j])
150166

167+
max_x = max(df[i])
168+
max_y = max(df[j])
169+
170+
plt.text(max_x/3, max_y/2 + max_y*.2, "R" + r'$^2 =$' + " " + str(r_value), fontsize=15)
171+
151172
pp.savefig()
152173
pp.close()
153174

crosslink_quality_check_colored.py

+198
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
from matplotlib.backends.backend_pdf import PdfPages
2+
from scipy import stats
3+
4+
import os
5+
import matplotlib.pyplot as plt
6+
import numpy
7+
import pandas
8+
import logging
9+
import argparse
10+
11+
####################
12+
## ARGS INPUT ##
13+
####################
14+
15+
tool_description = """
16+
This tool checks the reproducibility of the crosslinking sites between samples in fasta format.
17+
Ideally the trend should follow a diagonal line with the highest reproducible motif in the right
18+
upper most corner.
19+
By default output is written to source file location.
20+
Example usage:
21+
crosslink_quality_check.py exp_rep_1.fasta exp_rep_2.fasta controls.fasta kmer_length -o output.pdf
22+
"""
23+
24+
# parse command line arguments
25+
parser = argparse.ArgumentParser(description=tool_description,
26+
formatter_class=argparse.RawDescriptionHelpFormatter)
27+
# positional arguments
28+
parser.add_argument(
29+
"exp_rep_1",
30+
help="Path to first experiment replicate fasta file.")
31+
parser.add_argument(
32+
"exp_rep_2",
33+
help="Path to second experiment replicate fasta file.")
34+
parser.add_argument(
35+
'controls',
36+
nargs='+',
37+
help="Path to the control fasta files. Specify at least one more files.")
38+
parser.add_argument(
39+
'kmer_length',
40+
type=int,
41+
help="Length of the kmers. Keep in mind that the sequences should be long enough.")
42+
# optional arguments
43+
parser.add_argument(
44+
"-o", "--outfile",
45+
help="Write results to this file.")
46+
parser.add_argument(
47+
"-d", "--debug",
48+
help="Print lots of debugging information",
49+
action="store_true")
50+
51+
args = parser.parse_args()
52+
if args.debug:
53+
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
54+
else:
55+
logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
56+
logging.info("Parsed arguments:")
57+
logging.info(" exp_rep_1: '{}'".format(args.exp_rep_1))
58+
logging.info(" exp_rep_2: '{}'".format(args.exp_rep_2))
59+
logging.info(" controls: '{}'".format(args.controls))
60+
logging.info(" kmer_length: '{}'".format(args.kmer_length))
61+
if args.outfile:
62+
logging.info(" outfile: enabled writing to file")
63+
logging.info(" outfile: '{}'".format(args.outfile))
64+
logging.info("")
65+
66+
if args.kmer_length <= 1:
67+
raise Exception("[ERROR] kmer length too short. Your kmer length makes no sense.")
68+
69+
###################
70+
## READ DATA ##
71+
###################
72+
73+
print("[START]")
74+
print("[NOTE] Read data")
75+
76+
# your read data in bed6 format
77+
files = [args.exp_rep_1, args.exp_rep_2]
78+
files.extend(args.controls)
79+
80+
# read first line of first file to get length of the sequences
81+
tmp = open(files[0])
82+
firstline = tmp.readline()
83+
84+
# we assume that the middle of the sequence is the crosslink nucleotide
85+
cl_nucleotide = int(len(firstline)/2)
86+
87+
# get starting and end point for the kmers
88+
start_iter = cl_nucleotide - args.kmer_length + 1
89+
end_iter = cl_nucleotide + args.kmer_length + 1
90+
91+
tmp.close()
92+
93+
print("[NOTE] finish")
94+
95+
######################
96+
## PROCESS DATA ##
97+
######################
98+
99+
print("[NOTE] Process data")
100+
101+
# create a dictornary for the kmers
102+
kmer_dict = dict()
103+
104+
f = 0
105+
for file in files:
106+
with open(file) as openfileobject:
107+
for seq in openfileobject:
108+
# check length of kmer
109+
if len(seq) < (args.kmer_length*2 - 1):
110+
raise Exception("[ERROR] kmer length too long, in other words, sequence is too short.")
111+
112+
# go over your sequence and generate kmers of length args.kmer_length
113+
# put kmers into dictonary
114+
for i in range(start_iter, cl_nucleotide+1):
115+
kmer = seq[i:i+args.kmer_length]
116+
# if kmer already exists, then increment for file f
117+
if kmer in kmer_dict:
118+
kmer_dict[kmer][f] += 1
119+
# if kmer does not exist, then create a new vector of size len(files)
120+
else:
121+
init = numpy.zeros(len(files))
122+
init[f] = 1
123+
kmer_dict[kmer] = init
124+
125+
openfileobject.close()
126+
f +=1
127+
128+
sum_vector = numpy.zeros(len(files))
129+
130+
kmer_values_vectors = kmer_dict.values()
131+
132+
# get the total number of kmers for each files (sample)
133+
for values_vector in kmer_values_vectors:
134+
sum_vector += values_vector
135+
136+
# calculate the realtive abundance of each kmer for each file (sample)
137+
for kmer in kmer_dict:
138+
kmer_dict[kmer] = kmer_dict[kmer]/sum_vector
139+
140+
print("[NOTE] finish")
141+
142+
##############
143+
## PLOT ##
144+
##############
145+
146+
print("[NOTE] Make plots")
147+
148+
plotpath = os.path.dirname(os.path.abspath(__file__)) + '/'
149+
150+
df = pandas.DataFrame(kmer_dict).T
151+
152+
# sort the dictionary (now dataframe) accoridng two the first two files (here replicates of the experiemtn)
153+
df_sorted = df.sort_values([0, 1], ascending=[False, False])
154+
df_sorted.columns = files
155+
df_sorted.to_csv(plotpath + 'reproducible_motifs.csv', sep='\t')
156+
157+
# change colun names back to integer for convience
158+
df_sorted.columns = [x for x in range(len(files))]
159+
160+
# Find the n most reproducible motifs in the two replicates of your experiment
161+
n = 10
162+
top_n_motifs = ["red" for x in range(n)]
163+
rest_of_points = ["blue" for x in range(len(df_sorted[0]) - n)]
164+
colors_for_scatterplot = top_n_motifs + rest_of_points
165+
166+
# create a plot and list of motifs with their sorted relative abundance for each pair of files
167+
p = 1
168+
for i in range(0,len(files)):
169+
for j in range(i+1, len(files)):
170+
171+
outfile_name_plot = ""
172+
outfile_name_motif_table = ""
173+
if args.outfile:
174+
outfile_name_plot = args.outfile + '_' + str(p) + '.pdf'
175+
else:
176+
outfile_name_plot = plotpath + 'Crosslink_Kmer_Quality_Check_' + str(i) + '_' + str(j) + '_' + str(p) + '.pdf'
177+
178+
p += 1
179+
pp = PdfPages(outfile_name_plot)
180+
181+
# do linear regression for the two files
182+
slope, intercept, r_value, p_value, std_err = stats.linregress(df_sorted[i], df_sorted[j])
183+
184+
plt.scatter(df_sorted[i], df_sorted[j], c=colors_for_scatterplot, s=2)
185+
plt.ylabel(files[i])
186+
plt.xlabel(files[j])
187+
188+
max_x = max(df_sorted[i])
189+
max_y = max(df_sorted[j])
190+
191+
plt.title("R" + r'$^2 =$' + " " + str(r_value), fontsize=15)
192+
193+
pp.savefig()
194+
pp.close()
195+
plt.close()
196+
197+
print("[FINISH]")
198+

fetch_DNA_sequence.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
sequences in as fasta.
1313
By default output is written to source file location.
1414
Example usage:
15-
fetch_DNA_sequence.py interval-file --out output.file
15+
fetch_DNA_sequence.py interval-file -o output.file
1616
"""
1717

1818
# parse command line arguments
@@ -55,7 +55,7 @@
5555
cl_regions = pandas.read_table(file, sep='\t', names=['chrom', 'start', 'stop'])
5656

5757
# link to the reference genome in fasta format
58-
fastafile = pysam.Fastafile("test-data/hg18.fa")
58+
fastafile = pysam.Fastafile("test-data/hg19.fa")
5959

6060
print("[NOTE] finish")
6161

@@ -75,8 +75,8 @@
7575

7676
# get sequence for coordinates
7777
for i in range(0,len(cl_regions)):
78-
sequence_file.write(fastafile.fetch(cl_regions['chrom'][i], cl_regions['start'][i], cl_regions['stop'][i]) + '\n')
78+
sequence_file.write(fastafile.fetch(cl_regions['chrom'][i], cl_regions['start'][i], cl_regions['stop'][i]) + '\n')
7979

8080
sequence_file.close()
8181

82-
print("[FINISH]")
82+
print("[FINISH]")

0 commit comments

Comments
 (0)