Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLK1 exon 4 variation #7

Merged
merged 48 commits into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from 36 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
455d86c
compute variation for high CLK1 expressing samples/histologies
naqvia Nov 19, 2024
b960b14
variation and range plots
naqvia Nov 21, 2024
25459c5
transcript tpms for ctrls vs hgg
naqvia Nov 22, 2024
0f5dc42
variation with ctrls added
naqvia Nov 22, 2024
835d7c8
psi for other kinases
naqvia Dec 3, 2024
6136412
kinase variations
naqvia Dec 4, 2024
d971212
added ylim
naqvia Dec 4, 2024
362bad7
labeled ped regions
naqvia Dec 4, 2024
94dd0f5
color-blind friendly colors
naqvia Dec 4, 2024
c25a957
added cortex
naqvia Dec 4, 2024
b0492cf
with plot_groups and color (tpm plot)
naqvia Dec 4, 2024
e566c38
added plot
naqvia Dec 6, 2024
0029333
plot
naqvia Dec 6, 2024
89e934d
plot size
naqvia Dec 6, 2024
f18224d
sorted by mean
naqvia Dec 6, 2024
f0597ff
rem. polyA filter (can decide later)
naqvia Dec 6, 2024
cb0899b
removed the tpm filter in this version bc we mixed polyA and stranded
naqvia Dec 9, 2024
1ba3598
age-specific plot
naqvia Dec 10, 2024
b0125d7
add stats
naqvia Dec 10, 2024
3f0b296
adjusted labels and stats
naqvia Dec 10, 2024
7e75449
with other age-bins
naqvia Dec 10, 2024
168abd7
plot and added astrocytes and age bins
naqvia Dec 12, 2024
4ba9b93
modified plots
naqvia Dec 17, 2024
18d17b7
diff age_bins for plots
naqvia Dec 17, 2024
9ae5489
mofied age bins
naqvia Dec 19, 2024
9761344
also make a plot of total TPM for Ex4
naqvia Dec 23, 2024
6689526
stylistic changes
naqvia Dec 23, 2024
a964a0c
plot aethstics
naqvia Dec 23, 2024
802f3ec
fixed y lim
naqvia Dec 23, 2024
64f7b31
spinal comparisons
naqvia Dec 27, 2024
bd7011b
re-arrange astrocyte ctrl group
naqvia Dec 30, 2024
c8bcd5a
updated plot texts and ordering addressing reviewer comments
naqvia Jan 8, 2025
d69fff5
cleaned code and paths
naqvia Jan 10, 2025
0a00d35
fixed paths
naqvia Jan 13, 2025
d6df859
cleaned code and remove unn scripts
naqvia Jan 13, 2025
058a501
added new scripts to run script
naqvia Jan 13, 2025
7bffcff
fixed rmats file path and deleted extraneous code
naqvia Jan 14, 2025
471a2bc
fixed paths for 11 and 12
naqvia Jan 14, 2025
04f0f23
added headers and to run module
naqvia Jan 14, 2025
08ab111
plots
naqvia Jan 14, 2025
edf9fe2
fixed evo-devo ordering
naqvia Jan 21, 2025
5fe160c
update normal labels, typo
jharenza Jan 27, 2025
d92e803
Merge branch 'main' into naqvia/compute-variations-ex4
naqvia Jan 27, 2025
09e2c36
Pons CNH inpu file
naqvia Jan 27, 2025
473d4b7
added CNH Pons
naqvia Jan 27, 2025
ef232ee
increased text size and width for plot
naqvia Feb 3, 2025
a6e0b42
added path to file
naqvia Feb 11, 2025
d893c67
loaded coin library for tests
naqvia Feb 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 212 additions & 0 deletions analyses/CLK1-splicing_correlations/08-plot-Ex4-PSI-all-tumors.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
################################################################################
# 08-plot-Ex4-PSI-all-tumors.R
# script that plots CLK1 exon 4 PSI variations across tumors and ctrls
# written by Ammar Naqvi
#
# usage: Rscript 08-plot-Ex4-PSI-all-tumors.R
################################################################################

suppressPackageStartupMessages({
library("reshape2")
library("tidyverse")
library("ggpubr")
library("ggplot2")
library("vroom")
library("data.table")
})

# Get `magrittr` pipe
`%>%` <- dplyr::`%>%`

## set directories
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
data_dir <- file.path(root_dir, "data")
analysis_dir <- file.path(root_dir, "analyses", "CLK1-splicing_correlations")
input_dir <- file.path(analysis_dir, "input")
results_dir <- file.path(analysis_dir, "results")
plots_dir <- file.path(analysis_dir, "plots")
hist_dir <- file.path(root_dir, "analyses", "cohort_summary", "results")

## create plots dir if it doesn't exist
if(!dir.exists(plots_dir)){
dir.create(plots_dir, recursive=TRUE)
}

if(!dir.exists(results_dir)){
dir.create(results_dir, recursive=TRUE)
}

## theme for all plots
figures_dir <- file.path(root_dir, "figures")
source(file.path(figures_dir, "theme_for_plots.R"))

## input files
rmats_file <- file.path(results_dir,"clk1-splice-events-rmats.tsv")
clin_file <- file.path(hist_dir,"histologies-plot-group.tsv")
expr_file <- file.path(data_dir,"gene-expression-rsem-tpm-collapsed.rds")
gtex_rmats <- file.path(data_dir,"gtex-brain-under40-harmonized-splice-events-rmats.SE.tsv.gz")
pedr_rmats <- file.path(data_dir,"GSE243682-normal-splice-events-rmats.tsv.gz")

## output files for final plots
hgg_plot_file <- file.path(plots_dir,"all_hgg_CLK1_exon4_inclusion_fraction_hgg_stacked.pdf")
dmg_plot_file <- file.path(plots_dir,"dmg_CLK1_exon4_inclusion_fraction_hgg_stacked.pdf")
other_hgg_plot_file <- file.path(plots_dir,"other_hgg_CLK1_exon4_inclusion_fraction_hgg_stacked.pdf")
indep_file <- file.path(data_dir, "independent-specimens.rnaseqpanel.primary.tsv")

indep_df <- vroom(indep_file)

# Function to add TPM values to a dataframe from a matrix
add_TPM_values <- function(df, matrix) {
# Ensure gene_symbol and sample_id columns exist in df
if (!("gene_symbol" %in% names(df)) | !("Kids_First_Biospecimen_ID" %in% names(df))) {
stop("DataFrame must contain 'gene_symbol' and 'sample_id' as columns")
}

# Ensure rownames and colnames are properly set in the matrix
if (is.null(rownames(matrix)) | is.null(colnames(matrix))) {
stop("Matrix must have rownames and colnames set for gene_symbol and Sample_id, respectively")
}


# Extract the matching TPM values
df$gene_tpm <- mapply(function(gene, sample) {
if (gene %in% rownames(matrix) & sample %in% colnames(matrix)) {
matrix[gene, sample]
} else {
NA
}
}, df$gene_symbol, df$Kids_First_Biospecimen_ID)

return(df)
}

## load histologies info for HGG subty
histologies_df <- read_tsv(clin_file) %>%
filter(cohort == "PBTA",
experimental_strategy == "RNA-Seq",
RNA_library=='stranded',
Kids_First_Biospecimen_ID %in% indep_df$Kids_First_Biospecimen_ID)

## load rmats input for CLK1
clk1_rmats <- fread(rmats_file) %>%
# filter for CLK1 and exon 4, HGGs
dplyr::filter(
geneSymbol=="CLK1",
exonStart_0base=="200860124",
exonEnd=="200860215",
) %>%
dplyr::rename(Kids_First_Biospecimen_ID=sample_id) %>%
dplyr::select(Kids_First_Biospecimen_ID,IncLevel1) %>%
# Join rmats data with clinical data
inner_join(histologies_df, by='Kids_First_Biospecimen_ID') %>%
mutate(gene_symbol="CLK1")

exp <- readRDS(expr_file) %>%
dplyr::select(any_of(histologies_df$Kids_First_Biospecimen_ID))

var_exp_filt <- add_TPM_values(clk1_rmats, exp)

# filter for very high expression
high_expression <- quantile(var_exp_filt$gene_tpm, 0.90)
ex4_psi_filtered <- var_exp_filt %>%
filter(gene_tpm > high_expression) %>%
# Compute variance and add as a new column
group_by(plot_group) %>%
mutate(PSI_variance = sd(IncLevel1, na.rm = TRUE)) %>%
filter(!is.na(PSI_variance))

# Plot with pairwise comparison results and mean labels
boxplot_tpm<- ggplot(var_exp_filt, aes(x = plot_group, y = IncLevel1)) +
geom_boxplot(outlier.shape = NA, fill = "grey", color = "#2c3e50") +
stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "black") +
geom_jitter(width = 0.2, size = 2, shape = 21, color = "black", fill="grey") + # Add actual data points
labs(title = "CLK Exon 4 PSI",
x = "Histology", y = "PSI") +
theme_Publication() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 75, hjust = 1)) +
scale_x_discrete(labels = function(x) sapply(x, function(l) str_wrap(l, width = 30))) # Wrap x-axis labels


var_plot<- ggplot(data=ex4_psi_filtered,
aes(reorder(plot_group, PSI_variance),PSI_variance,
group=1), color="black") +
geom_point(aes(fill = 'black'), size = 3, pch = 21, color="black") + # Map color inside aes()
xlab("Histology") +
ylab("Standard Deviation") +
ggtitle("CLK1 Exon 4 variation") +
theme_Publication() +
theme(axis.text.x=element_text(angle = 75, hjust = 1, size = 11),legend.position = "none")

ex4_psi_range <- ex4_psi_filtered %>%
group_by(plot_group) %>%
mutate(PSI_range = max(IncLevel1, na.rm = TRUE) - min(IncLevel1, na.rm = TRUE)) %>%
ungroup() %>%
dplyr::select(plot_group,PSI_range) %>%
unique()

psi_range_plot<- ggplot(data=ex4_psi_range,
aes(reorder(plot_group, PSI_range),PSI_range,
group=1), color="black") +
geom_point(aes(fill = 'black'), size = 3, pch = 21, color="black") + # Map color inside aes()
xlab("Histology") +
ylab("PSI Range") +
ggtitle("CLK1 Exon 4 PSI Range") +
theme_Publication() +
theme(axis.text.x=element_text(angle = 75, hjust = 1, size = 11),legend.position = "none")


# Save plot as PDF
pdf(file.path(plots_dir, "CLK1-Ex4-sdev-across.pdf"),
width = 4, height = 6)
print(var_plot)
dev.off()

pdf(file.path(plots_dir, "CLK1-Ex4-range-across.pdf"),
width = 4, height = 6)
print(psi_range_plot)
dev.off()

## add ctrls to plots
gtex_psi_df <- vroom(gtex_rmats) %>% # Select CLK1 gene
dplyr::filter(geneSymbol=="CLK1") %>%
# Select exon 4
dplyr::filter(exonStart_0base=="200860124", exonEnd=="200860215") %>%
# Select "sample", "geneSymbol", and "IncLevel1" columns
dplyr::select(sample_id, geneSymbol, IncLevel1) %>%
dplyr::rename(gene_symbol=geneSymbol) %>%
dplyr::mutate(plot_group="gtex-brain")

ped_psi_df <- vroom(pedr_rmats) %>% # Select CLK1 gene
dplyr::filter(geneSymbol=="CLK1") %>%
# Select exon 4
dplyr::filter(exonStart_0base=="200860124", exonEnd=="200860215") %>%
# Select "sample", "geneSymbol", and "IncLevel1" columns
dplyr::select(sample_id, geneSymbol, IncLevel1) %>%
dplyr::rename(gene_symbol=geneSymbol) %>%
dplyr::mutate(plot_group="pediatric_ctrls")

ctrl_psi_df<- rbind(ped_psi_df,gtex_psi_df)

ctrl_psi_range <- ctrl_psi_df %>% group_by(plot_group) %>%
mutate(PSI_range = max(IncLevel1, na.rm = TRUE) - min(IncLevel1, na.rm = TRUE)) %>%
ungroup() %>%
dplyr::select(plot_group,PSI_range) %>%
unique()

combo_psi_range <- rbind(ctrl_psi_range,ex4_psi_range)

psi_range_plot<- ggplot(data=combo_psi_range,
aes(reorder(plot_group, PSI_range),PSI_range,
group=1), color="black") +
geom_point(aes(fill = 'black'), size = 3, pch = 21, color="black") + # Map color inside aes()
xlab("Histology") +
ylab("PSI Range") +
ggtitle("CLK1 Exon 4 PSI Range") +
theme_Publication() +
theme(axis.text.x=element_text(angle = 75, hjust = 1, size = 11),legend.position = "none")

pdf(file.path(plots_dir, "CLK1-Ex4-range-across-ctrls.pdf"),
width = 4, height = 6)
print(psi_range_plot)
dev.off()
Loading