-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #117 from vastgroup/diffUpdate
diff update
- Loading branch information
Showing
4 changed files
with
304 additions
and
215 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,23 @@ | ||
#!/usr/bin/env Rscript | ||
# | ||
# Author: Tim Sterne-Weiler, 2014 | ||
# [email protected] | ||
# Modifications by Ulrich Braunschweig 2018 | ||
# Author: Tim Sterne-Weiler, Ulrich Braunschweig 2014-2024 | ||
# [email protected] | ||
|
||
# This function takes a qual and returns c(post_alpha, post_beta) | ||
# Increments by prior alpha and prior distribution beta, uniform by default | ||
## Check that columns of INCLUSION... table are what we think they are | ||
checkHeader <- function(x, replicateA, replicateB) { | ||
reps <- c(replicateA, replicateB) | ||
sampInd <- unlist(sapply(reps, FUN=function(y) {which(x == y)})) | ||
if (length(sampInd) != length(reps)) { | ||
stop("[vast diff error]: Not all replicate names found in input header!\n") | ||
} | ||
namesOK <- all(paste0(reps, "-Q") == x[sampInd + 1]) | ||
if (!namesOK | !all(x[1:3] == c("GENE", "EVENT", "COORD"))) { | ||
stop("[vast diff error]: Input table does not have expected format!\n") | ||
} | ||
} | ||
|
||
## This function takes a qual and returns c(post_alpha, post_beta) | ||
## Increments by prior alpha and prior distribution beta, uniform by default | ||
parseQual <- function(qual, prior_alpha=1, prior_beta=1) { | ||
res1 <- as.numeric(sub("[^@]*@([^,]*),.*", "\\1", qual)) | ||
res2 <- as.numeric(sub("[^@]*@[^,]*,(.*)", "\\1", qual)) | ||
|
@@ -19,44 +31,43 @@ parseQual <- function(qual, prior_alpha=1, prior_beta=1) { | |
cbind(res1, res2) | ||
} | ||
|
||
# calculate the probability that the first dist is > than second | ||
# P(psi1 > psi2) when alpha=0; more generally we are determining the probability | ||
# that Psi1 is greater than Psi2 by alpha.. eg. P((psi1 - psi2) > alpha) = | ||
## Calculate the probability that the first dist is > than second | ||
## P(psi1 > psi2) when alpha=0; more generally we are determining the probability | ||
## that psi1 is greater than psi2 by alpha.. eg. P((psi1 - psi2) > alpha) = | ||
## IMPORTANT: run this function as sample(firstDist, length(firstDist)) AND | ||
## sample(secondDist, length(secondDist)) | ||
## sample(secondDist, length(secondDist)) | ||
## UNLESS you have paired data, then don't sample. | ||
pDiff <- function(firstDist, secondDist, alpha=0.15) { | ||
N <- length(firstDist) | ||
pass <- length( which( (firstDist - secondDist) > alpha ) ) | ||
N <- length(firstDist) | ||
pass <- length(which((firstDist - secondDist) > alpha)) | ||
pass / N | ||
} | ||
|
||
# This function finds the maximum difference between psi1 and psi2 given | ||
# at least some acceptable probability for difference. Defaults to 0.8 | ||
# distributions where no diff value exists with a probability > 0.8 are given | ||
# maxDiff of 0. | ||
maxDiff <- function(firstDist, secondDist, acceptProb=0.9) { | ||
alphaSet <- seq(0,1,0.01) #make this global? | ||
probs <- unlist(lapply(alphaSet, function(x) { pDiff(firstDist, secondDist, x) })) | ||
## This function finds the maximum difference between psi1 and psi2 given | ||
## at least some acceptable probability for difference. Defaults to 0.8 | ||
## distributions where no diff value exists with a probability > 0.8 are given | ||
## maxDiff of 0. | ||
maxDiff <- function(firstDist, secondDist, acceptProb=0.9, alphaSet) { | ||
probs <- unlist(lapply(alphaSet, pDiff, | ||
firstDist=firstDist, secondDist=secondDist)) | ||
ind <- max(c(which(probs > acceptProb), 1)) | ||
alphaSet[ind] | ||
} | ||
|
||
# | ||
# return the beta variance | ||
## Return the beta variance | ||
betaVar <- function(alpha, beta) { | ||
var <- alpha*beta / ( | ||
var <- alpha * beta / ( | ||
((alpha + beta) ** 2) * (alpha + beta + 1) | ||
) | ||
var | ||
} | ||
|
||
# | ||
## Return a confidence interval | ||
betaCI <- function(betaDist, percentile = c(0.05, 0.95)) { | ||
quantile(betaDist, p=percentile, na.rm = T) | ||
} | ||
|
||
# Extension of betaCI function that includes the sampling step | ||
## Extension of betaCI function that includes the sampling step | ||
betaCISample <- function(alpha, beta, n = 5000) { | ||
if (is.na(alpha) || is.na(beta)) { | ||
sample <- NA | ||
|
@@ -66,43 +77,44 @@ betaCISample <- function(alpha, beta, n = 5000) { | |
return(betaCI(sample)) | ||
} | ||
|
||
|
||
### MAKE VISUAL OUTPUT | ||
plotDiff <- function(inpOne, inpTwo, expOne, expTwo, maxD, medOne, medTwo, sampOneName, sampTwoName, rever) { | ||
plotDiff <- function(inpOne, inpTwo, expOne, expTwo, maxD, medOne, medTwo, | ||
sampOneName, sampTwoName, alphaSet, rever) { | ||
### Make visual output | ||
|
||
if(rever) { #write this better. ;-) | ||
curCol <- cbb[3:2] | ||
} else { | ||
curCol <- cbb[2:3] | ||
} | ||
|
||
if(length(expOne) == 0 || length(expTwo) == 0) { return(NULL) } | ||
if (length(expOne) == 0 || length(expTwo) == 0) {return(NULL)} | ||
one <- data.frame(x=expOne, y=-0.5) | ||
two <- data.frame(x=expTwo, y=-0.5) | ||
|
||
distPlot <- ggplot(melt(as.data.frame( | ||
do.call(cbind,list(inpOne, inpTwo)) | ||
), measure.vars=c("V1","V2")), aes(fill=variable, x=value))+ | ||
geom_histogram(aes(y=..density..), binwidth=0.03333,alpha=0.5, col="grey", position="identity")+ | ||
theme_bw()+xlim(c(0,1))+xlab(expression(hat(Psi)))+ | ||
scale_fill_manual(values=curCol, labels=c(sampOneName, sampTwoName), name="Samples")+ | ||
geom_point(data=one, mapping=aes(x=x, y=y), col=cbb[2], fill=cbb[2], alpha=0.85, inherit.aes = FALSE)+ | ||
geom_point(data=two, mapping=aes(x=x, y=y), col=cbb[3], fill=cbb[3], alpha=0.85, inherit.aes = FALSE) | ||
distPlot <- ggplot(melt(as.data.frame(do.call(cbind, list(inpOne, inpTwo))), | ||
measure.vars=c("V1","V2")), | ||
aes(fill=variable, x=value)) + | ||
geom_histogram(aes(y=after_stat(density)), binwidth=0.03333, alpha=0.5, col="grey", position="identity") + | ||
theme_bw() + scale_x_continuous(limits = c(0, 1), oob = scales::oob_keep) + xlab(expression(hat(Psi))) + | ||
scale_fill_manual(values=curCol, labels=c(sampOneName, sampTwoName), name="Samples") + | ||
geom_point(data=one, mapping=aes(x=x, y=y), col=cbb[2], fill=cbb[2], alpha=0.50, inherit.aes = FALSE) + | ||
geom_point(data=two, mapping=aes(x=x, y=y), col=cbb[3], fill=cbb[3], alpha=0.50, inherit.aes = FALSE) | ||
|
||
probPlot <- ggplot(as.data.frame(cbind(seq(0,1,0.01), | ||
unlist(lapply(alphaList, function(x) { | ||
unlist(lapply(alphaSet, function(x) { | ||
pDiff(inpOne, inpTwo, x) | ||
})))), aes(x=V1, y=V2))+ | ||
geom_line()+theme_bw()+ | ||
geom_vline(xintercept=maxD, lty="dashed", col=cbb[7])+ | ||
ylab(expression(P((hat(Psi)[1]-hat(Psi)[2]) > x)))+ | ||
xlab(expression(x))+ylim(c(0,1))+ | ||
annotate("text", x=(maxD+0.08), y=0.05, label=maxD, col=cbb[7]) | ||
|
||
return(list(distPlot, probPlot)) | ||
})))), aes(x=V1, y=V2)) + | ||
geom_line() + theme_bw() + | ||
geom_vline(xintercept=maxD, lty="dashed", col=cbb[7]) + | ||
ylab(expression(P((hat(Psi)[1] - hat(Psi)[2]) > x))) + | ||
xlab(expression(x)) + scale_y_continuous(limits = c(0, 1), oob = scales::oob_keep) + | ||
annotate("text", x=(maxD + 0.08), y=0.05, label=maxD, col=cbb[7]) | ||
|
||
return(list(distPlot, probPlot))# | ||
} | ||
|
||
plotPrint <- function(plotTitle, plotCoord, plotList) { | ||
### Print saved plotting information to output file | ||
grid.newpage() | ||
pushViewport(viewport(layout = grid.layout(3, 2, widths = unit(c(5, 4), "null"), heights = unit(c(1, 0.5, 5), "null")))) | ||
grid.text(as.character(plotTitle), check.overlap=TRUE, gp=gpar(font=2), draw=T, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:2)) | ||
|
@@ -117,106 +129,167 @@ diffBeta <- function(i, lines, opt, | |
totalFirst, totalSecond, | ||
shapeFirstAve, shapeSecondAve, | ||
expFirst, expSecond, | ||
repA.qualInd, repB.qualInd) { | ||
### Main diff functionality; fit beta distributions to sample groups for one event and compare | ||
### Is applied to each line of the current nLines of the INCLUSION... table | ||
|
||
## if no data, next; # adapted from @lpantano's fork 7/22/2015 | ||
if( (sum(totalFirst[i,] > (opt$minReads + opt$alpha + opt$beta)) < opt$minSamples) || | ||
(sum(totalSecond[i,] > (opt$minReads + opt$alpha + opt$beta)) < opt$minSamples) ) { | ||
return(NULL) | ||
} | ||
repA.qualInd, repB.qualInd, | ||
okFirst, okSecond, skip, | ||
alphaSet) { | ||
## Main diff functionality; fit beta distributions to sample groups for one event and compare | ||
## Is applied to each line of the current nLines of the INCLUSION... table | ||
|
||
## Sample Posterior Distributions | ||
psiFirst <- lapply(1:(dim(shapeFirst)[2]), function(x) { | ||
#sample here from rbeta(N, alpha, beta) if > -e | ||
if(totalFirst[i,x] < opt$minReads) { return(NULL) } | ||
rbeta(opt$size, shape1=shapeFirst[i,x,1], shape2=shapeFirst[i,x,2]) | ||
## sample here from rbeta(N, alpha, beta) if > -e | ||
if (!okFirst[i,x]) {return(NULL)} | ||
rbeta(opt$size, shape1=shapeFirst[i,x,1], shape2=shapeFirst[i,x,2]) | ||
}) | ||
|
||
psiSecond <- lapply(1:(dim(shapeSecond)[2]), function(x) { | ||
#sample here from rbeta(N, alpha, beta) if > -e | ||
if(totalSecond[i,x] < opt$minReads) { return(NULL) } | ||
rbeta(opt$size, shape1=shapeSecond[i,x,1], shape2=shapeSecond[i,x,2]) | ||
## sample here from rbeta(N, alpha, beta) if > -e | ||
if (!okSecond[i,x]) {return(NULL)} | ||
rbeta(opt$size, shape1=shapeSecond[i,x,1], shape2=shapeSecond[i,x,2]) | ||
}) | ||
|
||
if(opt$paired) { #make sure both samples have a non-NULL replicate | ||
for(lstInd in 1:length(psiFirst)) { | ||
if(is.null(psiFirst[[lstInd]]) || is.null(psiSecond[[lstInd]])) { | ||
psiFirst[[lstInd]] <- NULL | ||
psiSecond[[lstInd]] <- NULL | ||
} | ||
} | ||
} | ||
|
||
## Create non-parametric Joint Distributions | ||
psiFirstComb <- do.call(c, psiFirst) | ||
psiSecondComb <- do.call(c, psiSecond) | ||
|
||
if( length(psiFirstComb) <= 0 || length(psiSecondComb) <= 0 ) { return(NULL) } | ||
|
||
## if they aren't paired, then shuffle the joint distributions... | ||
if( !opt$paired ) { | ||
paramFirst <- try (suppressWarnings( | ||
fitdistr(psiFirstComb, | ||
"beta", | ||
list( shape1=shapeFirstAve[i,1], shape2=shapeFirstAve[i,2]) | ||
)$estimate ), TRUE ) | ||
paramSecond <- try (suppressWarnings( | ||
fitdistr(psiSecondComb, | ||
"beta", | ||
list( shape1=shapeSecondAve[i,1], shape2=shapeSecondAve[i,2]) | ||
)$estimate ), TRUE ) | ||
## if optimization fails its because the distribution is too narrow | ||
## in which case our starting shapes should already be good enough | ||
if(class(paramFirst) != "try-error") { | ||
psiFirstComb <- rbeta(opt$size, shape1=paramFirst[1], shape2=paramFirst[2]) | ||
} | ||
if(class(paramSecond) != "try-error") { | ||
psiSecondComb <- rbeta(opt$size, shape1=paramSecond[1], shape2=paramSecond[2]) | ||
} | ||
## Try to to fit a beta distribution on the replicates' parameter estimates. | ||
if (!opt$paired) { | ||
if (length(psiFirstComb) > 0) { | ||
paramFirst <- try(suppressWarnings( | ||
fitdistr(psiFirstComb, | ||
"beta", | ||
list(shape1=shapeFirstAve[i,1], shape2=shapeFirstAve[i,2]) | ||
)$estimate ), TRUE ) | ||
} | ||
if (length(psiSecondComb)) { | ||
paramSecond <- try(suppressWarnings( | ||
fitdistr(psiSecondComb, | ||
"beta", | ||
list(shape1=shapeSecondAve[i,1], shape2=shapeSecondAve[i,2]) | ||
)$estimate ), TRUE ) | ||
} | ||
## If optimization fails it's because the distribution is too narrow | ||
## in which case our starting shapes should already be good enough. | ||
## Shuffle since not paired, and downsample to size of smaller number | ||
## but don't downsize if the other sample type is absent anyway. | ||
minSample <- min(c(length(psiFirstComb), length(psiSecondComb))) | ||
|
||
if (length(psiFirstComb) > 0) { | ||
if (!inherits(paramFirst, "try-error")) { | ||
psiFirstComb <- rbeta(opt$size, shape1=paramFirst[1], shape2=paramFirst[2]) | ||
} else { | ||
if (length(psiSecondComb) > 0) { | ||
psiFirstComb <- sample(psiFirstComb, minSample) | ||
} | ||
} | ||
} | ||
if (length(psiSecondComb) > 0) { | ||
if (!inherits(paramSecond, "try-error")) { | ||
psiSecondComb <- rbeta(opt$size, shape1=paramSecond[1], shape2=paramSecond[2]) | ||
} else { | ||
if (length(psiFirstComb) > 0) { | ||
psiSecondComb <- sample(psiSecondComb, minSample) | ||
} | ||
} | ||
} | ||
} | ||
|
||
## get empirical posterior median of psi | ||
medOne <- median(psiFirstComb) | ||
medTwo <- median(psiSecondComb) | ||
|
||
## look for a max difference given prob cutoff... | ||
if(medOne > medTwo) { | ||
max <- maxDiff(psiFirstComb, psiSecondComb, opt$prob) | ||
if (skip[i]) { | ||
if (is.null(medOne)) {medOne <- NA} | ||
if (is.null(medTwo)) {medTwo <- NA} | ||
max <- NA | ||
} else { | ||
max <- maxDiff(psiSecondComb, psiFirstComb, opt$prob) | ||
if (medOne > medTwo) { | ||
max <- maxDiff(psiFirstComb, psiSecondComb, opt$prob, alphaSet) | ||
} else { | ||
max <- maxDiff(psiSecondComb, psiFirstComb, opt$prob, alphaSet) | ||
} | ||
} | ||
|
||
## SIGNIFICANT from here on out: | ||
|
||
filtOut <- sprintf("%s\t%s\t%f\t%f\t%f\t%s", lines[[i]][1], lines[[i]][2], medOne, medTwo, medOne - medTwo, round(max,2)) | ||
filtOut <- sprintf("%s\t%s\t%f\t%f\t%f\t%s", | ||
lines[[i]][1], lines[[i]][2], medOne, medTwo, medOne-medTwo, round(max,2)) | ||
|
||
## check for significant difference | ||
if(max < opt$minDiff) { | ||
## non-sig, return null plots and text output | ||
return(list(NULL, NULL, NULL, NULL, filtOut)) | ||
} else { | ||
sigInd <- i | ||
|
||
if (opt$noPDF) { | ||
eventTitle <- NULL | ||
eventCoord <- NULL | ||
retPlot <- NULL | ||
} else { | ||
if (opt$noPDF || (is.na(max) || max < opt$minDiff)) { | ||
## non-sig, return null plots and text output | ||
return(list(NULL, NULL, NULL, NULL, filtOut)) | ||
} else { | ||
## significant; plot | ||
sigInd <- i | ||
eventTitle <- paste(c("Gene: ", lines[[i]][1], " Event: ", lines[[i]][2]), collapse="") | ||
eventCoord <- paste(c("Coordinates: ", lines[[i]][3]), collapse="") | ||
|
||
## Print visual output to pdf; | ||
if( medOne > medTwo ) { | ||
## Store visual output to be printed to pdf | ||
if (medOne > medTwo) { | ||
retPlot <- plotDiff(psiFirstComb, psiSecondComb, | ||
expFirst[i,], expSecond[i,], max, medOne, medTwo, sampOneName, sampTwoName, FALSE) | ||
expFirst[i,][okFirst[i,]], expSecond[i,][okSecond[i,]], | ||
max, medOne, medTwo, sampOneName, sampTwoName, alphaSet, FALSE) | ||
} else { | ||
retPlot <- plotDiff(psiSecondComb, psiFirstComb, | ||
expFirst[i,], expSecond[i,], max, medTwo, medOne, sampTwoName, sampOneName, TRUE) | ||
expFirst[i,][okFirst[i,]], expSecond[i,][okSecond[i,]], | ||
max, medTwo, medOne, sampTwoName, sampOneName, alphaSet, TRUE) | ||
} | ||
} | ||
## sig event return | ||
return(list(retPlot, eventTitle, eventCoord, sigInd, filtOut)) #return of mclapply function | ||
|
||
## sig event return | ||
return(list(retPlot, eventTitle, eventCoord, sigInd, filtOut)) | ||
} | ||
} | ||
|
||
writeLog <- function(vastPath, opt) { | ||
### Add a line with call information to the general vast-tools log file | ||
|
||
logName <- file.path(opt$output, "VTS_LOG_commands.txt") | ||
vastVer <- scan(file.path(vastPath, "VERSION"), what="character", quiet = T) | ||
|
||
## Get date, time and vast-tools version | ||
systime <- Sys.time() | ||
sdate <- paste( | ||
sub("([0-9]{4})-.+", "\\1", systime), | ||
sub("[0-9]{4}-[0-9]{2}-([0-9]{2}).+", "\\1", systime), | ||
sub("[0-9]{4}-([0-9]{2})-.+", "\\1", systime), | ||
sep="-") | ||
stime <- sub("[^ ]+ (.{5}):[0-9]{2}.*", "\\1", systime) | ||
|
||
msg1 <- paste0( | ||
"[VAST-TOOLS v", vastVer, ", ", | ||
sdate, " (", stime, ")]" | ||
) | ||
|
||
msg2 <- "vast-tools diff" | ||
|
||
## Format input options | ||
msg3 <- paste0( | ||
"-a ", opt$replicateA, " ", | ||
"-b ", opt$replicateB, " ", | ||
"--sampleNameA ", opt$sampleNameA, " ", | ||
"--sampleNameB ", opt$sampleNameB, " ", | ||
"-i ", opt$input, " ", | ||
"-n ", opt$nLines, " ", | ||
"-p ", opt$paired, " ", | ||
"-d ", opt$baseName, " ", | ||
"-o ", opt$output, " ", | ||
"-r ", opt$prob, " ", | ||
"-m ", opt$minDiff, " ", | ||
"-e ", opt$minReads, " ", | ||
"-S ", opt$minSamples, " ", | ||
"--alpha ", opt$alpha, " ", | ||
"--beta ", opt$beta, " ", | ||
"-s ", opt$size, " ", | ||
"-z ", opt$seed | ||
) | ||
|
||
msg <- paste(msg1, msg2, msg3, paste = " ") | ||
|
||
try_error <- try(logFile <- file(logName, "a"), silent = TRUE) | ||
if (inherits(try_error, "try-error")) { | ||
cat("[vast diff warning]: Could not open log file\n") | ||
} else { | ||
write(msg, logFile) | ||
close(logFile) | ||
} | ||
} |
Oops, something went wrong.