From adb1f355ce16e747b10f1a9b57e0fd180b48f9d2 Mon Sep 17 00:00:00 2001 From: Ulrich Braunschweig Date: Wed, 24 Jan 2024 18:11:43 -0500 Subject: [PATCH] Small README and help changes --- R/vastdiff.R | 6 +++--- README.md | 27 ++++++++++++--------------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/R/vastdiff.R b/R/vastdiff.R index 04f79ab..a521d65 100755 --- a/R/vastdiff.R +++ b/R/vastdiff.R @@ -49,9 +49,6 @@ option.list <- list( help = "Name of the replicate set B, [default is first element of --replicateB]"), make_option(c("-i", "--input"), type = "character", default = "INCLUSION_LEVELS", help = "Exact or Partial match to PSI table in output directory [default %default]"), - make_option(c("-n", "--nLines"), type = "integer", default = "5000", - help = "Number of lines to read/process in parallel at a time... - lower number = less memory = greater overhead [default %default]"), make_option(c("-p", "--paired"), action="store_true", default = FALSE, help = "Samples are paired, -a pairOneA,pairTwoA,.. -b pairOneB,pairTwoB,.. [default %default]\n @@ -83,6 +80,9 @@ option.list <- list( [general options]"), make_option(c("-c", "--cores"), type = "integer", default = 1, metavar="int", help="Number of cores to use for plot processing.. [default %default]"), + make_option(c("-n", "--nLines"), type = "integer", default = "5000", + help = "Number of lines to read/process in parallel at a time... + lower number = less memory = greater overhead [default %default]"), make_option(c("-z", "--seed"), type = "integer", default = 10, metavar="int", help="Seed the RNG for a deterministic result.. [default %default]"), make_option(c("-v", "--verbose"), type = "logical", default = TRUE, metavar=NULL, diff --git a/README.md b/README.md index 217ce6c..10a3d08 100644 --- a/README.md +++ b/README.md @@ -393,7 +393,7 @@ From release v2.0.0, VAST-TOOLS includes a new module to identify and profile an ``vast-tools`` provides two alternative modules (``compare`` and ``diff``) to perform differential splicing analyses on a reduced number of samples per group. Each module gives different functionalities. - ``compare``: pre-filters the events based on read coverage, imbalance and other features, and simply compares average and individual dPSIs. That is, it looks for non-overlapping PSI distributions based on fixed dPSI cut-offs. For more than 3 replicates, it is likely to be too stringent. -- ``diff``: performs a statistical test to assess whether the PSI distributions of the two compared groups are signficantly different. It is possible to pre-filter the events based on the minimum number of reads per sample, but subsequent filtering is highly recommended (e.g. overlapping the results with the output of ``tidy``). For more than 5 samples per group it may also be over stringent. +- ``diff``: performs a statistical test to assess whether the PSI distributions of the two compared groups are signficantly different. It is possible to pre-filter the events based on the minimum number of reads per sample, but subsequent filtering is highly recommended (e.g. overlapping the results with the output of ``tidy``). For more than 5 samples per group it may also be overly stringent. - When comparing multiple samples per group, an alternative approach is recommended. First, events should be pre-filtered using ``tidy`` (see [Simplifying Combine Table](#simplifying-combine-table)). This module allows to select events for which a minimum number of samples per group pass the quality controls. Then, a Mann-Whitney U-test or similar can be used to identify differentially spliced events. Finally, average dPSI per group should be calculated and a minimum difference (usually |dPSI| > 15) should be requested. #### *compare*: Comparing PSIs Between Samples @@ -446,9 +446,10 @@ Note: Sample names do not have to follow any specific convention as long as they Probably the most important extra options to consider are ``-r PROB (--prob)``, ``-m MINDIFF (--minDiff)``, ``-e MINREADS (--minReads)``, and `-S MINSAMPLES (--minSamples)` These represent the stringency criterion for filtering of visual output and textual -data sent to file. `-S` is the minimum number of samples for each set `-a` and `-b` +output. `-S` is the minimum number of samples for each set `-a` and `-b` that have to have at least `-e` reads each to be considered in the downstream -statistical comparison. +statistical comparison. Values in a sample group that do not survive these setting are +set to NA in the output table and not represented in the PDF. The ``-r`` flag represents the minimal probability of acceptance that is required to consider a comparison to @@ -458,13 +459,12 @@ stringency requirements. The ``-m`` flag represents the minimum value of difference (`MV`, see example below) between PSI in group A and PSI in group B that you will accept, such that we are are sure with at least probability ``-r`` that there is a difference of at least ``-m``. `-m` does not -currently alter the output sent to STDOUT, but does filter what is plotted to PDF -and printed to file. +alter the output table, but does filter what is plotted to PDF. The ``-e`` flag specifies the minimum number of reads for a sample/event to be compared. In cases where the prior distribution has been methodically calculated and/or is believable beyond an uninformative prior (like the uniform default), -this may not be necessary, however it is still highly recommended. The default +this may not be necessary, however it is still highly recommended. The default value for ``-e`` is 10, though this could easily be higher. Additionally, ``diff`` allows you to alter the parameters of the conjugate beta @@ -478,8 +478,8 @@ it may be more appropriate to use a custom prior model that is able to more accu reflect the lower expectation of inclusion levels. In the case that you have paired samples, where NormalA is dependent on -PerturbationA, it is appropriate to use the ``--paired=TRUE`` flag. For -example when considering NormalA and NormalB, to compare to PerturbationA and +PerturbationA, it is appropriate to use the ``--paired=TRUE`` flag. For +example, when considering NormalA and NormalB, to compare to PerturbationA and PerturbationB, the probability that P( joint_psi1 - joint_psi2 > ``-m`` ) is calculated such that NormalA is only compared to PerturbationA, and then NormalB is compared to PerturbationB. No MLE fitting is used in this case. @@ -495,16 +495,13 @@ posterior distribution to sample, lower numbers decrease accuracy but increase performance. The ``diff`` command is also able to run in parallel. Specify the number of -cores to use with ``-c INT`` -Obviously more cores will increase the speed of ``diff``, at the cost of increased -RAM usage. +cores to use with ``-c INT``. Obviously more cores will increase the speed of ``diff``, +at the cost of increased memory. Using the ``-n`` flag to specify the number of lines to read/process at a time, will set a max threshold to the RAM used by parallel processing with the ``-c`` flag. A lower number means that ``diff`` will use significantly less memory, -however by decreasing ``-n`` you have increased the number of times that the -``mclapply`` function must calculate the parallel processing overhead. The -default is 100, which works well. +however, decreasing ``-n`` increases parallelization overhead and extends run time. *Output Format* @@ -522,7 +519,7 @@ The text output of diff looks like: BCORL1 | HsaEX0007940 | 0.213452 | 0.500425 | -0.286973 | 0.05 Where for example the first event HsaEX0008312 in the BOD1L gene has multireplicate point estimate -for SampleA of 0.12 and 0.7 for SampleB. While this gives an expected value for the difference of +for SampleA of 0.12 and 0.7 for SampleB. While this gives an expected value for the difference of PSI (dPsi/ΔPSI) between SampleA and SampleB of -0.57, the minimum value (`MV`) for |ΔPSI| at 0.95 is 0.3, meaning that there is a 0.95 probability that |ΔPSI| is greater than 0.3. Use this value to filter for events that are statistically likely to have at least a minimal difference of some