TheJacksonLaboratory
diff --git a/‎bin/filter_manifest.py
+33 b/‎bin/filter_manifest.py
+33
diff --git a/‎conf/examples/GEN3_DRS_config.md
-18 b/‎conf/examples/GEN3_DRS_config.md
-18
diff --git a/‎conf/examples/GTEX_config.md
+17 b/‎conf/examples/GTEX_config.md
+17
diff --git a/‎conf/executors/google.config
+3-1 b/‎conf/executors/google.config
+3-1
diff --git a/‎containers/csvkit/Dockerfile
+7 b/‎containers/csvkit/Dockerfile
+7
diff --git a/‎containers/csvkit/environment.yml
+9 b/‎containers/csvkit/environment.yml
+9
diff --git a/‎docs/Copying_Files_From_Sumner_to_Cloud.md
+12-14 b/‎docs/Copying_Files_From_Sumner_to_Cloud.md
+12-14
diff --git a/‎docs/Star_Index_Generation.md
+8 b/‎docs/Star_Index_Generation.md
+8
diff --git a/‎docs/run_on_cloudos.md
+1 b/‎docs/run_on_cloudos.md
+1
diff --git a/‎docs/usage.md
+4-45 b/‎docs/usage.md
+4-45
diff --git a/‎examples/GTEX/README.md
+21 b/‎examples/GTEX/README.md
+21
diff --git a/‎examples/GTEX/manifest.json
+14 b/‎examples/GTEX/manifest.json
+14
diff --git a/‎examples/GTEX/reads.csv
+6 b/‎examples/GTEX/reads.csv
+6
diff --git a/‎examples/analyses/MCF10_MYCER.datafiles.csv
-65 b/‎examples/analyses/MCF10_MYCER.datafiles.csv
-65
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import shutil
+import pandas as pd
+
+def __main__():
+    
+    manifest = sys.argv[1]
+    reads = sys.argv[2]
+    print("Input manifest file:", manifest)
+    print("Input read file: ", reads)
+
+    manifest_df = pd.read_csv(manifest, index_col=None, header=0, delimiter=",")
+
+    if reads != "PASS":
+        # process metadata
+        reads_df = pd.read_csv(reads, index_col=None, header=0, delimiter=",")
+        manifest_df = manifest_df[manifest_df['file_name'].isin(reads_df['file_name'].tolist())]
+
+        if manifest_df.empty:
+            print("Manifest file is empty after filtering.")
+            sys.exit(404, "Manifest file is empty after filtering.")
+        else:
+            print("Number of samples in filtered manifest:")
+            print(len(manifest_df))
+
+    # save final manifest file
+    manifest_df.to_csv("filtered_manifest.csv", sep=",", index=False) 
+
+if __name__=="__main__": __main__()
@@ -0,0 +1,17 @@
+A minimal set of params need to run when downloading option is GTEX. Test is done with following params on a dev environment.
+
+```yaml
+params {
+    reads = splicing-pipelines-nf/examples/GTEX/reads.csv
+    manifest = manifest.json
+    run_name = gtex_gen3
+    download_from = GTEX
+    key_file = credentials.json
+    gtf = gencode.v32.primary_assembly.annotation.gtf
+    star_index = /mnt/shared/gcp-user/session_data/star_75
+    assembly_name = GRCh38
+    readlength = 75
+    stranded = false
+    gc_disk_size = 200.GB
+}
+```
@@ -20,8 +20,10 @@ params {
     gc_disk_size = "2000 GB"
 
     cleanup = false // Don't change, otherwise CloudOS jobs won't be resumable by default even if user wants to.
+}
 
-    executor = 'google-lifesciences'
+executor {
+    name = 'google-lifesciences'
 }
 
 process {
 
@@ -0,0 +1,7 @@
+FROM nfcore/base:1.9
+LABEL authors="[email protected]" \
+      description="Docker image containing csvkit toolkit, including in2csv"
+
+COPY environment.yml /
+RUN conda env create -f /environment.yml && conda clean -a
+ENV PATH /opt/conda/envs/csvkit/bin:$PATH
@@ -0,0 +1,9 @@
+name: csvkit
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+  - anaconda
+dependencies:
+  - python=3.8
+  - csvkit=1.0.5
@@ -1,34 +1,32 @@
-//add singularity to $PATH:
+# Moving files from HPC to Cloud (particular to JAX Sumner)
+
+#### Add singularity to $PATH:
 module load singularity
 
-//make some convenience commands to reduce typing (note we changed container name so we can accommodate other cloud providers):
+#### Make some convenient commands to reduce typing:
 alias gcloud="singularity exec /projects/researchit/crf/containers/gcp_sdk.sif gcloud"
 alias gsutil="singularity exec /projects/researchit/crf/containers/gcp_sdk.sif gsutil"
 
-//login to gcloud; this will return a url that you need to paste into a browser, which
-//will take you through the google authentication process; you can use your jax
-//email as userid and jax password to get in. Once you authenticate, it will display
-//a code that you need to paste into the prompt provided in your ssh session on Sumner:
-
+#### Login to gcloud; this will return a url that you need to paste into a browser, which will take you through the google authentication process; you can use your jax email as userid and jax password to get in. Once you authenticate, it will display a code that you need to paste into the prompt provided in your ssh session on Sumner:
 gcloud auth login --no-launch-browser
 
-//see which projects you have access to:
+#### See which projects you have access to:
 gcloud projects list
 
-//what is the project you are currently associated with:
+#### What is the project you are currently associated with:
 gcloud config list project
 
-//change project association:
+#### Change project association:
 gcloud config set project my-project
 
-//see what buckets are associated with my-project:
+#### See what buckets are associated with my-project:
 gsutil ls
 
-//see contents of a particular bucket:
+#### See contents of a particular bucket:
 gsutil ls -l gs://my-bucket
 
-//recursively copy large directory from filesystem accessible on Sumner to your bucket:
+#### Recursively copy large directory from file system accessible on Sumner to your bucket:
 gsutil -m -o GSUtil:parallel_composite_upload_threshold=150M cp -r my_dir gs://my_bucket/my_dir
 
-//recursively copy a directory from your bucket to an existing directory on Sumner:
+#### Recursively copy a directory from your bucket to an existing directory on Sumner:
 gsutil -m -o GSUtil:parallel_composite_upload_threshold=150M cp -r gs://my_bucket/my_dir my_dir
@@ -0,0 +1,8 @@
+## Generating Star Indices
+
+To run the pipeline, you will need star indexes (preferably that match you read length).
+
+This might be a helpful resource to generate multiple star indices:
+https://github.com/TheJacksonLaboratory/Star_indices
+
+This is also a useful resource: https://github.com/alexdobin/STAR
@@ -98,3 +98,4 @@ Note, if you had to resume your job, the above method will not work **sad face**
 ### Helpful Tips
 [Import the Pipeline](https://github.com/TheJacksonLaboratory/splicing-pipelines-nf/blob/master/docs/import_pipeline) 
 [Information on how to run TCGA](https://github.com/TheJacksonLaboratory/splicing-pipelines-nf/blob/master/docs/Running_TCGA.md)
+[Information on how to run GTEX](https://github.com/TheJacksonLaboratory/splicing-pipelines-nf/blob/dev-v2.1/examples/GTEX/README.md)
@@ -103,13 +103,13 @@ Input files:
                                 (default: no rmats_pairs specified)
     --run_name                  User specified name used as prefix for output files
                                 (defaut: no prefix, only date and time)
-    --download_from             Database to download FASTQ/BAMs from (available = 'TCGA', 'GTEX' or 'GEN3-DRS',
-                                'SRA', 'FTP') (string)
+    --download_from             Database to download FASTQ/BAMs from (available = 'TCGA', 'GTEX', 'SRA', 'FTP')
+                                (string)
                                 false should be used to run local files on the HPC (Sumner).
                                 'TCGA' can also be used to download GDC data including HCMI data.
                                 (default: false)
-    --key_file                  For downloading reads, use TCGA authentication token (TCGA) or dbGAP repository
-                                key (GTEx, path) or credentials.json file in case of 'GEN3-DRS'
+    --key_file                  For downloading reads, use TCGA authentication token (TCGA) or
+                                credentials.json file in case of 'GTEX'.
                                 (default: false)
 
 Main arguments:
@@ -246,44 +246,3 @@ Some useful ones include (specified in main.pbs):
 - `-with-trace` eg `-with-trace trace.txt` which gives a [trace report](https://www.nextflow.io/docs/latest/tracing.html?highlight=dag#trace-report) for resource consumption by the pipeline
 - `-with-dag` eg `-with-dag flowchart.png` which produces the [DAG visualisation](https://www.nextflow.io/docs/latest/tracing.html?highlight=dag#dag-visualisation) graph showing each of the different processes and the connections between them (the channels)
 
-## Run with data from AnviL Gen3-DRS
-
-You will be needing two things from - https://gen3.theanvil.io/
-
-1. manifest file
-2. credentials file
-
-Original downloaded `manifest.json` file need to be converted into `manifest.csv` in order to be accepted in `--reads`, for doing that you can do this - 
-
-```bash
-pip install csvkit
-in2csv manifest.json > manifest.csv
-```
-
-NOTE: Make sure the `manifest.csv` file have five columns, Check from [examples](../examples/gen3/)
-
-Downloaded `credentials.json` file can be provided in `--key` param.
-
-NOTE: Make sure `credentials.json` is a latest one. They have expiry dates when you download.
-
-If you running with AnviL Gen3-DRS files you also need to provide a Genome fasta file with `--genome_fasta`, which will be used to convert CRAM files to BAM format.
-
-For a minimal params list check [gen3_drs.config](../conf/examples/GEN3_DRS_config.md)
-
-### Extract based on a bam query list
-
-If you have a list of bam file names of interest, extract the manifest file - 
-
-```bash
-# Get all the bam files name into a txt file
-cut -d, -f4 query_list.csv > bam_files_list.txt
-# Extract those bam files list from manifest.csv
-grep -f bam_files_list.txt -i manifest.csv > manifest.csv
-```
-
-Here `query_list.csv` should look something like - 
-
-```csv
-file_name,sequencing_assay,data_format,file_name,sample_id,participant_id,tissue,age,gender
-GTEX-PPPP-XXX-XX-XXXXX,RNA-Seq,bam,GTEX-PPPP-XXX-XX-XXXXX.Aligned.sortedByCoord.out.patched.md.bam,GTEX-PPPP-XXX-XX-XXXXX,GTEX-PPPP,Breast,21,Female
-```
@@ -0,0 +1,21 @@
+## Run with GTEX data
+You can run pipeline on GTEX data otained directly from Gen3-DRS if you specify input option:
+```
+--download_from 'GTEX'
+```
+
+You will be needing two things from - https://gen3.theanvil.io/
+
+1. [manifest file](https://github.com/TheJacksonLaboratory/splicing-pipelines-nf/blob/dev-v2.1/examples/GTEX/manifest.json)
+2. credentials file
+
+Original downloaded `manifest.json` will be converted into `manifest.csv` with pipeline using: https://csvkit.readthedocs.io/en/latest/ 
+
+The manifest.csv will be subset using the `reads.csv` file provided in `--reads` param. (This allows you to download a complete manifest and later select the samples of interest.) For example: [gtex.reads](https://github.com/TheJacksonLaboratory/splicing-pipelines-nf/blob/dev-v2.1/examples/GTEX/reads.csv)
+
+Downloaded `credentials.json` file can be provided in `--key_file` param.
+NOTE: Make sure `credentials.json` is a latest one. They have expiration dates when you download.
+
+If you running with AnviL Gen3-DRS to download CRAM files you also need to provide a Genome fasta file with `--genome_fasta`, which will be used to convert CRAM files to BAM format. If you are donwloading bam files, you can skip this parameter.
+
+For a minimal params list check [gtex.config](../conf/examples/GTEX_config.md)
@@ -0,0 +1,14 @@
+[
+  {
+    "md5sum":"x1x111xxx1xxxxx1xx1x1x11xxx11111",
+    "file_name": "GTEX-XXXXX-XXXX-XX-XXXXX.Aligned.sortedByCoord.out.patched.md.bam",
+    "object_id":"dg.ANV0/yyyyyyyy-yyyy-yyyy-yyyyyyyyyyyy",
+    "file_size":123321365
+  },
+  {
+    "md5sum":"x2x222xxx2xxxxx2xx2x2x22xxx22222",
+    "file_name": "GTEX-XXXXX-XXXX-XX-XXXXZ.Aligned.sortedByCoord.out.patched.md.bam",
+    "object_id":"dg.ANV0/yyyyyyyy-yyyy-yyyy-yyyyyyyyyzzz",
+    "file_size":123321369
+  }
+]
@@ -0,0 +1,6 @@
+sample_id
+GTEX-XXXXX-XXXX-XX-XXXXX.Aligned.sortedByCoord.out.patched.md.bam
+GTEX-XXXXX-XXXX-XX-XXXXX.Aligned.sortedByCoord.out.patched.md.bam
+GTEX-XXXXX-XXXX-XX-XXXXX.Aligned.sortedByCoord.out.patched.md.bam
+GTEX-XXXXX-XXXX-XX-XXXXX.Aligned.sortedByCoord.out.patched.md.bam
+GTEX-XXXXX-XXXX-XX-XXXXX.Aligned.sortedByCoord.out.patched.md.bam
Original file line number	Diff line number	Diff line change
`@@ -20,8 +20,10 @@ params {`
`20`	`20`	`gc_disk_size = "2000 GB"`
`21`	`21`
`22`	`22`	`cleanup = false // Don't change, otherwise CloudOS jobs won't be resumable by default even if user wants to.`
	`23`	`+}`
`23`	`24`
`24`		`- executor = 'google-lifesciences'`
	`25`	`+executor {`
	`26`	`+ name = 'google-lifesciences'`
`25`	`27`	`}`
`26`	`28`
`27`	`29`	`process {`