wip

znatty22 · znatty22 · commit de59c44cbf1c · 2021-12-06T15:37:25.000-05:00
diff --git a/creator/dewrangle/__init__.py b/creator/dewrangle/__init__.py
diff --git a/creator/dewrangle/client.py b/creator/dewrangle/client.py
@@ -0,0 +1,76 @@
+import logging
+import json
+import os
+
+import requests
+from graphql import GraphQLError
+from django.conf import settings
+
+from .graphql import *
+
+logger = logging.getLogger(__name__)
+
+
+class DewrangleClient(object):
+
+    def __init__(self, personal_access_token=None, url=None):
+        pat = personal_access_token or settings.DATA_TRACKER_DEWRANGLE_TOKEN
+        self.url = url or settings.DEWRANGLE_URL
+        self.session = requests.Session()
+        self.session.headers.update(
+            {
+                "X-Api-Key": pat,
+                "Content-Type": "application/json"
+            }
+        )
+
+    def bulk_create_file_upload_invoices(self, study_id, invoices):
+        """
+        Send graphql mutation to create a batch of file upload invoices
+        in Dewrangle
+        """
+        body = {
+            "variables": {
+                "input": {
+                    "studyId": study_id,
+                    "fileUploadInvoices": invoices
+                }
+            }
+        }
+        try:
+            response = self.session.post(self.url, json=body)
+            response.raise_for_status()
+        except requests.exceptions.RequestException as e:
+            logger.exception(
+                f"Problem sending request to dewrangle {self.url}")
+            raise
+
+        try:
+            data = response.json()
+        except json.exceptions.JSONDecodeError:
+            logger.exception("Problem parsing JSON from response body")
+            raise
+
+        if response.status_code != 200 or "errors" in data:
+            msg = "There was a problem creating file upload invoices."
+            logger.exception(msg)
+            raise GraphQLError(
+                f"{msg} Caused by:\n{data['errors']}"
+            )
+
+        results = data["fileUploadInvoices"]
+
+        return results
+
+    def get_storage_analysis(self, obj_id):
+        """
+        Send graphql query to get a StorageAnalysis object from Dewrangle
+        """
+        pass
+
+    def get_storage_analyses(self, study_id=None):
+        """
+        Send graphql query to get all StorageAnalysis objects or optionally,
+        all StorageAnalysis objects for a particular study from Dewrangle
+        """
+        pass
diff --git a/creator/dewrangle/graphql.py b/creator/dewrangle/graphql.py
@@ -0,0 +1,11 @@
+
+FILE_UPLOAD_INVOICE_CREATE = """
+mutation newUploadInvoices($input: FileUploadInvoicesCreateInput!) {
+  fileUploadInvoiceCreate(input: $input) {
+    fileUploadInvoices {
+      created
+      total
+    }
+  }
+}
+"""
diff --git a/creator/files/mutations/version.py b/creator/files/mutations/version.py
@@ -1,5 +1,7 @@
+import logging
 import graphene
 import uuid
+import django_rq
 from django.conf import settings
 from django.db import transaction
 from django_s3_storage.storage import S3Storage
@@ -16,6 +18,12 @@
 from creator.data_templates.nodes.template_version import TemplateVersionNode
 from creator.data_templates.models import TemplateVersion
 from creator.files.utils import evaluate_template_match
+from creator.files.tasks import (
+    is_file_upload_manifest,
+    push_to_dewrangle
+)
+
+logger = logging.getLogger(__name__)
 
 
 class VersionMutation(graphene.Mutation):
@@ -220,6 +228,19 @@ def mutate(
             analysis.creator = user
             analysis.save()
 
+        if (
+            settings.FEAT_DEWRANGLE_INTEGRATION and
+            is_file_upload_manifest(version)
+        ):
+            logger.info(
+                f"Queued version {version.kf_id} {version.root_file.name} for"
+                " audit processing..."
+            )
+            push_to_dewrangle(version.pk)
+            # django_rq.enqueue(
+            #     push_to_dewrangle, version_id=version.pk
+            # )
+
         return VersionUploadMutation(success=True, version=version)
 
 
diff --git a/creator/files/tasks.py b/creator/files/tasks.py
@@ -0,0 +1,151 @@
+import logging
+import requests
+from pprint import pprint
+
+from django.conf import settings
+
+from creator.decorators import task
+from creator.files.models import Version
+from creator.analyses.analyzer import extract_data
+from creator.dewrangle.client import DewrangleClient
+from kf_lib_data_ingest.common.io import read_df
+
+DATAFRAME_CHUNK_SIZE = 100
+KNOWN_FORMATS = {
+    ".csv": {"reader": pandas.read_csv, "sep": ","},
+    ".tsv": {"reader": pandas.read_csv, "sep": "\t"},
+    ".txt": {"reader": pandas.read_csv, "sep": None},
+}
+FILE_UPLOAD_MANIFEST_SCHEMA = {
+    "required": [
+        "Source File Name",
+        "Hash",
+        "Hash Algorithm",
+        "Size",
+    ],
+    "optional": [
+        "Patient IDs",
+        "Specimen IDs"
+    ]
+}
+
+logger = logginer.getLogger(__name__)
+
+
+class ExtractDataError(Exception):
+    pass
+
+
+def is_file_upload_manifest(version):
+    """
+    Check whether this file version conforms to the File Upload Manifest schema
+    """
+    header = {
+        "_".join(c.split(" ")).lower()
+        for c in read_df(version.key, nrows=0).columns
+    }
+    expected = ["_".join(c.split(" ")).lower()
+                for c in FILE_UPLOAD_MANIFEST_SCHEMA["required"]]
+    return expected <= header
+
+
+def chunked_dataframe_reader(version):
+    """
+    Read a tabular file into chunks of Dataframes and yield chunks
+    """
+    # Need to set storage location for study bucket if using S3 backend
+    if settings.DEFAULT_FILE_STORAGE == "django_s3_storage.storage.S3Storage":
+        if version.study is not None:
+            study = version.study
+        elif version.root_file is not None:
+            study = version.root_file.study
+        else:
+            raise GraphQLError("Version must be part of a study.")
+
+        version.key.storage = S3Storage(aws_s3_bucket_name=study.bucket)
+
+    # Check file format
+    _, ext = os.path.splitext(version.key.name)
+    if ext not in KNOWN_FORMATS:
+        raise IOError(
+            "Unsupported file format. Unable to read file upload manifest: "
+            f"{version.pk} {version.file_name}"
+        )
+
+    # Read file into chunks (DataFrames)
+    reader = KNOWN_FORMATS[ext]["reader"]
+    delim = KNOWN_FORMATS[ext]["sep"]
+    try:
+        for i, chunk in enumerate(
+            reader(version.key, sep=delim, chunksize=DATAFRAME_CHUNK_SIZE)
+        ):
+            logger.info(
+                f"Reading {DATAFRAME_CHUNK_SIZE} rows from "
+                f"{version.file_name} into DataFrame"
+            )
+            yield chunk
+    except Exception as e:
+        er_msg = (
+            f"Error in parsing {version.pk}: {version.file_name}"
+            " content into a DataFrame."
+        )
+        raise ExtractDataError from e
+
+
+def dataframe_to_invoices(df):
+    """
+    Helper to convert a file upload manifest DataFrame into a list of
+    FileUploadInvoice dicts in preparation to send to Dewrangle API
+    """
+    extract_cols = {
+        c: "_".join(c.split(" ")).lower()
+        for c in (
+            FILE_UPLOAD_MANIFEST_SCHEMA["required"] +
+            FILE_UPLOAD_MANIFEST_SCHEMA["optional"]
+        )
+    }
+    df = df[[c for c in extract_cols if c in df.columns]]
+
+    mapping = {
+        "source_file_name": "fileName",
+        "hash": "hash",
+        "hash_algorithm": "hashAlgorithm",
+        "size": "size",
+        "patient_ids": "patientIds",
+        "specimen_ids": "specimenIds"
+    }
+    df = df.rename(columns=mapped_cols)
+
+    return df.to_dict(orient="records")
+
+
+@task("push_to_dewrangle")
+def push_to_dewrangle(version_id):
+    """
+    Push the records in a file upload manifest to the Dewrangle API
+    where they will processed to produce an audit report of files in
+    cloud storage
+    """
+    try:
+        dewrangle = DewrangleClient()
+        for df in chunked_dataframe_reader(version):
+            logger.info(
+                f"Submitting {df.shape[0]} file upload invoices to"
+                f" {dewrangle.url}"
+            )
+            pprint(dataframe_to_invoices(df))
+            # result = dewrangle.bulk_create_file_upload_invoices({
+            #     "studyId": version.study.pk,
+            #     "fileUploadInvoices": dataframe_to_invoices(df)
+            # })
+            logger.info(
+                f"Success. Created: {result['created']} invoices. Total:"
+                f" {results['total']} invoices"
+            )
+
+    except Exception as e:
+        # TODO Set Version.submitted_for_audit = fail
+        raise
+
+    # TODO
+    # set Version.submitted_for_audit = completed
diff --git a/creator/settings/features.py b/creator/settings/features.py
@@ -5,6 +5,20 @@
 import os
 
 
+# DEWRANGLE ##################################################################
+# Study creator delegates auditing of files uploaded to S3 buckets to the
+# Dewrangle API. User submitted file upload manifest must be pushed to
+# Dewrangle to complete auditing
+FEAT_DEWRANGLE_INTEGRATION = os.environ.get(
+    "FEAT_DEWRANGLE_INTEGRATION", False
+)
+DEWRANGLE_URL = os.environ.get(
+    "DEWRANGLE_URL", "http://localhost:3000/api/graphql"
+)
+DATA_TRACKER_DEWRANGLE_TOKEN = os.environ.get(
+    "DATA_TRACKER_DEWRANGLE_TOKEN", None
+)
+
 # STUDIES ######################################################################
 # The Study Creator's primary purpose is to create and track studies.