Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Properly decode file content before validation #750

Merged
merged 3 commits into from
Sep 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions creator/files/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from django.utils import timezone
from django.contrib.auth import get_user_model
from django.contrib.postgres.fields import ArrayField
from django_s3_storage.storage import S3Storage
from creator.studies.models import Study
from creator.fields import KFIDField, kf_id_generator
from creator.analyses.file_types import FILE_TYPES
Expand Down Expand Up @@ -369,6 +370,22 @@ def extract_config_path(self):

return config_path

def set_storage(self):
"""
Set storage location for study bucket if using S3 backend
"""
s3_storage = "django_s3_storage.storage.S3Storage"
if settings.DEFAULT_FILE_STORAGE == s3_storage:
if self.study is not None:
study = self.study
elif self.root_file is not None:
study = self.root_file.study
else:
raise Study.DoesNotExist(
f"{self} must be part of a study."
)
self.key.storage = S3Storage(aws_s3_bucket_name=study.bucket)


class DownloadToken(models.Model):
"""
Expand Down
20 changes: 13 additions & 7 deletions creator/ingest_runs/tasks/validation_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
from django.conf import settings
from django.core.files.base import ContentFile
from django_s3_storage.storage import S3Storage
from graphql import GraphQLError

from kf_lib_data_ingest.etl.extract.utils import Extractor
from kf_lib_data_ingest.common.io import read_df
from kf_lib_data_ingest.validation.data_validator import (
Validator as DataValidator,
)
Expand Down Expand Up @@ -114,14 +115,13 @@ def clean_and_map(
logger.info(
f"Attempting to map file {version_display(version)} to template keys"
)
# Set storage location appropriately
version.set_storage()

# Extract file content into a DataFrame
try:
df = pandas.DataFrame(extract_data(version))
df = read_df(version.key)
except Exception as e:
er_msg = (
f"Error in parsing {version_display(version)} "
"content into a DataFrame."
)
raise ExtractDataError from e

# Try to map as many of the input columns to template keys
Expand All @@ -146,7 +146,10 @@ def validate_file_versions(validation_run: ValidationRun) -> dict:
version, clean and map the data before passing it to the validator
"""
versions = validation_run.versions.all()
logger.info(f"Begin validating file versions: {pformat(list(versions))}")
logger.info(
"Begin validating file versions:\n"
f"{pformat([version_display(v) for v in versions])}"
)

# Load study templates
study = validation_run.data_review.study
Expand Down Expand Up @@ -188,6 +191,9 @@ def validate_file_versions(validation_run: ValidationRun) -> dict:
continue

if clean_df.empty:
logger.warning(
f"Skipping {version_display(version)} due to empty mapped df"
)
empty_df_count += 1
else:
df_dict[version.pk] = clean_df
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
from django.contrib.auth import get_user_model
from graphql_relay import to_global_id

from creator.studies.models import Study
from creator.studies.factories import StudyFactory
from creator.files.models import File

from creator.files.factories import FileFactory
from creator.files.factories import FileFactory, VersionFactory

User = get_user_model()

Expand Down Expand Up @@ -40,3 +41,35 @@ def test_s3_scrape(db, clients, upload_file):

assert "S3S" in file.valid_types
assert file.valid_types == version.valid_types


def test_set_storage(db, settings):
"""
Test Version.set_storage
"""
# File storage
fv = VersionFactory()
fv.set_storage()
assert "FileSystemStorage" in str(fv.key.storage)

# S3 storage - get study from root_file
s3_storage = "django_s3_storage.storage.S3Storage"
settings.DEFAULT_FILE_STORAGE = s3_storage
fv.set_storage()
assert fv.key.storage.settings.AWS_S3_BUCKET_NAME == (
fv.root_file.study.bucket
)

# S3 storage - get study from prop
study = fv.root_file.study
fv.study = study
fv.set_storage()
assert fv.key.storage.settings.AWS_S3_BUCKET_NAME == (
fv.root_file.study.bucket
)

# No study
fv.root_file = None
fv.study = None
with pytest.raises(Study.DoesNotExist):
fv.set_storage()
8 changes: 6 additions & 2 deletions tests/ingest_runs/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,12 +300,16 @@ class MockVersion:
def __init__(self):
self.pk = 1
self.file_name = "MyVersion"
self.key = f"{self.file_name}.txt"

def set_storage(self):
pass

clean_and_map = validation_run.clean_and_map

# Error extracting file content into DataFrame
mocker.patch(
"creator.ingest_runs.tasks.validation_run.extract_data",
"creator.ingest_runs.tasks.validation_run.read_df",
side_effect=Exception,
)
with pytest.raises(validation_run.ExtractDataError) as e:
Expand All @@ -320,7 +324,7 @@ def __init__(self):
}
)
mock_extract_data = mocker.patch(
"creator.ingest_runs.tasks.validation_run.extract_data",
"creator.ingest_runs.tasks.validation_run.read_df",
return_value=df
)
mapped_df = clean_and_map(MockVersion(), mapper)
Expand Down