Skip to content

Commit

Permalink
Merge pull request #67 from TileDB-Inc/sethshelnutt/ch1756/add-checks…
Browse files Browse the repository at this point in the history
…um-to-array-schemas

Add checksum filter support
  • Loading branch information
Shelnutt2 authored Mar 31, 2020
2 parents 638aa19 + 649c3f9 commit 9c92059
Show file tree
Hide file tree
Showing 15 changed files with 218 additions and 27 deletions.
1 change: 1 addition & 0 deletions apis/python/src/tiledbvcf/binding/libtiledbvcf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ PYBIND11_MODULE(libtiledbvcf, m) {
.def("init", &Writer::init)
.def("set_samples", &Writer::set_samples)
.def("set_extra_attributes", &Writer::set_extra_attributes)
.def("set_checksum", &Writer::set_checksum)
.def("create_dataset", &Writer::create_dataset)
.def("register_samples", &Writer::register_samples)
.def("ingest_samples", &Writer::ingest_samples);
Expand Down
14 changes: 14 additions & 0 deletions apis/python/src/tiledbvcf/binding/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,20 @@ void Writer::set_extra_attributes(const std::string& attributes) {
tiledb_vcf_writer_set_extra_attributes(writer, attributes.c_str()));
}

void Writer::set_checksum(const std::string& checksum) {
auto writer = ptr.get();
tiledb_vcf_checksum_type_t checksum_type = TILEDB_VCF_CHECKSUM_SHA256;

if (checksum == "md5")
checksum_type = TILEDB_VCF_CHECKSUM_MD5;
else if (checksum == "sha256")
checksum_type = TILEDB_VCF_CHECKSUM_SHA256;
else if (checksum == "none")
checksum_type = TILEDB_VCF_CHECKSUM_NONE;

check_error(writer, tiledb_vcf_writer_set_checksum_type(writer, checksum_type));
}

void Writer::create_dataset() {
auto writer = ptr.get();
check_error(writer, tiledb_vcf_writer_create_dataset(writer));
Expand Down
5 changes: 5 additions & 0 deletions apis/python/src/tiledbvcf/binding/writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ class Writer {
*/
void set_extra_attributes(const std::string& attributes);

/**
[Creation only] Sets the checksum type to be used of the arrays
*/
void set_checksum(const std::string& checksum);

void create_dataset();

void register_samples();
Expand Down
15 changes: 14 additions & 1 deletion apis/python/src/tiledbvcf/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,26 @@ def count(self, samples=None, regions=None):

return self.reader.result_num_records()

def ingest_samples(self, sample_uris=None, extra_attrs=None):
def ingest_samples(self, sample_uris=None, extra_attrs=None, checksum_type=None):
"""Ingest samples
:param list of str samples: CSV list of sample names to include in
the count.
:param list of str extra_attrs: CSV list of extra attributes to
materialize from fmt field
:param str checksum_type: Optional override checksum type for creating new dataset
valid values are sha256, md5 or none.
"""
if self.mode != 'w':
raise Exception('Dataset not open in write mode')

if sample_uris is None:
return

if checksum_type is not None:
checksum_type = checksum_type.lower()
self.writer.set_checksum(checksum_type)

self.writer.set_samples(','.join(sample_uris))

extra_attrs = '' if extra_attrs is None else extra_attrs
Expand Down
4 changes: 2 additions & 2 deletions libtiledbvcf/cmake/Modules/FindTileDB_EP.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ else()

ExternalProject_Add(ep_tiledb
PREFIX "externals"
URL "https://github.com/TileDB-Inc/TileDB/archive/1.7.5.zip"
URL_HASH SHA1=129c6e046df074fac8af9d449dd5f8ae7221fbbe
URL "https://github.com/TileDB-Inc/TileDB/archive/1.7.6.zip"
URL_HASH SHA1=f6b63111d0eff8633ecebf2ca3a64fc9d22c362f
DOWNLOAD_NAME "tiledb.zip"
CMAKE_ARGS
-DCMAKE_INSTALL_PREFIX=${EP_INSTALL_PREFIX}
Expand Down
12 changes: 12 additions & 0 deletions libtiledbvcf/src/c_api/tiledbvcf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,18 @@ int32_t tiledb_vcf_writer_set_extra_attributes(
return TILEDB_VCF_OK;
}

int32_t tiledb_vcf_writer_set_checksum_type(
tiledb_vcf_writer_t* writer, tiledb_vcf_checksum_type_t checksum_type) {
if (sanity_check(writer) == TILEDB_VCF_ERR)
return TILEDB_VCF_ERR;

if (SAVE_ERROR_CATCH(
writer, writer->writer_->set_checksum_type((int)checksum_type)))
return TILEDB_VCF_ERR;

return TILEDB_VCF_OK;
}

int32_t tiledb_vcf_writer_create_dataset(tiledb_vcf_writer_t* writer) {
if (sanity_check(writer) == TILEDB_VCF_ERR)
return TILEDB_VCF_ERR;
Expand Down
23 changes: 23 additions & 0 deletions libtiledbvcf/src/c_api/tiledbvcf.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,14 @@ typedef enum {
#undef TILEDB_VCF_ATTR_DATATYPE_ENUM
} tiledb_vcf_attr_datatype_t;

/** Checksum filter types. */
typedef enum {
/** Helper macro for defining subset of tiledb filter type enums. */
#define TILEDB_VCF_CHECKSUM_TYPE_ENUM(id) TILEDB_VCF_##id
#include "tiledbvcf_enum.h"
#undef TILEDB_VCF_CHECKSUM_TYPE_ENUM
} tiledb_vcf_checksum_type_t;

/* ********************************* */
/* STRUCT TYPES */
/* ********************************* */
Expand Down Expand Up @@ -763,6 +771,21 @@ TILEDBVCF_EXPORT int32_t tiledb_vcf_writer_set_samples(
TILEDBVCF_EXPORT int32_t tiledb_vcf_writer_set_extra_attributes(
tiledb_vcf_writer_t* writer, const char* attributes);

/**
* [Creation only] Sets the checksum type to be used for the underlying arrays
*
* The checksum type can be set to TILEDB_VCF_CHECKSUM_MD5,
* TILEDB_VCF_CHECKSUM_SHA256 or TILEDB_VCF_CHECKSUM_NONE to disable.
*
* TILEDB_VCF_CHECKSUM_SHA256 is the default
*
* @param writer VCF writer object
* @param checksum_type tiledb checksum filter type to be use
* @return `TILEDB_VCF_OK` for success or `TILEDB_VCF_ERR` for error.
*/
TILEDBVCF_EXPORT int32_t tiledb_vcf_writer_set_checksum_type(
tiledb_vcf_writer_t* writer, tiledb_vcf_checksum_type_t checksum);

/**
* Creates a new TileDB-VCF dataset, using previously set parameters.
*
Expand Down
9 changes: 9 additions & 0 deletions libtiledbvcf/src/c_api/tiledbvcf_enum.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,13 @@ TILEDB_VCF_READ_STATUS_ENUM(FAILED) = 0,
TILEDB_VCF_ATTR_DATATYPE_ENUM(INT32) = 2,
/** 32-bit floating-point */
TILEDB_VCF_ATTR_DATATYPE_ENUM(FLOAT32) = 3,
#endif

#ifdef TILEDB_VCF_CHECKSUM_TYPE_ENUM
/** No-op filter */
TILEDB_VCF_CHECKSUM_TYPE_ENUM(CHECKSUM_NONE) = 0,
/** MD5 checksum filter. */
TILEDB_VCF_CHECKSUM_TYPE_ENUM(CHECKSUM_MD5) = 12,
/** SHA256 checksum filter. */
TILEDB_VCF_CHECKSUM_TYPE_ENUM(CHECKSUM_SHA256) = 13,
#endif
11 changes: 11 additions & 0 deletions libtiledbvcf/src/cli/tiledbvcf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,17 @@ int main(int argc, char** argv) {
"specifying optional TileDB configuration parameter settings." &
value("params").call([&create_args](const std::string& s) {
create_args.tiledb_config = utils::split(s, ',');
}),
option("--checksum") %
"Checksum to use for dataset validation on read and writes, "
"defauls to 'sha256'" &
value("checksum").call([&create_args](const std::string& s) {
if (s == "sha256")
create_args.checksum = TILEDB_FILTER_CHECKSUM_SHA256;
else if (s == "md5")
create_args.checksum = TILEDB_FILTER_CHECKSUM_MD5;
else if (s == "none")
create_args.checksum = TILEDB_FILTER_NONE;
}));

RegistrationParams register_args;
Expand Down
60 changes: 44 additions & 16 deletions libtiledbvcf/src/dataset/tiledbvcfdataset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ void TileDBVCFDataset::create(const CreationParams& params) {
metadata.extra_attributes = params.extra_attributes;
metadata.free_sample_id = 0;

create_empty_metadata(ctx, params.uri, metadata);
create_empty_data_array(ctx, params.uri, metadata);
create_empty_metadata(ctx, params.uri, metadata, params.checksum);
create_empty_data_array(ctx, params.uri, metadata, params.checksum);
write_metadata(ctx, params.uri, metadata);
}

Expand Down Expand Up @@ -124,13 +124,19 @@ void TileDBVCFDataset::check_attribute_names(
}

void TileDBVCFDataset::create_empty_metadata(
const Context& ctx, const std::string& root_uri, const Metadata& metadata) {
const Context& ctx,
const std::string& root_uri,
const Metadata& metadata,
const tiledb_filter_type_t& checksum) {
create_group(ctx, utils::uri_join(root_uri, "metadata"));
create_sample_header_array(ctx, root_uri);
create_sample_header_array(ctx, root_uri, checksum);
}

void TileDBVCFDataset::create_empty_data_array(
const Context& ctx, const std::string& root_uri, const Metadata& metadata) {
const Context& ctx,
const std::string& root_uri,
const Metadata& metadata,
const tiledb_filter_type_t& checksum) {
ArraySchema schema(ctx, TILEDB_SPARSE);
schema.set_capacity(metadata.tile_capacity);
schema.set_order({{TILEDB_COL_MAJOR, TILEDB_COL_MAJOR}});
Expand All @@ -148,35 +154,46 @@ void TileDBVCFDataset::create_empty_data_array(
domain.add_dimensions(sample, end_pos);
}
schema.set_domain(domain);
schema.set_offsets_filter_list(default_offsets_filter_list(ctx));
auto offsets_filter_list = default_offsets_filter_list(ctx);

// Set coords filters
FilterList coords_filter_list(ctx);
coords_filter_list.add_filter({ctx, TILEDB_FILTER_DOUBLE_DELTA})
.add_filter({ctx, TILEDB_FILTER_ZSTD});
schema.set_coords_filter_list(coords_filter_list);

// Create a byteshuffle -> zstd filter list used by a few attributes
FilterList byteshuffle_zstd_filters(ctx);
byteshuffle_zstd_filters.add_filter({ctx, TILEDB_FILTER_BYTESHUFFLE})
.add_filter({ctx, TILEDB_FILTER_ZSTD});

auto attribute_filter_list = default_attribute_filter_list(ctx);
if (checksum != TILEDB_FILTER_NONE) {
Filter checksum_filter(ctx, checksum);

attribute_filter_list.add_filter(checksum_filter);
byteshuffle_zstd_filters.add_filter(checksum_filter);
coords_filter_list.add_filter(checksum_filter);
offsets_filter_list.add_filter(checksum_filter);
}
schema.set_coords_filter_list(coords_filter_list);
schema.set_offsets_filter_list(offsets_filter_list);

auto pos = Attribute::create<uint32_t>(
ctx, AttrNames::pos, byteshuffle_zstd_filters);
auto real_end = Attribute::create<uint32_t>(
ctx, AttrNames::real_end, byteshuffle_zstd_filters);
auto qual = Attribute::create<float>(
ctx, AttrNames::qual, default_attribute_filter_list(ctx));
auto qual =
Attribute::create<float>(ctx, AttrNames::qual, attribute_filter_list);
auto alleles = Attribute::create<std::vector<char>>(
ctx, AttrNames::alleles, default_attribute_filter_list(ctx));
ctx, AttrNames::alleles, attribute_filter_list);
auto id = Attribute::create<std::vector<char>>(
ctx, AttrNames::id, default_attribute_filter_list(ctx));
ctx, AttrNames::id, attribute_filter_list);
auto filters_ids = Attribute::create<std::vector<int32_t>>(
ctx, AttrNames::filter_ids, byteshuffle_zstd_filters);
auto info = Attribute::create<std::vector<uint8_t>>(
ctx, AttrNames::info, default_attribute_filter_list(ctx));
ctx, AttrNames::info, attribute_filter_list);
auto fmt = Attribute::create<std::vector<uint8_t>>(
ctx, AttrNames::fmt, default_attribute_filter_list(ctx));
ctx, AttrNames::fmt, attribute_filter_list);
schema.add_attributes(
pos, real_end, qual, alleles, id, filters_ids, info, fmt);

Expand All @@ -187,14 +204,16 @@ void TileDBVCFDataset::create_empty_data_array(
continue;
used.insert(attr);
schema.add_attribute(Attribute::create<std::vector<uint8_t>>(
ctx, attr, default_attribute_filter_list(ctx)));
ctx, attr, attribute_filter_list));
}

Array::create(data_array_uri(root_uri), schema);
}

void TileDBVCFDataset::create_sample_header_array(
const Context& ctx, const std::string& root_uri) {
const Context& ctx,
const std::string& root_uri,
const tiledb_filter_type_t& checksum) {
ArraySchema schema(ctx, TILEDB_DENSE);

// Set domain
Expand All @@ -209,10 +228,19 @@ void TileDBVCFDataset::create_sample_header_array(

// Set offsets filters
FilterList offsets_filter_list = default_offsets_filter_list(ctx);
schema.set_offsets_filter_list(offsets_filter_list);

// Add a single 'header' string attribute.
FilterList attribute_filter_list = default_attribute_filter_list(ctx);
if (checksum != TILEDB_FILTER_NONE) {
Filter checksum_filter(ctx, checksum);

attribute_filter_list.add_filter(checksum_filter);
offsets_filter_list.add_filter(checksum_filter);
FilterList coords_filter_list(ctx);
coords_filter_list.add_filter(checksum_filter);
schema.set_coords_filter_list(coords_filter_list);
}
schema.set_offsets_filter_list(offsets_filter_list);
auto attr_header = Attribute::create<std::vector<char>>(
ctx, "header", attribute_filter_list);
schema.add_attributes(attr_header);
Expand Down
14 changes: 11 additions & 3 deletions libtiledbvcf/src/dataset/tiledbvcfdataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ struct CreationParams {
uint32_t row_tile_extent = 10;
uint32_t anchor_gap = 1000;
std::vector<std::string> tiledb_config;
tiledb_filter_type_t checksum = TILEDB_FILTER_CHECKSUM_SHA256;
};

/** Arguments/params for dataset registration. */
Expand Down Expand Up @@ -241,23 +242,27 @@ class TileDBVCFDataset {
* @param ctx TileDB context
* @param root_uri Root URI of the dataset
* @param metadata General dataset metadata to write
* @param checksum optional checksum filter
*/
static void create_empty_metadata(
const Context& ctx,
const std::string& root_uri,
const Metadata& metadata);
const Metadata& metadata,
const tiledb_filter_type_t& checksum);

/**
* Creates the empty sample data array for a new dataset.
*
* @param ctx TileDB context
* @param root_uri Root URI of the dataset
* @param metadata Dataset metadata containing tile capacity etc. to use
* @param checksum optional checksum filter
*/
static void create_empty_data_array(
const Context& ctx,
const std::string& root_uri,
const Metadata& metadata);
const Metadata& metadata,
const tiledb_filter_type_t& checksum);

/**
* Creates the empty sample header array for a new dataset.
Expand All @@ -268,9 +273,12 @@ class TileDBVCFDataset {
*
* @param ctx TileDB context
* @param root_uri Root URI of the dataset
* @param checksum optional checksum filter
*/
static void create_sample_header_array(
const Context& ctx, const std::string& root_uri);
const Context& ctx,
const std::string& root_uri,
const tiledb_filter_type_t& checksum);

/**
* Write the given Metadata instance into the dataset.
Expand Down
8 changes: 4 additions & 4 deletions libtiledbvcf/src/vcf/vcf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -373,12 +373,12 @@ std::string VCF::hdr_to_string(bcf_hdr_t* hdr) {
if (res != 0) {
if (res == -1) {
throw std::invalid_argument(
"Cannot set VCF samples; possibly bad VCF header.");
"Cannot set VCF samples; possibly bad VCF header.");
} else if (res > 0) {
throw std::runtime_error(
std::string("Cannot set VCF samples: list contains samples not present in VCF header, sample #:") +
std::to_string(res)
);
std::string("Cannot set VCF samples: list contains samples not "
"present in VCF header, sample #:") +
std::to_string(res));
}
}
bcf_hdr_format(tmp, 0, &t);
Expand Down
10 changes: 10 additions & 0 deletions libtiledbvcf/src/write/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ void Writer::init(
array_.reset(new Array(*ctx_, dataset.data_uri(), TILEDB_WRITE));
query_.reset(new Query(*ctx_, *array_));
query_->set_layout(TILEDB_GLOBAL_ORDER);

creation_params_.checksum = TILEDB_FILTER_CHECKSUM_SHA256;
}

void Writer::set_all_params(const IngestionParams& params) {
Expand All @@ -79,6 +81,14 @@ void Writer::set_extra_attributes(const std::string& attributes) {
creation_params_.extra_attributes = attrs;
}

void Writer::set_checksum_type(const int& checksum) {
set_checksum_type((tiledb_filter_type_t)checksum);
}

void Writer::set_checksum_type(const tiledb_filter_type_t& checksum) {
creation_params_.checksum = checksum;
}

void Writer::create_dataset() {
TileDBVCFDataset::create(creation_params_);
}
Expand Down
Loading

0 comments on commit 9c92059

Please sign in to comment.