Skip to content

Commit

Permalink
Add AF filtering for release 0.21 (#473)
Browse files Browse the repository at this point in the history
* Update TileDB to 2.13 and update java/spark for version 0.21.0

* Combined changes for INFO_TDB_IAF filtering

Co-authored-by: George Powley <[email protected]>
  • Loading branch information
awenocur and gspowley authored Dec 12, 2022
1 parent 9caa7fa commit 05bde31
Show file tree
Hide file tree
Showing 49 changed files with 1,997 additions and 56 deletions.
2 changes: 1 addition & 1 deletion apis/java/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ plugins {
}

group 'io.tiledb'
version '0.20.2'
version '0.21.0'

sourceCompatibility = 1.8
targetCompatibility = 1.8
Expand Down
1 change: 1 addition & 0 deletions apis/python/src/tiledbvcf/binding/libtiledbvcf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ PYBIND11_MODULE(libtiledbvcf, m) {
.def("set_output_format", &Reader::set_output_format)
.def("set_output_path", &Reader::set_output_path)
.def("set_output_dir", &Reader::set_output_dir)
.def("set_af_filter", &Reader::set_af_filter)
.def("read", &Reader::read, py::arg("release_buffs") = true)
.def("get_results_arrow", &Reader::get_results_arrow)
.def("completed", &Reader::completed)
Expand Down
25 changes: 25 additions & 0 deletions apis/python/src/tiledbvcf/binding/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,33 @@ void Reader::set_output_dir(const std::string& output_dir) {
reader, tiledb_vcf_reader_set_output_dir(reader, output_dir.c_str()));
}

void Reader::set_af_filter(const std::string& af_filter) {
auto reader = ptr.get();
check_error(
reader, tiledb_vcf_reader_set_af_filter(reader, af_filter.c_str()));
}

void Reader::read(const bool release_buffs) {
auto reader = ptr.get();
bool af_filter_enabled = false;
if (tiledb_vcf_reader_get_af_filter_exists(reader, &af_filter_enabled) ==
TILEDB_VCF_ERR)
throw std::runtime_error("TileDB-VCF-Py: Error finding AF filter.");
if (af_filter_enabled) {
// add alleles buffer only if not already present
bool add_alleles = true;
bool add_GT = true;
for (std::string attribute : attributes_) {
add_alleles = add_alleles && attribute != "alleles";
add_GT = add_GT && attribute != "fmt_GT";
}
if (add_alleles) {
attributes_.emplace_back("alleles");
}
if (add_GT) {
attributes_.emplace_back("fmt_GT");
}
}
alloc_buffers(release_buffs);
set_buffers();

Expand Down
3 changes: 3 additions & 0 deletions apis/python/src/tiledbvcf/binding/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ class Reader {
/** Set export output directory */
void set_output_dir(const std::string& output_dir);

/** Set internal allele frequency filtering expression */
void set_af_filter(const std::string& af_filter);

/** Set the TileDB query buffer memory percentage */
void set_buffer_percentage(float buffer_percentage);

Expand Down
4 changes: 4 additions & 0 deletions apis/python/src/tiledbvcf/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ def read_arrow(
samples_file=None,
bed_file=None,
skip_check_samples=False,
set_af_filter="",
enable_progress_estimation=False,
):
"""Reads data from a TileDB-VCF dataset into Arrow table.
Expand Down Expand Up @@ -186,6 +187,7 @@ def read_arrow(
self.reader.set_regions(",".join(regions))
self.reader.set_attributes(attrs)
self.reader.set_check_samples_exist(not skip_check_samples)
self.reader.set_af_filter(set_af_filter)
self.reader.set_enable_progress_estimation(enable_progress_estimation)

if bed_file is not None:
Expand All @@ -201,6 +203,7 @@ def read(
samples_file=None,
bed_file=None,
skip_check_samples=False,
set_af_filter="",
enable_progress_estimation=False,
):
"""Reads data from a TileDB-VCF dataset into Pandas dataframe.
Expand Down Expand Up @@ -233,6 +236,7 @@ def read(
self.reader.set_regions(",".join(regions))
self.reader.set_attributes(attrs)
self.reader.set_check_samples_exist(not skip_check_samples)
self.reader.set_af_filter(set_af_filter)
self.reader.set_enable_progress_estimation(enable_progress_estimation)

if bed_file is not None:
Expand Down
58 changes: 53 additions & 5 deletions apis/python/tests/test_tiledbvcf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import numpy as np
import subprocess
import os
import pandas as pd
import re
import glob
import shutil
import platform
import pytest
import tiledbvcf
Expand Down Expand Up @@ -401,7 +405,7 @@ def test_read_filters(test_ds):
),
"filters": pd.Series(
map(
lambda lst: np.array(lst, dtype=np.object),
lambda lst: np.array(lst, dtype=object),
[None, None, ["LowQual"], None, None, None],
)
),
Expand Down Expand Up @@ -442,7 +446,7 @@ def test_read_var_length_filters(tmp_path):
),
"filters": pd.Series(
map(
lambda lst: np.array(lst, dtype=np.object),
lambda lst: np.array(lst, dtype=object),
[
["PASS"],
["PASS"],
Expand Down Expand Up @@ -495,7 +499,7 @@ def test_read_alleles(test_ds):
),
"alleles": pd.Series(
map(
lambda lst: np.array(lst, dtype=np.object),
lambda lst: np.array(lst, dtype=object),
[
["C", "<NON_REF>"],
["C", "<NON_REF>"],
Expand Down Expand Up @@ -535,14 +539,14 @@ def test_read_multiple_alleles(tmp_path):
"pos_start": pd.Series([866511, 1289367], dtype=np.int32),
"alleles": pd.Series(
map(
lambda lst: np.array(lst, dtype=np.object),
lambda lst: np.array(lst, dtype=object),
[["T", "CCCCTCCCT", "C", "CCCCTCCCTCCCT", "CCCCT"], ["CTG", "C"]],
)
),
"id": pd.Series([".", "rs1497816"]),
"filters": pd.Series(
map(
lambda lst: np.array(lst, dtype=np.object),
lambda lst: np.array(lst, dtype=object),
[["LowQual"], ["LowQual"]],
)
),
Expand Down Expand Up @@ -1102,6 +1106,50 @@ def test_ingest_mode_merged(tmp_path):
assert ds.count(regions=["chrX:9032893-9032893"]) == 0


def test_ingest_with_stats(tmp_path):
tmp_path_contents = os.listdir(tmp_path)
if "stats" in tmp_path_contents:
shutil.rmtree(os.path.join(tmp_path, "stats"))
shutil.copytree(
os.path.join(TESTS_INPUT_DIR, "stats"), os.path.join(tmp_path, "stats")
)
raw_inputs = glob.glob(os.path.join(tmp_path, "stats", "*.vcf"))
print(f"raw inputs: {raw_inputs}")
for vcf_file in raw_inputs:
subprocess.run(
"bcftools view --no-version -Oz -o " + vcf_file + ".gz " + vcf_file,
shell=True,
check=True,
)
bgzipped_inputs = glob.glob(os.path.join(tmp_path, "stats", "*.gz"))
print(f"bgzipped inputs: {bgzipped_inputs}")
for vcf_file in bgzipped_inputs:
assert subprocess.run("bcftools index " + vcf_file, shell=True).returncode == 0
if "outputs" in tmp_path_contents:
shutil.rmtree(os.path.join(tmp_path, "outputs"))
if "stats_test" in tmp_path_contents:
shutil.rmtree(os.path.join(tmp_path, "outputs"))
tiledbvcf.config_logging("trace")
ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "stats_test"), mode="w")
ds.create_dataset(enable_variant_stats=True)
ds.ingest_samples(bgzipped_inputs)
ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "stats_test"), mode="r")
sample_names = [os.path.basename(file).split(".")[0] for file in bgzipped_inputs]
data_frame = ds.read(
samples=sample_names,
attrs=["contig", "pos_start", "id", "qual", "info_TILEDB_IAF", "sample_name"],
set_af_filter="<0.2",
)
assert data_frame.shape == (1, 8)
assert (
data_frame[data_frame["sample_name"] == "second"]["qual"] == 343.730011
).bool()
assert (
data_frame[data_frame["sample_name"] == "second"]["info_TILEDB_IAF"].iloc[0][0]
== 0.0625
)


def test_ingest_mode_separate(tmp_path):
# tiledbvcf.config_logging("debug")
# Create the dataset
Expand Down
2 changes: 1 addition & 1 deletion apis/spark/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ plugins {
}

group 'io.tiledb'
version '0.20.2'
version '0.21.0'

sourceCompatibility = 1.8
targetCompatibility = 1.8
Expand Down
2 changes: 1 addition & 1 deletion apis/spark3/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ plugins {
}

group 'io.tiledb'
version '0.20.2'
version '0.21.0'

sourceCompatibility = 1.8
targetCompatibility = 1.8
Expand Down
2 changes: 1 addition & 1 deletion ci/native_libs-linux_osx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ steps:
- bash: |
set -e pipefail
# Install bcftools (only required for running the CLI tests)
version=1.10.2
version=1.16
if [[ "$AGENT_OS" == "Linux" ]]; then
pushd /tmp
wget https://github.com/samtools/bcftools/releases/download/${version}/bcftools-${version}.tar.bz2
Expand Down
5 changes: 4 additions & 1 deletion libtiledbvcf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
############################################################

cmake_minimum_required(VERSION 3.3)
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
cmake_policy(SET CMP0135 NEW)
endif()
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/")

# Superbuild option must be on by default.
Expand Down Expand Up @@ -66,7 +69,7 @@ if (FORCE_EXTERNAL_HTSLIB)
message(STATUS "Skipping search for htslib, building it as an external project. To use system htslib, set FORCE_EXTERNAL_HTSLIB=OFF")
endif()

if (FORCE_EXTERNAL_HTSLIB)
if (FORCE_EXTERNAL_TILEDB)
message(STATUS "Skipping search for TileDB, building it as an external project. To use system TileDB, set FORCE_EXTERNAL_TILEDB=OFF")
endif()

Expand Down
20 changes: 10 additions & 10 deletions libtiledbvcf/cmake/Modules/FindTileDB_EP.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -48,20 +48,20 @@ else()
# Try to download prebuilt artifacts unless the user specifies to build from source
if(DOWNLOAD_TILEDB_PREBUILT)
if (WIN32) # Windows
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.12.2/tiledb-windows-x86_64-2.12.2-a9d60c8.zip")
SET(DOWNLOAD_SHA1 "30fd2a26e744a2f496b040733187d33869e37458")
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.13.0/tiledb-windows-x86_64-2.13.0-db00e70.zip")
SET(DOWNLOAD_SHA1 "e38b77c672ff885c47d50e89fba8b344cd4ebe38")
elseif(APPLE) # OSX

if (CMAKE_OSX_ARCHITECTURES STREQUAL x86_64 OR CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.12.2/tiledb-macos-x86_64-2.12.2-a9d60c8.tar.gz")
SET(DOWNLOAD_SHA1 "505abc35181c0b0dba55928c3ac4bea57806d7a2")
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.13.0/tiledb-macos-x86_64-2.13.0-db00e70.tar.gz")
SET(DOWNLOAD_SHA1 "ac9f7a735568e2461a4b72706d574b799e84a7c6")
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL arm64 OR CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.12.2/tiledb-macos-arm64-2.12.2-a9d60c8.tar.gz")
SET(DOWNLOAD_SHA1 "bd7f220354baaaeebd860ba5bb82545329e549c1")
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.13.0/tiledb-macos-arm64-2.13.0-db00e70.tar.gz")
SET(DOWNLOAD_SHA1 "77f4223a8dfef1a4c66ce82e105717be8b0e4038")
endif()
else() # Linux
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.12.2/tiledb-linux-x86_64-2.12.2-a9d60c8.tar.gz")
SET(DOWNLOAD_SHA1 "47f1a64db038f31c40d38fcac10a3316f7295881")
SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.13.0/tiledb-linux-x86_64-2.13.0-db00e70.tar.gz")
SET(DOWNLOAD_SHA1 "5ea6c6008c2c1bab80fee5093eff50179a02db23")
endif()

ExternalProject_Add(ep_tiledb
Expand All @@ -83,8 +83,8 @@ else()
else() # Build from source
ExternalProject_Add(ep_tiledb
PREFIX "externals"
URL "https://github.com/TileDB-Inc/TileDB/archive/2.12.2.zip"
URL_HASH SHA1=e5dbe0a308da3556b46fa010f98365cfd0ef56a6
URL "https://github.com/TileDB-Inc/TileDB/archive/2.13.0.zip"
URL_HASH SHA1=2bb9f4f20702bdc0471df4ece58d5d06e89dc6d8
DOWNLOAD_NAME "tiledb.zip"
CMAKE_ARGS
-DCMAKE_INSTALL_PREFIX=${EP_INSTALL_PREFIX}
Expand Down
12 changes: 8 additions & 4 deletions libtiledbvcf/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,19 @@ set(TILEDB_VCF_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/c_api/tiledbvcf.cc
${CMAKE_CURRENT_SOURCE_DIR}/dataset/attribute_buffer_set.cc
${CMAKE_CURRENT_SOURCE_DIR}/dataset/tiledbvcfdataset.cc
${CMAKE_CURRENT_SOURCE_DIR}/dataset/allele_count.cc
${CMAKE_CURRENT_SOURCE_DIR}/dataset/variant_stats.cc
${CMAKE_CURRENT_SOURCE_DIR}/htslib_plugin/hfile_tiledb_vfs.c
${CMAKE_CURRENT_SOURCE_DIR}/read/bcf_exporter.cc
${CMAKE_CURRENT_SOURCE_DIR}/read/pvcf_exporter.cc
${CMAKE_CURRENT_SOURCE_DIR}/read/exporter.cc
${CMAKE_CURRENT_SOURCE_DIR}/read/in_memory_exporter.cc
${CMAKE_CURRENT_SOURCE_DIR}/read/pvcf_exporter.cc
${CMAKE_CURRENT_SOURCE_DIR}/read/read_query_results.cc
${CMAKE_CURRENT_SOURCE_DIR}/read/reader.cc
${CMAKE_CURRENT_SOURCE_DIR}/read/tsv_exporter.cc
${CMAKE_CURRENT_SOURCE_DIR}/stats/allele_count.cc
${CMAKE_CURRENT_SOURCE_DIR}/stats/column_buffer.cc
${CMAKE_CURRENT_SOURCE_DIR}/stats/managed_query.cc
${CMAKE_CURRENT_SOURCE_DIR}/stats/variant_stats.cc
${CMAKE_CURRENT_SOURCE_DIR}/stats/variant_stats_reader.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils/bitmap.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils/buffer.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils/logger.cc
Expand All @@ -73,10 +76,10 @@ set(TILEDB_VCF_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/write/record_heap_v2.cc
${CMAKE_CURRENT_SOURCE_DIR}/write/record_heap_v3.cc
${CMAKE_CURRENT_SOURCE_DIR}/write/record_heap_v4.cc
${CMAKE_CURRENT_SOURCE_DIR}/write/writer.cc
${CMAKE_CURRENT_SOURCE_DIR}/write/writer_worker_v2.cc
${CMAKE_CURRENT_SOURCE_DIR}/write/writer_worker_v3.cc
${CMAKE_CURRENT_SOURCE_DIR}/write/writer_worker_v4.cc
${CMAKE_CURRENT_SOURCE_DIR}/write/writer.cc
)

set(TILEDB_VCF_EXTERNAL_SOURCES
Expand Down Expand Up @@ -193,6 +196,7 @@ endif()
target_include_directories(tiledbvcf-bin
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/../external
)

############################################################
Expand Down
21 changes: 21 additions & 0 deletions libtiledbvcf/src/c_api/tiledbvcf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,16 @@ int32_t tiledb_vcf_reader_get_tiledb_stats(
return TILEDB_VCF_OK;
}

TILEDBVCF_EXPORT int32_t tiledb_vcf_reader_get_af_filter_exists(
tiledb_vcf_reader_t* reader, bool* present) {
if (sanity_check(reader) == TILEDB_VCF_ERR || present == nullptr)
return TILEDB_VCF_ERR;

*present = reader->reader_->af_filter_enabled();

return TILEDB_VCF_OK;
}

int32_t tiledb_vcf_reader_read(tiledb_vcf_reader_t* reader) {
if (sanity_check(reader) == TILEDB_VCF_ERR)
return TILEDB_VCF_ERR;
Expand Down Expand Up @@ -921,6 +931,17 @@ int32_t tiledb_vcf_reader_set_output_dir(
return TILEDB_VCF_OK;
}

int32_t tiledb_vcf_reader_set_af_filter(
tiledb_vcf_reader_t* reader, const char* af_filter) {
if (sanity_check(reader) == TILEDB_VCF_ERR)
return TILEDB_VCF_ERR;

if (SAVE_ERROR_CATCH(reader, reader->reader_->set_af_filter(af_filter)))
return TILEDB_VCF_ERR;

return TILEDB_VCF_OK;
}

/* ********************************* */
/* BED FILE */
/* ********************************* */
Expand Down
18 changes: 18 additions & 0 deletions libtiledbvcf/src/c_api/tiledbvcf.h
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,16 @@ tiledb_vcf_reader_get_tiledb_stats_enabled_vcf_header_array(
TILEDBVCF_EXPORT int32_t
tiledb_vcf_reader_get_tiledb_stats(tiledb_vcf_reader_t* reader, char** stats);

/**
* Gets whether a VCF reader has an AF filter set.
*
* @param writer VCF writer object
* @param present a char** were the stats will be returned
* @return `TILEDB_VCF_OK` for success or `TILEDB_VCF_ERR` for error.
*/
TILEDBVCF_EXPORT int32_t tiledb_vcf_reader_get_af_filter_exists(
tiledb_vcf_reader_t* reader, bool* present);

TILEDBVCF_EXPORT int32_t
tiledb_vcf_reader_get_samples(tiledb_vcf_reader_t* reader, const char* samples);

Expand Down Expand Up @@ -900,6 +910,14 @@ TILEDBVCF_EXPORT int32_t tiledb_vcf_reader_set_output_format(
TILEDBVCF_EXPORT int32_t tiledb_vcf_reader_set_output_path(
tiledb_vcf_reader_t* reader, const char* output_path);

/**
* Sets AF filter
* @param reader VCF reader object
* @param af_filter setting
*/
TILEDBVCF_EXPORT int32_t tiledb_vcf_reader_set_af_filter(
tiledb_vcf_reader_t* reader, const char* af_filter);

/**
* Sets export output directory
* @param reader VCF reader object
Expand Down
Loading

0 comments on commit 05bde31

Please sign in to comment.