Skip to content

Commit

Permalink
move fts out of tree
Browse files Browse the repository at this point in the history
  • Loading branch information
lnkuiper committed Nov 13, 2024
0 parents commit a1639ff
Show file tree
Hide file tree
Showing 19 changed files with 1,286 additions and 0 deletions.
31 changes: 31 additions & 0 deletions .github/workflows/MainDistributionPipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#
# This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension
#
name: Main Extension Distribution Pipeline
on:
push:
pull_request:
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }}
cancel-in-progress: true

jobs:
duckdb-stable-build:
name: Build extension binaries
uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main
with:
duckdb_version: v1.1.2
ci_tools_version: main
extension_name: httpfs

duckdb-stable-deploy:
name: Deploy extension binaries
needs: duckdb-stable-build
uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@main
secrets: inherit
with:
duckdb_version: v1.1.2
extension_name: httpfs
deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }}
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[submodule "duckdb"]
path = duckdb
url = https://github.com/duckdb/duckdb.git
[submodule "extension-ci-tools"]
path = extension-ci-tools
url = https://github.com/duckdb/extension-ci-tools.git
55 changes: 55 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
cmake_minimum_required(VERSION 2.8.12...3.29)

set(FTS_BASE_FOLDER "extension/fts")

project(FTSExtension)

add_extension_definitions()

include_directories(include ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/libstemmer)
set(FTS_SOURCES
fts_extension.cpp
fts_indexing.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/libstemmer/libstemmer.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/runtime/utilities.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/runtime/api.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_arabic.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_basque.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_catalan.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_danish.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_dutch.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_english.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_finnish.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_french.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_german.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_german2.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_greek.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_hindi.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_hungarian.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_indonesian.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_irish.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_italian.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_kraaij_pohlmann.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_lithuanian.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_lovins.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_nepali.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_norwegian.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_porter.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_portuguese.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_romanian.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_russian.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_serbian.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_spanish.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_swedish.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_tamil.cpp
${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_turkish.cpp)

build_static_extension(fts ${FTS_SOURCES})
set(PARAMETERS "-warnings")
build_loadable_extension(fts ${PARAMETERS} ${FTS_SOURCES})

install(
TARGETS fts_extension
EXPORT "${DUCKDB_EXPORT_SET}"
LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
ARCHIVE DESTINATION "${INSTALL_LIB_DIR}")
55 changes: 55 additions & 0 deletions extension/fts/fts_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import os

# list all include directories
include_directories = [
os.path.sep.join(x.split('/'))
for x in [
'extension/fts/include',
'third_party/snowball/libstemmer',
'third_party/snowball/runtime',
'third_party/snowball/src_c',
]
]
# source files
source_files = [
os.path.sep.join(x.split('/')) for x in ['extension/fts/fts_extension.cpp', 'extension/fts/fts_indexing.cpp']
]
# snowball
source_files += [
os.path.sep.join(x.split('/'))
for x in [
'third_party/snowball/libstemmer/libstemmer.cpp',
'third_party/snowball/runtime/utilities.cpp',
'third_party/snowball/runtime/api.cpp',
'third_party/snowball/src_c/stem_UTF_8_arabic.cpp',
'third_party/snowball/src_c/stem_UTF_8_basque.cpp',
'third_party/snowball/src_c/stem_UTF_8_catalan.cpp',
'third_party/snowball/src_c/stem_UTF_8_danish.cpp',
'third_party/snowball/src_c/stem_UTF_8_dutch.cpp',
'third_party/snowball/src_c/stem_UTF_8_english.cpp',
'third_party/snowball/src_c/stem_UTF_8_finnish.cpp',
'third_party/snowball/src_c/stem_UTF_8_french.cpp',
'third_party/snowball/src_c/stem_UTF_8_german.cpp',
'third_party/snowball/src_c/stem_UTF_8_german2.cpp',
'third_party/snowball/src_c/stem_UTF_8_greek.cpp',
'third_party/snowball/src_c/stem_UTF_8_hindi.cpp',
'third_party/snowball/src_c/stem_UTF_8_hungarian.cpp',
'third_party/snowball/src_c/stem_UTF_8_indonesian.cpp',
'third_party/snowball/src_c/stem_UTF_8_irish.cpp',
'third_party/snowball/src_c/stem_UTF_8_italian.cpp',
'third_party/snowball/src_c/stem_UTF_8_kraaij_pohlmann.cpp',
'third_party/snowball/src_c/stem_UTF_8_lithuanian.cpp',
'third_party/snowball/src_c/stem_UTF_8_lovins.cpp',
'third_party/snowball/src_c/stem_UTF_8_nepali.cpp',
'third_party/snowball/src_c/stem_UTF_8_norwegian.cpp',
'third_party/snowball/src_c/stem_UTF_8_porter.cpp',
'third_party/snowball/src_c/stem_UTF_8_portuguese.cpp',
'third_party/snowball/src_c/stem_UTF_8_romanian.cpp',
'third_party/snowball/src_c/stem_UTF_8_russian.cpp',
'third_party/snowball/src_c/stem_UTF_8_serbian.cpp',
'third_party/snowball/src_c/stem_UTF_8_spanish.cpp',
'third_party/snowball/src_c/stem_UTF_8_swedish.cpp',
'third_party/snowball/src_c/stem_UTF_8_tamil.cpp',
'third_party/snowball/src_c/stem_UTF_8_turkish.cpp',
]
]
103 changes: 103 additions & 0 deletions extension/fts/fts_extension.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#define DUCKDB_EXTENSION_MAIN
#include "fts_extension.hpp"

#include "duckdb.hpp"
#include "duckdb/common/exception.hpp"
#include "duckdb/common/string_util.hpp"
#include "duckdb/function/pragma_function.hpp"
#include "duckdb/function/scalar_function.hpp"
#include "duckdb/main/extension_util.hpp"
#include "fts_indexing.hpp"
#include "libstemmer.h"

namespace duckdb {

static void StemFunction(DataChunk &args, ExpressionState &state, Vector &result) {
auto &input_vector = args.data[0];
auto &stemmer_vector = args.data[1];

BinaryExecutor::Execute<string_t, string_t, string_t>(
input_vector, stemmer_vector, result, args.size(), [&](string_t input, string_t stemmer) {
auto input_data = input.GetData();
auto input_size = input.GetSize();

if (stemmer.GetString() == "none") {
auto output = StringVector::AddString(result, input_data, input_size);
return output;
}

struct sb_stemmer *s = sb_stemmer_new(stemmer.GetString().c_str(), "UTF_8");
if (s == 0) {
const char **stemmers = sb_stemmer_list();
size_t n_stemmers = 27;
throw InvalidInputException(
"Unrecognized stemmer '%s'. Supported stemmers are: ['%s'], or use 'none' for no stemming",
stemmer.GetString(),
StringUtil::Join(stemmers, n_stemmers, "', '", [](const char *st) { return st; }));
}

auto output_data =
const_char_ptr_cast(sb_stemmer_stem(s, reinterpret_cast<const sb_symbol *>(input_data), input_size));
auto output_size = sb_stemmer_length(s);
auto output = StringVector::AddString(result, output_data, output_size);

sb_stemmer_delete(s);
return output;
});
}

static void LoadInternal(DuckDB &db) {
auto &db_instance = *db.instance;
ScalarFunction stem_func("stem", {LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::VARCHAR, StemFunction);

auto create_fts_index_func =
PragmaFunction::PragmaCall("create_fts_index", FTSIndexing::CreateFTSIndexQuery,
{LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::VARCHAR);
create_fts_index_func.named_parameters["stemmer"] = LogicalType::VARCHAR;
create_fts_index_func.named_parameters["stopwords"] = LogicalType::VARCHAR;
create_fts_index_func.named_parameters["ignore"] = LogicalType::VARCHAR;
create_fts_index_func.named_parameters["strip_accents"] = LogicalType::BOOLEAN;
create_fts_index_func.named_parameters["lower"] = LogicalType::BOOLEAN;
create_fts_index_func.named_parameters["overwrite"] = LogicalType::BOOLEAN;

auto drop_fts_index_func =
PragmaFunction::PragmaCall("drop_fts_index", FTSIndexing::DropFTSIndexQuery, {LogicalType::VARCHAR});

ExtensionUtil::RegisterFunction(db_instance, stem_func);
ExtensionUtil::RegisterFunction(db_instance, create_fts_index_func);
ExtensionUtil::RegisterFunction(db_instance, drop_fts_index_func);
}

void FtsExtension::Load(DuckDB &db) {
LoadInternal(db);
}

std::string FtsExtension::Name() {
return "fts";
}

std::string FtsExtension::Version() const {
#ifdef EXT_VERSION_FTS
return EXT_VERSION_FTS;
#else
return "";
#endif
}

} // namespace duckdb

extern "C" {

DUCKDB_EXTENSION_API void fts_init(duckdb::DatabaseInstance &db) {
duckdb::DuckDB db_wrapper(db);
duckdb::LoadInternal(db_wrapper);
}

DUCKDB_EXTENSION_API const char *fts_version() {
return duckdb::DuckDB::LibraryVersion();
}
}

#ifndef DUCKDB_EXTENSION_MAIN
#error DUCKDB_EXTENSION_MAIN not defined
#endif
Loading

0 comments on commit a1639ff

Please sign in to comment.