Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add hf tokenizer class definition header #57

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 60 additions & 1 deletion include/tokenizers_cpp.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <memory>
#include <string>
#include <vector>

#include <tokenizers_c.h>
namespace tokenizers {

/*!
Expand Down Expand Up @@ -106,5 +106,64 @@ class Tokenizer {
static std::unique_ptr<Tokenizer> FromBlobRWKVWorld(const std::string& model_blob);
};

class HFTokenizer : public Tokenizer {
public:
explicit HFTokenizer(TokenizerHandle handle);

HFTokenizer(const HFTokenizer&);
HFTokenizer(HFTokenizer&& other);

~HFTokenizer();

// use i32 to be consistent with sentencepiece
std::vector<int32_t> Encode(const std::string& text, bool add_special_tokens);

// use i32 to be consistent with sentencepiece
std::vector<int32_t> Encode(const std::string& text) final;

// version specific to HFTokenizer, which adds special tokens flag
std::vector<std::vector<int32_t>> EncodeBatch(const std::vector<std::string>& texts,
bool add_special_tokens);

std::vector<std::vector<int32_t>> EncodeBatch(const std::vector<std::string>& texts) final;

// use i32 to be consistent with sentencepiece
std::string Decode(const std::vector<int32_t>& ids, bool skip_special_tokens);

std::string Decode(const std::vector<int32_t>& ids) final;

size_t GetVocabSize() final;

std::string IdToToken(int32_t id) final;

int32_t TokenToId(const std::string& token) final;


/*!
* \brief Create HF tokenizer from a single in-memory json blob.
*
* \param json_blob The json blob.
* \return The created tokenzier.
*/
static std::unique_ptr<HFTokenizer> FromBlobJSON(const std::string& json_blob);

/*!
* \brief Create BPE tokenizer
*
* \param vocab_blob The blob that contains vocabs.
* \param merges_blob The blob that contains the merges.
* \param added_tokens The added tokens.
* \return The created tokenizer.
*/
static std::unique_ptr<HFTokenizer> FromBlobByteLevelBPE(const std::string& vocab_blob,
const std::string& merges_blob,
const std::string& added_tokens = "");


private:
// internal handle
TokenizerHandle handle_{nullptr};
};

} // namespace tokenizers
#endif // TOKENIZERS_CPP_H_
178 changes: 95 additions & 83 deletions src/huggingface_tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,110 +13,122 @@ namespace tokenizers {
/*!
* \brief A simple c++ header of tokenizer via C API.
*/
class HFTokenizer : public Tokenizer {
public:
explicit HFTokenizer(TokenizerHandle handle) : handle_(handle) {
#ifdef COMPILE_WASM_RUNTIME
setenv("TOKENIZERS_PARALLELISM", "false", true);
#endif
}

HFTokenizer(const HFTokenizer&) = delete;
HFTokenizer(HFTokenizer&& other) { std::swap(other.handle_, handle_); }
/*
These are the methods for the HFTokenizer class.
*/

~HFTokenizer() {
if (handle_ != nullptr) {
tokenizers_free(handle_);
}
}
HFTokenizer::HFTokenizer(TokenizerHandle handle) : handle_(handle) {
#ifdef COMPILE_WASM_RUNTIME
setenv("TOKENIZERS_PARALLELISM", "false", true);
#endif
}

// use i32 to be consistent with sentencepiece
std::vector<int32_t> Encode(const std::string& text, bool add_special_tokens) {
TokenizerEncodeResult result;
tokenizers_encode(handle_, text.data(), text.length(), static_cast<int>(add_special_tokens),
&result);
std::vector<int32_t> ret(result.token_ids, result.token_ids + result.len);
tokenizers_free_encode_results(&result, 1);
return ret;
}
// HFTokenizer::HFTokenizer(const HFTokenizer&) = delete;
HFTokenizer::HFTokenizer(HFTokenizer&& other) { std::swap(other.handle_, handle_); }

// use i32 to be consistent with sentencepiece
std::vector<int32_t> Encode(const std::string& text) final { return Encode(text, false); }

std::vector<std::vector<int32_t>> EncodeBatch(const std::vector<std::string>& texts,
bool add_special_tokens) {
std::vector<const char*> texts_raw;
std::vector<size_t> seq_lens;
size_t num_seqs = texts.size();
texts_raw.reserve(num_seqs);
seq_lens.reserve(num_seqs);
for (const auto& text : texts) {
texts_raw.push_back(text.data());
seq_lens.push_back(text.length());
}
std::vector<TokenizerEncodeResult> results(num_seqs);
tokenizers_encode_batch(handle_, texts_raw.data(), seq_lens.data(), texts.size(),
static_cast<int>(add_special_tokens), results.data());
std::vector<std::vector<int32_t>> ret;
ret.reserve(texts.size());
for (size_t i = 0; i < texts.size(); ++i) {
ret.push_back(
std::vector<int32_t>(results[i].token_ids, results[i].token_ids + results[i].len));
}
tokenizers_free_encode_results(results.data(), texts.size());
return ret;
HFTokenizer::~HFTokenizer() {
if (handle_ != nullptr) {
tokenizers_free(handle_);
}
}

std::vector<std::vector<int32_t>> EncodeBatch(const std::vector<std::string>& texts) final {
return EncodeBatch(texts, false);
}
// use i32 to be consistent with sentencepiece
std::vector<int32_t> HFTokenizer::Encode(const std::string& text, bool add_special_tokens) {
TokenizerEncodeResult result;
tokenizers_encode(handle_, text.data(), text.length(), static_cast<int>(add_special_tokens),
&result);
std::vector<int32_t> ret(result.token_ids, result.token_ids + result.len);
tokenizers_free_encode_results(&result, 1);
return ret;
}

// use i32 to be consistent with sentencepiece
std::string Decode(const std::vector<int32_t>& ids, bool skip_special_tokens) {
tokenizers_decode(handle_, reinterpret_cast<const uint32_t*>(ids.data()), ids.size(),
static_cast<int>(skip_special_tokens));
const char* data;
size_t len;
tokenizers_get_decode_str(handle_, &data, &len);
return std::string(data, len);
// use i32 to be consistent with sentencepiece
std::vector<int32_t> HFTokenizer::Encode(const std::string& text) { return Encode(text, false); }

std::vector<std::vector<int32_t>> HFTokenizer::EncodeBatch(const std::vector<std::string>& texts,
bool add_special_tokens) {
std::vector<const char*> texts_raw;
std::vector<size_t> seq_lens;
size_t num_seqs = texts.size();
texts_raw.reserve(num_seqs);
seq_lens.reserve(num_seqs);
for (const auto& text : texts) {
texts_raw.push_back(text.data());
seq_lens.push_back(text.length());
}
std::vector<TokenizerEncodeResult> results(num_seqs);
tokenizers_encode_batch(handle_, texts_raw.data(), seq_lens.data(), texts.size(),
static_cast<int>(add_special_tokens), results.data());
std::vector<std::vector<int32_t>> ret;
ret.reserve(texts.size());
for (size_t i = 0; i < texts.size(); ++i) {
ret.push_back(
std::vector<int32_t>(results[i].token_ids, results[i].token_ids + results[i].len));
}
tokenizers_free_encode_results(results.data(), texts.size());
return ret;
}

std::string Decode(const std::vector<int32_t>& ids) final { return Decode(ids, false); }
std::vector<std::vector<int32_t>> HFTokenizer::EncodeBatch(const std::vector<std::string>& texts) {
return EncodeBatch(texts, false);
}

size_t GetVocabSize() final {
size_t size;
tokenizers_get_vocab_size(handle_, &size);
assert(size > 0);
return size;
}
// use i32 to be consistent with sentencepiece
std::string HFTokenizer::Decode(const std::vector<int32_t>& ids, bool skip_special_tokens) {
tokenizers_decode(handle_, reinterpret_cast<const uint32_t*>(ids.data()), ids.size(),
static_cast<int>(skip_special_tokens));
const char* data;
size_t len;
tokenizers_get_decode_str(handle_, &data, &len);
return std::string(data, len);
}

std::string IdToToken(int32_t id) final {
const char* data;
size_t len;
tokenizers_id_to_token(handle_, static_cast<uint32_t>(id), &data, &len);
return std::string(data, len);
}
std::string HFTokenizer::Decode(const std::vector<int32_t>& ids) { return Decode(ids, false); }

int32_t TokenToId(const std::string& token) final {
int32_t id;
tokenizers_token_to_id(handle_, token.data(), token.length(), &id);
return id;
}
size_t HFTokenizer::GetVocabSize() {
size_t size;
tokenizers_get_vocab_size(handle_, &size);
assert(size > 0);
return size;
}

private:
// internal handle
TokenizerHandle handle_{nullptr};
};
std::string HFTokenizer::IdToToken(int32_t id) {
const char* data;
size_t len;
tokenizers_id_to_token(handle_, static_cast<uint32_t>(id), &data, &len);
return std::string(data, len);
}

std::unique_ptr<Tokenizer> Tokenizer::FromBlobJSON(const std::string& json) {
int32_t HFTokenizer::TokenToId(const std::string& token) {
int32_t id;
tokenizers_token_to_id(handle_, token.data(), token.length(), &id);
return id;
}

// These are factory methods defined in the base class Tokenizer:


std::unique_ptr<HFTokenizer> HFTokenizer::FromBlobJSON(const std::string& json) {
return std::make_unique<HFTokenizer>(tokenizers_new_from_str(json.data(), json.length()));
}

std::unique_ptr<Tokenizer> Tokenizer::FromBlobByteLevelBPE(const std::string& vocab,
std::unique_ptr<Tokenizer> Tokenizer::FromBlobJSON(const std::string& json) {
return HFTokenizer::FromBlobJSON(json);
}

std::unique_ptr<HFTokenizer> HFTokenizer::FromBlobByteLevelBPE(const std::string& vocab,
const std::string& merges,
const std::string& added_tokens) {
return std::make_unique<HFTokenizer>(byte_level_bpe_tokenizers_new_from_str(
vocab.data(), vocab.length(), merges.data(), merges.length(), added_tokens.data(),
added_tokens.length()));
}

std::unique_ptr<Tokenizer> Tokenizer::FromBlobByteLevelBPE(const std::string& vocab,
const std::string& merges,
const std::string& added_tokens) {
return HFTokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens);
}

} // namespace tokenizers