Skip to content

Commit

Permalink
Fix spliting text by languages for kokoro tts. (#1849)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Feb 13, 2025
1 parent 115e9c2 commit 944400e
Show file tree
Hide file tree
Showing 7 changed files with 203 additions and 35 deletions.
3 changes: 2 additions & 1 deletion sherpa-onnx/c-api/cxx-api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,8 @@ OfflineStream OfflineRecognizer::CreateStream() const {
return OfflineStream{s};
}

OfflineStream OfflineRecognizer::CreateStream(const std::string &hotwords) const {
OfflineStream OfflineRecognizer::CreateStream(
const std::string &hotwords) const {
auto s = SherpaOnnxCreateOfflineStreamWithHotwords(p_, hotwords.c_str());
return OfflineStream{s};
}
Expand Down
1 change: 1 addition & 0 deletions sherpa-onnx/csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,7 @@ if(SHERPA_ONNX_ENABLE_TESTS)
context-graph-test.cc
packed-sequence-test.cc
pad-sequence-test.cc
regex-lang-test.cc
slice-test.cc
stack-test.cc
text-utils-test.cc
Expand Down
52 changes: 22 additions & 30 deletions sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@

#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"

#include <codecvt>
#include <fstream>
#include <locale>
#include <regex> // NOLINT
#include <sstream>
#include <strstream>
Expand All @@ -22,6 +20,8 @@
#include "rawfile/raw_file_manager.h"
#endif

#include <codecvt>

#include "cppjieba/Jieba.hpp"
#include "espeak-ng/speak_lib.h"
#include "phoneme_ids.hpp"
Expand All @@ -37,20 +37,6 @@ void CallPhonemizeEspeak(const std::string &text,
piper::eSpeakPhonemeConfig &config, // NOLINT
std::vector<std::vector<piper::Phoneme>> *phonemes);

static std::wstring ToWideString(const std::string &s) {
// see
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.from_bytes(s);
}

static std::string ToString(const std::wstring &s) {
// see
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.to_bytes(s);
}

class KokoroMultiLangLexicon::Impl {
public:
Impl(const std::string &tokens, const std::string &lexicon,
Expand Down Expand Up @@ -103,37 +89,42 @@ class KokoroMultiLangLexicon::Impl {

// https://en.cppreference.com/w/cpp/regex
// https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
std::string expr =
"([;:,.?!'\"\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+"
")";
std::string expr_chinese = "([\\u4e00-\\u9fff]+)";
std::string expr_not_chinese = "([^\\u4e00-\\u9fff]+)";

std::string expr_both = expr_chinese + "|" + expr_not_chinese;

auto ws = ToWideString(text);
std::wstring wexpr = ToWideString(expr);
std::wregex we(wexpr);
std::wstring wexpr_both = ToWideString(expr_both);
std::wregex we_both(wexpr_both);

std::wstring wexpr_zh = ToWideString(expr_chinese);
std::wregex we_zh(wexpr_zh);

auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we_both);
auto end = std::wsregex_iterator();

std::vector<TokenIDs> ans;

for (std::wsregex_iterator i = begin; i != end; ++i) {
std::wsmatch match = *i;
std::wstring match_str = match.str();

auto ms = ToString(match_str);
uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0];

std::vector<std::vector<int32_t>> ids_vec;

if (c < 0x80) {
if (std::regex_match(match_str, we_zh)) {
if (debug_) {
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
}
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
ids_vec = ConvertChineseToTokenIDs(ms);
} else {
if (debug_) {
SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
}
ids_vec = ConvertChineseToTokenIDs(ms);

ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
}

for (const auto &ids : ids_vec) {
Expand Down Expand Up @@ -315,9 +306,10 @@ class KokoroMultiLangLexicon::Impl {
this_sentence.push_back(space_id);
} else {
if (debug_) {
SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str());
SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'",
word.c_str());
}

piper::eSpeakPhonemeConfig config;

config.voice = voice;
Expand Down
86 changes: 86 additions & 0 deletions sherpa-onnx/csrc/regex-lang-test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// sherpa-onnx/csrc/regex-lang-test.cc
//
// Copyright (c) 2025 Xiaomi Corporation

#include <regex> // NOLINT

#include "gtest/gtest.h"
#include "sherpa-onnx/csrc/text-utils.cc"

namespace sherpa_onnx {

static void TestLang(const std::string &expr, const std::string &text,
const std::vector<std::string> &expected) {
auto ws = ToWideString(text);
std::wstring wexpr = ToWideString(expr);
std::wregex we(wexpr);

auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
auto end = std::wsregex_iterator();
int32_t k = 0;
for (std::wsregex_iterator i = begin; i != end; ++i) {
std::wsmatch match = *i;
std::wstring match_str = match.str();
auto ms = ToString(match_str);
std::cout << ms << "\n";
EXPECT_EQ(ms, expected[k]);
k++;
}
EXPECT_EQ(k, expected.size());
}

TEST(German, Case1) {
std::cout << "----------Test German----------";
// see https://character-table.netlify.app/german/
std::string expr =
"([\\u0020-\\u005f\\u0061-"
"\\u007d\\u00a0\\u00a7\\u00a9\\u00ab\\u00bb\\u00c4\\u00d6\\u00dc\\u00df\\"
"u00e4\\u00f6\\u00fc\\u2010-\\u2011\\u2013-"
"\\u2014\\u2018\\u201a\\u201c\\u201e\\u2026\\u2030\\u20ac]+)";

std::string text =
"开始Übeltäter übergibt Ärzten 中间öfters äußerst ätzende Öle结束3€";

std::vector<std::string> expected = {"Übeltäter übergibt Ärzten ",
"öfters äußerst ätzende Öle", "3€"};

TestLang(expr, text, expected);
}

TEST(French, Case1) {
std::string expr =
"([\\u0020-\\u005f\\u0061-"
"\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u00ab\\u00b2-"
"\\u00b3\\u00bb\\u00c0\\u00c2\\u00c6-\\u00cb\\u00ce-"
"\\u00cf\\u00d4\\u00d9\\u00db-\\u00dc\\u00e0\\u00e2\\u00e6-"
"\\u00eb\\u00ee-\\u00ef\\u00f4\\u00f9\\u00fb-\\u00fc\\u00ff\\u0152-"
"\\u0153\\u0178\\u02b3\\u02e2\\u1d48-\\u1d49\\u2010-\\u2011\\u2013-"
"\\u2014\\u2019\\u201c-\\u201d\\u2020-\\u2021\\u2026\\u202f-"
"\\u2030\\u20ac\\u2212]+)";
std::string text =
"L'été, 一avec son ciel bleuâtre, 二est un moment où, 三Noël, maçon";
std::vector<std::string> expected = {
"L'été, ",
"avec son ciel bleuâtre, ",
"est un moment où, ",
"Noël, maçon",
};
TestLang(expr, text, expected);
}

TEST(English, Case1) {
// https://character-table.netlify.app/english/
std::string expr =
"([\\u0020-\\u005f\\u0061-\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u2010-"
"\\u2011\\u2013-\\u2014\\u2018-\\u2019\\u201c-\\u201d\\u2020-"
"\\u2021\\u2026\\u2030\\u2032-\\u2033\\u20ac]+)";
std::string text = "一how are you doing? 二Thank you!";

std::vector<std::string> expected = {
"how are you doing? ",
"Thank you!",
};
TestLang(expr, text, expected);
}

} // namespace sherpa_onnx
8 changes: 8 additions & 0 deletions sherpa-onnx/csrc/text-utils-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@

namespace sherpa_onnx {

TEST(ToLowerCase, WideString) {
std::string text =
"Hallo! Übeltäter übergibt Ärzten öfters äußerst ätzende Öle 3€";
auto t = ToLowerCase(text);
std::cout << text << "\n";
std::cout << t << "\n";
}

TEST(RemoveInvalidUtf8Sequences, Case1) {
std::vector<uint8_t> v = {
0xe4, 0xbb, 0x8a, //
Expand Down
82 changes: 78 additions & 4 deletions sherpa-onnx/csrc/text-utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@
#include <algorithm>
#include <cassert>
#include <cctype>
#include <codecvt>
#include <cstdint>
#include <cwctype>
#include <limits>
#include <locale>
#include <sstream>
#include <string>
#include <unordered_map>
Expand Down Expand Up @@ -389,17 +392,74 @@ std::vector<std::string> SplitUtf8(const std::string &text) {
}

std::string ToLowerCase(const std::string &s) {
std::string ans(s.size(), 0);
std::transform(s.begin(), s.end(), ans.begin(),
[](unsigned char c) { return std::tolower(c); });
return ans;
return ToString(ToLowerCase(ToWideString(s)));
}

void ToLowerCase(std::string *in_out) {
std::transform(in_out->begin(), in_out->end(), in_out->begin(),
[](unsigned char c) { return std::tolower(c); });
}

std::wstring ToLowerCase(const std::wstring &s) {
std::wstring ans(s.size(), 0);
std::transform(s.begin(), s.end(), ans.begin(), [](wchar_t c) -> wchar_t {
switch (c) {
// French
case L'À':
return L'à';
case L'Â':
return L'â';
case L'Æ':
return L'æ';
case L'Ç':
return L'ç';
case L'È':
return L'è';
case L'É':
return L'é';
case L'Ë':
return L'ë';
case L'Î':
return L'î';
case L'Ï':
return L'ï';
case L'Ô':
return L'ô';
case L'Ù':
return L'ù';
case L'Û':
return L'û';
case L'Ü':
return L'ü';

// others
case L'Á':
return L'á';
case L'Í':
return L'í';
case L'Ó':
return L'ó';
case L'Ú':
return L'ú';
case L'Ñ':
return L'ñ';
case L'Ì':
return L'ì';
case L'Ò':
return L'ò';
case L'Ä':
return L'ä';
case L'Ö':
return L'ö';
// TODO(fangjun): Add more

default:
return std::towlower(c);
}
});
return ans;
}

static inline bool InRange(uint8_t x, uint8_t low, uint8_t high) {
return low <= x && x <= high;
}
Expand Down Expand Up @@ -625,4 +685,18 @@ std::string Gb2312ToUtf8(const std::string &text) {
}
#endif

std::wstring ToWideString(const std::string &s) {
// see
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.from_bytes(s);
}

std::string ToString(const std::wstring &s) {
// see
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.to_bytes(s);
}

} // namespace sherpa_onnx
6 changes: 6 additions & 0 deletions sherpa-onnx/csrc/text-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ std::vector<std::string> SplitUtf8(const std::string &text);
std::string ToLowerCase(const std::string &s);
void ToLowerCase(std::string *in_out);

std::wstring ToLowerCase(const std::wstring &s);

std::string RemoveInvalidUtf8Sequences(const std::string &text,
bool show_debug_msg = false);

Expand All @@ -139,6 +141,10 @@ bool IsGB2312(const std::string &text);
std::string Gb2312ToUtf8(const std::string &text);
#endif

std::wstring ToWideString(const std::string &s);

std::string ToString(const std::wstring &s);

} // namespace sherpa_onnx

#endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_

0 comments on commit 944400e

Please sign in to comment.