Skip to content

Commit 68cddda

Browse files
committed
Update: Add the language identification pattern to config
1 parent d4de56c commit 68cddda

File tree

2 files changed

+6
-13
lines changed

2 files changed

+6
-13
lines changed

config.py

+3-7
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,6 @@
4242
it's recommended to specify model paths in config.yaml.
4343
"""
4444

45-
model_list = [
46-
# {"model_path": "model_name/G_9000.pth", "config_path": "model_name/config.json"},
47-
]
48-
4945

5046
@dataclass
5147
class AsDictMixin:
@@ -391,6 +387,9 @@ class LanguageIdentification(AsDictMixin):
391387
espeak_library: str = r"C:/Program Files/eSpeak NG/libespeak-ng.dll" if "win" in sys.platform else ""
392388
# zh ja ko en... If it is empty, it will be read based on the text_cleaners specified in the config.json.
393389
language_automatic_detect: list = field(default_factory=list)
390+
split_pattern: str = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
391+
r'\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
392+
r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
394393

395394

396395
@dataclass
@@ -476,9 +475,6 @@ def load_config():
476475
else:
477476
logging.info("config.yaml is empty, initializing config.yaml...")
478477

479-
# Load default models from config.py.
480-
# config.update_config(model_list)
481-
482478
# If parameters are incomplete, they will be automatically filled in upon saving.
483479
Config.save_config(config)
484480

utils/sentence.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import regex as re
44

5+
from contants import config
56
from utils.data_utils import check_is_none
67
from utils.classify_language import classify_language, split_alpha_nonalpha
78

@@ -18,9 +19,7 @@ def _expand_hyphens(text):
1819

1920

2021
def markup_language(text: str, target_languages: list = None) -> str:
21-
pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
22-
r'\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
23-
r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
22+
pattern = config.LanguageIdentification.split_pattern
2423
sentences = re.split(pattern, text)
2524

2625
pre_lang = ""
@@ -51,9 +50,7 @@ def markup_language(text: str, target_languages: list = None) -> str:
5150

5251
def split_languages(text: str, target_languages: list = None, segment_size: int = 50,
5352
expand_abbreviations: bool = False, expand_hyphens: bool = False) -> list:
54-
pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
55-
r'\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
56-
r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
53+
pattern = config.language_identification.split_pattern
5754
sentences = re.split(pattern, text)
5855

5956
pre_lang = ""

0 commit comments

Comments
 (0)