updated scripts for text

This commit is contained in:
jinzr 2024-03-12 12:40:44 +08:00
parent d45e4c61e1
commit d887bf8c63
2 changed files with 13 additions and 2 deletions

View File

@ -30,6 +30,7 @@ from pathlib import Path
from typing import List from typing import List
import pycantonese import pycantonese
from preprocess_commonvoice import normalize_text
from tqdm.auto import tqdm from tqdm.auto import tqdm
from icefall.utils import is_cjk from icefall.utils import is_cjk
@ -54,6 +55,13 @@ def get_parser():
type=str, type=str,
help="The output directory", help="The output directory",
) )
parser.add_argument(
"--lang",
"-l",
default="yue",
type=str,
help="The language",
)
return parser return parser
@ -102,13 +110,15 @@ if __name__ == "__main__":
input_file = Path(args.input_file) input_file = Path(args.input_file)
output_dir = Path(args.output_dir) output_dir = Path(args.output_dir)
lang = Path(args.lang)
assert input_file.is_file(), f"{input_file} does not exist" assert input_file.is_file(), f"{input_file} does not exist"
assert output_dir.is_dir(), f"{output_dir} does not exist" assert output_dir.is_dir(), f"{output_dir} does not exist"
lines = input_file.read_text(encoding="utf-8").strip().split("\n") lines = input_file.read_text(encoding="utf-8").strip().split("\n")
norm_lines = [normalize_text(line, lang) for line in lines]
text_words_segments = get_word_segments(lines) text_words_segments = get_word_segments(norm_lines)
with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f: with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f:
f.writelines(text_words_segments) f.writelines(text_words_segments)

View File

@ -194,7 +194,8 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
# Get words.txt and words_no_ids.txt # Get words.txt and words_no_ids.txt
./local/word_segment_yue.py \ ./local/word_segment_yue.py \
--input-file $lang_dir/text \ --input-file $lang_dir/text \
--output-dir $lang_dir --output-dir $lang_dir \
--lang $lang
mv $lang_dir/text $lang_dir/_text mv $lang_dir/text $lang_dir/_text
cp $lang_dir/transcript_words.txt $lang_dir/text cp $lang_dir/transcript_words.txt $lang_dir/text