mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 10:16:14 +00:00
updated scripts for text
This commit is contained in:
parent
d45e4c61e1
commit
d887bf8c63
@ -30,6 +30,7 @@ from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
import pycantonese
|
||||
from preprocess_commonvoice import normalize_text
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from icefall.utils import is_cjk
|
||||
@ -54,6 +55,13 @@ def get_parser():
|
||||
type=str,
|
||||
help="The output directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lang",
|
||||
"-l",
|
||||
default="yue",
|
||||
type=str,
|
||||
help="The language",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
@ -102,13 +110,15 @@ if __name__ == "__main__":
|
||||
|
||||
input_file = Path(args.input_file)
|
||||
output_dir = Path(args.output_dir)
|
||||
lang = Path(args.lang)
|
||||
|
||||
assert input_file.is_file(), f"{input_file} does not exist"
|
||||
assert output_dir.is_dir(), f"{output_dir} does not exist"
|
||||
|
||||
lines = input_file.read_text(encoding="utf-8").strip().split("\n")
|
||||
norm_lines = [normalize_text(line, lang) for line in lines]
|
||||
|
||||
text_words_segments = get_word_segments(lines)
|
||||
text_words_segments = get_word_segments(norm_lines)
|
||||
with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f:
|
||||
f.writelines(text_words_segments)
|
||||
|
||||
|
@ -194,7 +194,8 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
|
||||
# Get words.txt and words_no_ids.txt
|
||||
./local/word_segment_yue.py \
|
||||
--input-file $lang_dir/text \
|
||||
--output-dir $lang_dir
|
||||
--output-dir $lang_dir \
|
||||
--lang $lang
|
||||
|
||||
mv $lang_dir/text $lang_dir/_text
|
||||
cp $lang_dir/transcript_words.txt $lang_dir/text
|
||||
|
Loading…
x
Reference in New Issue
Block a user