mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
updated scripts for text
This commit is contained in:
parent
d45e4c61e1
commit
d887bf8c63
@ -30,6 +30,7 @@ from pathlib import Path
|
|||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import pycantonese
|
import pycantonese
|
||||||
|
from preprocess_commonvoice import normalize_text
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
from icefall.utils import is_cjk
|
from icefall.utils import is_cjk
|
||||||
@ -54,6 +55,13 @@ def get_parser():
|
|||||||
type=str,
|
type=str,
|
||||||
help="The output directory",
|
help="The output directory",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--lang",
|
||||||
|
"-l",
|
||||||
|
default="yue",
|
||||||
|
type=str,
|
||||||
|
help="The language",
|
||||||
|
)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
@ -102,13 +110,15 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
input_file = Path(args.input_file)
|
input_file = Path(args.input_file)
|
||||||
output_dir = Path(args.output_dir)
|
output_dir = Path(args.output_dir)
|
||||||
|
lang = Path(args.lang)
|
||||||
|
|
||||||
assert input_file.is_file(), f"{input_file} does not exist"
|
assert input_file.is_file(), f"{input_file} does not exist"
|
||||||
assert output_dir.is_dir(), f"{output_dir} does not exist"
|
assert output_dir.is_dir(), f"{output_dir} does not exist"
|
||||||
|
|
||||||
lines = input_file.read_text(encoding="utf-8").strip().split("\n")
|
lines = input_file.read_text(encoding="utf-8").strip().split("\n")
|
||||||
|
norm_lines = [normalize_text(line, lang) for line in lines]
|
||||||
|
|
||||||
text_words_segments = get_word_segments(lines)
|
text_words_segments = get_word_segments(norm_lines)
|
||||||
with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f:
|
with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f:
|
||||||
f.writelines(text_words_segments)
|
f.writelines(text_words_segments)
|
||||||
|
|
||||||
|
@ -194,7 +194,8 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
|
|||||||
# Get words.txt and words_no_ids.txt
|
# Get words.txt and words_no_ids.txt
|
||||||
./local/word_segment_yue.py \
|
./local/word_segment_yue.py \
|
||||||
--input-file $lang_dir/text \
|
--input-file $lang_dir/text \
|
||||||
--output-dir $lang_dir
|
--output-dir $lang_dir \
|
||||||
|
--lang $lang
|
||||||
|
|
||||||
mv $lang_dir/text $lang_dir/_text
|
mv $lang_dir/text $lang_dir/_text
|
||||||
cp $lang_dir/transcript_words.txt $lang_dir/text
|
cp $lang_dir/transcript_words.txt $lang_dir/text
|
||||||
|
Loading…
x
Reference in New Issue
Block a user