diff --git a/egs/wenetspeech/ASR/local/prepare_words.py b/egs/wenetspeech/ASR/local/prepare_words.py new file mode 100644 index 000000000..65aca2983 --- /dev/null +++ b/egs/wenetspeech/ASR/local/prepare_words.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2021 Xiaomi Corp. (authors: Mingshuang Luo) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script takes as input words.txt without ids: + - words_no_ids.txt +and generates the new words.txt with related ids. + - words.txt +""" + + +import argparse +import logging + +from tqdm import tqdm + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Prepare words.txt", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--input-file", + default="data/lang_char/words_no_ids.txt", + type=str, + help="the words file without ids for WenetSpeech", + ) + parser.add_argument( + "--output-file", + default="data/lang_char/words.txt", + type=str, + help="the words file with ids for WenetSpeech", + ) + + return parser + + +def main(): + parser = get_parser() + args = parser.parse_args() + + input_file = args.input_file + output_file = args.output_file + + f = open(input_file, "r", encoding="utf-8") + lines = f.readlines() + new_lines = [] + add_words = [" 0", "!SIL 1", " 2", " 3"] + new_lines.extend(add_words) + + logging.info("Starting reading the input file") + for i in tqdm(range(len(lines))): + x = lines[i] + idx = 4 + i + new_line = str(x.strip("\n")) + " " + str(idx) + new_lines.append(new_line) + + logging.info("Starting writing the words.txt") + f_out = open(output_file, "w", encoding="utf-8") + for line in new_lines: + f_out.write(line) + f_out.write("\n") + + +if __name__ == "__main__": + main() diff --git a/egs/wenetspeech/ASR/local/text2segments.py b/egs/wenetspeech/ASR/local/text2segments.py new file mode 100644 index 000000000..b55277e95 --- /dev/null +++ b/egs/wenetspeech/ASR/local/text2segments.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2021 Xiaomi Corp. (authors: Mingshuang Luo) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script takes as input "text", which refers to the transcript file for +WenetSpeech: + - text +and generates the output file text_word_segmentation which is implemented +with word segmenting: + - text_words_segmentation +""" + + +import argparse + +import jieba +from tqdm import tqdm + +jieba.enable_paddle() + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Chinese Word Segmentation for text", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--input", + default="data/lang_char/text", + type=str, + help="the input text file for WenetSpeech", + ) + parser.add_argument( + "--output", + default="data/lang_char/text_words_segmentation", + type=str, + help="the text implemented with words segmenting for WenetSpeech", + ) + + return parser + + +def main(): + parser = get_parser() + args = parser.parse_args() + + input_file = args.input + output_file = args.output + + f = open(input_file, "r", encoding="utf-8") + lines = f.readlines() + new_lines = [] + for i in tqdm(range(len(lines))): + x = lines[i].rstrip() + seg_list = jieba.cut(x, use_paddle=True) + new_line = " ".join(seg_list) + new_lines.append(new_line) + + f_new = open(output_file, "w", encoding="utf-8") + for line in new_lines: + f_new.write(line) + f_new.write("\n") + + +if __name__ == "__main__": + main() diff --git a/egs/wenetspeech/ASR/prepare.sh b/egs/wenetspeech/ASR/prepare.sh index 0d1d6f2a9..22c5e449f 100755 --- a/egs/wenetspeech/ASR/prepare.sh +++ b/egs/wenetspeech/ASR/prepare.sh @@ -199,6 +199,23 @@ if [ $stage -le 15 ] && [ $stop_stage -ge 15 ]; then # grep "\"text\":" $dl_dir/WenetSpeech/WenetSpeech.json | # sed -e 's/["text:\t ]*//g' > $lang_char_dir/text fi + + # The implementation of chinese word segmentation for text, + # and it will take about 15 minutes. + if [ ! -f $lang_char_dir/text_words_segmentation ]; then + python ./local/text2segments.py \ + --input $lang_char_dir/text \ + --output $lang_char_dir/text_words_segmentation + fi + + cat $lang_char_dir/text_words_segmentation | sed 's/ /\n/g' \ + | sort -u | sed '/^$/d' | uniq > $lang_char_dir/words_no_ids.txt + + if [ ! -f $lang_char_dir/words.txt ]; then + python ./local/prepare_words.py \ + --input-file $lang_char_dir/words_no_ids.txt \ + --output-file $lang_char_dir/words.txt + fi fi if [ $stage -le 16 ] && [ $stop_stage -ge 16 ]; then