do some changes

2022-05-07 19:51:56 +08:00 · 2022-05-07 19:51:56 +08:00 · da78063c9f
commit da78063c9f
parent 1b51f1840b
3 changed files with 184 additions and 0 deletions
--- a/egs/wenetspeech/ASR/local/prepare_words.py
+++ b/egs/wenetspeech/ASR/local/prepare_words.py
@ -0,0 +1,84 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Copyright    2021  Xiaomi Corp.        (authors: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes as input words.txt without ids:
    - words_no_ids.txt
 and generates the new words.txt with related ids.
    - words.txt
 """
 import argparse
 import logging
 from tqdm import tqdm
 def get_parser():
    parser = argparse.ArgumentParser(
        description="Prepare words.txt",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--input-file",
        default="data/lang_char/words_no_ids.txt",
        type=str,
        help="the words file without ids for WenetSpeech",
    )
    parser.add_argument(
        "--output-file",
        default="data/lang_char/words.txt",
        type=str,
        help="the words file with ids for WenetSpeech",
    )
    return parser
 def main():
    parser = get_parser()
    args = parser.parse_args()
    input_file = args.input_file
    output_file = args.output_file
    f = open(input_file, "r", encoding="utf-8")
    lines = f.readlines()
    new_lines = []
    add_words = ["<eps> 0", "!SIL 1", "<SPOKEN_NOISE> 2", "<UNK> 3"]
    new_lines.extend(add_words)
    logging.info("Starting reading the input file")
    for i in tqdm(range(len(lines))):
        x = lines[i]
        idx = 4 + i
        new_line = str(x.strip("\n")) + " " + str(idx)
        new_lines.append(new_line)
    logging.info("Starting writing the words.txt")
    f_out = open(output_file, "w", encoding="utf-8")
    for line in new_lines:
        f_out.write(line)
        f_out.write("\n")
 if __name__ == "__main__":
    main()
--- a/egs/wenetspeech/ASR/local/text2segments.py
+++ b/egs/wenetspeech/ASR/local/text2segments.py
@ -0,0 +1,83 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Copyright    2021  Xiaomi Corp.        (authors: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes as input "text", which refers to the transcript file for
 WenetSpeech:
    - text
 and generates the output file text_word_segmentation which is implemented
 with word segmenting:
    - text_words_segmentation
 """
 import argparse
 import jieba
 from tqdm import tqdm
 jieba.enable_paddle()
 def get_parser():
    parser = argparse.ArgumentParser(
        description="Chinese Word Segmentation for text",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--input",
        default="data/lang_char/text",
        type=str,
        help="the input text file for WenetSpeech",
    )
    parser.add_argument(
        "--output",
        default="data/lang_char/text_words_segmentation",
        type=str,
        help="the text implemented with words segmenting for WenetSpeech",
    )
    return parser
 def main():
    parser = get_parser()
    args = parser.parse_args()
    input_file = args.input
    output_file = args.output
    f = open(input_file, "r", encoding="utf-8")
    lines = f.readlines()
    new_lines = []
    for i in tqdm(range(len(lines))):
        x = lines[i].rstrip()
        seg_list = jieba.cut(x, use_paddle=True)
        new_line = " ".join(seg_list)
        new_lines.append(new_line)
    f_new = open(output_file, "w", encoding="utf-8")
    for line in new_lines:
        f_new.write(line)
        f_new.write("\n")
 if __name__ == "__main__":
    main()
--- a/egs/wenetspeech/ASR/prepare.sh
+++ b/egs/wenetspeech/ASR/prepare.sh
@ -199,6 +199,23 @@ if [ $stage -le 15 ] && [ $stop_stage -ge 15 ]; then
    # grep "\"text\":" $dl_dir/WenetSpeech/WenetSpeech.json |
    #   sed -e 's/["text:\t ]*//g' > $lang_char_dir/text
  fi
  # The implementation of chinese word segmentation for text,
  # and it will take about 15 minutes.
  if [ ! -f $lang_char_dir/text_words_segmentation ]; then
    python ./local/text2segments.py \
      --input $lang_char_dir/text \
      --output $lang_char_dir/text_words_segmentation
  fi
  cat $lang_char_dir/text_words_segmentation | sed 's/ /\n/g' \
    | sort -u | sed '/^$/d' | uniq > $lang_char_dir/words_no_ids.txt
  if [ ! -f $lang_char_dir/words.txt ]; then
    python ./local/prepare_words.py \
      --input-file $lang_char_dir/words_no_ids.txt \
      --output-file $lang_char_dir/words.txt
  fi
 fi
 if [ $stage -le 16 ] && [ $stop_stage -ge 16 ]; then