from local

2025-12-11 06:55:27 +00:00 · 2023-02-16 17:47:35 +09:00 · 2023-02-16 17:47:35 +09:00 · 352e1d221a
commit 352e1d221a
parent dfecc5b81a
1 changed files with 90 additions and 0 deletions
--- a/egs/librispeech/ASR/local/convert_transcript_words_to_bpe_ids.py
+++ b/egs/librispeech/ASR/local/convert_transcript_words_to_bpe_ids.py
@ -0,0 +1,90 @@
 #!/usr/bin/env python3
 # Copyright    2022 Xiaomi Corporation  (Author: Mingshuang Luo)
 """
 Convert a transcript based on words to a list of BPE ids.
 For example, if we use 2 as the encoding id of <unk>:
 texts = ['this is a <unk> day']
 spm_ids = [[38, 33, 6, 2, 316]]
 texts = ['<unk> this is a sunny day']
 spm_ids = [[2, 38, 33, 6, 118, 11, 11, 21, 316]]
 texts = ['<unk>']
 spm_ids = [[2]]
 """
 import argparse
 import logging
 from typing import List
 import sentencepiece as spm
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--texts", type=List[str], help="The input transcripts list.")
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bpe_500/bpe.model",
        help="Path to the BPE model",
    )
    return parser.parse_args()
 def convert_texts_into_ids(
    texts: List[str],
    unk_id: int,
    sp: spm.SentencePieceProcessor,
 ) -> List[List[int]]:
    """
    Args:
      texts:
        A string list of transcripts, such as ['Today is Monday', 'It's sunny'].
      unk_id:
        A number id for the token '<unk>'.
    Returns:
      Return an integer list of bpe ids.
    """
    y = []
    for text in texts:
        y_ids = []
        if "<unk>" in text:
            text_segments = text.split("<unk>")
            id_segments = sp.encode(text_segments, out_type=int)
            for i in range(len(id_segments)):
                if i != len(id_segments) - 1:
                    y_ids.extend(id_segments[i] + [unk_id])
                else:
                    y_ids.extend(id_segments[i])
        else:
            y_ids = sp.encode(text, out_type=int)
        y.append(y_ids)
    return y
 def main():
    args = get_args()
    texts = args.texts
    bpe_model = args.bpe_model
    sp = spm.SentencePieceProcessor()
    sp.load(bpe_model)
    unk_id = sp.piece_to_id("<unk>")
    y = convert_texts_into_ids(
        texts=texts,
        unk_id=unk_id,
        sp=sp,
    )
    logging.info(f"The input texts: {texts}")
    logging.info(f"The encoding ids: {y}")
 if __name__ == "__main__":
    main()