change vocab table

2025-09-03 22:24:19 +00:00 · 2023-09-08 16:07:39 +08:00 · 2023-09-08 16:07:39 +08:00 · 7e387dd54b
commit 7e387dd54b
parent 72e9a436b8
3 changed files with 1307 additions and 1 deletions
--- a/egs/aishell/ASR/run.sh
+++ b/egs/aishell/ASR/run.sh
@ -5,4 +5,4 @@ pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.
 export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
 export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/seamless_communication/src
 export TORCH_HOME=/lustre/fsw/sa/yuekaiz/asr/hub
-torchrun --nproc-per-node 8 seamlessm4t/train2.py --use-fp16 1 --max-duration 300 --base-lr 1e-5 --exp-dir seamlessm4t/exp --start-epoch 6
+torchrun --nproc-per-node 8 seamlessm4t/train.py --use-fp16 1 --max-duration 300 --base-lr 1e-5 --exp-dir seamlessm4t/exp_new_vocab --start-epoch 1
--- a/egs/aishell/ASR/seamlessm4t/tokenizer.py
+++ b/egs/aishell/ASR/seamlessm4t/tokenizer.py
@ -0,0 +1,43 @@
+
+#import sentencepiece as spm
+
+class CharTokenizer(object):
+    def __init__(self, tokenizer_file):
+        self.id2symbol = {}
+        self.symbol2id = {}
+        with open(tokenizer_file, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    symbol, id = line.split()
+                    id = int(id)
+                    self.id2symbol[id] = symbol
+                    self.symbol2id[symbol] = id
+        self.vocab_size = len(self.id2symbol)
+
+    def encode(self, text):
+        # if symbol not in self.symbol2id, using <unk>'s id
+        return [self.symbol2id.get(symbol, 2) for symbol in text]
+
+    def decode(self, ids):
+        return ''.join([self.id2symbol[id] for id in ids])
+
+if __name__ == '__main__':
+    # config_file = './config.yaml'
+    # config = read_yaml(config_file)
+    # converter = TokenIDConverter(config['token_list'])
+    # ids = converter.tokens2ids(['<s>', '你', '好', '吗', '</s>', 'microsoft', 'world'])
+    # print(ids)
+    # print(converter.ids2tokens(ids))
+
+
+    tokenizer = CharTokenizer('./tokens.txt')
+    ids = tokenizer.encode('今天 天气不错')
+    print(ids)
+    print(tokenizer.decode(ids+[1]))
+    # sp = spm.SentencePieceProcessor()
+    # sp.Load('../../../librispeech/ASR/k2fsa-zipformer-chinese-english-mixed/data/lang_char_bpe/bpe.model')
+    # texts = ['MICROSOFT  WORLD']
+    # y = sp.encode(texts, out_type=int)
+    # x = sp.decode(y)
+    # print(y, x)
--- a/egs/aishell/ASR/seamlessm4t/train.py
+++ b/egs/aishell/ASR/seamlessm4t/train.py