mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-03 22:24:19 +00:00
change vocab table
This commit is contained in:
parent
72e9a436b8
commit
7e387dd54b
@ -5,4 +5,4 @@ pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.
|
||||
export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
|
||||
export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/seamless_communication/src
|
||||
export TORCH_HOME=/lustre/fsw/sa/yuekaiz/asr/hub
|
||||
torchrun --nproc-per-node 8 seamlessm4t/train2.py --use-fp16 1 --max-duration 300 --base-lr 1e-5 --exp-dir seamlessm4t/exp --start-epoch 6
|
||||
torchrun --nproc-per-node 8 seamlessm4t/train.py --use-fp16 1 --max-duration 300 --base-lr 1e-5 --exp-dir seamlessm4t/exp_new_vocab --start-epoch 1
|
||||
|
43
egs/aishell/ASR/seamlessm4t/tokenizer.py
Normal file
43
egs/aishell/ASR/seamlessm4t/tokenizer.py
Normal file
@ -0,0 +1,43 @@
|
||||
|
||||
#import sentencepiece as spm
|
||||
|
||||
class CharTokenizer(object):
|
||||
def __init__(self, tokenizer_file):
|
||||
self.id2symbol = {}
|
||||
self.symbol2id = {}
|
||||
with open(tokenizer_file, 'r') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
symbol, id = line.split()
|
||||
id = int(id)
|
||||
self.id2symbol[id] = symbol
|
||||
self.symbol2id[symbol] = id
|
||||
self.vocab_size = len(self.id2symbol)
|
||||
|
||||
def encode(self, text):
|
||||
# if symbol not in self.symbol2id, using <unk>'s id
|
||||
return [self.symbol2id.get(symbol, 2) for symbol in text]
|
||||
|
||||
def decode(self, ids):
|
||||
return ''.join([self.id2symbol[id] for id in ids])
|
||||
|
||||
if __name__ == '__main__':
|
||||
# config_file = './config.yaml'
|
||||
# config = read_yaml(config_file)
|
||||
# converter = TokenIDConverter(config['token_list'])
|
||||
# ids = converter.tokens2ids(['<s>', '你', '好', '吗', '</s>', 'microsoft', 'world'])
|
||||
# print(ids)
|
||||
# print(converter.ids2tokens(ids))
|
||||
|
||||
|
||||
tokenizer = CharTokenizer('./tokens.txt')
|
||||
ids = tokenizer.encode('今天 天气不错')
|
||||
print(ids)
|
||||
print(tokenizer.decode(ids+[1]))
|
||||
# sp = spm.SentencePieceProcessor()
|
||||
# sp.Load('../../../librispeech/ASR/k2fsa-zipformer-chinese-english-mixed/data/lang_char_bpe/bpe.model')
|
||||
# texts = ['MICROSOFT WORLD']
|
||||
# y = sp.encode(texts, out_type=int)
|
||||
# x = sp.decode(y)
|
||||
# print(y, x)
|
1263
egs/aishell/ASR/seamlessm4t/train.py
Normal file
1263
egs/aishell/ASR/seamlessm4t/train.py
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user