change vocab table

This commit is contained in:
Yuekai Zhang 2023-09-08 16:07:39 +08:00
parent 72e9a436b8
commit 7e387dd54b
3 changed files with 1307 additions and 1 deletions

View File

@ -5,4 +5,4 @@ pip install k2==1.24.3.dev20230524+cuda11.8.torch2.0.1 -f https://k2-fsa.github.
export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/icefall
export PYTHONPATH=$PYTHONPATH:/lustre/fsw/sa/yuekaiz/asr/seamless_communication/src
export TORCH_HOME=/lustre/fsw/sa/yuekaiz/asr/hub
torchrun --nproc-per-node 8 seamlessm4t/train2.py --use-fp16 1 --max-duration 300 --base-lr 1e-5 --exp-dir seamlessm4t/exp --start-epoch 6
torchrun --nproc-per-node 8 seamlessm4t/train.py --use-fp16 1 --max-duration 300 --base-lr 1e-5 --exp-dir seamlessm4t/exp_new_vocab --start-epoch 1

View File

@ -0,0 +1,43 @@
#import sentencepiece as spm
class CharTokenizer(object):
def __init__(self, tokenizer_file):
self.id2symbol = {}
self.symbol2id = {}
with open(tokenizer_file, 'r') as f:
for line in f:
line = line.strip()
if line:
symbol, id = line.split()
id = int(id)
self.id2symbol[id] = symbol
self.symbol2id[symbol] = id
self.vocab_size = len(self.id2symbol)
def encode(self, text):
# if symbol not in self.symbol2id, using <unk>'s id
return [self.symbol2id.get(symbol, 2) for symbol in text]
def decode(self, ids):
return ''.join([self.id2symbol[id] for id in ids])
if __name__ == '__main__':
# config_file = './config.yaml'
# config = read_yaml(config_file)
# converter = TokenIDConverter(config['token_list'])
# ids = converter.tokens2ids(['<s>', '你', '好', '吗', '</s>', 'microsoft', 'world'])
# print(ids)
# print(converter.ids2tokens(ids))
tokenizer = CharTokenizer('./tokens.txt')
ids = tokenizer.encode('今天 天气不错')
print(ids)
print(tokenizer.decode(ids+[1]))
# sp = spm.SentencePieceProcessor()
# sp.Load('../../../librispeech/ASR/k2fsa-zipformer-chinese-english-mixed/data/lang_char_bpe/bpe.model')
# texts = ['MICROSOFT WORLD']
# y = sp.encode(texts, out_type=int)
# x = sp.decode(y)
# print(y, x)

File diff suppressed because it is too large Load Diff