Fix deocding byte bpes tokens to words.

This commit is contained in:
Wei Kang 2025-06-19 11:17:38 +08:00 committed by GitHub
parent 762f965cf7
commit 857507795d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -328,9 +328,14 @@ def main():
logging.info(msg)
def token_ids_to_words(token_ids: List[int]) -> str:
text = ""
byte_list = []
for i in token_ids:
text += token_table[i]
token = token_table[i]
if token.startswith("<0x") and token.endswith(">"):
byte_list.append(int(x[3:-1], 16))
else:
byte_list += list(token.encode("utf-8"))
text = bytes(byte_list).decode("utf-8")
return text.replace("", " ").strip()
if params.method == "fast_beam_search":