Fix decoding byte bpes tokens to words. (#1966)

This commit is contained in:
Wei Kang 2025-06-19 12:26:01 +08:00 committed by GitHub
parent 762f965cf7
commit 3587c4b3b7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -328,9 +328,14 @@ def main():
logging.info(msg)
def token_ids_to_words(token_ids: List[int]) -> str:
text = ""
byte_list = []
for i in token_ids:
text += token_table[i]
token = token_table[i]
if token.startswith("<0x") and token.endswith(">"):
byte_list.append(int(token[3:-1], 16))
else:
byte_list += list(token.encode("utf-8"))
text = bytes(byte_list).decode("utf-8", errors='ignore')
return text.replace("", " ").strip()
if params.method == "fast_beam_search":