icefall/egs/iwslt22_ta/ST/local/transcript_cleaning.py
2023-11-01 06:39:24 +03:00

244 lines
7.9 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# Copyright 2020 Kanari AI (Amir Hussein)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
import pdb
import numpy as np
import pandas as pd
import re
import string
import argparse
import sys
import os
import pyarabic.number as number
from pyarabic import araby
_unicode = u"\u0622\u0624\u0626\u0628\u062a\u062c\u06af\u062e\u0630\u0632\u0634\u0636\u0638\u063a\u0640\u0642\u0644\u0646\u0648\u064a\u064c\u064e\u0650\u0652\u0670\u067e\u0686\u0621\u0623\u0625\u06a4\u0627\u0629\u062b\u062d\u062f\u0631\u0633\u0635\u0637\u0639\u0641\u0643\u0645\u0647\u0649\u064b\u064d\u064f\u0651\u0671"
_buckwalter = u"|&}btjGx*z$DZg_qlnwyNaio`PJ'><VApvHdrsSTEfkmhYFKu~{"
_backwardMap = {ord(b):a for a,b in zip(_buckwalter, _unicode)}
# def number2text(anumber):
# """
# Convert number to arabic words, for example convert 25 --> خمسة و عشرون
# Example:
# >>> number2text(523)
# خمسمئة وثلاث وعشرون
# @param anumber: input number
# @type anumber: int
# @return: number words
# @rtype: unicode
# """
# # test if the given type is numeric(float or int
# # if ok, convert it to string
# if type(anumber) is int or type(anumber) is float:
# anumber = str(anumber)
# # if the given type is str/unicode, test if it's a valid number
# elif type(anumber) is str or type(anumber) is unicode:
# try:
# a = float(anumber);
# except ValueError:
# return u"صفر"
# # if the given number not a valid return 0
# else:
# return u"صفر"
# arbn = number.ArNumbers()
# arbn.set_feminine(2)
# return arbn.int2str(anumber)
# return total
def fromBuckWalter(s):
return s.translate(_backwardMap)
def read_tsv(data_file):
text_data = list()
infile = open(data_file, encoding='utf-8')
for line in infile:
if not line.strip():
continue
text= line.split('\t')
text_data.append(text)
return text_data
words_to_remove =['#غير_واضح', '#تلعثم', '', '', '#ال', '#بال', '#وال', ' #وب', '###تداخل', 'FRN','RFN',\
' #سي', '#يي', '#هـ' , '#لل', '#بم', '#الش', '#آآ', ' #يت', '#وو', \
'#ومش', '#ول', '#وسي', '#غير_معروف','#العا','#مطا', '#محم' ,' #ماث','#متطو', ' #نشا', '#أأ', '#آآآ', ' #استج']
def normalizeArabic(text):
# text = re.sub("[إأٱآا]", "ا", text)
# text = re.sub("ى", "ي", text)
# text = re.sub("ة", "ه", text)
# text = re.sub("ئ", "ء", text)
# text = re.sub("ؤ", "ء", text)
text = re.sub(r"(ه){2,}", "ههه", text)
text = re.sub(r"(أ){2,}", "ا", text)
text = re.sub(r"(ا){2,}", "ا", text)
text = re.sub(r"(آ){2,}", "ا", text)
text = re.sub(r"(ص){2,}", "ص", text)
text = re.sub(r"(و){2,}", "و", text)
return text
def remove_special_words(text):
# remove special words from the transcription text
for word in words_to_remove:
if word in text:
text = text.replace(word, '')
return text
def remove_hashes(text):
return re.sub(r'#?', '', text)
def remove_english_characters(text):
return re.sub(r'[^\u0600-\u06FF0-9\s]+', '', text)
def remove_diacritics(text):
return re.sub(r'[\u064B-\u0652\u06D4\u0670\u0674\u06D5-\u06ED]+', '', text)
def remove_punctuations(text):
""" This function removes all punctuations except the verbatim """
arabic_punctuations = '''﴿`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
all_punctuations = set(arabic_punctuations + english_punctuations)-{'@','%','.'} # remove all non verbatim punctuations
for p in all_punctuations:
if p in text:
text = text.replace(p, '')
text = re.sub('\s+\.','',text) # keep only the "." that is part of a word: marsad@aljazeera.net . => marsad@aljazeera.net
return text
def remove_extra_space(text):
text = text.lower()
text = re.sub('\s+', ' ', text)
text = re.sub('\s+\.\s+', '.', text)
return text
def remove_dot(text):
words = text.split()
res = []
for word in words:
if word.replace('.','').isnumeric(): # remove the dot if it is not part of a number
res.append(word)
else:
word = re.sub('\s+\.','',word)
res.append(word)
return " ".join(res)
def east_to_west_num(text):
eastern_to_western = {"٠":"0","١":"1","٢":"2","٣":"3","٤":"4","٥":"5","٦":"6","٧":"7","٨":"8","٩":"9","٪":"%","_":" ","ڤ":"ف","|":" "}
trans_string = str.maketrans(eastern_to_western)
return text.translate(trans_string)
def remove_repeating_char(text):
return re.sub(r'(.)\1+', r'\1', text)
def remove_single_char_word(text):
"""
Remove single character word from text
Example: I am in a a home for two years => am in home for two years
Args:
text (str): text
Returns:
(str): text with single char removed
"""
words = text.split()
filter_words = [word for word in words if len(word) > 1 or word.isnumeric()]
return " ".join(filter_words)
def seperate_english_characters(text):
text = text.lower()
res = re.findall(r'[a-z]+', text) # search for english
for match in res:
if match not in {'.',' '}:
text = re.sub(match, " "+ match+ " ",text)
text = re.sub('\s+', ' ', text)
return text
def digit2num(text, dig2num=False):
""" This function is used to clean numbers"""
# search for numbers with spaces
# 100 . 000 => 100.000
res = re.search('[0-9]+\s\.\s[0-9]+', text)
if res != None:
t = re.sub(r'\s', '', res[0])
text = re.sub(res[0], t, text)
# seperate numbers glued with words
# 3أشهر => 3 أشهر
# من10الى15 => من 10 الى 15
# pdb.set_trace()
res = re.findall(r'[^\u0600-\u06FF\%\@a-z]+', text) # search for digits
for match in res:
if match not in {'.',' '}:
text = re.sub(match, " "+ match+ " ",text)
text = re.sub('\s+', ' ', text)
# transliterate numbers to digits
# 13 => ثلاثة عشر
if dig2num == True:
words = araby.tokenize(text)
for i in range(len(words)):
digit = re.sub(r'[\u0600-\u06FF]+', '', words[i])
if digit.isnumeric():
sub_word = re.sub(r'[^\u0600-\u06FF]+', '', words[i])
if number.number2text(digit) != 'صفر':
words[i] = sub_word + number.number2text(digit)
else:
pass
return " ".join(words)
else:
return text
def data_cleaning(text):
# text = remove_special_words(text)
text = remove_punctuations(text)
text = remove_single_char_word(text)
text = remove_diacritics(text)
text = seperate_english_characters(text)
text = remove_extra_space(text)
text = remove_dot(text)
text = east_to_west_num(text)
text = digit2num(text, True)
text = normalizeArabic(text)
#text = re.sub(r'#\w{1,2}\b', '', text) # text = re.sub(r'#\w{1,3}\b', '', text)
#text = remove_hashes(text)
#text = normalizeArabic(text)
return text
def main():
input_file = sys.argv[1] # input transcription file with format <id> <text>
to_BW = str(sys.argv[2]) # transform to BW True|False
output_file=sys.argv[3] # output file name
data = read_tsv(input_file)
new_data = []
for i in range(len(data)):
tokens = data[i][0].split()
tokens[1:] = data_cleaning(" ".join(tokens[1:])).split()
#tokens = data_cleaning(" ".join(tokens)).split()
if to_BW == "True":
for i in range(len(tokens[1:])):
tokens[i+1] = fromBuckWalter(tokens[i+1])
new_data.append(" ".join(tokens))
else:
new_data.append(" ".join(tokens))
df = pd.DataFrame(data=new_data)
df.to_csv(output_file, sep = '\n', header=False, index=False)
if __name__ == "__main__":
main()