mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-06 23:54:17 +00:00
244 lines
7.9 KiB
Python
Executable File
244 lines
7.9 KiB
Python
Executable File
#!/usr/bin/env python
|
||
# Copyright 2020 Kanari AI (Amir Hussein)
|
||
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||
import pdb
|
||
import numpy as np
|
||
import pandas as pd
|
||
import re
|
||
import string
|
||
import argparse
|
||
import sys
|
||
import os
|
||
import pyarabic.number as number
|
||
from pyarabic import araby
|
||
|
||
_unicode = u"\u0622\u0624\u0626\u0628\u062a\u062c\u06af\u062e\u0630\u0632\u0634\u0636\u0638\u063a\u0640\u0642\u0644\u0646\u0648\u064a\u064c\u064e\u0650\u0652\u0670\u067e\u0686\u0621\u0623\u0625\u06a4\u0627\u0629\u062b\u062d\u062f\u0631\u0633\u0635\u0637\u0639\u0641\u0643\u0645\u0647\u0649\u064b\u064d\u064f\u0651\u0671"
|
||
_buckwalter = u"|&}btjGx*z$DZg_qlnwyNaio`PJ'><VApvHdrsSTEfkmhYFKu~{"
|
||
_backwardMap = {ord(b):a for a,b in zip(_buckwalter, _unicode)}
|
||
|
||
|
||
# def number2text(anumber):
|
||
# """
|
||
# Convert number to arabic words, for example convert 25 --> خمسة و عشرون
|
||
# Example:
|
||
# >>> number2text(523)
|
||
# خمسمئة وثلاث وعشرون
|
||
# @param anumber: input number
|
||
# @type anumber: int
|
||
# @return: number words
|
||
# @rtype: unicode
|
||
# """
|
||
# # test if the given type is numeric(float or int
|
||
# # if ok, convert it to string
|
||
# if type(anumber) is int or type(anumber) is float:
|
||
# anumber = str(anumber)
|
||
# # if the given type is str/unicode, test if it's a valid number
|
||
# elif type(anumber) is str or type(anumber) is unicode:
|
||
# try:
|
||
# a = float(anumber);
|
||
# except ValueError:
|
||
# return u"صفر"
|
||
# # if the given number not a valid return 0
|
||
# else:
|
||
# return u"صفر"
|
||
# arbn = number.ArNumbers()
|
||
# arbn.set_feminine(2)
|
||
# return arbn.int2str(anumber)
|
||
|
||
# return total
|
||
|
||
|
||
def fromBuckWalter(s):
|
||
return s.translate(_backwardMap)
|
||
|
||
def read_tsv(data_file):
|
||
text_data = list()
|
||
infile = open(data_file, encoding='utf-8')
|
||
for line in infile:
|
||
if not line.strip():
|
||
continue
|
||
text= line.split('\t')
|
||
text_data.append(text)
|
||
|
||
return text_data
|
||
|
||
words_to_remove =['#غير_واضح', '#تلعثم', '#آ', '#أ', '#ال', '#بال', '#وال', ' #وب', '###تداخل', 'FRN','RFN',\
|
||
' #سي', '#يي', '#هـ' , '#لل', '#بم', '#الش', '#آآ', ' #يت', '#وو', \
|
||
'#ومش', '#ول', '#وسي', '#غير_معروف','#العا','#مطا', '#محم' ,' #ماث','#متطو', ' #نشا', '#أأ', '#آآآ', ' #استج']
|
||
|
||
|
||
def normalizeArabic(text):
|
||
# text = re.sub("[إأٱآا]", "ا", text)
|
||
# text = re.sub("ى", "ي", text)
|
||
# text = re.sub("ة", "ه", text)
|
||
# text = re.sub("ئ", "ء", text)
|
||
# text = re.sub("ؤ", "ء", text)
|
||
text = re.sub(r"(ه){2,}", "ههه", text)
|
||
text = re.sub(r"(أ){2,}", "ا", text)
|
||
text = re.sub(r"(ا){2,}", "ا", text)
|
||
text = re.sub(r"(آ){2,}", "ا", text)
|
||
text = re.sub(r"(ص){2,}", "ص", text)
|
||
text = re.sub(r"(و){2,}", "و", text)
|
||
return text
|
||
|
||
def remove_special_words(text):
|
||
# remove special words from the transcription text
|
||
for word in words_to_remove:
|
||
if word in text:
|
||
text = text.replace(word, '')
|
||
return text
|
||
|
||
def remove_hashes(text):
|
||
return re.sub(r'#?', '', text)
|
||
|
||
def remove_english_characters(text):
|
||
return re.sub(r'[^\u0600-\u06FF0-9\s]+', '', text)
|
||
|
||
def remove_diacritics(text):
|
||
return re.sub(r'[\u064B-\u0652\u06D4\u0670\u0674\u06D5-\u06ED]+', '', text)
|
||
|
||
def remove_punctuations(text):
|
||
""" This function removes all punctuations except the verbatim """
|
||
|
||
arabic_punctuations = '''﴿﴾‘`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
|
||
english_punctuations = string.punctuation
|
||
all_punctuations = set(arabic_punctuations + english_punctuations)-{'@','%','.'} # remove all non verbatim punctuations
|
||
|
||
for p in all_punctuations:
|
||
if p in text:
|
||
text = text.replace(p, '')
|
||
text = re.sub('\s+\.','',text) # keep only the "." that is part of a word: marsad@aljazeera.net . => marsad@aljazeera.net
|
||
return text
|
||
|
||
def remove_extra_space(text):
|
||
text = text.lower()
|
||
text = re.sub('\s+', ' ', text)
|
||
text = re.sub('\s+\.\s+', '.', text)
|
||
return text
|
||
|
||
def remove_dot(text):
|
||
|
||
words = text.split()
|
||
res = []
|
||
for word in words:
|
||
if word.replace('.','').isnumeric(): # remove the dot if it is not part of a number
|
||
res.append(word)
|
||
|
||
else:
|
||
word = re.sub('\s+\.','',word)
|
||
res.append(word)
|
||
|
||
return " ".join(res)
|
||
|
||
def east_to_west_num(text):
|
||
eastern_to_western = {"٠":"0","١":"1","٢":"2","٣":"3","٤":"4","٥":"5","٦":"6","٧":"7","٨":"8","٩":"9","٪":"%","_":" ","ڤ":"ف","|":" "}
|
||
trans_string = str.maketrans(eastern_to_western)
|
||
return text.translate(trans_string)
|
||
|
||
def remove_repeating_char(text):
|
||
return re.sub(r'(.)\1+', r'\1', text)
|
||
|
||
def remove_single_char_word(text):
|
||
"""
|
||
Remove single character word from text
|
||
Example: I am in a a home for two years => am in home for two years
|
||
Args:
|
||
text (str): text
|
||
Returns:
|
||
(str): text with single char removed
|
||
"""
|
||
words = text.split()
|
||
|
||
filter_words = [word for word in words if len(word) > 1 or word.isnumeric()]
|
||
return " ".join(filter_words)
|
||
|
||
def seperate_english_characters(text):
|
||
text = text.lower()
|
||
res = re.findall(r'[a-z]+', text) # search for english
|
||
for match in res:
|
||
if match not in {'.',' '}:
|
||
text = re.sub(match, " "+ match+ " ",text)
|
||
text = re.sub('\s+', ' ', text)
|
||
return text
|
||
|
||
def digit2num(text, dig2num=False):
|
||
|
||
""" This function is used to clean numbers"""
|
||
|
||
# search for numbers with spaces
|
||
# 100 . 000 => 100.000
|
||
|
||
res = re.search('[0-9]+\s\.\s[0-9]+', text)
|
||
if res != None:
|
||
t = re.sub(r'\s', '', res[0])
|
||
text = re.sub(res[0], t, text)
|
||
|
||
# seperate numbers glued with words
|
||
# 3أشهر => 3 أشهر
|
||
# من10الى15 => من 10 الى 15
|
||
# pdb.set_trace()
|
||
res = re.findall(r'[^\u0600-\u06FF\%\@a-z]+', text) # search for digits
|
||
for match in res:
|
||
if match not in {'.',' '}:
|
||
text = re.sub(match, " "+ match+ " ",text)
|
||
text = re.sub('\s+', ' ', text)
|
||
|
||
# transliterate numbers to digits
|
||
# 13 => ثلاثة عشر
|
||
|
||
if dig2num == True:
|
||
words = araby.tokenize(text)
|
||
for i in range(len(words)):
|
||
digit = re.sub(r'[\u0600-\u06FF]+', '', words[i])
|
||
if digit.isnumeric():
|
||
sub_word = re.sub(r'[^\u0600-\u06FF]+', '', words[i])
|
||
if number.number2text(digit) != 'صفر':
|
||
words[i] = sub_word + number.number2text(digit)
|
||
else:
|
||
pass
|
||
|
||
return " ".join(words)
|
||
else:
|
||
return text
|
||
|
||
def data_cleaning(text):
|
||
# text = remove_special_words(text)
|
||
text = remove_punctuations(text)
|
||
text = remove_single_char_word(text)
|
||
text = remove_diacritics(text)
|
||
text = seperate_english_characters(text)
|
||
text = remove_extra_space(text)
|
||
text = remove_dot(text)
|
||
text = east_to_west_num(text)
|
||
text = digit2num(text, True)
|
||
text = normalizeArabic(text)
|
||
#text = re.sub(r'#\w{1,2}\b', '', text) # text = re.sub(r'#\w{1,3}\b', '', text)
|
||
#text = remove_hashes(text)
|
||
#text = normalizeArabic(text)
|
||
return text
|
||
|
||
def main():
|
||
input_file = sys.argv[1] # input transcription file with format <id> <text>
|
||
to_BW = str(sys.argv[2]) # transform to BW True|False
|
||
output_file=sys.argv[3] # output file name
|
||
data = read_tsv(input_file)
|
||
new_data = []
|
||
for i in range(len(data)):
|
||
|
||
tokens = data[i][0].split()
|
||
|
||
tokens[1:] = data_cleaning(" ".join(tokens[1:])).split()
|
||
#tokens = data_cleaning(" ".join(tokens)).split()
|
||
if to_BW == "True":
|
||
for i in range(len(tokens[1:])):
|
||
tokens[i+1] = fromBuckWalter(tokens[i+1])
|
||
new_data.append(" ".join(tokens))
|
||
else:
|
||
new_data.append(" ".join(tokens))
|
||
|
||
df = pd.DataFrame(data=new_data)
|
||
df.to_csv(output_file, sep = '\n', header=False, index=False)
|
||
|
||
if __name__ == "__main__":
|
||
main()
|