From 6df88a71b13d3e43c59bea935a3804c2093766d8 Mon Sep 17 00:00:00 2001 From: yfyeung Date: Tue, 2 Apr 2024 08:10:19 +0000 Subject: [PATCH] update --- .../SSL/local/preprocess_gigaspeech2.py | 39 ++++++++++--------- egs/gigaspeech2/SSL/prepare.sh | 12 +++++- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/egs/gigaspeech2/SSL/local/preprocess_gigaspeech2.py b/egs/gigaspeech2/SSL/local/preprocess_gigaspeech2.py index 8647a7594..b7cd0c923 100755 --- a/egs/gigaspeech2/SSL/local/preprocess_gigaspeech2.py +++ b/egs/gigaspeech2/SSL/local/preprocess_gigaspeech2.py @@ -15,6 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import argparse import logging import re import unicodedata @@ -51,28 +52,28 @@ def normalize_text( # Language-related normalization if lang == "Thai": # Digit mapping - text = re.sub(r"\u0030", r"\u0E50", text) - text = re.sub(r"\u0031", r"\u0E51", text) - text = re.sub(r"\u0032", r"\u0E52", text) - text = re.sub(r"\u0033", r"\u0E53", text) - text = re.sub(r"\u0034", r"\u0E54", text) - text = re.sub(r"\u0035", r"\u0E55", text) - text = re.sub(r"\u0036", r"\u0E56", text) - text = re.sub(r"\u0037", r"\u0E57", text) - text = re.sub(r"\u0038", r"\u0E58", text) - text = re.sub(r"\u0039", r"\u0E59", text) + text = re.sub("\u0030", "\u0E50", text) + text = re.sub("\u0031", "\u0E51", text) + text = re.sub("\u0032", "\u0E52", text) + text = re.sub("\u0033", "\u0E53", text) + text = re.sub("\u0034", "\u0E54", text) + text = re.sub("\u0035", "\u0E55", text) + text = re.sub("\u0036", "\u0E56", text) + text = re.sub("\u0037", "\u0E57", text) + text = re.sub("\u0038", "\u0E58", text) + text = re.sub("\u0039", "\u0E59", text) # Currency symbols mapping - text = re.sub(r"\u0024", "ดอลลาร์", text) # $ - text = re.sub(r"\u00A3", "ปอนด์", text) # £ - text = re.sub(r"\u00A5", "หยวน", text) # ¥ - text = re.sub(r"\u20AC", "ยูโร", text) # € - text = re.sub(r"\u0E3F", "บาท", text) # ฿ + text = re.sub("\u0024", "ดอลลาร์", text) # $ + text = re.sub("\u00A3", "ปอนด์", text) # £ + text = re.sub("\u00A5", "หยวน", text) # ¥ + text = re.sub("\u20AC", "ยูโร", text) # € + text = re.sub("\u0E3F", "บาท", text) # ฿ # Temperature/Angle symbols mapping - text = re.sub(r"\u00B0\u0043", "องศาเซลเซียส", text) # °C - text = re.sub(r"\u00B0\u0046", "องศาฟาเรนไฮต์", text) # °F - text = re.sub(r"\u00B0", "องศา", text) # ° + text = re.sub("\u00B0\u0043", "องศาเซลเซียส", text) # °C + text = re.sub("\u00B0\u0046", "องศาฟาเรนไฮต์", text) # °F + text = re.sub("\u00B0", "องศา", text) # ° # Remove blank symbols text = re.sub(r"\s", "", text) @@ -114,7 +115,7 @@ def preprocess_gigaspeech2(args): continue for sup in m["supervisions"]: - sup.text = normalize_text(sup.text) + sup.text = normalize_text(sup.text, args.lang) logging.info(f"Processing {partition}") cut_set = CutSet.from_manifests( diff --git a/egs/gigaspeech2/SSL/prepare.sh b/egs/gigaspeech2/SSL/prepare.sh index f20f57f85..42590229d 100755 --- a/egs/gigaspeech2/SSL/prepare.sh +++ b/egs/gigaspeech2/SSL/prepare.sh @@ -45,10 +45,18 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Compute fbank for gigaspeech2" + log "State 2: Preprocess GigaSpeech2 manifest" + if [ ! -f data/fbank/.preprocess.done ]; then + python3 ./local/preprocess_gigaspeech2.py --lang $lang + touch data/fbank/.preprocess.done + fi +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Compute fbank for gigaspeech2" mkdir -p data/fbank if [ ! -e data/fbank/.gigaspeech2.done ]; then - ./local/compute_fbank_gigaspeech2.py --lang $lang + ./local/compute_fbank_gigaspeech2.py touch data/fbank/.gigaspeech2.done fi fi