This commit is contained in:
yfyeung 2024-04-02 08:10:19 +00:00
parent 4ae9a00ec5
commit 6df88a71b1
2 changed files with 30 additions and 21 deletions

View File

@ -15,6 +15,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import argparse
import logging import logging
import re import re
import unicodedata import unicodedata
@ -51,28 +52,28 @@ def normalize_text(
# Language-related normalization # Language-related normalization
if lang == "Thai": if lang == "Thai":
# Digit mapping # Digit mapping
text = re.sub(r"\u0030", r"\u0E50", text) text = re.sub("\u0030", "\u0E50", text)
text = re.sub(r"\u0031", r"\u0E51", text) text = re.sub("\u0031", "\u0E51", text)
text = re.sub(r"\u0032", r"\u0E52", text) text = re.sub("\u0032", "\u0E52", text)
text = re.sub(r"\u0033", r"\u0E53", text) text = re.sub("\u0033", "\u0E53", text)
text = re.sub(r"\u0034", r"\u0E54", text) text = re.sub("\u0034", "\u0E54", text)
text = re.sub(r"\u0035", r"\u0E55", text) text = re.sub("\u0035", "\u0E55", text)
text = re.sub(r"\u0036", r"\u0E56", text) text = re.sub("\u0036", "\u0E56", text)
text = re.sub(r"\u0037", r"\u0E57", text) text = re.sub("\u0037", "\u0E57", text)
text = re.sub(r"\u0038", r"\u0E58", text) text = re.sub("\u0038", "\u0E58", text)
text = re.sub(r"\u0039", r"\u0E59", text) text = re.sub("\u0039", "\u0E59", text)
# Currency symbols mapping # Currency symbols mapping
text = re.sub(r"\u0024", "ดอลลาร์", text) # $ text = re.sub("\u0024", "ดอลลาร์", text) # $
text = re.sub(r"\u00A3", "ปอนด์", text) # £ text = re.sub("\u00A3", "ปอนด์", text) # £
text = re.sub(r"\u00A5", "หยวน", text) # ¥ text = re.sub("\u00A5", "หยวน", text) # ¥
text = re.sub(r"\u20AC", "ยูโร", text) # € text = re.sub("\u20AC", "ยูโร", text) # €
text = re.sub(r"\u0E3F", "บาท", text) # ฿ text = re.sub("\u0E3F", "บาท", text) # ฿
# Temperature/Angle symbols mapping # Temperature/Angle symbols mapping
text = re.sub(r"\u00B0\u0043", "องศาเซลเซียส", text) # °C text = re.sub("\u00B0\u0043", "องศาเซลเซียส", text) # °C
text = re.sub(r"\u00B0\u0046", "องศาฟาเรนไฮต์", text) # °F text = re.sub("\u00B0\u0046", "องศาฟาเรนไฮต์", text) # °F
text = re.sub(r"\u00B0", "องศา", text) # ° text = re.sub("\u00B0", "องศา", text) # °
# Remove blank symbols # Remove blank symbols
text = re.sub(r"\s", "", text) text = re.sub(r"\s", "", text)
@ -114,7 +115,7 @@ def preprocess_gigaspeech2(args):
continue continue
for sup in m["supervisions"]: for sup in m["supervisions"]:
sup.text = normalize_text(sup.text) sup.text = normalize_text(sup.text, args.lang)
logging.info(f"Processing {partition}") logging.info(f"Processing {partition}")
cut_set = CutSet.from_manifests( cut_set = CutSet.from_manifests(

View File

@ -45,10 +45,18 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
fi fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Compute fbank for gigaspeech2" log "State 2: Preprocess GigaSpeech2 manifest"
if [ ! -f data/fbank/.preprocess.done ]; then
python3 ./local/preprocess_gigaspeech2.py --lang $lang
touch data/fbank/.preprocess.done
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Compute fbank for gigaspeech2"
mkdir -p data/fbank mkdir -p data/fbank
if [ ! -e data/fbank/.gigaspeech2.done ]; then if [ ! -e data/fbank/.gigaspeech2.done ]; then
./local/compute_fbank_gigaspeech2.py --lang $lang ./local/compute_fbank_gigaspeech2.py
touch data/fbank/.gigaspeech2.done touch data/fbank/.gigaspeech2.done
fi fi
fi fi