mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
update
This commit is contained in:
parent
4ae9a00ec5
commit
6df88a71b1
@ -15,6 +15,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import re
|
||||
import unicodedata
|
||||
@ -51,28 +52,28 @@ def normalize_text(
|
||||
# Language-related normalization
|
||||
if lang == "Thai":
|
||||
# Digit mapping
|
||||
text = re.sub(r"\u0030", r"\u0E50", text)
|
||||
text = re.sub(r"\u0031", r"\u0E51", text)
|
||||
text = re.sub(r"\u0032", r"\u0E52", text)
|
||||
text = re.sub(r"\u0033", r"\u0E53", text)
|
||||
text = re.sub(r"\u0034", r"\u0E54", text)
|
||||
text = re.sub(r"\u0035", r"\u0E55", text)
|
||||
text = re.sub(r"\u0036", r"\u0E56", text)
|
||||
text = re.sub(r"\u0037", r"\u0E57", text)
|
||||
text = re.sub(r"\u0038", r"\u0E58", text)
|
||||
text = re.sub(r"\u0039", r"\u0E59", text)
|
||||
text = re.sub("\u0030", "\u0E50", text)
|
||||
text = re.sub("\u0031", "\u0E51", text)
|
||||
text = re.sub("\u0032", "\u0E52", text)
|
||||
text = re.sub("\u0033", "\u0E53", text)
|
||||
text = re.sub("\u0034", "\u0E54", text)
|
||||
text = re.sub("\u0035", "\u0E55", text)
|
||||
text = re.sub("\u0036", "\u0E56", text)
|
||||
text = re.sub("\u0037", "\u0E57", text)
|
||||
text = re.sub("\u0038", "\u0E58", text)
|
||||
text = re.sub("\u0039", "\u0E59", text)
|
||||
|
||||
# Currency symbols mapping
|
||||
text = re.sub(r"\u0024", "ดอลลาร์", text) # $
|
||||
text = re.sub(r"\u00A3", "ปอนด์", text) # £
|
||||
text = re.sub(r"\u00A5", "หยวน", text) # ¥
|
||||
text = re.sub(r"\u20AC", "ยูโร", text) # €
|
||||
text = re.sub(r"\u0E3F", "บาท", text) # ฿
|
||||
text = re.sub("\u0024", "ดอลลาร์", text) # $
|
||||
text = re.sub("\u00A3", "ปอนด์", text) # £
|
||||
text = re.sub("\u00A5", "หยวน", text) # ¥
|
||||
text = re.sub("\u20AC", "ยูโร", text) # €
|
||||
text = re.sub("\u0E3F", "บาท", text) # ฿
|
||||
|
||||
# Temperature/Angle symbols mapping
|
||||
text = re.sub(r"\u00B0\u0043", "องศาเซลเซียส", text) # °C
|
||||
text = re.sub(r"\u00B0\u0046", "องศาฟาเรนไฮต์", text) # °F
|
||||
text = re.sub(r"\u00B0", "องศา", text) # °
|
||||
text = re.sub("\u00B0\u0043", "องศาเซลเซียส", text) # °C
|
||||
text = re.sub("\u00B0\u0046", "องศาฟาเรนไฮต์", text) # °F
|
||||
text = re.sub("\u00B0", "องศา", text) # °
|
||||
|
||||
# Remove blank symbols
|
||||
text = re.sub(r"\s", "", text)
|
||||
@ -114,7 +115,7 @@ def preprocess_gigaspeech2(args):
|
||||
continue
|
||||
|
||||
for sup in m["supervisions"]:
|
||||
sup.text = normalize_text(sup.text)
|
||||
sup.text = normalize_text(sup.text, args.lang)
|
||||
|
||||
logging.info(f"Processing {partition}")
|
||||
cut_set = CutSet.from_manifests(
|
||||
|
@ -45,10 +45,18 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||
log "Stage 2: Compute fbank for gigaspeech2"
|
||||
log "State 2: Preprocess GigaSpeech2 manifest"
|
||||
if [ ! -f data/fbank/.preprocess.done ]; then
|
||||
python3 ./local/preprocess_gigaspeech2.py --lang $lang
|
||||
touch data/fbank/.preprocess.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
log "Stage 3: Compute fbank for gigaspeech2"
|
||||
mkdir -p data/fbank
|
||||
if [ ! -e data/fbank/.gigaspeech2.done ]; then
|
||||
./local/compute_fbank_gigaspeech2.py --lang $lang
|
||||
./local/compute_fbank_gigaspeech2.py
|
||||
touch data/fbank/.gigaspeech2.done
|
||||
fi
|
||||
fi
|
||||
|
Loading…
x
Reference in New Issue
Block a user