mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
update
This commit is contained in:
parent
4ae9a00ec5
commit
6df88a71b1
@ -15,6 +15,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import unicodedata
|
import unicodedata
|
||||||
@ -51,28 +52,28 @@ def normalize_text(
|
|||||||
# Language-related normalization
|
# Language-related normalization
|
||||||
if lang == "Thai":
|
if lang == "Thai":
|
||||||
# Digit mapping
|
# Digit mapping
|
||||||
text = re.sub(r"\u0030", r"\u0E50", text)
|
text = re.sub("\u0030", "\u0E50", text)
|
||||||
text = re.sub(r"\u0031", r"\u0E51", text)
|
text = re.sub("\u0031", "\u0E51", text)
|
||||||
text = re.sub(r"\u0032", r"\u0E52", text)
|
text = re.sub("\u0032", "\u0E52", text)
|
||||||
text = re.sub(r"\u0033", r"\u0E53", text)
|
text = re.sub("\u0033", "\u0E53", text)
|
||||||
text = re.sub(r"\u0034", r"\u0E54", text)
|
text = re.sub("\u0034", "\u0E54", text)
|
||||||
text = re.sub(r"\u0035", r"\u0E55", text)
|
text = re.sub("\u0035", "\u0E55", text)
|
||||||
text = re.sub(r"\u0036", r"\u0E56", text)
|
text = re.sub("\u0036", "\u0E56", text)
|
||||||
text = re.sub(r"\u0037", r"\u0E57", text)
|
text = re.sub("\u0037", "\u0E57", text)
|
||||||
text = re.sub(r"\u0038", r"\u0E58", text)
|
text = re.sub("\u0038", "\u0E58", text)
|
||||||
text = re.sub(r"\u0039", r"\u0E59", text)
|
text = re.sub("\u0039", "\u0E59", text)
|
||||||
|
|
||||||
# Currency symbols mapping
|
# Currency symbols mapping
|
||||||
text = re.sub(r"\u0024", "ดอลลาร์", text) # $
|
text = re.sub("\u0024", "ดอลลาร์", text) # $
|
||||||
text = re.sub(r"\u00A3", "ปอนด์", text) # £
|
text = re.sub("\u00A3", "ปอนด์", text) # £
|
||||||
text = re.sub(r"\u00A5", "หยวน", text) # ¥
|
text = re.sub("\u00A5", "หยวน", text) # ¥
|
||||||
text = re.sub(r"\u20AC", "ยูโร", text) # €
|
text = re.sub("\u20AC", "ยูโร", text) # €
|
||||||
text = re.sub(r"\u0E3F", "บาท", text) # ฿
|
text = re.sub("\u0E3F", "บาท", text) # ฿
|
||||||
|
|
||||||
# Temperature/Angle symbols mapping
|
# Temperature/Angle symbols mapping
|
||||||
text = re.sub(r"\u00B0\u0043", "องศาเซลเซียส", text) # °C
|
text = re.sub("\u00B0\u0043", "องศาเซลเซียส", text) # °C
|
||||||
text = re.sub(r"\u00B0\u0046", "องศาฟาเรนไฮต์", text) # °F
|
text = re.sub("\u00B0\u0046", "องศาฟาเรนไฮต์", text) # °F
|
||||||
text = re.sub(r"\u00B0", "องศา", text) # °
|
text = re.sub("\u00B0", "องศา", text) # °
|
||||||
|
|
||||||
# Remove blank symbols
|
# Remove blank symbols
|
||||||
text = re.sub(r"\s", "", text)
|
text = re.sub(r"\s", "", text)
|
||||||
@ -114,7 +115,7 @@ def preprocess_gigaspeech2(args):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
for sup in m["supervisions"]:
|
for sup in m["supervisions"]:
|
||||||
sup.text = normalize_text(sup.text)
|
sup.text = normalize_text(sup.text, args.lang)
|
||||||
|
|
||||||
logging.info(f"Processing {partition}")
|
logging.info(f"Processing {partition}")
|
||||||
cut_set = CutSet.from_manifests(
|
cut_set = CutSet.from_manifests(
|
||||||
|
@ -45,10 +45,18 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||||
log "Stage 2: Compute fbank for gigaspeech2"
|
log "State 2: Preprocess GigaSpeech2 manifest"
|
||||||
|
if [ ! -f data/fbank/.preprocess.done ]; then
|
||||||
|
python3 ./local/preprocess_gigaspeech2.py --lang $lang
|
||||||
|
touch data/fbank/.preprocess.done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
|
log "Stage 3: Compute fbank for gigaspeech2"
|
||||||
mkdir -p data/fbank
|
mkdir -p data/fbank
|
||||||
if [ ! -e data/fbank/.gigaspeech2.done ]; then
|
if [ ! -e data/fbank/.gigaspeech2.done ]; then
|
||||||
./local/compute_fbank_gigaspeech2.py --lang $lang
|
./local/compute_fbank_gigaspeech2.py
|
||||||
touch data/fbank/.gigaspeech2.done
|
touch data/fbank/.gigaspeech2.done
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
Loading…
x
Reference in New Issue
Block a user