mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
CSJ Data Preparation (#617)
* workspace setup * csj prepare done * Change compute_fbank_musan.py t soft link * add description * change lhotse prepare csj command * split train-dev here * Add header * remove debug * save manifest_statistics * generate transcript in Lhotse * update comments in config file
This commit is contained in:
parent
d69bb826ed
commit
15c1a4a441
7
egs/csj/ASR/.gitignore
vendored
Normal file
7
egs/csj/ASR/.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
librispeech_*.*
|
||||
todelete*
|
||||
lang*
|
||||
notify_tg.py
|
||||
finetune_*
|
||||
misc.ini
|
||||
.vscode/*
|
173
egs/csj/ASR/local/compute_fbank_csj.py
Normal file
173
egs/csj/ASR/local/compute_fbank_csj.py
Normal file
@ -0,0 +1,173 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2022 The University of Electro-Communications (Author: Teo Wen Shen) # noqa
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from random import Random
|
||||
from typing import List, Tuple
|
||||
|
||||
import torch
|
||||
from lhotse import (
|
||||
CutSet,
|
||||
Fbank,
|
||||
FbankConfig,
|
||||
# fmt: off
|
||||
# See the following for why LilcomChunkyWriter is preferred
|
||||
# https://github.com/k2-fsa/icefall/pull/404
|
||||
# https://github.com/lhotse-speech/lhotse/pull/527
|
||||
# fmt: on
|
||||
LilcomChunkyWriter,
|
||||
RecordingSet,
|
||||
SupervisionSet,
|
||||
)
|
||||
|
||||
ARGPARSE_DESCRIPTION = """
|
||||
This script follows the espnet method of splitting the remaining core+noncore
|
||||
utterances into valid and train cutsets at an index which is by default 4000.
|
||||
|
||||
In other words, the core+noncore utterances are shuffled, where 4000 utterances
|
||||
of the shuffled set go to the `valid` cutset and are not subject to speed
|
||||
perturbation. The remaining utterances become the `train` cutset and are speed-
|
||||
perturbed (0.9x, 1.0x, 1.1x).
|
||||
|
||||
"""
|
||||
|
||||
# Torch's multithreaded behavior needs to be disabled or
|
||||
# it wastes a lot of CPU and slow things down.
|
||||
# Do this outside of main() in case it needs to take effect
|
||||
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
||||
torch.set_num_threads(1)
|
||||
torch.set_num_interop_threads(1)
|
||||
|
||||
RNG_SEED = 42
|
||||
|
||||
|
||||
def make_cutset_blueprints(
|
||||
manifest_dir: Path,
|
||||
split: int,
|
||||
) -> List[Tuple[str, CutSet]]:
|
||||
|
||||
cut_sets = []
|
||||
# Create eval datasets
|
||||
logging.info("Creating eval cuts.")
|
||||
for i in range(1, 4):
|
||||
cut_set = CutSet.from_manifests(
|
||||
recordings=RecordingSet.from_file(
|
||||
manifest_dir / f"csj_recordings_eval{i}.jsonl.gz"
|
||||
),
|
||||
supervisions=SupervisionSet.from_file(
|
||||
manifest_dir / f"csj_supervisions_eval{i}.jsonl.gz"
|
||||
),
|
||||
)
|
||||
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
|
||||
cut_sets.append((f"eval{i}", cut_set))
|
||||
|
||||
# Create train and valid cuts
|
||||
logging.info(
|
||||
"Loading, trimming, and shuffling the remaining core+noncore cuts."
|
||||
)
|
||||
recording_set = RecordingSet.from_file(
|
||||
manifest_dir / "csj_recordings_core.jsonl.gz"
|
||||
) + RecordingSet.from_file(manifest_dir / "csj_recordings_noncore.jsonl.gz")
|
||||
supervision_set = SupervisionSet.from_file(
|
||||
manifest_dir / "csj_supervisions_core.jsonl.gz"
|
||||
) + SupervisionSet.from_file(
|
||||
manifest_dir / "csj_supervisions_noncore.jsonl.gz"
|
||||
)
|
||||
|
||||
cut_set = CutSet.from_manifests(
|
||||
recordings=recording_set,
|
||||
supervisions=supervision_set,
|
||||
)
|
||||
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
|
||||
cut_set = cut_set.shuffle(Random(RNG_SEED))
|
||||
|
||||
logging.info(
|
||||
"Creating valid and train cuts from core and noncore,"
|
||||
f"split at {split}."
|
||||
)
|
||||
valid_set = CutSet.from_cuts(islice(cut_set, 0, split))
|
||||
|
||||
train_set = CutSet.from_cuts(islice(cut_set, split, None))
|
||||
train_set = (
|
||||
train_set + train_set.perturb_speed(0.9) + train_set.perturb_speed(1.1)
|
||||
)
|
||||
|
||||
cut_sets.extend([("valid", valid_set), ("train", train_set)])
|
||||
|
||||
return cut_sets
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=ARGPARSE_DESCRIPTION,
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--manifest-dir", type=Path, help="Path to save manifests"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fbank-dir", type=Path, help="Path to save fbank features"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--split", type=int, default=4000, help="Split at this index"
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
|
||||
extractor = Fbank(FbankConfig(num_mel_bins=80))
|
||||
num_jobs = min(16, os.cpu_count())
|
||||
|
||||
formatter = (
|
||||
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||
)
|
||||
|
||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||
|
||||
if (args.fbank_dir / ".done").exists():
|
||||
logging.info(
|
||||
"Previous fbank computed for CSJ found. "
|
||||
f"Delete {args.fbank_dir / '.done'} to allow recomputing fbank."
|
||||
)
|
||||
return
|
||||
else:
|
||||
cut_sets = make_cutset_blueprints(args.manifest_dir, args.split)
|
||||
for part, cut_set in cut_sets:
|
||||
logging.info(f"Processing {part}")
|
||||
cut_set = cut_set.compute_and_store_features(
|
||||
extractor=extractor,
|
||||
num_jobs=num_jobs,
|
||||
storage_path=(args.fbank_dir / f"feats_{part}").as_posix(),
|
||||
storage_type=LilcomChunkyWriter,
|
||||
)
|
||||
cut_set.to_file(args.manifest_dir / f"csj_cuts_{part}.jsonl.gz")
|
||||
|
||||
logging.info("All fbank computed for CSJ.")
|
||||
(args.fbank_dir / ".done").touch()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
1
egs/csj/ASR/local/compute_fbank_musan.py
Symbolic link
1
egs/csj/ASR/local/compute_fbank_musan.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/compute_fbank_musan.py
|
321
egs/csj/ASR/local/conf/disfluent.ini
Normal file
321
egs/csj/ASR/local/conf/disfluent.ini
Normal file
@ -0,0 +1,321 @@
|
||||
; # This section is ignored if this file is not supplied as the first config file to
|
||||
; # lhotse prepare csj
|
||||
[SEGMENTS]
|
||||
; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
|
||||
gap = 0.5
|
||||
; # Maximum length of segment (s).
|
||||
maxlen = 10
|
||||
; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.
|
||||
minlen = 0.02
|
||||
; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`.
|
||||
; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi.
|
||||
; # If you intend to use a multicharacter string for gap_sym, remember to register the
|
||||
; # multicharacter string as part of userdef-string in prepare_lang_char.py.
|
||||
gap_sym =
|
||||
|
||||
[CONSTANTS]
|
||||
; # Name of this mode
|
||||
MODE = disfluent
|
||||
; # Suffixes to use after the word surface (no longer used)
|
||||
MORPH = pos1 cForm cType2 pos2
|
||||
; # Used to differentiate between A tag and A_num tag
|
||||
JPN_NUM = ゼロ 0 零 一 二 三 四 五 六 七 八 九 十 百 千 .
|
||||
; # Dummy character to delineate multiline words
|
||||
PLUS = +
|
||||
|
||||
[DECISIONS]
|
||||
; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
|
||||
; # The PLUS (fullwidth) sign '+' marks line boundaries for multiline entries
|
||||
|
||||
; # フィラー、感情表出系感動詞
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(F ぎょっ)'
|
||||
F = 0
|
||||
; # Example: '(L (F ン))', '比べ(F えー)る'
|
||||
F^ = 0
|
||||
; # 言い直し、いいよどみなどによる語断片
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(D だ)(D だいが) 大学の学部の会議'
|
||||
D = 0
|
||||
; # Example: '(L (D ドゥ)+(D ヒ))'
|
||||
D^ = 0
|
||||
; # 助詞、助動詞、接辞の言い直し
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
|
||||
D2 = 0
|
||||
; # Example: '(X (D2 ノ))'
|
||||
D2^ = 0
|
||||
; # 聞き取りや語彙の判断に自信がない場合
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: (? 字数) の
|
||||
; # If no option: empty string is returned regardless of output
|
||||
; # Example: '(?) で'
|
||||
? = 0
|
||||
; # Example: '(D (? すー))+そう+です+よ+ね'
|
||||
?^ = 0
|
||||
; # タグ?で、値は複数の候補が想定される場合
|
||||
; # 0 for main guess with matching morph info, 1 for second guess
|
||||
; # Example: '(? 次数, 実数)', '(? これ,ここで)+(? 説明+し+た+方+が+いい+か+な)'
|
||||
?, = 0
|
||||
; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))'
|
||||
?,^ = 0
|
||||
; # 音や言葉に関するメタ的な引用
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
|
||||
M = 0
|
||||
; # Example: '(L (M ヒ)+(M ヒ))', '(L (M (? ヒ+ヒ)))'
|
||||
M^ = 0
|
||||
; # 外国語や古語、方言など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(O ザッツファイン)'
|
||||
O = 0
|
||||
; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))'
|
||||
O^ = 0
|
||||
; # 講演者の名前、差別語、誹謗中傷など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '国語研の (R ××) です'
|
||||
R = 0
|
||||
R^ = 0
|
||||
; # 非朗読対象発話(朗読における言い間違い等)
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(X 実際は) 実際には'
|
||||
X = 0
|
||||
; # Example: '(L (X (D2 ニ)))'
|
||||
X^ = 0
|
||||
; # アルファベットや算用数字、記号の表記
|
||||
; # 0 to use Japanese form, 1 to use alphabet form
|
||||
; # Example: '(A シーディーアール;CD-R)'
|
||||
A = 1
|
||||
; # Example: 'スモール(A エヌ;N)', 'ラージ(A キュー;Q)', '(A ティーエフ;TF)+(A アイディーエフ;IDF)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)')
|
||||
A^ = 1
|
||||
; # タグAで、単語は算用数字の場合
|
||||
; # 0 to use Japanese form, 1 to use Arabic numerals
|
||||
; # Example: (A 二千;2000)
|
||||
A_num = eval:self.notag
|
||||
A_num^ = eval:self.notag
|
||||
; # 何らかの原因で漢字表記できなくなった場合
|
||||
; # 0 to use broken form, 1 to use orthodox form
|
||||
; # Example: '(K たち (F えー) ばな;橘)'
|
||||
K = 1
|
||||
; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
|
||||
K^ = 1
|
||||
; # 転訛、発音の怠けなど、一時的な発音エラー
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(W ギーツ;ギジュツ)'
|
||||
W = 1
|
||||
; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
|
||||
W^ = 1
|
||||
; # 語の読みに関する知識レベルのいい間違い
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(B シブタイ;ジュータイ)'
|
||||
B = 0
|
||||
; # Example: 'データー(B カズ;スー)'
|
||||
B^ = 0
|
||||
; # 笑いながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)'
|
||||
笑 = 0
|
||||
; # Example: 'コク(笑 サイ+(D オン))',
|
||||
笑^ = 0
|
||||
; # 泣きながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(泣 ドンナニ)'
|
||||
泣 = 0
|
||||
泣^ = 0
|
||||
; # 咳をしながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: 'シャ(咳 リン) ノ'
|
||||
咳 = 0
|
||||
; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
|
||||
咳^ = 0
|
||||
; # ささやき声や独り言などの小さな声
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))+(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))'
|
||||
L = 0
|
||||
; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト'
|
||||
L^ = 0
|
||||
|
||||
[REPLACEMENTS]
|
||||
; # ボーカルフライなどで母音が同定できない場合
|
||||
<FV> =
|
||||
; # 「うん/うーん/ふーん」の音の特定が困難な場合
|
||||
<VN> =
|
||||
; # 非語彙的な母音の引き延ばし
|
||||
<H> =
|
||||
; # 非語彙的な子音の引き延ばし
|
||||
<Q> =
|
||||
; # 言語音と独立に講演者の笑いが生じている場合
|
||||
<笑> =
|
||||
; # 言語音と独立に講演者の咳が生じている場合
|
||||
<咳> =
|
||||
; # 言語音と独立に講演者の息が生じている場合
|
||||
<息> =
|
||||
; # 講演者の泣き声
|
||||
<泣> =
|
||||
; # 聴衆(司会者なども含む)の発話
|
||||
<フロア発話> =
|
||||
; # 聴衆の笑い
|
||||
<フロア笑> =
|
||||
; # 聴衆の拍手
|
||||
<拍手> =
|
||||
; # 講演者が発表中に用いたデモンストレーションの音声
|
||||
<デモ> =
|
||||
; # 学会講演に発表時間を知らせるためにならすベルの音
|
||||
<ベル> =
|
||||
; # 転記単位全体が再度読み直された場合
|
||||
<朗読間違い> =
|
||||
; # 上記以外の音で特に目立った音
|
||||
<雑音> =
|
||||
; # 0.2秒以上のポーズ
|
||||
<P> =
|
||||
; # Redacted information, for R
|
||||
; # It is \x00D7 multiplication sign, not your normal 'x'
|
||||
× = ×
|
||||
|
||||
[FIELDS]
|
||||
; # Time information for segment
|
||||
time = 3
|
||||
; # Word surface
|
||||
surface = 5
|
||||
; # Word surface root form without CSJ tags
|
||||
notag = 9
|
||||
; # Part Of Speech
|
||||
pos1 = 11
|
||||
; # Conjugated Form
|
||||
cForm = 12
|
||||
; # Conjugation Type
|
||||
cType1 = 13
|
||||
; # Subcategory of POS
|
||||
pos2 = 14
|
||||
; # Euphonic Change / Subcategory of Conjugation Type
|
||||
cType2 = 15
|
||||
; # Other information
|
||||
other = 16
|
||||
; # Pronunciation for lexicon
|
||||
pron = 10
|
||||
; # Speaker ID
|
||||
spk_id = 2
|
||||
|
||||
[KATAKANA2ROMAJI]
|
||||
ア = 'a
|
||||
イ = 'i
|
||||
ウ = 'u
|
||||
エ = 'e
|
||||
オ = 'o
|
||||
カ = ka
|
||||
キ = ki
|
||||
ク = ku
|
||||
ケ = ke
|
||||
コ = ko
|
||||
ガ = ga
|
||||
ギ = gi
|
||||
グ = gu
|
||||
ゲ = ge
|
||||
ゴ = go
|
||||
サ = sa
|
||||
シ = si
|
||||
ス = su
|
||||
セ = se
|
||||
ソ = so
|
||||
ザ = za
|
||||
ジ = zi
|
||||
ズ = zu
|
||||
ゼ = ze
|
||||
ゾ = zo
|
||||
タ = ta
|
||||
チ = ti
|
||||
ツ = tu
|
||||
テ = te
|
||||
ト = to
|
||||
ダ = da
|
||||
ヂ = di
|
||||
ヅ = du
|
||||
デ = de
|
||||
ド = do
|
||||
ナ = na
|
||||
ニ = ni
|
||||
ヌ = nu
|
||||
ネ = ne
|
||||
ノ = no
|
||||
ハ = ha
|
||||
ヒ = hi
|
||||
フ = hu
|
||||
ヘ = he
|
||||
ホ = ho
|
||||
バ = ba
|
||||
ビ = bi
|
||||
ブ = bu
|
||||
ベ = be
|
||||
ボ = bo
|
||||
パ = pa
|
||||
ピ = pi
|
||||
プ = pu
|
||||
ペ = pe
|
||||
ポ = po
|
||||
マ = ma
|
||||
ミ = mi
|
||||
ム = mu
|
||||
メ = me
|
||||
モ = mo
|
||||
ヤ = ya
|
||||
ユ = yu
|
||||
ヨ = yo
|
||||
ラ = ra
|
||||
リ = ri
|
||||
ル = ru
|
||||
レ = re
|
||||
ロ = ro
|
||||
ワ = wa
|
||||
ヰ = we
|
||||
ヱ = wi
|
||||
ヲ = wo
|
||||
ン = ŋ
|
||||
ッ = q
|
||||
ー = -
|
||||
キャ = kǐa
|
||||
キュ = kǐu
|
||||
キョ = kǐo
|
||||
ギャ = gǐa
|
||||
ギュ = gǐu
|
||||
ギョ = gǐo
|
||||
シャ = sǐa
|
||||
シュ = sǐu
|
||||
ショ = sǐo
|
||||
ジャ = zǐa
|
||||
ジュ = zǐu
|
||||
ジョ = zǐo
|
||||
チャ = tǐa
|
||||
チュ = tǐu
|
||||
チョ = tǐo
|
||||
ヂャ = dǐa
|
||||
ヂュ = dǐu
|
||||
ヂョ = dǐo
|
||||
ニャ = nǐa
|
||||
ニュ = nǐu
|
||||
ニョ = nǐo
|
||||
ヒャ = hǐa
|
||||
ヒュ = hǐu
|
||||
ヒョ = hǐo
|
||||
ビャ = bǐa
|
||||
ビュ = bǐu
|
||||
ビョ = bǐo
|
||||
ピャ = pǐa
|
||||
ピュ = pǐu
|
||||
ピョ = pǐo
|
||||
ミャ = mǐa
|
||||
ミュ = mǐu
|
||||
ミョ = mǐo
|
||||
リャ = rǐa
|
||||
リュ = rǐu
|
||||
リョ = rǐo
|
||||
ァ = a
|
||||
ィ = i
|
||||
ゥ = u
|
||||
ェ = e
|
||||
ォ = o
|
||||
ヮ = ʍ
|
||||
ヴ = vu
|
||||
ャ = ǐa
|
||||
ュ = ǐu
|
||||
ョ = ǐo
|
||||
|
321
egs/csj/ASR/local/conf/fluent.ini
Normal file
321
egs/csj/ASR/local/conf/fluent.ini
Normal file
@ -0,0 +1,321 @@
|
||||
; # This section is ignored if this file is not supplied as the first config file to
|
||||
; # lhotse prepare csj
|
||||
[SEGMENTS]
|
||||
; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
|
||||
gap = 0.5
|
||||
; # Maximum length of segment (s).
|
||||
maxlen = 10
|
||||
; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.
|
||||
minlen = 0.02
|
||||
; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`.
|
||||
; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi.
|
||||
; # If you intend to use a multicharacter string for gap_sym, remember to register the
|
||||
; # multicharacter string as part of userdef-string in prepare_lang_char.py.
|
||||
gap_sym =
|
||||
|
||||
[CONSTANTS]
|
||||
; # Name of this mode
|
||||
MODE = fluent
|
||||
; # Suffixes to use after the word surface (no longer used)
|
||||
MORPH = pos1 cForm cType2 pos2
|
||||
; # Used to differentiate between A tag and A_num tag
|
||||
JPN_NUM = ゼロ 0 零 一 二 三 四 五 六 七 八 九 十 百 千 .
|
||||
; # Dummy character to delineate multiline words
|
||||
PLUS = +
|
||||
|
||||
[DECISIONS]
|
||||
; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
|
||||
; # The PLUS (fullwidth) sign '+' marks line boundaries for multiline entries
|
||||
|
||||
; # フィラー、感情表出系感動詞
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(F ぎょっ)'
|
||||
F = 1
|
||||
; # Example: '(L (F ン))', '比べ(F えー)る'
|
||||
F^ = 1
|
||||
; # 言い直し、いいよどみなどによる語断片
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(D だ)(D だいが) 大学の学部の会議'
|
||||
D = 1
|
||||
; # Example: '(L (D ドゥ)+(D ヒ))'
|
||||
D^ = 1
|
||||
; # 助詞、助動詞、接辞の言い直し
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
|
||||
D2 = 1
|
||||
; # Example: '(X (D2 ノ))'
|
||||
D2^ = 1
|
||||
; # 聞き取りや語彙の判断に自信がない場合
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: (? 字数) の
|
||||
; # If no option: empty string is returned regardless of output
|
||||
; # Example: '(?) で'
|
||||
? = 0
|
||||
; # Example: '(D (? すー))+そう+です+よ+ね'
|
||||
?^ = 0
|
||||
; # タグ?で、値は複数の候補が想定される場合
|
||||
; # 0 for main guess with matching morph info, 1 for second guess
|
||||
; # Example: '(? 次数, 実数)', '(? これ,ここで)+(? 説明+し+た+方+が+いい+か+な)'
|
||||
?, = 0
|
||||
; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))'
|
||||
?,^ = 0
|
||||
; # 音や言葉に関するメタ的な引用
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
|
||||
M = 0
|
||||
; # Example: '(L (M ヒ)+(M ヒ))', '(L (M (? ヒ+ヒ)))'
|
||||
M^ = 0
|
||||
; # 外国語や古語、方言など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(O ザッツファイン)'
|
||||
O = 0
|
||||
; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))'
|
||||
O^ = 0
|
||||
; # 講演者の名前、差別語、誹謗中傷など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '国語研の (R ××) です'
|
||||
R = 0
|
||||
R^ = 0
|
||||
; # 非朗読対象発話(朗読における言い間違い等)
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(X 実際は) 実際には'
|
||||
X = 0
|
||||
; # Example: '(L (X (D2 ニ)))'
|
||||
X^ = 0
|
||||
; # アルファベットや算用数字、記号の表記
|
||||
; # 0 to use Japanese form, 1 to use alphabet form
|
||||
; # Example: '(A シーディーアール;CD-R)'
|
||||
A = 1
|
||||
; # Example: 'スモール(A エヌ;N)', 'ラージ(A キュー;Q)', '(A ティーエフ;TF)+(A アイディーエフ;IDF)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)')
|
||||
A^ = 1
|
||||
; # タグAで、単語は算用数字の場合
|
||||
; # 0 to use Japanese form, 1 to use Arabic numerals
|
||||
; # Example: (A 二千;2000)
|
||||
A_num = eval:self.notag
|
||||
A_num^ = eval:self.notag
|
||||
; # 何らかの原因で漢字表記できなくなった場合
|
||||
; # 0 to use broken form, 1 to use orthodox form
|
||||
; # Example: '(K たち (F えー) ばな;橘)'
|
||||
K = 1
|
||||
; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
|
||||
K^ = 1
|
||||
; # 転訛、発音の怠けなど、一時的な発音エラー
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(W ギーツ;ギジュツ)'
|
||||
W = 1
|
||||
; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
|
||||
W^ = 1
|
||||
; # 語の読みに関する知識レベルのいい間違い
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(B シブタイ;ジュータイ)'
|
||||
B = 0
|
||||
; # Example: 'データー(B カズ;スー)'
|
||||
B^ = 0
|
||||
; # 笑いながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)'
|
||||
笑 = 0
|
||||
; # Example: 'コク(笑 サイ+(D オン))',
|
||||
笑^ = 0
|
||||
; # 泣きながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(泣 ドンナニ)'
|
||||
泣 = 0
|
||||
泣^ = 0
|
||||
; # 咳をしながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: 'シャ(咳 リン) ノ'
|
||||
咳 = 0
|
||||
; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
|
||||
咳^ = 0
|
||||
; # ささやき声や独り言などの小さな声
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))+(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))'
|
||||
L = 0
|
||||
; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト'
|
||||
L^ = 0
|
||||
|
||||
[REPLACEMENTS]
|
||||
; # ボーカルフライなどで母音が同定できない場合
|
||||
<FV> =
|
||||
; # 「うん/うーん/ふーん」の音の特定が困難な場合
|
||||
<VN> =
|
||||
; # 非語彙的な母音の引き延ばし
|
||||
<H> =
|
||||
; # 非語彙的な子音の引き延ばし
|
||||
<Q> =
|
||||
; # 言語音と独立に講演者の笑いが生じている場合
|
||||
<笑> =
|
||||
; # 言語音と独立に講演者の咳が生じている場合
|
||||
<咳> =
|
||||
; # 言語音と独立に講演者の息が生じている場合
|
||||
<息> =
|
||||
; # 講演者の泣き声
|
||||
<泣> =
|
||||
; # 聴衆(司会者なども含む)の発話
|
||||
<フロア発話> =
|
||||
; # 聴衆の笑い
|
||||
<フロア笑> =
|
||||
; # 聴衆の拍手
|
||||
<拍手> =
|
||||
; # 講演者が発表中に用いたデモンストレーションの音声
|
||||
<デモ> =
|
||||
; # 学会講演に発表時間を知らせるためにならすベルの音
|
||||
<ベル> =
|
||||
; # 転記単位全体が再度読み直された場合
|
||||
<朗読間違い> =
|
||||
; # 上記以外の音で特に目立った音
|
||||
<雑音> =
|
||||
; # 0.2秒以上のポーズ
|
||||
<P> =
|
||||
; # Redacted information, for R
|
||||
; # It is \x00D7 multiplication sign, not your normal 'x'
|
||||
× = ×
|
||||
|
||||
[FIELDS]
|
||||
; # Time information for segment
|
||||
time = 3
|
||||
; # Word surface
|
||||
surface = 5
|
||||
; # Word surface root form without CSJ tags
|
||||
notag = 9
|
||||
; # Part Of Speech
|
||||
pos1 = 11
|
||||
; # Conjugated Form
|
||||
cForm = 12
|
||||
; # Conjugation Type
|
||||
cType1 = 13
|
||||
; # Subcategory of POS
|
||||
pos2 = 14
|
||||
; # Euphonic Change / Subcategory of Conjugation Type
|
||||
cType2 = 15
|
||||
; # Other information
|
||||
other = 16
|
||||
; # Pronunciation for lexicon
|
||||
pron = 10
|
||||
; # Speaker ID
|
||||
spk_id = 2
|
||||
|
||||
[KATAKANA2ROMAJI]
|
||||
ア = 'a
|
||||
イ = 'i
|
||||
ウ = 'u
|
||||
エ = 'e
|
||||
オ = 'o
|
||||
カ = ka
|
||||
キ = ki
|
||||
ク = ku
|
||||
ケ = ke
|
||||
コ = ko
|
||||
ガ = ga
|
||||
ギ = gi
|
||||
グ = gu
|
||||
ゲ = ge
|
||||
ゴ = go
|
||||
サ = sa
|
||||
シ = si
|
||||
ス = su
|
||||
セ = se
|
||||
ソ = so
|
||||
ザ = za
|
||||
ジ = zi
|
||||
ズ = zu
|
||||
ゼ = ze
|
||||
ゾ = zo
|
||||
タ = ta
|
||||
チ = ti
|
||||
ツ = tu
|
||||
テ = te
|
||||
ト = to
|
||||
ダ = da
|
||||
ヂ = di
|
||||
ヅ = du
|
||||
デ = de
|
||||
ド = do
|
||||
ナ = na
|
||||
ニ = ni
|
||||
ヌ = nu
|
||||
ネ = ne
|
||||
ノ = no
|
||||
ハ = ha
|
||||
ヒ = hi
|
||||
フ = hu
|
||||
ヘ = he
|
||||
ホ = ho
|
||||
バ = ba
|
||||
ビ = bi
|
||||
ブ = bu
|
||||
ベ = be
|
||||
ボ = bo
|
||||
パ = pa
|
||||
ピ = pi
|
||||
プ = pu
|
||||
ペ = pe
|
||||
ポ = po
|
||||
マ = ma
|
||||
ミ = mi
|
||||
ム = mu
|
||||
メ = me
|
||||
モ = mo
|
||||
ヤ = ya
|
||||
ユ = yu
|
||||
ヨ = yo
|
||||
ラ = ra
|
||||
リ = ri
|
||||
ル = ru
|
||||
レ = re
|
||||
ロ = ro
|
||||
ワ = wa
|
||||
ヰ = we
|
||||
ヱ = wi
|
||||
ヲ = wo
|
||||
ン = ŋ
|
||||
ッ = q
|
||||
ー = -
|
||||
キャ = kǐa
|
||||
キュ = kǐu
|
||||
キョ = kǐo
|
||||
ギャ = gǐa
|
||||
ギュ = gǐu
|
||||
ギョ = gǐo
|
||||
シャ = sǐa
|
||||
シュ = sǐu
|
||||
ショ = sǐo
|
||||
ジャ = zǐa
|
||||
ジュ = zǐu
|
||||
ジョ = zǐo
|
||||
チャ = tǐa
|
||||
チュ = tǐu
|
||||
チョ = tǐo
|
||||
ヂャ = dǐa
|
||||
ヂュ = dǐu
|
||||
ヂョ = dǐo
|
||||
ニャ = nǐa
|
||||
ニュ = nǐu
|
||||
ニョ = nǐo
|
||||
ヒャ = hǐa
|
||||
ヒュ = hǐu
|
||||
ヒョ = hǐo
|
||||
ビャ = bǐa
|
||||
ビュ = bǐu
|
||||
ビョ = bǐo
|
||||
ピャ = pǐa
|
||||
ピュ = pǐu
|
||||
ピョ = pǐo
|
||||
ミャ = mǐa
|
||||
ミュ = mǐu
|
||||
ミョ = mǐo
|
||||
リャ = rǐa
|
||||
リュ = rǐu
|
||||
リョ = rǐo
|
||||
ァ = a
|
||||
ィ = i
|
||||
ゥ = u
|
||||
ェ = e
|
||||
ォ = o
|
||||
ヮ = ʍ
|
||||
ヴ = vu
|
||||
ャ = ǐa
|
||||
ュ = ǐu
|
||||
ョ = ǐo
|
||||
|
321
egs/csj/ASR/local/conf/number.ini
Normal file
321
egs/csj/ASR/local/conf/number.ini
Normal file
@ -0,0 +1,321 @@
|
||||
; # This section is ignored if this file is not supplied as the first config file to
|
||||
; # lhotse prepare csj
|
||||
[SEGMENTS]
|
||||
; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
|
||||
gap = 0.5
|
||||
; # Maximum length of segment (s).
|
||||
maxlen = 10
|
||||
; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.
|
||||
minlen = 0.02
|
||||
; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`.
|
||||
; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi.
|
||||
; # If you intend to use a multicharacter string for gap_sym, remember to register the
|
||||
; # multicharacter string as part of userdef-string in prepare_lang_char.py.
|
||||
gap_sym =
|
||||
|
||||
[CONSTANTS]
|
||||
; # Name of this mode
|
||||
MODE = number
|
||||
; # Suffixes to use after the word surface (no longer used)
|
||||
MORPH = pos1 cForm cType2 pos2
|
||||
; # Used to differentiate between A tag and A_num tag
|
||||
JPN_NUM = ゼロ 0 零 一 二 三 四 五 六 七 八 九 十 百 千 .
|
||||
; # Dummy character to delineate multiline words
|
||||
PLUS = +
|
||||
|
||||
[DECISIONS]
|
||||
; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
|
||||
; # The PLUS (fullwidth) sign '+' marks line boundaries for multiline entries
|
||||
|
||||
; # フィラー、感情表出系感動詞
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(F ぎょっ)'
|
||||
F = 1
|
||||
; # Example: '(L (F ン))', '比べ(F えー)る'
|
||||
F^ = 1
|
||||
; # 言い直し、いいよどみなどによる語断片
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(D だ)(D だいが) 大学の学部の会議'
|
||||
D = 1
|
||||
; # Example: '(L (D ドゥ)+(D ヒ))'
|
||||
D^ = 1
|
||||
; # 助詞、助動詞、接辞の言い直し
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
|
||||
D2 = 1
|
||||
; # Example: '(X (D2 ノ))'
|
||||
D2^ = 1
|
||||
; # 聞き取りや語彙の判断に自信がない場合
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: (? 字数) の
|
||||
; # If no option: empty string is returned regardless of output
|
||||
; # Example: '(?) で'
|
||||
? = 0
|
||||
; # Example: '(D (? すー))+そう+です+よ+ね'
|
||||
?^ = 0
|
||||
; # タグ?で、値は複数の候補が想定される場合
|
||||
; # 0 for main guess with matching morph info, 1 for second guess
|
||||
; # Example: '(? 次数, 実数)', '(? これ,ここで)+(? 説明+し+た+方+が+いい+か+な)'
|
||||
?, = 0
|
||||
; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))'
|
||||
?,^ = 0
|
||||
; # 音や言葉に関するメタ的な引用
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
|
||||
M = 0
|
||||
; # Example: '(L (M ヒ)+(M ヒ))', '(L (M (? ヒ+ヒ)))'
|
||||
M^ = 0
|
||||
; # 外国語や古語、方言など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(O ザッツファイン)'
|
||||
O = 0
|
||||
; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))'
|
||||
O^ = 0
|
||||
; # 講演者の名前、差別語、誹謗中傷など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '国語研の (R ××) です'
|
||||
R = 0
|
||||
R^ = 0
|
||||
; # 非朗読対象発話(朗読における言い間違い等)
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(X 実際は) 実際には'
|
||||
X = 0
|
||||
; # Example: '(L (X (D2 ニ)))'
|
||||
X^ = 0
|
||||
; # アルファベットや算用数字、記号の表記
|
||||
; # 0 to use Japanese form, 1 to use alphabet form
|
||||
; # Example: '(A シーディーアール;CD-R)'
|
||||
A = 1
|
||||
; # Example: 'スモール(A エヌ;N)', 'ラージ(A キュー;Q)', '(A ティーエフ;TF)+(A アイディーエフ;IDF)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)')
|
||||
A^ = 1
|
||||
; # タグAで、単語は算用数字の場合
|
||||
; # 0 to use Japanese form, 1 to use Arabic numerals
|
||||
; # Example: (A 二千;2000)
|
||||
A_num = 1
|
||||
A_num^ = 1
|
||||
; # 何らかの原因で漢字表記できなくなった場合
|
||||
; # 0 to use broken form, 1 to use orthodox form
|
||||
; # Example: '(K たち (F えー) ばな;橘)'
|
||||
K = 1
|
||||
; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
|
||||
K^ = 1
|
||||
; # 転訛、発音の怠けなど、一時的な発音エラー
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(W ギーツ;ギジュツ)'
|
||||
W = 1
|
||||
; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
|
||||
W^ = 1
|
||||
; # 語の読みに関する知識レベルのいい間違い
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(B シブタイ;ジュータイ)'
|
||||
B = 0
|
||||
; # Example: 'データー(B カズ;スー)'
|
||||
B^ = 0
|
||||
; # 笑いながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)'
|
||||
笑 = 0
|
||||
; # Example: 'コク(笑 サイ+(D オン))',
|
||||
笑^ = 0
|
||||
; # 泣きながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(泣 ドンナニ)'
|
||||
泣 = 0
|
||||
泣^ = 0
|
||||
; # 咳をしながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: 'シャ(咳 リン) ノ'
|
||||
咳 = 0
|
||||
; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
|
||||
咳^ = 0
|
||||
; # ささやき声や独り言などの小さな声
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))+(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))'
|
||||
L = 0
|
||||
; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト'
|
||||
L^ = 0
|
||||
|
||||
[REPLACEMENTS]
|
||||
; # ボーカルフライなどで母音が同定できない場合
|
||||
<FV> =
|
||||
; # 「うん/うーん/ふーん」の音の特定が困難な場合
|
||||
<VN> =
|
||||
; # 非語彙的な母音の引き延ばし
|
||||
<H> =
|
||||
; # 非語彙的な子音の引き延ばし
|
||||
<Q> =
|
||||
; # 言語音と独立に講演者の笑いが生じている場合
|
||||
<笑> =
|
||||
; # 言語音と独立に講演者の咳が生じている場合
|
||||
<咳> =
|
||||
; # 言語音と独立に講演者の息が生じている場合
|
||||
<息> =
|
||||
; # 講演者の泣き声
|
||||
<泣> =
|
||||
; # 聴衆(司会者なども含む)の発話
|
||||
<フロア発話> =
|
||||
; # 聴衆の笑い
|
||||
<フロア笑> =
|
||||
; # 聴衆の拍手
|
||||
<拍手> =
|
||||
; # 講演者が発表中に用いたデモンストレーションの音声
|
||||
<デモ> =
|
||||
; # 学会講演に発表時間を知らせるためにならすベルの音
|
||||
<ベル> =
|
||||
; # 転記単位全体が再度読み直された場合
|
||||
<朗読間違い> =
|
||||
; # 上記以外の音で特に目立った音
|
||||
<雑音> =
|
||||
; # 0.2秒以上のポーズ
|
||||
<P> =
|
||||
; # Redacted information, for R
|
||||
; # It is \x00D7 multiplication sign, not your normal 'x'
|
||||
× = ×
|
||||
|
||||
[FIELDS]
|
||||
; # Time information for segment
|
||||
time = 3
|
||||
; # Word surface
|
||||
surface = 5
|
||||
; # Word surface root form without CSJ tags
|
||||
notag = 9
|
||||
; # Part Of Speech
|
||||
pos1 = 11
|
||||
; # Conjugated Form
|
||||
cForm = 12
|
||||
; # Conjugation Type
|
||||
cType1 = 13
|
||||
; # Subcategory of POS
|
||||
pos2 = 14
|
||||
; # Euphonic Change / Subcategory of Conjugation Type
|
||||
cType2 = 15
|
||||
; # Other information
|
||||
other = 16
|
||||
; # Pronunciation for lexicon
|
||||
pron = 10
|
||||
; # Speaker ID
|
||||
spk_id = 2
|
||||
|
||||
[KATAKANA2ROMAJI]
|
||||
ア = 'a
|
||||
イ = 'i
|
||||
ウ = 'u
|
||||
エ = 'e
|
||||
オ = 'o
|
||||
カ = ka
|
||||
キ = ki
|
||||
ク = ku
|
||||
ケ = ke
|
||||
コ = ko
|
||||
ガ = ga
|
||||
ギ = gi
|
||||
グ = gu
|
||||
ゲ = ge
|
||||
ゴ = go
|
||||
サ = sa
|
||||
シ = si
|
||||
ス = su
|
||||
セ = se
|
||||
ソ = so
|
||||
ザ = za
|
||||
ジ = zi
|
||||
ズ = zu
|
||||
ゼ = ze
|
||||
ゾ = zo
|
||||
タ = ta
|
||||
チ = ti
|
||||
ツ = tu
|
||||
テ = te
|
||||
ト = to
|
||||
ダ = da
|
||||
ヂ = di
|
||||
ヅ = du
|
||||
デ = de
|
||||
ド = do
|
||||
ナ = na
|
||||
ニ = ni
|
||||
ヌ = nu
|
||||
ネ = ne
|
||||
ノ = no
|
||||
ハ = ha
|
||||
ヒ = hi
|
||||
フ = hu
|
||||
ヘ = he
|
||||
ホ = ho
|
||||
バ = ba
|
||||
ビ = bi
|
||||
ブ = bu
|
||||
ベ = be
|
||||
ボ = bo
|
||||
パ = pa
|
||||
ピ = pi
|
||||
プ = pu
|
||||
ペ = pe
|
||||
ポ = po
|
||||
マ = ma
|
||||
ミ = mi
|
||||
ム = mu
|
||||
メ = me
|
||||
モ = mo
|
||||
ヤ = ya
|
||||
ユ = yu
|
||||
ヨ = yo
|
||||
ラ = ra
|
||||
リ = ri
|
||||
ル = ru
|
||||
レ = re
|
||||
ロ = ro
|
||||
ワ = wa
|
||||
ヰ = we
|
||||
ヱ = wi
|
||||
ヲ = wo
|
||||
ン = ŋ
|
||||
ッ = q
|
||||
ー = -
|
||||
キャ = kǐa
|
||||
キュ = kǐu
|
||||
キョ = kǐo
|
||||
ギャ = gǐa
|
||||
ギュ = gǐu
|
||||
ギョ = gǐo
|
||||
シャ = sǐa
|
||||
シュ = sǐu
|
||||
ショ = sǐo
|
||||
ジャ = zǐa
|
||||
ジュ = zǐu
|
||||
ジョ = zǐo
|
||||
チャ = tǐa
|
||||
チュ = tǐu
|
||||
チョ = tǐo
|
||||
ヂャ = dǐa
|
||||
ヂュ = dǐu
|
||||
ヂョ = dǐo
|
||||
ニャ = nǐa
|
||||
ニュ = nǐu
|
||||
ニョ = nǐo
|
||||
ヒャ = hǐa
|
||||
ヒュ = hǐu
|
||||
ヒョ = hǐo
|
||||
ビャ = bǐa
|
||||
ビュ = bǐu
|
||||
ビョ = bǐo
|
||||
ピャ = pǐa
|
||||
ピュ = pǐu
|
||||
ピョ = pǐo
|
||||
ミャ = mǐa
|
||||
ミュ = mǐu
|
||||
ミョ = mǐo
|
||||
リャ = rǐa
|
||||
リュ = rǐu
|
||||
リョ = rǐo
|
||||
ァ = a
|
||||
ィ = i
|
||||
ゥ = u
|
||||
ェ = e
|
||||
ォ = o
|
||||
ヮ = ʍ
|
||||
ヴ = vu
|
||||
ャ = ǐa
|
||||
ュ = ǐu
|
||||
ョ = ǐo
|
||||
|
322
egs/csj/ASR/local/conf/symbol.ini
Normal file
322
egs/csj/ASR/local/conf/symbol.ini
Normal file
@ -0,0 +1,322 @@
|
||||
; # This section is ignored if this file is not supplied as the first config file to
|
||||
; # lhotse prepare csj
|
||||
[SEGMENTS]
|
||||
; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
|
||||
gap = 0.5
|
||||
; # Maximum length of segment (s).
|
||||
maxlen = 10
|
||||
; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.
|
||||
minlen = 0.02
|
||||
; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`.
|
||||
; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi.
|
||||
; # If you intend to use a multicharacter string for gap_sym, remember to register the
|
||||
; # multicharacter string as part of userdef-string in prepare_lang_char.py.
|
||||
gap_sym =
|
||||
|
||||
[CONSTANTS]
|
||||
; # Name of this mode
|
||||
; # See https://www.isca-speech.org/archive/pdfs/interspeech_2022/horii22_interspeech.pdf
|
||||
MODE = symbol
|
||||
; # Suffixes to use after the word surface (no longer used)
|
||||
MORPH = pos1 cForm cType2 pos2
|
||||
; # Used to differentiate between A tag and A_num tag
|
||||
JPN_NUM = ゼロ 0 零 一 二 三 四 五 六 七 八 九 十 百 千 .
|
||||
; # Dummy character to delineate multiline words
|
||||
PLUS = +
|
||||
|
||||
[DECISIONS]
|
||||
; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
|
||||
; # The PLUS (fullwidth) sign '+' marks line boundaries for multiline entries
|
||||
|
||||
; # フィラー、感情表出系感動詞
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(F ぎょっ)'
|
||||
F = #
|
||||
; # Example: '(L (F ン))', '比べ(F えー)る'
|
||||
F^ = #
|
||||
; # 言い直し、いいよどみなどによる語断片
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(D だ)(D だいが) 大学の学部の会議'
|
||||
D = @
|
||||
; # Example: '(L (D ドゥ)+(D ヒ))'
|
||||
D^ = @
|
||||
; # 助詞、助動詞、接辞の言い直し
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
|
||||
D2 = @
|
||||
; # Example: '(X (D2 ノ))'
|
||||
D2^ = @
|
||||
; # 聞き取りや語彙の判断に自信がない場合
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: (? 字数) の
|
||||
; # If no option: empty string is returned regardless of output
|
||||
; # Example: '(?) で'
|
||||
? = 0
|
||||
; # Example: '(D (? すー))+そう+です+よ+ね'
|
||||
?^ = 0
|
||||
; # タグ?で、値は複数の候補が想定される場合
|
||||
; # 0 for main guess with matching morph info, 1 for second guess
|
||||
; # Example: '(? 次数, 実数)', '(? これ,ここで)+(? 説明+し+た+方+が+いい+か+な)'
|
||||
?, = 0
|
||||
; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))'
|
||||
?,^ = 0
|
||||
; # 音や言葉に関するメタ的な引用
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
|
||||
M = 0
|
||||
; # Example: '(L (M ヒ)+(M ヒ))', '(L (M (? ヒ+ヒ)))'
|
||||
M^ = 0
|
||||
; # 外国語や古語、方言など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(O ザッツファイン)'
|
||||
O = 0
|
||||
; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))'
|
||||
O^ = 0
|
||||
; # 講演者の名前、差別語、誹謗中傷など
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '国語研の (R ××) です'
|
||||
R = 0
|
||||
R^ = 0
|
||||
; # 非朗読対象発話(朗読における言い間違い等)
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(X 実際は) 実際には'
|
||||
X = 0
|
||||
; # Example: '(L (X (D2 ニ)))'
|
||||
X^ = 0
|
||||
; # アルファベットや算用数字、記号の表記
|
||||
; # 0 to use Japanese form, 1 to use alphabet form
|
||||
; # Example: '(A シーディーアール;CD-R)'
|
||||
A = 1
|
||||
; # Example: 'スモール(A エヌ;N)', 'ラージ(A キュー;Q)', '(A ティーエフ;TF)+(A アイディーエフ;IDF)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)')
|
||||
A^ = 1
|
||||
; # タグAで、単語は算用数字の場合
|
||||
; # 0 to use Japanese form, 1 to use Arabic numerals
|
||||
; # Example: (A 二千;2000)
|
||||
A_num = eval:self.notag
|
||||
A_num^ = eval:self.notag
|
||||
; # 何らかの原因で漢字表記できなくなった場合
|
||||
; # 0 to use broken form, 1 to use orthodox form
|
||||
; # Example: '(K たち (F えー) ばな;橘)'
|
||||
K = 1
|
||||
; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
|
||||
K^ = 1
|
||||
; # 転訛、発音の怠けなど、一時的な発音エラー
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(W ギーツ;ギジュツ)'
|
||||
W = 1
|
||||
; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
|
||||
W^ = 1
|
||||
; # 語の読みに関する知識レベルのいい間違い
|
||||
; # 0 to use wrong form, 1 to use orthodox form
|
||||
; # Example: '(B シブタイ;ジュータイ)'
|
||||
B = 0
|
||||
; # Example: 'データー(B カズ;スー)'
|
||||
B^ = 0
|
||||
; # 笑いながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)'
|
||||
笑 = 0
|
||||
; # Example: 'コク(笑 サイ+(D オン))',
|
||||
笑^ = 0
|
||||
; # 泣きながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(泣 ドンナニ)'
|
||||
泣 = 0
|
||||
泣^ = 0
|
||||
; # 咳をしながら発話
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: 'シャ(咳 リン) ノ'
|
||||
咳 = 0
|
||||
; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
|
||||
咳^ = 0
|
||||
; # ささやき声や独り言などの小さな声
|
||||
; # 0 to remain, 1 to delete
|
||||
; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))+(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))'
|
||||
L = 0
|
||||
; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト'
|
||||
L^ = 0
|
||||
|
||||
[REPLACEMENTS]
|
||||
; # ボーカルフライなどで母音が同定できない場合
|
||||
<FV> =
|
||||
; # 「うん/うーん/ふーん」の音の特定が困難な場合
|
||||
<VN> =
|
||||
; # 非語彙的な母音の引き延ばし
|
||||
<H> =
|
||||
; # 非語彙的な子音の引き延ばし
|
||||
<Q> =
|
||||
; # 言語音と独立に講演者の笑いが生じている場合
|
||||
<笑> =
|
||||
; # 言語音と独立に講演者の咳が生じている場合
|
||||
<咳> =
|
||||
; # 言語音と独立に講演者の息が生じている場合
|
||||
<息> =
|
||||
; # 講演者の泣き声
|
||||
<泣> =
|
||||
; # 聴衆(司会者なども含む)の発話
|
||||
<フロア発話> =
|
||||
; # 聴衆の笑い
|
||||
<フロア笑> =
|
||||
; # 聴衆の拍手
|
||||
<拍手> =
|
||||
; # 講演者が発表中に用いたデモンストレーションの音声
|
||||
<デモ> =
|
||||
; # 学会講演に発表時間を知らせるためにならすベルの音
|
||||
<ベル> =
|
||||
; # 転記単位全体が再度読み直された場合
|
||||
<朗読間違い> =
|
||||
; # 上記以外の音で特に目立った音
|
||||
<雑音> =
|
||||
; # 0.2秒以上のポーズ
|
||||
<P> =
|
||||
; # Redacted information, for R
|
||||
; # It is \x00D7 multiplication sign, not your normal 'x'
|
||||
× = ×
|
||||
|
||||
[FIELDS]
|
||||
; # Time information for segment
|
||||
time = 3
|
||||
; # Word surface
|
||||
surface = 5
|
||||
; # Word surface root form without CSJ tags
|
||||
notag = 9
|
||||
; # Part Of Speech
|
||||
pos1 = 11
|
||||
; # Conjugated Form
|
||||
cForm = 12
|
||||
; # Conjugation Type
|
||||
cType1 = 13
|
||||
; # Subcategory of POS
|
||||
pos2 = 14
|
||||
; # Euphonic Change / Subcategory of Conjugation Type
|
||||
cType2 = 15
|
||||
; # Other information
|
||||
other = 16
|
||||
; # Pronunciation for lexicon
|
||||
pron = 10
|
||||
; # Speaker ID
|
||||
spk_id = 2
|
||||
|
||||
[KATAKANA2ROMAJI]
|
||||
ア = 'a
|
||||
イ = 'i
|
||||
ウ = 'u
|
||||
エ = 'e
|
||||
オ = 'o
|
||||
カ = ka
|
||||
キ = ki
|
||||
ク = ku
|
||||
ケ = ke
|
||||
コ = ko
|
||||
ガ = ga
|
||||
ギ = gi
|
||||
グ = gu
|
||||
ゲ = ge
|
||||
ゴ = go
|
||||
サ = sa
|
||||
シ = si
|
||||
ス = su
|
||||
セ = se
|
||||
ソ = so
|
||||
ザ = za
|
||||
ジ = zi
|
||||
ズ = zu
|
||||
ゼ = ze
|
||||
ゾ = zo
|
||||
タ = ta
|
||||
チ = ti
|
||||
ツ = tu
|
||||
テ = te
|
||||
ト = to
|
||||
ダ = da
|
||||
ヂ = di
|
||||
ヅ = du
|
||||
デ = de
|
||||
ド = do
|
||||
ナ = na
|
||||
ニ = ni
|
||||
ヌ = nu
|
||||
ネ = ne
|
||||
ノ = no
|
||||
ハ = ha
|
||||
ヒ = hi
|
||||
フ = hu
|
||||
ヘ = he
|
||||
ホ = ho
|
||||
バ = ba
|
||||
ビ = bi
|
||||
ブ = bu
|
||||
ベ = be
|
||||
ボ = bo
|
||||
パ = pa
|
||||
ピ = pi
|
||||
プ = pu
|
||||
ペ = pe
|
||||
ポ = po
|
||||
マ = ma
|
||||
ミ = mi
|
||||
ム = mu
|
||||
メ = me
|
||||
モ = mo
|
||||
ヤ = ya
|
||||
ユ = yu
|
||||
ヨ = yo
|
||||
ラ = ra
|
||||
リ = ri
|
||||
ル = ru
|
||||
レ = re
|
||||
ロ = ro
|
||||
ワ = wa
|
||||
ヰ = we
|
||||
ヱ = wi
|
||||
ヲ = wo
|
||||
ン = ŋ
|
||||
ッ = q
|
||||
ー = -
|
||||
キャ = kǐa
|
||||
キュ = kǐu
|
||||
キョ = kǐo
|
||||
ギャ = gǐa
|
||||
ギュ = gǐu
|
||||
ギョ = gǐo
|
||||
シャ = sǐa
|
||||
シュ = sǐu
|
||||
ショ = sǐo
|
||||
ジャ = zǐa
|
||||
ジュ = zǐu
|
||||
ジョ = zǐo
|
||||
チャ = tǐa
|
||||
チュ = tǐu
|
||||
チョ = tǐo
|
||||
ヂャ = dǐa
|
||||
ヂュ = dǐu
|
||||
ヂョ = dǐo
|
||||
ニャ = nǐa
|
||||
ニュ = nǐu
|
||||
ニョ = nǐo
|
||||
ヒャ = hǐa
|
||||
ヒュ = hǐu
|
||||
ヒョ = hǐo
|
||||
ビャ = bǐa
|
||||
ビュ = bǐu
|
||||
ビョ = bǐo
|
||||
ピャ = pǐa
|
||||
ピュ = pǐu
|
||||
ピョ = pǐo
|
||||
ミャ = mǐa
|
||||
ミュ = mǐu
|
||||
ミョ = mǐo
|
||||
リャ = rǐa
|
||||
リュ = rǐu
|
||||
リョ = rǐo
|
||||
ァ = a
|
||||
ィ = i
|
||||
ゥ = u
|
||||
ェ = e
|
||||
ォ = o
|
||||
ヮ = ʍ
|
||||
ヴ = vu
|
||||
ャ = ǐa
|
||||
ュ = ǐu
|
||||
ョ = ǐo
|
||||
|
182
egs/csj/ASR/local/display_manifest_statistics.py
Normal file
182
egs/csj/ASR/local/display_manifest_statistics.py
Normal file
@ -0,0 +1,182 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||
# 2022 The University of Electro-Communications (author: Teo Wen Shen) # noqa
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet, load_manifest
|
||||
|
||||
ARGPARSE_DESCRIPTION = """
|
||||
This file displays duration statistics of utterances in a manifest.
|
||||
You can use the displayed value to choose minimum/maximum duration
|
||||
to remove short and long utterances during the training.
|
||||
|
||||
See the function `remove_short_and_long_utt()` in
|
||||
pruned_transducer_stateless5/train.py for usage.
|
||||
"""
|
||||
|
||||
|
||||
def get_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=ARGPARSE_DESCRIPTION,
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--manifest-dir", type=Path, help="Path to cutset manifests"
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = get_parser()
|
||||
|
||||
for path in args.manifest_dir.glob("csj_cuts_*.jsonl.gz"):
|
||||
|
||||
cuts: CutSet = load_manifest(path)
|
||||
|
||||
print("\n---------------------------------\n")
|
||||
print(path.name + ":")
|
||||
cuts.describe()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
"""
|
||||
## eval1
|
||||
Cuts count: 1272
|
||||
Total duration (hh:mm:ss): 01:50:07
|
||||
Speech duration (hh:mm:ss): 01:50:07 (100.0%)
|
||||
Duration statistics (seconds):
|
||||
mean 5.2
|
||||
std 3.9
|
||||
min 0.2
|
||||
25% 1.9
|
||||
50% 4.0
|
||||
75% 8.1
|
||||
99% 14.3
|
||||
99.5% 14.7
|
||||
99.9% 16.0
|
||||
max 16.9
|
||||
Recordings available: 1272
|
||||
Features available: 1272
|
||||
Supervisions available: 1272
|
||||
SUPERVISION custom fields:
|
||||
- fluent (in 1272 cuts)
|
||||
- disfluent (in 1272 cuts)
|
||||
- number (in 1272 cuts)
|
||||
- symbol (in 1272 cuts)
|
||||
|
||||
## eval2
|
||||
Cuts count: 1292
|
||||
Total duration (hh:mm:ss): 01:56:50
|
||||
Speech duration (hh:mm:ss): 01:56:50 (100.0%)
|
||||
Duration statistics (seconds):
|
||||
mean 5.4
|
||||
std 3.9
|
||||
min 0.1
|
||||
25% 2.1
|
||||
50% 4.6
|
||||
75% 8.6
|
||||
99% 14.1
|
||||
99.5% 15.2
|
||||
99.9% 16.1
|
||||
max 16.9
|
||||
Recordings available: 1292
|
||||
Features available: 1292
|
||||
Supervisions available: 1292
|
||||
SUPERVISION custom fields:
|
||||
- fluent (in 1292 cuts)
|
||||
- number (in 1292 cuts)
|
||||
- symbol (in 1292 cuts)
|
||||
- disfluent (in 1292 cuts)
|
||||
|
||||
## eval3
|
||||
Cuts count: 1385
|
||||
Total duration (hh:mm:ss): 01:19:21
|
||||
Speech duration (hh:mm:ss): 01:19:21 (100.0%)
|
||||
Duration statistics (seconds):
|
||||
mean 3.4
|
||||
std 3.0
|
||||
min 0.2
|
||||
25% 1.2
|
||||
50% 2.5
|
||||
75% 4.6
|
||||
99% 12.7
|
||||
99.5% 13.7
|
||||
99.9% 15.0
|
||||
max 15.9
|
||||
Recordings available: 1385
|
||||
Features available: 1385
|
||||
Supervisions available: 1385
|
||||
SUPERVISION custom fields:
|
||||
- number (in 1385 cuts)
|
||||
- symbol (in 1385 cuts)
|
||||
- fluent (in 1385 cuts)
|
||||
- disfluent (in 1385 cuts)
|
||||
|
||||
## valid
|
||||
Cuts count: 4000
|
||||
Total duration (hh:mm:ss): 05:08:09
|
||||
Speech duration (hh:mm:ss): 05:08:09 (100.0%)
|
||||
Duration statistics (seconds):
|
||||
mean 4.6
|
||||
std 3.8
|
||||
min 0.1
|
||||
25% 1.5
|
||||
50% 3.4
|
||||
75% 7.0
|
||||
99% 13.8
|
||||
99.5% 14.8
|
||||
99.9% 16.0
|
||||
max 17.3
|
||||
Recordings available: 4000
|
||||
Features available: 4000
|
||||
Supervisions available: 4000
|
||||
SUPERVISION custom fields:
|
||||
- fluent (in 4000 cuts)
|
||||
- symbol (in 4000 cuts)
|
||||
- disfluent (in 4000 cuts)
|
||||
- number (in 4000 cuts)
|
||||
|
||||
## train
|
||||
Cuts count: 1291134
|
||||
Total duration (hh:mm:ss): 1596:37:27
|
||||
Speech duration (hh:mm:ss): 1596:37:27 (100.0%)
|
||||
Duration statistics (seconds):
|
||||
mean 4.5
|
||||
std 3.6
|
||||
min 0.0
|
||||
25% 1.6
|
||||
50% 3.3
|
||||
75% 6.4
|
||||
99% 14.0
|
||||
99.5% 14.8
|
||||
99.9% 16.6
|
||||
max 27.8
|
||||
Recordings available: 1291134
|
||||
Features available: 1291134
|
||||
Supervisions available: 1291134
|
||||
SUPERVISION custom fields:
|
||||
- disfluent (in 1291134 cuts)
|
||||
- fluent (in 1291134 cuts)
|
||||
- symbol (in 1291134 cuts)
|
||||
- number (in 1291134 cuts)
|
||||
"""
|
155
egs/csj/ASR/local/prepare_lang_char.py
Normal file
155
egs/csj/ASR/local/prepare_lang_char.py
Normal file
@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2022 The University of Electro-Communications (Author: Teo Wen Shen) # noqa
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet
|
||||
|
||||
ARGPARSE_DESCRIPTION = """
|
||||
This script gathers all training transcripts of the specified {trans_mode} type
|
||||
and produces a token_list that would be output set of the ASR system.
|
||||
|
||||
It splits transcripts by whitespace into lists, then, for each word in the
|
||||
list, if the word does not appear in the list of user-defined multicharacter
|
||||
strings, it further splits that word into individual characters to be counted
|
||||
into the output token set.
|
||||
|
||||
It outputs 4 files into the lang directory:
|
||||
- trans_mode: the name of transcript mode. If trans_mode was not specified,
|
||||
this will be an empty file.
|
||||
- userdef_string: a list of user defined strings that should not be split
|
||||
further into individual characters. By default, it contains "<unk>", "<blk>",
|
||||
"<sos/eos>"
|
||||
- words_len: the total number of tokens in the output set.
|
||||
- words.txt: a list of tokens in the output set. The length matches words_len.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=ARGPARSE_DESCRIPTION,
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--train-cut", type=Path, required=True, help="Path to the train cut"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--trans-mode",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"Name of the transcript mode to use. "
|
||||
"If lang-dir is not set, this will also name the lang-dir"
|
||||
),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--lang-dir",
|
||||
type=Path,
|
||||
default=None,
|
||||
help=(
|
||||
"Name of lang dir. "
|
||||
"If not set, this will default to lang_char_{trans-mode}"
|
||||
),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--userdef-string",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Multicharacter strings that do not need to be split",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
|
||||
logging.basicConfig(
|
||||
format=(
|
||||
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] " "%(message)s"
|
||||
),
|
||||
level=logging.INFO,
|
||||
)
|
||||
|
||||
if not args.lang_dir:
|
||||
p = "lang_char"
|
||||
if args.trans_mode:
|
||||
p += f"_{args.trans_mode}"
|
||||
args.lang_dir = Path(p)
|
||||
|
||||
if args.userdef_string:
|
||||
args.userdef_string = set(args.userdef_string.read_text().split())
|
||||
else:
|
||||
args.userdef_string = set()
|
||||
|
||||
sysdef_string = ["<blk>", "<unk>", "<sos/eos>"]
|
||||
args.userdef_string.update(sysdef_string)
|
||||
|
||||
train_set: CutSet = CutSet.from_file(args.train_cut)
|
||||
|
||||
words = set()
|
||||
logging.info(
|
||||
f"Creating vocabulary from {args.train_cut.name}"
|
||||
f" at {args.trans_mode} mode."
|
||||
)
|
||||
for cut in train_set:
|
||||
try:
|
||||
text: str = (
|
||||
cut.supervisions[0].custom[args.trans_mode]
|
||||
if args.trans_mode
|
||||
else cut.supervisions[0].text
|
||||
)
|
||||
except KeyError:
|
||||
raise KeyError(
|
||||
f"Could not find {args.trans_mode} in "
|
||||
f"{cut.supervisions[0].custom}"
|
||||
)
|
||||
for t in text.split():
|
||||
if t in args.userdef_string:
|
||||
words.add(t)
|
||||
else:
|
||||
words.update(c for c in list(t))
|
||||
|
||||
words -= set(sysdef_string)
|
||||
words = sorted(words)
|
||||
words = ["<blk>"] + words + ["<unk>", "<sos/eos>"]
|
||||
|
||||
args.lang_dir.mkdir(parents=True, exist_ok=True)
|
||||
(args.lang_dir / "words.txt").write_text(
|
||||
"\n".join(f"{word}\t{i}" for i, word in enumerate(words))
|
||||
)
|
||||
|
||||
(args.lang_dir / "words_len").write_text(f"{len(words)}")
|
||||
|
||||
(args.lang_dir / "userdef_string").write_text(
|
||||
"\n".join(args.userdef_string)
|
||||
)
|
||||
|
||||
(args.lang_dir / "trans_mode").write_text(args.trans_mode)
|
||||
logging.info("Done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
98
egs/csj/ASR/local/validate_manifest.py
Normal file
98
egs/csj/ASR/local/validate_manifest.py
Normal file
@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This script checks the following assumptions of the generated manifest:
|
||||
|
||||
- Single supervision per cut
|
||||
- Supervision time bounds are within cut time bounds
|
||||
|
||||
We will add more checks later if needed.
|
||||
|
||||
Usage example:
|
||||
|
||||
python3 ./local/validate_manifest.py \
|
||||
./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
|
||||
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet, load_manifest
|
||||
from lhotse.cut import Cut
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--manifest",
|
||||
type=Path,
|
||||
help="Path to the manifest file",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def validate_one_supervision_per_cut(c: Cut):
|
||||
if len(c.supervisions) != 1:
|
||||
raise ValueError(f"{c.id} has {len(c.supervisions)} supervisions")
|
||||
|
||||
|
||||
def validate_supervision_and_cut_time_bounds(c: Cut):
|
||||
s = c.supervisions[0]
|
||||
|
||||
# Removed because when the cuts were trimmed from supervisions,
|
||||
# the start time of the supervision can be lesser than cut start time.
|
||||
# https://github.com/lhotse-speech/lhotse/issues/813
|
||||
# if s.start < c.start:
|
||||
# raise ValueError(
|
||||
# f"{c.id}: Supervision start time {s.start} is less "
|
||||
# f"than cut start time {c.start}"
|
||||
# )
|
||||
|
||||
if s.end > c.end:
|
||||
raise ValueError(
|
||||
f"{c.id}: Supervision end time {s.end} is larger "
|
||||
f"than cut end time {c.end}"
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
|
||||
manifest = Path(args.manifest)
|
||||
logging.info(f"Validating {manifest}")
|
||||
|
||||
assert manifest.is_file(), f"{manifest} does not exist"
|
||||
cut_set = load_manifest(manifest)
|
||||
assert isinstance(cut_set, CutSet)
|
||||
|
||||
for c in cut_set:
|
||||
validate_one_supervision_per_cut(c)
|
||||
validate_supervision_and_cut_time_bounds(c)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
formatter = (
|
||||
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||
)
|
||||
|
||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||
|
||||
main()
|
130
egs/csj/ASR/prepare.sh
Executable file
130
egs/csj/ASR/prepare.sh
Executable file
@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env bash
|
||||
# We assume the following directories are downloaded.
|
||||
#
|
||||
# - $csj_dir
|
||||
# CSJ is assumed to be the USB-type directory, which should contain the following subdirectories:-
|
||||
# - DATA (not used in this script)
|
||||
# - DOC (not used in this script)
|
||||
# - MODEL (not used in this script)
|
||||
# - MORPH
|
||||
# - LDB (not used in this script)
|
||||
# - SUWDIC (not used in this script)
|
||||
# - SDB
|
||||
# - core
|
||||
# - ...
|
||||
# - noncore
|
||||
# - ...
|
||||
# - PLABEL (not used in this script)
|
||||
# - SUMMARY (not used in this script)
|
||||
# - TOOL (not used in this script)
|
||||
# - WAV
|
||||
# - core
|
||||
# - ...
|
||||
# - noncore
|
||||
# - ...
|
||||
# - XML (not used in this script)
|
||||
#
|
||||
# - $musan_dir
|
||||
# This directory contains the following directories downloaded from
|
||||
# http://www.openslr.org/17/
|
||||
# - music
|
||||
# - noise
|
||||
# - speech
|
||||
#
|
||||
# By default, this script produces the original transcript like kaldi and espnet. Optionally, you
|
||||
# can generate other transcript formats by supplying your own config files. A few examples of these
|
||||
# config files can be found in local/conf.
|
||||
|
||||
set -eou pipefail
|
||||
|
||||
nj=8
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
csj_dir=/mnt/minami_data_server/t2131178/corpus/CSJ
|
||||
musan_dir=/mnt/minami_data_server/t2131178/corpus/musan/musan
|
||||
trans_dir=$csj_dir/retranscript
|
||||
csj_fbank_dir=/mnt/host/csj_data/fbank
|
||||
musan_fbank_dir=$musan_dir/fbank
|
||||
csj_manifest_dir=data/manifests
|
||||
musan_manifest_dir=$musan_dir/manifests
|
||||
|
||||
. shared/parse_options.sh || exit 1
|
||||
|
||||
mkdir -p data
|
||||
|
||||
log() {
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
log "Stage 1: Prepare CSJ manifest"
|
||||
# If you want to generate more transcript modes, append the path to those config files at c.
|
||||
# Example: lhotse prepare csj $csj_dir $trans_dir $csj_manifest_dir -c local/conf/disfluent.ini
|
||||
# NOTE: In case multiple config files are supplied, the second config file and onwards will inherit
|
||||
# the segment boundaries of the first config file.
|
||||
if [ ! -e $csj_manifest_dir/.librispeech.done ]; then
|
||||
lhotse prepare csj $csj_dir $trans_dir $csj_manifest_dir -j 4
|
||||
touch $csj_manifest_dir/.librispeech.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||
log "Stage 2: Prepare musan manifest"
|
||||
mkdir -p $musan_manifest_dir
|
||||
if [ ! -e $musan_manifest_dir/.musan.done ]; then
|
||||
lhotse prepare musan $musan_dir $musan_manifest_dir
|
||||
touch $musan_manifest_dir/.musan.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
log "Stage 3: Compute CSJ fbank"
|
||||
if [ ! -e $csj_fbank_dir/.csj-validated.done ]; then
|
||||
python local/compute_fbank_csj.py --manifest-dir $csj_manifest_dir \
|
||||
--fbank-dir $csj_fbank_dir
|
||||
parts=(
|
||||
train
|
||||
valid
|
||||
eval1
|
||||
eval2
|
||||
eval3
|
||||
)
|
||||
for part in ${parts[@]}; do
|
||||
python local/validate_manifest.py --manifest $csj_manifest_dir/csj_cuts_$part.jsonl.gz
|
||||
done
|
||||
touch $csj_fbank_dir/.csj-validated.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||
log "Stage 4: Prepare CSJ lang"
|
||||
modes=disfluent
|
||||
|
||||
# If you want prepare the lang directory for other transcript modes, just append
|
||||
# the names of those modes behind. An example is shown as below:-
|
||||
# modes="$modes fluent symbol number"
|
||||
|
||||
for mode in ${modes[@]}; do
|
||||
python local/prepare_lang_char.py --trans-mode $mode \
|
||||
--train-cut $csj_manifest_dir/csj_cuts_train.jsonl.gz \
|
||||
--lang-dir lang_char_$mode
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
log "Stage 5: Compute fbank for musan"
|
||||
mkdir -p $musan_fbank_dir
|
||||
|
||||
if [ ! -e $musan_fbank_dir/.musan.done ]; then
|
||||
python local/compute_fbank_musan.py --manifest-dir $musan_manifest_dir --fbank-dir $musan_fbank_dir
|
||||
touch $musan_fbank_dir/.musan.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||
log "Stage 6: Show manifest statistics"
|
||||
python local/display_manifest_statistics.py --manifest-dir $csj_manifest_dir > $csj_manifest_dir/manifest_statistics.txt
|
||||
cat $csj_manifest_dir/manifest_statistics.txt
|
||||
fi
|
1
egs/csj/ASR/shared
Symbolic link
1
egs/csj/ASR/shared
Symbolic link
@ -0,0 +1 @@
|
||||
../../../icefall/shared/
|
Loading…
x
Reference in New Issue
Block a user