CSJ Data Preparation (#617)

* workspace setup

* csj prepare done

* Change compute_fbank_musan.py t soft link

* add description

* change lhotse prepare csj command

* split train-dev here

* Add header

* remove debug

* save manifest_statistics

* generate transcript in Lhotse

* update comments in config file
This commit is contained in:
Teo Wen Shen 2022-10-18 16:56:43 +09:00 committed by GitHub
parent d69bb826ed
commit 15c1a4a441
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 2032 additions and 0 deletions

7
egs/csj/ASR/.gitignore vendored Normal file
View File

@ -0,0 +1,7 @@
librispeech_*.*
todelete*
lang*
notify_tg.py
finetune_*
misc.ini
.vscode/*

View File

@ -0,0 +1,173 @@
#!/usr/bin/env python3
# Copyright 2022 The University of Electro-Communications (Author: Teo Wen Shen) # noqa
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import os
from itertools import islice
from pathlib import Path
from random import Random
from typing import List, Tuple
import torch
from lhotse import (
CutSet,
Fbank,
FbankConfig,
# fmt: off
# See the following for why LilcomChunkyWriter is preferred
# https://github.com/k2-fsa/icefall/pull/404
# https://github.com/lhotse-speech/lhotse/pull/527
# fmt: on
LilcomChunkyWriter,
RecordingSet,
SupervisionSet,
)
ARGPARSE_DESCRIPTION = """
This script follows the espnet method of splitting the remaining core+noncore
utterances into valid and train cutsets at an index which is by default 4000.
In other words, the core+noncore utterances are shuffled, where 4000 utterances
of the shuffled set go to the `valid` cutset and are not subject to speed
perturbation. The remaining utterances become the `train` cutset and are speed-
perturbed (0.9x, 1.0x, 1.1x).
"""
# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
# Do this outside of main() in case it needs to take effect
# even when we are not invoking the main (e.g. when spawning subprocesses).
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
RNG_SEED = 42
def make_cutset_blueprints(
manifest_dir: Path,
split: int,
) -> List[Tuple[str, CutSet]]:
cut_sets = []
# Create eval datasets
logging.info("Creating eval cuts.")
for i in range(1, 4):
cut_set = CutSet.from_manifests(
recordings=RecordingSet.from_file(
manifest_dir / f"csj_recordings_eval{i}.jsonl.gz"
),
supervisions=SupervisionSet.from_file(
manifest_dir / f"csj_supervisions_eval{i}.jsonl.gz"
),
)
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
cut_sets.append((f"eval{i}", cut_set))
# Create train and valid cuts
logging.info(
"Loading, trimming, and shuffling the remaining core+noncore cuts."
)
recording_set = RecordingSet.from_file(
manifest_dir / "csj_recordings_core.jsonl.gz"
) + RecordingSet.from_file(manifest_dir / "csj_recordings_noncore.jsonl.gz")
supervision_set = SupervisionSet.from_file(
manifest_dir / "csj_supervisions_core.jsonl.gz"
) + SupervisionSet.from_file(
manifest_dir / "csj_supervisions_noncore.jsonl.gz"
)
cut_set = CutSet.from_manifests(
recordings=recording_set,
supervisions=supervision_set,
)
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
cut_set = cut_set.shuffle(Random(RNG_SEED))
logging.info(
"Creating valid and train cuts from core and noncore,"
f"split at {split}."
)
valid_set = CutSet.from_cuts(islice(cut_set, 0, split))
train_set = CutSet.from_cuts(islice(cut_set, split, None))
train_set = (
train_set + train_set.perturb_speed(0.9) + train_set.perturb_speed(1.1)
)
cut_sets.extend([("valid", valid_set), ("train", train_set)])
return cut_sets
def get_args():
parser = argparse.ArgumentParser(
description=ARGPARSE_DESCRIPTION,
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--manifest-dir", type=Path, help="Path to save manifests"
)
parser.add_argument(
"--fbank-dir", type=Path, help="Path to save fbank features"
)
parser.add_argument(
"--split", type=int, default=4000, help="Split at this index"
)
return parser.parse_args()
def main():
args = get_args()
extractor = Fbank(FbankConfig(num_mel_bins=80))
num_jobs = min(16, os.cpu_count())
formatter = (
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
)
logging.basicConfig(format=formatter, level=logging.INFO)
if (args.fbank_dir / ".done").exists():
logging.info(
"Previous fbank computed for CSJ found. "
f"Delete {args.fbank_dir / '.done'} to allow recomputing fbank."
)
return
else:
cut_sets = make_cutset_blueprints(args.manifest_dir, args.split)
for part, cut_set in cut_sets:
logging.info(f"Processing {part}")
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
num_jobs=num_jobs,
storage_path=(args.fbank_dir / f"feats_{part}").as_posix(),
storage_type=LilcomChunkyWriter,
)
cut_set.to_file(args.manifest_dir / f"csj_cuts_{part}.jsonl.gz")
logging.info("All fbank computed for CSJ.")
(args.fbank_dir / ".done").touch()
if __name__ == "__main__":
main()

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/compute_fbank_musan.py

View File

@ -0,0 +1,321 @@
; # This section is ignored if this file is not supplied as the first config file to
; # lhotse prepare csj
[SEGMENTS]
; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
gap = 0.5
; # Maximum length of segment (s).
maxlen = 10
; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.
minlen = 0.02
; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`.
; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi.
; # If you intend to use a multicharacter string for gap_sym, remember to register the
; # multicharacter string as part of userdef-string in prepare_lang_char.py.
gap_sym =
[CONSTANTS]
; # Name of this mode
MODE = disfluent
; # Suffixes to use after the word surface (no longer used)
MORPH = pos1 cForm cType2 pos2
; # Used to differentiate between A tag and A_num tag
JPN_NUM = ゼロ 零 一 二 三 四 五 六 七 八 九 十 百 千
; # Dummy character to delineate multiline words
PLUS =
[DECISIONS]
; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
; # The PLUS (fullwidth) sign '' marks line boundaries for multiline entries
; # フィラー、感情表出系感動詞
; # 0 to remain, 1 to delete
; # Example: '(F ぎょっ)'
F = 0
; # Example: '(L (F ン))', '比べ(F えー)る'
F^ = 0
; # 言い直し、いいよどみなどによる語断片
; # 0 to remain, 1 to delete
; # Example: '(D だ)(D だいが) 大学の学部の会議'
D = 0
; # Example: '(L (D ドゥ)(D ヒ))'
D^ = 0
; # 助詞、助動詞、接辞の言い直し
; # 0 to remain, 1 to delete
; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
D2 = 0
; # Example: '(X (D2 ))'
D2^ = 0
; # 聞き取りや語彙の判断に自信がない場合
; # 0 to remain, 1 to delete
; # Example: (? 字数) の
; # If no option: empty string is returned regardless of output
; # Example: '(?) で'
? = 0
; # Example: '(D (? すー))+そう+です+よ+ね'
?^ = 0
; # タグ?で、値は複数の候補が想定される場合
; # 0 for main guess with matching morph info, 1 for second guess
; # Example: '(? 次数, 実数)', '(? これ,ここで)(? 説明+し+た+方+が+いい+か+な)'
?, = 0
; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))'
?,^ = 0
; # 音や言葉に関するメタ的な引用
; # 0 to remain, 1 to delete
; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
M = 0
; # Example: '(L (M ヒ)(M ヒ))', '(L (M (? ヒ+ヒ)))'
M^ = 0
; # 外国語や古語、方言など
; # 0 to remain, 1 to delete
; # Example: '(O ザッツファイン)'
O = 0
; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))'
O^ = 0
; # 講演者の名前、差別語、誹謗中傷など
; # 0 to remain, 1 to delete
; # Example: '国語研の (R ××) です'
R = 0
R^ = 0
; # 非朗読対象発話(朗読における言い間違い等)
; # 0 to remain, 1 to delete
; # Example: '(X 実際は) 実際には'
X = 0
; # Example: '(L (X (D2 ニ)))'
X^ = 0
; # アルファベットや算用数字、記号の表記
; # 0 to use Japanese form, 1 to use alphabet form
; # Example: '(A シーディーアール;)'
A = 1
; # Example: 'スモール(A エヌ;)', 'ラージ(A キュー;)', '(A ティーエフ;)(A アイディーエフ;)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)')
A^ = 1
; # タグAで、単語は算用数字の場合
; # 0 to use Japanese form, 1 to use Arabic numerals
; # Example: (A 二千;)
A_num = eval:self.notag
A_num^ = eval:self.notag
; # 何らかの原因で漢字表記できなくなった場合
; # 0 to use broken form, 1 to use orthodox form
; # Example: '(K たち (F えー) ばな;橘)'
K = 1
; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
K^ = 1
; # 転訛、発音の怠けなど、一時的な発音エラー
; # 0 to use wrong form, 1 to use orthodox form
; # Example: '(W ギーツ;ギジュツ)'
W = 1
; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
W^ = 1
; # 語の読みに関する知識レベルのいい間違い
; # 0 to use wrong form, 1 to use orthodox form
; # Example: '(B シブタイ;ジュータイ)'
B = 0
; # Example: 'データー(B カズ;スー)'
B^ = 0
; # 笑いながら発話
; # 0 to remain, 1 to delete
; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)'
= 0
; # Example: 'コク(笑 サイ+(D オン))',
笑^ = 0
; # 泣きながら発話
; # 0 to remain, 1 to delete
; # Example: '(泣 ドンナニ)'
= 0
泣^ = 0
; # 咳をしながら発話
; # 0 to remain, 1 to delete
; # Example: 'シャ(咳 リン) '
= 0
; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
咳^ = 0
; # ささやき声や独り言などの小さな声
; # 0 to remain, 1 to delete
; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))'
L = 0
; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト'
L^ = 0
[REPLACEMENTS]
; # ボーカルフライなどで母音が同定できない場合
<FV> =
; # 「うん/うーん/ふーん」の音の特定が困難な場合
<VN> =
; # 非語彙的な母音の引き延ばし
<H> =
; # 非語彙的な子音の引き延ばし
<Q> =
; # 言語音と独立に講演者の笑いが生じている場合
<笑> =
; # 言語音と独立に講演者の咳が生じている場合
<咳> =
; # 言語音と独立に講演者の息が生じている場合
<息> =
; # 講演者の泣き声
<泣> =
; # 聴衆(司会者なども含む)の発話
<フロア発話> =
; # 聴衆の笑い
<フロア笑> =
; # 聴衆の拍手
<拍手> =
; # 講演者が発表中に用いたデモンストレーションの音声
<デモ> =
; # 学会講演に発表時間を知らせるためにならすベルの音
<ベル> =
; # 転記単位全体が再度読み直された場合
<朗読間違い> =
; # 上記以外の音で特に目立った音
<雑音> =
; # 0.2秒以上のポーズ
<P> =
; # Redacted information, for R
; # It is \x00D7 multiplication sign, not your normal 'x'
× = ×
[FIELDS]
; # Time information for segment
time = 3
; # Word surface
surface = 5
; # Word surface root form without CSJ tags
notag = 9
; # Part Of Speech
pos1 = 11
; # Conjugated Form
cForm = 12
; # Conjugation Type
cType1 = 13
; # Subcategory of POS
pos2 = 14
; # Euphonic Change / Subcategory of Conjugation Type
cType2 = 15
; # Other information
other = 16
; # Pronunciation for lexicon
pron = 10
; # Speaker ID
spk_id = 2
[KATAKANA2ROMAJI]
= 'a
= 'i
= 'u
= 'e
= 'o
= ka
= ki
= ku
= ke
= ko
= ga
= gi
= gu
= ge
= go
= sa
= si
= su
= se
= so
= za
= zi
= zu
= ze
= zo
= ta
= ti
= tu
= te
= to
= da
= di
= du
= de
= do
= na
= ni
= nu
= ne
= no
= ha
= hi
= hu
= he
= ho
= ba
= bi
= bu
= be
= bo
= pa
= pi
= pu
= pe
= po
= ma
= mi
= mu
= me
= mo
= ya
= yu
= yo
= ra
= ri
= ru
= re
= ro
= wa
= we
= wi
= wo
= ŋ
= q
= -
キャ = kǐa
キュ = kǐu
キョ = kǐo
ギャ = gǐa
ギュ = gǐu
ギョ = gǐo
シャ = sǐa
シュ = sǐu
ショ = sǐo
ジャ = zǐa
ジュ = zǐu
ジョ = zǐo
チャ = tǐa
チュ = tǐu
チョ = tǐo
ヂャ = dǐa
ヂュ = dǐu
ヂョ = dǐo
ニャ = nǐa
ニュ = nǐu
ニョ = nǐo
ヒャ = hǐa
ヒュ = hǐu
ヒョ = hǐo
ビャ = bǐa
ビュ = bǐu
ビョ = bǐo
ピャ = pǐa
ピュ = pǐu
ピョ = pǐo
ミャ = mǐa
ミュ = mǐu
ミョ = mǐo
リャ = rǐa
リュ = rǐu
リョ = rǐo
= a
= i
= u
= e
= o
= ʍ
= vu
= ǐa
= ǐu
= ǐo

View File

@ -0,0 +1,321 @@
; # This section is ignored if this file is not supplied as the first config file to
; # lhotse prepare csj
[SEGMENTS]
; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
gap = 0.5
; # Maximum length of segment (s).
maxlen = 10
; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.
minlen = 0.02
; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`.
; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi.
; # If you intend to use a multicharacter string for gap_sym, remember to register the
; # multicharacter string as part of userdef-string in prepare_lang_char.py.
gap_sym =
[CONSTANTS]
; # Name of this mode
MODE = fluent
; # Suffixes to use after the word surface (no longer used)
MORPH = pos1 cForm cType2 pos2
; # Used to differentiate between A tag and A_num tag
JPN_NUM = ゼロ 零 一 二 三 四 五 六 七 八 九 十 百 千
; # Dummy character to delineate multiline words
PLUS =
[DECISIONS]
; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
; # The PLUS (fullwidth) sign '' marks line boundaries for multiline entries
; # フィラー、感情表出系感動詞
; # 0 to remain, 1 to delete
; # Example: '(F ぎょっ)'
F = 1
; # Example: '(L (F ン))', '比べ(F えー)る'
F^ = 1
; # 言い直し、いいよどみなどによる語断片
; # 0 to remain, 1 to delete
; # Example: '(D だ)(D だいが) 大学の学部の会議'
D = 1
; # Example: '(L (D ドゥ)(D ヒ))'
D^ = 1
; # 助詞、助動詞、接辞の言い直し
; # 0 to remain, 1 to delete
; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
D2 = 1
; # Example: '(X (D2 ))'
D2^ = 1
; # 聞き取りや語彙の判断に自信がない場合
; # 0 to remain, 1 to delete
; # Example: (? 字数) の
; # If no option: empty string is returned regardless of output
; # Example: '(?) で'
? = 0
; # Example: '(D (? すー))+そう+です+よ+ね'
?^ = 0
; # タグ?で、値は複数の候補が想定される場合
; # 0 for main guess with matching morph info, 1 for second guess
; # Example: '(? 次数, 実数)', '(? これ,ここで)(? 説明+し+た+方+が+いい+か+な)'
?, = 0
; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))'
?,^ = 0
; # 音や言葉に関するメタ的な引用
; # 0 to remain, 1 to delete
; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
M = 0
; # Example: '(L (M ヒ)(M ヒ))', '(L (M (? ヒ+ヒ)))'
M^ = 0
; # 外国語や古語、方言など
; # 0 to remain, 1 to delete
; # Example: '(O ザッツファイン)'
O = 0
; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))'
O^ = 0
; # 講演者の名前、差別語、誹謗中傷など
; # 0 to remain, 1 to delete
; # Example: '国語研の (R ××) です'
R = 0
R^ = 0
; # 非朗読対象発話(朗読における言い間違い等)
; # 0 to remain, 1 to delete
; # Example: '(X 実際は) 実際には'
X = 0
; # Example: '(L (X (D2 ニ)))'
X^ = 0
; # アルファベットや算用数字、記号の表記
; # 0 to use Japanese form, 1 to use alphabet form
; # Example: '(A シーディーアール;)'
A = 1
; # Example: 'スモール(A エヌ;)', 'ラージ(A キュー;)', '(A ティーエフ;)(A アイディーエフ;)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)')
A^ = 1
; # タグAで、単語は算用数字の場合
; # 0 to use Japanese form, 1 to use Arabic numerals
; # Example: (A 二千;)
A_num = eval:self.notag
A_num^ = eval:self.notag
; # 何らかの原因で漢字表記できなくなった場合
; # 0 to use broken form, 1 to use orthodox form
; # Example: '(K たち (F えー) ばな;橘)'
K = 1
; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
K^ = 1
; # 転訛、発音の怠けなど、一時的な発音エラー
; # 0 to use wrong form, 1 to use orthodox form
; # Example: '(W ギーツ;ギジュツ)'
W = 1
; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
W^ = 1
; # 語の読みに関する知識レベルのいい間違い
; # 0 to use wrong form, 1 to use orthodox form
; # Example: '(B シブタイ;ジュータイ)'
B = 0
; # Example: 'データー(B カズ;スー)'
B^ = 0
; # 笑いながら発話
; # 0 to remain, 1 to delete
; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)'
= 0
; # Example: 'コク(笑 サイ+(D オン))',
笑^ = 0
; # 泣きながら発話
; # 0 to remain, 1 to delete
; # Example: '(泣 ドンナニ)'
= 0
泣^ = 0
; # 咳をしながら発話
; # 0 to remain, 1 to delete
; # Example: 'シャ(咳 リン) '
= 0
; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
咳^ = 0
; # ささやき声や独り言などの小さな声
; # 0 to remain, 1 to delete
; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))'
L = 0
; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト'
L^ = 0
[REPLACEMENTS]
; # ボーカルフライなどで母音が同定できない場合
<FV> =
; # 「うん/うーん/ふーん」の音の特定が困難な場合
<VN> =
; # 非語彙的な母音の引き延ばし
<H> =
; # 非語彙的な子音の引き延ばし
<Q> =
; # 言語音と独立に講演者の笑いが生じている場合
<笑> =
; # 言語音と独立に講演者の咳が生じている場合
<咳> =
; # 言語音と独立に講演者の息が生じている場合
<息> =
; # 講演者の泣き声
<泣> =
; # 聴衆(司会者なども含む)の発話
<フロア発話> =
; # 聴衆の笑い
<フロア笑> =
; # 聴衆の拍手
<拍手> =
; # 講演者が発表中に用いたデモンストレーションの音声
<デモ> =
; # 学会講演に発表時間を知らせるためにならすベルの音
<ベル> =
; # 転記単位全体が再度読み直された場合
<朗読間違い> =
; # 上記以外の音で特に目立った音
<雑音> =
; # 0.2秒以上のポーズ
<P> =
; # Redacted information, for R
; # It is \x00D7 multiplication sign, not your normal 'x'
× = ×
[FIELDS]
; # Time information for segment
time = 3
; # Word surface
surface = 5
; # Word surface root form without CSJ tags
notag = 9
; # Part Of Speech
pos1 = 11
; # Conjugated Form
cForm = 12
; # Conjugation Type
cType1 = 13
; # Subcategory of POS
pos2 = 14
; # Euphonic Change / Subcategory of Conjugation Type
cType2 = 15
; # Other information
other = 16
; # Pronunciation for lexicon
pron = 10
; # Speaker ID
spk_id = 2
[KATAKANA2ROMAJI]
= 'a
= 'i
= 'u
= 'e
= 'o
= ka
= ki
= ku
= ke
= ko
= ga
= gi
= gu
= ge
= go
= sa
= si
= su
= se
= so
= za
= zi
= zu
= ze
= zo
= ta
= ti
= tu
= te
= to
= da
= di
= du
= de
= do
= na
= ni
= nu
= ne
= no
= ha
= hi
= hu
= he
= ho
= ba
= bi
= bu
= be
= bo
= pa
= pi
= pu
= pe
= po
= ma
= mi
= mu
= me
= mo
= ya
= yu
= yo
= ra
= ri
= ru
= re
= ro
= wa
= we
= wi
= wo
= ŋ
= q
= -
キャ = kǐa
キュ = kǐu
キョ = kǐo
ギャ = gǐa
ギュ = gǐu
ギョ = gǐo
シャ = sǐa
シュ = sǐu
ショ = sǐo
ジャ = zǐa
ジュ = zǐu
ジョ = zǐo
チャ = tǐa
チュ = tǐu
チョ = tǐo
ヂャ = dǐa
ヂュ = dǐu
ヂョ = dǐo
ニャ = nǐa
ニュ = nǐu
ニョ = nǐo
ヒャ = hǐa
ヒュ = hǐu
ヒョ = hǐo
ビャ = bǐa
ビュ = bǐu
ビョ = bǐo
ピャ = pǐa
ピュ = pǐu
ピョ = pǐo
ミャ = mǐa
ミュ = mǐu
ミョ = mǐo
リャ = rǐa
リュ = rǐu
リョ = rǐo
= a
= i
= u
= e
= o
= ʍ
= vu
= ǐa
= ǐu
= ǐo

View File

@ -0,0 +1,321 @@
; # This section is ignored if this file is not supplied as the first config file to
; # lhotse prepare csj
[SEGMENTS]
; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
gap = 0.5
; # Maximum length of segment (s).
maxlen = 10
; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.
minlen = 0.02
; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`.
; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi.
; # If you intend to use a multicharacter string for gap_sym, remember to register the
; # multicharacter string as part of userdef-string in prepare_lang_char.py.
gap_sym =
[CONSTANTS]
; # Name of this mode
MODE = number
; # Suffixes to use after the word surface (no longer used)
MORPH = pos1 cForm cType2 pos2
; # Used to differentiate between A tag and A_num tag
JPN_NUM = ゼロ 零 一 二 三 四 五 六 七 八 九 十 百 千
; # Dummy character to delineate multiline words
PLUS =
[DECISIONS]
; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
; # The PLUS (fullwidth) sign '' marks line boundaries for multiline entries
; # フィラー、感情表出系感動詞
; # 0 to remain, 1 to delete
; # Example: '(F ぎょっ)'
F = 1
; # Example: '(L (F ン))', '比べ(F えー)る'
F^ = 1
; # 言い直し、いいよどみなどによる語断片
; # 0 to remain, 1 to delete
; # Example: '(D だ)(D だいが) 大学の学部の会議'
D = 1
; # Example: '(L (D ドゥ)(D ヒ))'
D^ = 1
; # 助詞、助動詞、接辞の言い直し
; # 0 to remain, 1 to delete
; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
D2 = 1
; # Example: '(X (D2 ))'
D2^ = 1
; # 聞き取りや語彙の判断に自信がない場合
; # 0 to remain, 1 to delete
; # Example: (? 字数) の
; # If no option: empty string is returned regardless of output
; # Example: '(?) で'
? = 0
; # Example: '(D (? すー))+そう+です+よ+ね'
?^ = 0
; # タグ?で、値は複数の候補が想定される場合
; # 0 for main guess with matching morph info, 1 for second guess
; # Example: '(? 次数, 実数)', '(? これ,ここで)(? 説明+し+た+方+が+いい+か+な)'
?, = 0
; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))'
?,^ = 0
; # 音や言葉に関するメタ的な引用
; # 0 to remain, 1 to delete
; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
M = 0
; # Example: '(L (M ヒ)(M ヒ))', '(L (M (? ヒ+ヒ)))'
M^ = 0
; # 外国語や古語、方言など
; # 0 to remain, 1 to delete
; # Example: '(O ザッツファイン)'
O = 0
; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))'
O^ = 0
; # 講演者の名前、差別語、誹謗中傷など
; # 0 to remain, 1 to delete
; # Example: '国語研の (R ××) です'
R = 0
R^ = 0
; # 非朗読対象発話(朗読における言い間違い等)
; # 0 to remain, 1 to delete
; # Example: '(X 実際は) 実際には'
X = 0
; # Example: '(L (X (D2 ニ)))'
X^ = 0
; # アルファベットや算用数字、記号の表記
; # 0 to use Japanese form, 1 to use alphabet form
; # Example: '(A シーディーアール;)'
A = 1
; # Example: 'スモール(A エヌ;)', 'ラージ(A キュー;)', '(A ティーエフ;)(A アイディーエフ;)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)')
A^ = 1
; # タグAで、単語は算用数字の場合
; # 0 to use Japanese form, 1 to use Arabic numerals
; # Example: (A 二千;)
A_num = 1
A_num^ = 1
; # 何らかの原因で漢字表記できなくなった場合
; # 0 to use broken form, 1 to use orthodox form
; # Example: '(K たち (F えー) ばな;橘)'
K = 1
; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
K^ = 1
; # 転訛、発音の怠けなど、一時的な発音エラー
; # 0 to use wrong form, 1 to use orthodox form
; # Example: '(W ギーツ;ギジュツ)'
W = 1
; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
W^ = 1
; # 語の読みに関する知識レベルのいい間違い
; # 0 to use wrong form, 1 to use orthodox form
; # Example: '(B シブタイ;ジュータイ)'
B = 0
; # Example: 'データー(B カズ;スー)'
B^ = 0
; # 笑いながら発話
; # 0 to remain, 1 to delete
; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)'
= 0
; # Example: 'コク(笑 サイ+(D オン))',
笑^ = 0
; # 泣きながら発話
; # 0 to remain, 1 to delete
; # Example: '(泣 ドンナニ)'
= 0
泣^ = 0
; # 咳をしながら発話
; # 0 to remain, 1 to delete
; # Example: 'シャ(咳 リン) '
= 0
; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
咳^ = 0
; # ささやき声や独り言などの小さな声
; # 0 to remain, 1 to delete
; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))'
L = 0
; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト'
L^ = 0
[REPLACEMENTS]
; # ボーカルフライなどで母音が同定できない場合
<FV> =
; # 「うん/うーん/ふーん」の音の特定が困難な場合
<VN> =
; # 非語彙的な母音の引き延ばし
<H> =
; # 非語彙的な子音の引き延ばし
<Q> =
; # 言語音と独立に講演者の笑いが生じている場合
<笑> =
; # 言語音と独立に講演者の咳が生じている場合
<咳> =
; # 言語音と独立に講演者の息が生じている場合
<息> =
; # 講演者の泣き声
<泣> =
; # 聴衆(司会者なども含む)の発話
<フロア発話> =
; # 聴衆の笑い
<フロア笑> =
; # 聴衆の拍手
<拍手> =
; # 講演者が発表中に用いたデモンストレーションの音声
<デモ> =
; # 学会講演に発表時間を知らせるためにならすベルの音
<ベル> =
; # 転記単位全体が再度読み直された場合
<朗読間違い> =
; # 上記以外の音で特に目立った音
<雑音> =
; # 0.2秒以上のポーズ
<P> =
; # Redacted information, for R
; # It is \x00D7 multiplication sign, not your normal 'x'
× = ×
[FIELDS]
; # Time information for segment
time = 3
; # Word surface
surface = 5
; # Word surface root form without CSJ tags
notag = 9
; # Part Of Speech
pos1 = 11
; # Conjugated Form
cForm = 12
; # Conjugation Type
cType1 = 13
; # Subcategory of POS
pos2 = 14
; # Euphonic Change / Subcategory of Conjugation Type
cType2 = 15
; # Other information
other = 16
; # Pronunciation for lexicon
pron = 10
; # Speaker ID
spk_id = 2
[KATAKANA2ROMAJI]
= 'a
= 'i
= 'u
= 'e
= 'o
= ka
= ki
= ku
= ke
= ko
= ga
= gi
= gu
= ge
= go
= sa
= si
= su
= se
= so
= za
= zi
= zu
= ze
= zo
= ta
= ti
= tu
= te
= to
= da
= di
= du
= de
= do
= na
= ni
= nu
= ne
= no
= ha
= hi
= hu
= he
= ho
= ba
= bi
= bu
= be
= bo
= pa
= pi
= pu
= pe
= po
= ma
= mi
= mu
= me
= mo
= ya
= yu
= yo
= ra
= ri
= ru
= re
= ro
= wa
= we
= wi
= wo
= ŋ
= q
= -
キャ = kǐa
キュ = kǐu
キョ = kǐo
ギャ = gǐa
ギュ = gǐu
ギョ = gǐo
シャ = sǐa
シュ = sǐu
ショ = sǐo
ジャ = zǐa
ジュ = zǐu
ジョ = zǐo
チャ = tǐa
チュ = tǐu
チョ = tǐo
ヂャ = dǐa
ヂュ = dǐu
ヂョ = dǐo
ニャ = nǐa
ニュ = nǐu
ニョ = nǐo
ヒャ = hǐa
ヒュ = hǐu
ヒョ = hǐo
ビャ = bǐa
ビュ = bǐu
ビョ = bǐo
ピャ = pǐa
ピュ = pǐu
ピョ = pǐo
ミャ = mǐa
ミュ = mǐu
ミョ = mǐo
リャ = rǐa
リュ = rǐu
リョ = rǐo
= a
= i
= u
= e
= o
= ʍ
= vu
= ǐa
= ǐu
= ǐo

View File

@ -0,0 +1,322 @@
; # This section is ignored if this file is not supplied as the first config file to
; # lhotse prepare csj
[SEGMENTS]
; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
gap = 0.5
; # Maximum length of segment (s).
maxlen = 10
; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.
minlen = 0.02
; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`.
; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi.
; # If you intend to use a multicharacter string for gap_sym, remember to register the
; # multicharacter string as part of userdef-string in prepare_lang_char.py.
gap_sym =
[CONSTANTS]
; # Name of this mode
; # See https://www.isca-speech.org/archive/pdfs/interspeech_2022/horii22_interspeech.pdf
MODE = symbol
; # Suffixes to use after the word surface (no longer used)
MORPH = pos1 cForm cType2 pos2
; # Used to differentiate between A tag and A_num tag
JPN_NUM = ゼロ 零 一 二 三 四 五 六 七 八 九 十 百 千
; # Dummy character to delineate multiline words
PLUS =
[DECISIONS]
; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
; # The PLUS (fullwidth) sign '' marks line boundaries for multiline entries
; # フィラー、感情表出系感動詞
; # 0 to remain, 1 to delete
; # Example: '(F ぎょっ)'
F =
; # Example: '(L (F ン))', '比べ(F えー)る'
F^ =
; # 言い直し、いいよどみなどによる語断片
; # 0 to remain, 1 to delete
; # Example: '(D だ)(D だいが) 大学の学部の会議'
D =
; # Example: '(L (D ドゥ)(D ヒ))'
D^ =
; # 助詞、助動詞、接辞の言い直し
; # 0 to remain, 1 to delete
; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
D2 =
; # Example: '(X (D2 ))'
D2^ =
; # 聞き取りや語彙の判断に自信がない場合
; # 0 to remain, 1 to delete
; # Example: (? 字数) の
; # If no option: empty string is returned regardless of output
; # Example: '(?) で'
? = 0
; # Example: '(D (? すー))+そう+です+よ+ね'
?^ = 0
; # タグ?で、値は複数の候補が想定される場合
; # 0 for main guess with matching morph info, 1 for second guess
; # Example: '(? 次数, 実数)', '(? これ,ここで)(? 説明+し+た+方+が+いい+か+な)'
?, = 0
; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))'
?,^ = 0
; # 音や言葉に関するメタ的な引用
; # 0 to remain, 1 to delete
; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
M = 0
; # Example: '(L (M ヒ)(M ヒ))', '(L (M (? ヒ+ヒ)))'
M^ = 0
; # 外国語や古語、方言など
; # 0 to remain, 1 to delete
; # Example: '(O ザッツファイン)'
O = 0
; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))'
O^ = 0
; # 講演者の名前、差別語、誹謗中傷など
; # 0 to remain, 1 to delete
; # Example: '国語研の (R ××) です'
R = 0
R^ = 0
; # 非朗読対象発話(朗読における言い間違い等)
; # 0 to remain, 1 to delete
; # Example: '(X 実際は) 実際には'
X = 0
; # Example: '(L (X (D2 ニ)))'
X^ = 0
; # アルファベットや算用数字、記号の表記
; # 0 to use Japanese form, 1 to use alphabet form
; # Example: '(A シーディーアール;)'
A = 1
; # Example: 'スモール(A エヌ;)', 'ラージ(A キュー;)', '(A ティーエフ;)(A アイディーエフ;)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)')
A^ = 1
; # タグAで、単語は算用数字の場合
; # 0 to use Japanese form, 1 to use Arabic numerals
; # Example: (A 二千;)
A_num = eval:self.notag
A_num^ = eval:self.notag
; # 何らかの原因で漢字表記できなくなった場合
; # 0 to use broken form, 1 to use orthodox form
; # Example: '(K たち (F えー) ばな;橘)'
K = 1
; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
K^ = 1
; # 転訛、発音の怠けなど、一時的な発音エラー
; # 0 to use wrong form, 1 to use orthodox form
; # Example: '(W ギーツ;ギジュツ)'
W = 1
; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
W^ = 1
; # 語の読みに関する知識レベルのいい間違い
; # 0 to use wrong form, 1 to use orthodox form
; # Example: '(B シブタイ;ジュータイ)'
B = 0
; # Example: 'データー(B カズ;スー)'
B^ = 0
; # 笑いながら発話
; # 0 to remain, 1 to delete
; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)'
= 0
; # Example: 'コク(笑 サイ+(D オン))',
笑^ = 0
; # 泣きながら発話
; # 0 to remain, 1 to delete
; # Example: '(泣 ドンナニ)'
= 0
泣^ = 0
; # 咳をしながら発話
; # 0 to remain, 1 to delete
; # Example: 'シャ(咳 リン) '
= 0
; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
咳^ = 0
; # ささやき声や独り言などの小さな声
; # 0 to remain, 1 to delete
; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))'
L = 0
; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト'
L^ = 0
[REPLACEMENTS]
; # ボーカルフライなどで母音が同定できない場合
<FV> =
; # 「うん/うーん/ふーん」の音の特定が困難な場合
<VN> =
; # 非語彙的な母音の引き延ばし
<H> =
; # 非語彙的な子音の引き延ばし
<Q> =
; # 言語音と独立に講演者の笑いが生じている場合
<笑> =
; # 言語音と独立に講演者の咳が生じている場合
<咳> =
; # 言語音と独立に講演者の息が生じている場合
<息> =
; # 講演者の泣き声
<泣> =
; # 聴衆(司会者なども含む)の発話
<フロア発話> =
; # 聴衆の笑い
<フロア笑> =
; # 聴衆の拍手
<拍手> =
; # 講演者が発表中に用いたデモンストレーションの音声
<デモ> =
; # 学会講演に発表時間を知らせるためにならすベルの音
<ベル> =
; # 転記単位全体が再度読み直された場合
<朗読間違い> =
; # 上記以外の音で特に目立った音
<雑音> =
; # 0.2秒以上のポーズ
<P> =
; # Redacted information, for R
; # It is \x00D7 multiplication sign, not your normal 'x'
× = ×
[FIELDS]
; # Time information for segment
time = 3
; # Word surface
surface = 5
; # Word surface root form without CSJ tags
notag = 9
; # Part Of Speech
pos1 = 11
; # Conjugated Form
cForm = 12
; # Conjugation Type
cType1 = 13
; # Subcategory of POS
pos2 = 14
; # Euphonic Change / Subcategory of Conjugation Type
cType2 = 15
; # Other information
other = 16
; # Pronunciation for lexicon
pron = 10
; # Speaker ID
spk_id = 2
[KATAKANA2ROMAJI]
= 'a
= 'i
= 'u
= 'e
= 'o
= ka
= ki
= ku
= ke
= ko
= ga
= gi
= gu
= ge
= go
= sa
= si
= su
= se
= so
= za
= zi
= zu
= ze
= zo
= ta
= ti
= tu
= te
= to
= da
= di
= du
= de
= do
= na
= ni
= nu
= ne
= no
= ha
= hi
= hu
= he
= ho
= ba
= bi
= bu
= be
= bo
= pa
= pi
= pu
= pe
= po
= ma
= mi
= mu
= me
= mo
= ya
= yu
= yo
= ra
= ri
= ru
= re
= ro
= wa
= we
= wi
= wo
= ŋ
= q
= -
キャ = kǐa
キュ = kǐu
キョ = kǐo
ギャ = gǐa
ギュ = gǐu
ギョ = gǐo
シャ = sǐa
シュ = sǐu
ショ = sǐo
ジャ = zǐa
ジュ = zǐu
ジョ = zǐo
チャ = tǐa
チュ = tǐu
チョ = tǐo
ヂャ = dǐa
ヂュ = dǐu
ヂョ = dǐo
ニャ = nǐa
ニュ = nǐu
ニョ = nǐo
ヒャ = hǐa
ヒュ = hǐu
ヒョ = hǐo
ビャ = bǐa
ビュ = bǐu
ビョ = bǐo
ピャ = pǐa
ピュ = pǐu
ピョ = pǐo
ミャ = mǐa
ミュ = mǐu
ミョ = mǐo
リャ = rǐa
リュ = rǐu
リョ = rǐo
= a
= i
= u
= e
= o
= ʍ
= vu
= ǐa
= ǐu
= ǐo

View File

@ -0,0 +1,182 @@
#!/usr/bin/env python3
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
# 2022 The University of Electro-Communications (author: Teo Wen Shen) # noqa
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
from lhotse import CutSet, load_manifest
ARGPARSE_DESCRIPTION = """
This file displays duration statistics of utterances in a manifest.
You can use the displayed value to choose minimum/maximum duration
to remove short and long utterances during the training.
See the function `remove_short_and_long_utt()` in
pruned_transducer_stateless5/train.py for usage.
"""
def get_parser():
parser = argparse.ArgumentParser(
description=ARGPARSE_DESCRIPTION,
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--manifest-dir", type=Path, help="Path to cutset manifests"
)
return parser.parse_args()
def main():
args = get_parser()
for path in args.manifest_dir.glob("csj_cuts_*.jsonl.gz"):
cuts: CutSet = load_manifest(path)
print("\n---------------------------------\n")
print(path.name + ":")
cuts.describe()
if __name__ == "__main__":
main()
"""
## eval1
Cuts count: 1272
Total duration (hh:mm:ss): 01:50:07
Speech duration (hh:mm:ss): 01:50:07 (100.0%)
Duration statistics (seconds):
mean 5.2
std 3.9
min 0.2
25% 1.9
50% 4.0
75% 8.1
99% 14.3
99.5% 14.7
99.9% 16.0
max 16.9
Recordings available: 1272
Features available: 1272
Supervisions available: 1272
SUPERVISION custom fields:
- fluent (in 1272 cuts)
- disfluent (in 1272 cuts)
- number (in 1272 cuts)
- symbol (in 1272 cuts)
## eval2
Cuts count: 1292
Total duration (hh:mm:ss): 01:56:50
Speech duration (hh:mm:ss): 01:56:50 (100.0%)
Duration statistics (seconds):
mean 5.4
std 3.9
min 0.1
25% 2.1
50% 4.6
75% 8.6
99% 14.1
99.5% 15.2
99.9% 16.1
max 16.9
Recordings available: 1292
Features available: 1292
Supervisions available: 1292
SUPERVISION custom fields:
- fluent (in 1292 cuts)
- number (in 1292 cuts)
- symbol (in 1292 cuts)
- disfluent (in 1292 cuts)
## eval3
Cuts count: 1385
Total duration (hh:mm:ss): 01:19:21
Speech duration (hh:mm:ss): 01:19:21 (100.0%)
Duration statistics (seconds):
mean 3.4
std 3.0
min 0.2
25% 1.2
50% 2.5
75% 4.6
99% 12.7
99.5% 13.7
99.9% 15.0
max 15.9
Recordings available: 1385
Features available: 1385
Supervisions available: 1385
SUPERVISION custom fields:
- number (in 1385 cuts)
- symbol (in 1385 cuts)
- fluent (in 1385 cuts)
- disfluent (in 1385 cuts)
## valid
Cuts count: 4000
Total duration (hh:mm:ss): 05:08:09
Speech duration (hh:mm:ss): 05:08:09 (100.0%)
Duration statistics (seconds):
mean 4.6
std 3.8
min 0.1
25% 1.5
50% 3.4
75% 7.0
99% 13.8
99.5% 14.8
99.9% 16.0
max 17.3
Recordings available: 4000
Features available: 4000
Supervisions available: 4000
SUPERVISION custom fields:
- fluent (in 4000 cuts)
- symbol (in 4000 cuts)
- disfluent (in 4000 cuts)
- number (in 4000 cuts)
## train
Cuts count: 1291134
Total duration (hh:mm:ss): 1596:37:27
Speech duration (hh:mm:ss): 1596:37:27 (100.0%)
Duration statistics (seconds):
mean 4.5
std 3.6
min 0.0
25% 1.6
50% 3.3
75% 6.4
99% 14.0
99.5% 14.8
99.9% 16.6
max 27.8
Recordings available: 1291134
Features available: 1291134
Supervisions available: 1291134
SUPERVISION custom fields:
- disfluent (in 1291134 cuts)
- fluent (in 1291134 cuts)
- symbol (in 1291134 cuts)
- number (in 1291134 cuts)
"""

View File

@ -0,0 +1,155 @@
#!/usr/bin/env python3
# Copyright 2022 The University of Electro-Communications (Author: Teo Wen Shen) # noqa
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from pathlib import Path
from lhotse import CutSet
ARGPARSE_DESCRIPTION = """
This script gathers all training transcripts of the specified {trans_mode} type
and produces a token_list that would be output set of the ASR system.
It splits transcripts by whitespace into lists, then, for each word in the
list, if the word does not appear in the list of user-defined multicharacter
strings, it further splits that word into individual characters to be counted
into the output token set.
It outputs 4 files into the lang directory:
- trans_mode: the name of transcript mode. If trans_mode was not specified,
this will be an empty file.
- userdef_string: a list of user defined strings that should not be split
further into individual characters. By default, it contains "<unk>", "<blk>",
"<sos/eos>"
- words_len: the total number of tokens in the output set.
- words.txt: a list of tokens in the output set. The length matches words_len.
"""
def get_args():
parser = argparse.ArgumentParser(
description=ARGPARSE_DESCRIPTION,
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--train-cut", type=Path, required=True, help="Path to the train cut"
)
parser.add_argument(
"--trans-mode",
type=str,
default=None,
help=(
"Name of the transcript mode to use. "
"If lang-dir is not set, this will also name the lang-dir"
),
)
parser.add_argument(
"--lang-dir",
type=Path,
default=None,
help=(
"Name of lang dir. "
"If not set, this will default to lang_char_{trans-mode}"
),
)
parser.add_argument(
"--userdef-string",
type=Path,
default=None,
help="Multicharacter strings that do not need to be split",
)
return parser.parse_args()
def main():
args = get_args()
logging.basicConfig(
format=(
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] " "%(message)s"
),
level=logging.INFO,
)
if not args.lang_dir:
p = "lang_char"
if args.trans_mode:
p += f"_{args.trans_mode}"
args.lang_dir = Path(p)
if args.userdef_string:
args.userdef_string = set(args.userdef_string.read_text().split())
else:
args.userdef_string = set()
sysdef_string = ["<blk>", "<unk>", "<sos/eos>"]
args.userdef_string.update(sysdef_string)
train_set: CutSet = CutSet.from_file(args.train_cut)
words = set()
logging.info(
f"Creating vocabulary from {args.train_cut.name}"
f" at {args.trans_mode} mode."
)
for cut in train_set:
try:
text: str = (
cut.supervisions[0].custom[args.trans_mode]
if args.trans_mode
else cut.supervisions[0].text
)
except KeyError:
raise KeyError(
f"Could not find {args.trans_mode} in "
f"{cut.supervisions[0].custom}"
)
for t in text.split():
if t in args.userdef_string:
words.add(t)
else:
words.update(c for c in list(t))
words -= set(sysdef_string)
words = sorted(words)
words = ["<blk>"] + words + ["<unk>", "<sos/eos>"]
args.lang_dir.mkdir(parents=True, exist_ok=True)
(args.lang_dir / "words.txt").write_text(
"\n".join(f"{word}\t{i}" for i, word in enumerate(words))
)
(args.lang_dir / "words_len").write_text(f"{len(words)}")
(args.lang_dir / "userdef_string").write_text(
"\n".join(args.userdef_string)
)
(args.lang_dir / "trans_mode").write_text(args.trans_mode)
logging.info("Done.")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,98 @@
#!/usr/bin/env python3
# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script checks the following assumptions of the generated manifest:
- Single supervision per cut
- Supervision time bounds are within cut time bounds
We will add more checks later if needed.
Usage example:
python3 ./local/validate_manifest.py \
./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
"""
import argparse
import logging
from pathlib import Path
from lhotse import CutSet, load_manifest
from lhotse.cut import Cut
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--manifest",
type=Path,
help="Path to the manifest file",
)
return parser.parse_args()
def validate_one_supervision_per_cut(c: Cut):
if len(c.supervisions) != 1:
raise ValueError(f"{c.id} has {len(c.supervisions)} supervisions")
def validate_supervision_and_cut_time_bounds(c: Cut):
s = c.supervisions[0]
# Removed because when the cuts were trimmed from supervisions,
# the start time of the supervision can be lesser than cut start time.
# https://github.com/lhotse-speech/lhotse/issues/813
# if s.start < c.start:
# raise ValueError(
# f"{c.id}: Supervision start time {s.start} is less "
# f"than cut start time {c.start}"
# )
if s.end > c.end:
raise ValueError(
f"{c.id}: Supervision end time {s.end} is larger "
f"than cut end time {c.end}"
)
def main():
args = get_args()
manifest = Path(args.manifest)
logging.info(f"Validating {manifest}")
assert manifest.is_file(), f"{manifest} does not exist"
cut_set = load_manifest(manifest)
assert isinstance(cut_set, CutSet)
for c in cut_set:
validate_one_supervision_per_cut(c)
validate_supervision_and_cut_time_bounds(c)
if __name__ == "__main__":
formatter = (
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
)
logging.basicConfig(format=formatter, level=logging.INFO)
main()

130
egs/csj/ASR/prepare.sh Executable file
View File

@ -0,0 +1,130 @@
#!/usr/bin/env bash
# We assume the following directories are downloaded.
#
# - $csj_dir
# CSJ is assumed to be the USB-type directory, which should contain the following subdirectories:-
# - DATA (not used in this script)
# - DOC (not used in this script)
# - MODEL (not used in this script)
# - MORPH
# - LDB (not used in this script)
# - SUWDIC (not used in this script)
# - SDB
# - core
# - ...
# - noncore
# - ...
# - PLABEL (not used in this script)
# - SUMMARY (not used in this script)
# - TOOL (not used in this script)
# - WAV
# - core
# - ...
# - noncore
# - ...
# - XML (not used in this script)
#
# - $musan_dir
# This directory contains the following directories downloaded from
# http://www.openslr.org/17/
# - music
# - noise
# - speech
#
# By default, this script produces the original transcript like kaldi and espnet. Optionally, you
# can generate other transcript formats by supplying your own config files. A few examples of these
# config files can be found in local/conf.
set -eou pipefail
nj=8
stage=-1
stop_stage=100
csj_dir=/mnt/minami_data_server/t2131178/corpus/CSJ
musan_dir=/mnt/minami_data_server/t2131178/corpus/musan/musan
trans_dir=$csj_dir/retranscript
csj_fbank_dir=/mnt/host/csj_data/fbank
musan_fbank_dir=$musan_dir/fbank
csj_manifest_dir=data/manifests
musan_manifest_dir=$musan_dir/manifests
. shared/parse_options.sh || exit 1
mkdir -p data
log() {
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare CSJ manifest"
# If you want to generate more transcript modes, append the path to those config files at c.
# Example: lhotse prepare csj $csj_dir $trans_dir $csj_manifest_dir -c local/conf/disfluent.ini
# NOTE: In case multiple config files are supplied, the second config file and onwards will inherit
# the segment boundaries of the first config file.
if [ ! -e $csj_manifest_dir/.librispeech.done ]; then
lhotse prepare csj $csj_dir $trans_dir $csj_manifest_dir -j 4
touch $csj_manifest_dir/.librispeech.done
fi
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Prepare musan manifest"
mkdir -p $musan_manifest_dir
if [ ! -e $musan_manifest_dir/.musan.done ]; then
lhotse prepare musan $musan_dir $musan_manifest_dir
touch $musan_manifest_dir/.musan.done
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Compute CSJ fbank"
if [ ! -e $csj_fbank_dir/.csj-validated.done ]; then
python local/compute_fbank_csj.py --manifest-dir $csj_manifest_dir \
--fbank-dir $csj_fbank_dir
parts=(
train
valid
eval1
eval2
eval3
)
for part in ${parts[@]}; do
python local/validate_manifest.py --manifest $csj_manifest_dir/csj_cuts_$part.jsonl.gz
done
touch $csj_fbank_dir/.csj-validated.done
fi
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Prepare CSJ lang"
modes=disfluent
# If you want prepare the lang directory for other transcript modes, just append
# the names of those modes behind. An example is shown as below:-
# modes="$modes fluent symbol number"
for mode in ${modes[@]}; do
python local/prepare_lang_char.py --trans-mode $mode \
--train-cut $csj_manifest_dir/csj_cuts_train.jsonl.gz \
--lang-dir lang_char_$mode
done
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Compute fbank for musan"
mkdir -p $musan_fbank_dir
if [ ! -e $musan_fbank_dir/.musan.done ]; then
python local/compute_fbank_musan.py --manifest-dir $musan_manifest_dir --fbank-dir $musan_fbank_dir
touch $musan_fbank_dir/.musan.done
fi
fi
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Show manifest statistics"
python local/display_manifest_statistics.py --manifest-dir $csj_manifest_dir > $csj_manifest_dir/manifest_statistics.txt
cat $csj_manifest_dir/manifest_statistics.txt
fi

1
egs/csj/ASR/shared Symbolic link
View File

@ -0,0 +1 @@
../../../icefall/shared/