CSJ Data Preparation (#617)

* workspace setup * csj prepare done * Change compute_fbank_musan.py t soft link * add description * change lhotse prepare csj command * split train-dev here * Add header * remove debug * save manifest_statistics * generate transcript in Lhotse * update comments in config file
2025-08-08 09:32:20 +00:00 · 2022-10-18 16:56:43 +09:00 · 2022-10-18 16:56:43 +09:00 · 15c1a4a441
commit 15c1a4a441
parent d69bb826ed
12 changed files with 2032 additions and 0 deletions
--- a/egs/csj/ASR/.gitignore
+++ b/egs/csj/ASR/.gitignore
@ -0,0 +1,7 @@
+librispeech_*.*
+todelete*
+lang*
+notify_tg.py
+finetune_*
+misc.ini
+.vscode/*
--- a/egs/csj/ASR/local/compute_fbank_csj.py
+++ b/egs/csj/ASR/local/compute_fbank_csj.py
@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+# Copyright    2022  The University of Electro-Communications  (Author: Teo Wen Shen)  # noqa
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+import os
+from itertools import islice
+from pathlib import Path
+from random import Random
+from typing import List, Tuple
+
+import torch
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    # fmt: off
+    # See the following for why LilcomChunkyWriter is preferred
+    # https://github.com/k2-fsa/icefall/pull/404
+    # https://github.com/lhotse-speech/lhotse/pull/527
+    # fmt: on
+    LilcomChunkyWriter,
+    RecordingSet,
+    SupervisionSet,
+)
+
+ARGPARSE_DESCRIPTION = """
+This script follows the espnet method of splitting the remaining core+noncore
+utterances into valid and train cutsets at an index which is by default 4000.
+
+In other words, the core+noncore utterances are shuffled, where 4000 utterances
+of the shuffled set go to the `valid` cutset and are not subject to speed
+perturbation. The remaining utterances become the `train` cutset and are speed-
+perturbed (0.9x, 1.0x, 1.1x).
+
+"""
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+RNG_SEED = 42
+
+
+def make_cutset_blueprints(
+    manifest_dir: Path,
+    split: int,
+) -> List[Tuple[str, CutSet]]:
+
+    cut_sets = []
+    # Create eval datasets
+    logging.info("Creating eval cuts.")
+    for i in range(1, 4):
+        cut_set = CutSet.from_manifests(
+            recordings=RecordingSet.from_file(
+                manifest_dir / f"csj_recordings_eval{i}.jsonl.gz"
+            ),
+            supervisions=SupervisionSet.from_file(
+                manifest_dir / f"csj_supervisions_eval{i}.jsonl.gz"
+            ),
+        )
+        cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
+        cut_sets.append((f"eval{i}", cut_set))
+
+    # Create train and valid cuts
+    logging.info(
+        "Loading, trimming, and shuffling the remaining core+noncore cuts."
+    )
+    recording_set = RecordingSet.from_file(
+        manifest_dir / "csj_recordings_core.jsonl.gz"
+    ) + RecordingSet.from_file(manifest_dir / "csj_recordings_noncore.jsonl.gz")
+    supervision_set = SupervisionSet.from_file(
+        manifest_dir / "csj_supervisions_core.jsonl.gz"
+    ) + SupervisionSet.from_file(
+        manifest_dir / "csj_supervisions_noncore.jsonl.gz"
+    )
+
+    cut_set = CutSet.from_manifests(
+        recordings=recording_set,
+        supervisions=supervision_set,
+    )
+    cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
+    cut_set = cut_set.shuffle(Random(RNG_SEED))
+
+    logging.info(
+        "Creating valid and train cuts from core and noncore,"
+        f"split at {split}."
+    )
+    valid_set = CutSet.from_cuts(islice(cut_set, 0, split))
+
+    train_set = CutSet.from_cuts(islice(cut_set, split, None))
+    train_set = (
+        train_set + train_set.perturb_speed(0.9) + train_set.perturb_speed(1.1)
+    )
+
+    cut_sets.extend([("valid", valid_set), ("train", train_set)])
+
+    return cut_sets
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description=ARGPARSE_DESCRIPTION,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--manifest-dir", type=Path, help="Path to save manifests"
+    )
+    parser.add_argument(
+        "--fbank-dir", type=Path, help="Path to save fbank features"
+    )
+    parser.add_argument(
+        "--split", type=int, default=4000, help="Split at this index"
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    extractor = Fbank(FbankConfig(num_mel_bins=80))
+    num_jobs = min(16, os.cpu_count())
+
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    if (args.fbank_dir / ".done").exists():
+        logging.info(
+            "Previous fbank computed for CSJ found. "
+            f"Delete {args.fbank_dir / '.done'} to allow recomputing fbank."
+        )
+        return
+    else:
+        cut_sets = make_cutset_blueprints(args.manifest_dir, args.split)
+        for part, cut_set in cut_sets:
+            logging.info(f"Processing {part}")
+            cut_set = cut_set.compute_and_store_features(
+                extractor=extractor,
+                num_jobs=num_jobs,
+                storage_path=(args.fbank_dir / f"feats_{part}").as_posix(),
+                storage_type=LilcomChunkyWriter,
+            )
+            cut_set.to_file(args.manifest_dir / f"csj_cuts_{part}.jsonl.gz")
+
+        logging.info("All fbank computed for CSJ.")
+        (args.fbank_dir / ".done").touch()
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/csj/ASR/local/compute_fbank_musan.py
+++ b/egs/csj/ASR/local/compute_fbank_musan.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/compute_fbank_musan.py
--- a/egs/csj/ASR/local/conf/disfluent.ini
+++ b/egs/csj/ASR/local/conf/disfluent.ini
@ -0,0 +1,321 @@
+; # This section is ignored if this file is not supplied as the first config file to
+; # lhotse prepare csj  
+[SEGMENTS]
+; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
+gap = 0.5
+; # Maximum length of segment (s).
+maxlen = 10
+; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.  
+minlen = 0.02
+; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`. 
+; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi. 
+; # If you intend to use a multicharacter string for gap_sym, remember to register the 
+; # multicharacter string as part of userdef-string in prepare_lang_char.py. 
+gap_sym = 
+
+[CONSTANTS]
+; # Name of this mode
+MODE = disfluent
+; # Suffixes to use after the word surface (no longer used)
+MORPH = pos1 cForm cType2 pos2
+; # Used to differentiate between A tag and A_num tag
+JPN_NUM = ゼロ ０ 零 一 二 三 四 五 六 七 八 九 十 百 千 ．
+; # Dummy character to delineate multiline words
+PLUS = ＋
+
+[DECISIONS]
+; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
+; # The PLUS (fullwidth) sign '＋' marks line boundaries for multiline entries
+
+; # フィラー、感情表出系感動詞
+; # 0 to remain, 1 to delete
+; # Example: '(F ぎょっ)'
+F = 0
+; # Example: '(L (F ン))', '比べ(F えー)る'
+F^ = 0
+; # 言い直し、いいよどみなどによる語断片
+; # 0 to remain, 1 to delete
+; # Example: '(D だ)(D だいが) 大学の学部の会議'
+D = 0
+; # Example: '(L (D ドゥ)＋(D ヒ))'
+D^ = 0
+; # 助詞、助動詞、接辞の言い直し
+; # 0 to remain, 1 to delete
+; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
+D2 = 0
+; # Example: '(X (D2 ノ))'
+D2^ = 0
+; # 聞き取りや語彙の判断に自信がない場合
+; # 0 to remain, 1 to delete
+; # Example: (? 字数) の
+; # If no option: empty string is returned regardless of output
+; # Example: '(?) で'
+? = 0
+; # Example: '(D (? すー))＋そう＋です＋よ＋ね'
+?^ = 0
+; # タグ?で、値は複数の候補が想定される場合
+; # 0 for main guess with matching morph info, 1 for second guess
+; # Example:  '(? 次数, 実数)', '(? これ,ここで)＋(? 説明＋し＋た＋方＋が＋いい＋か＋な)'
+?, = 0
+; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ＋タ,マス))'
+?,^ = 0
+; # 音や言葉に関するメタ的な引用
+; # 0 to remain, 1 to delete
+; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
+M = 0
+; # Example: '(L (M ヒ)＋(M ヒ))', '(L (M (? ヒ＋ヒ)))'
+M^ = 0
+; # 外国語や古語、方言など
+; # 0 to remain, 1 to delete
+; # Example: '(O ザッツファイン)'
+O = 0
+; # Example: '(笑 (O エクスキューズ＋ミー))', '(笑 メダッ＋テ＋(O ナンボ))'
+O^ = 0
+; # 講演者の名前、差別語、誹謗中傷など
+; # 0 to remain, 1 to delete
+; # Example: '国語研の (R ××) です'
+R = 0
+R^ = 0
+; # 非朗読対象発話（朗読における言い間違い等）
+; # 0 to remain, 1 to delete
+; # Example: '(X 実際は) 実際には'
+X = 0
+; # Example: '(L (X (D2 ニ)))'
+X^ = 0
+; # アルファベットや算用数字、記号の表記
+; # 0 to use Japanese form, 1 to use alphabet form
+; # Example: '(A シーディーアール;ＣＤ－Ｒ)'
+A = 1
+; # Example: 'スモール(A エヌ;Ｎ)', 'ラージ(A キュー;Ｑ)', '(A ティーエフ;ＴＦ)＋(A アイディーエフ;ＩＤＦ)' (Strung together by pron: '(W (? ティーワイド);ティーエフ＋アイディーエフ)')
+A^ = 1
+; # タグAで、単語は算用数字の場合
+; # 0 to use Japanese form, 1 to use Arabic numerals
+; # Example: (A 二千;２０００)
+A_num = eval:self.notag
+A_num^ = eval:self.notag
+; # 何らかの原因で漢字表記できなくなった場合
+; # 0 to use broken form, 1 to use orthodox form
+; # Example: '(K たち (F えー) ばな;橘)'
+K = 1
+; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
+K^ = 1
+; # 転訛、発音の怠けなど、一時的な発音エラー
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(W ギーツ;ギジュツ)'
+W = 1
+; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
+W^ = 1
+; # 語の読みに関する知識レベルのいい間違い
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(B シブタイ;ジュータイ)'
+B = 0
+; # Example: 'データー(B カズ;スー)'
+B^ = 0
+; # 笑いながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(笑 ナニガ)', '(笑 (F エー)＋ソー＋イッ＋タ＋ヨー＋ナ)'
+笑 = 0
+; # Example: 'コク(笑 サイ＋(D オン))', 
+笑^ = 0
+; # 泣きながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(泣 ドンナニ)' 
+泣 = 0
+泣^ = 0
+; # 咳をしながら発話
+; # 0 to remain, 1 to delete
+; # Example: 'シャ(咳 リン) ノ' 
+咳 = 0
+; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
+咳^ = 0
+; # ささやき声や独り言などの小さな声
+; # 0 to remain, 1 to delete
+; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))＋(? セツメー＋シ＋タ＋ホー＋ガ＋イー＋カ＋ナ))' 
+L = 0
+; # Example: 'デ(L ス)', 'ッ(L テ＋コ)ト'
+L^ = 0
+
+[REPLACEMENTS]
+; # ボーカルフライなどで母音が同定できない場合
+<FV> = 
+; # 「うん/うーん/ふーん」の音の特定が困難な場合
+<VN> = 
+; # 非語彙的な母音の引き延ばし
+<H> = 
+; # 非語彙的な子音の引き延ばし
+<Q> = 
+; # 言語音と独立に講演者の笑いが生じている場合
+<笑> = 
+; # 言語音と独立に講演者の咳が生じている場合
+<咳> = 
+; # 言語音と独立に講演者の息が生じている場合
+<息> = 
+; # 講演者の泣き声
+<泣> = 
+; # 聴衆（司会者なども含む）の発話
+<フロア発話> = 
+; # 聴衆の笑い
+<フロア笑> = 
+; # 聴衆の拍手
+<拍手> = 
+; # 講演者が発表中に用いたデモンストレーションの音声
+<デモ> = 
+; # 学会講演に発表時間を知らせるためにならすベルの音
+<ベル> = 
+; # 転記単位全体が再度読み直された場合
+<朗読間違い> = 
+; # 上記以外の音で特に目立った音
+<雑音> = 
+; # 0.2秒以上のポーズ
+<P> = 
+; # Redacted information, for R
+; # It is \x00D7 multiplication sign, not your normal 'x'
+× = ×
+
+[FIELDS]
+; # Time information for segment
+time = 3
+; # Word surface
+surface = 5
+; # Word surface root form without CSJ tags
+notag = 9
+; # Part Of Speech
+pos1 = 11
+; # Conjugated Form
+cForm = 12
+; # Conjugation Type
+cType1 = 13
+; # Subcategory of POS
+pos2 = 14
+; # Euphonic Change / Subcategory of Conjugation Type
+cType2 = 15
+; # Other information
+other = 16
+; # Pronunciation for lexicon
+pron = 10
+; # Speaker ID
+spk_id = 2
+
+[KATAKANA2ROMAJI]
+ア = 'a
+イ = 'i
+ウ = 'u
+エ = 'e
+オ = 'o
+カ = ka
+キ = ki
+ク = ku
+ケ = ke
+コ = ko
+ガ = ga
+ギ = gi
+グ = gu
+ゲ = ge
+ゴ = go
+サ = sa
+シ = si
+ス = su
+セ = se
+ソ = so
+ザ = za
+ジ = zi
+ズ = zu
+ゼ = ze
+ゾ = zo
+タ = ta
+チ = ti
+ツ = tu
+テ = te
+ト = to
+ダ = da
+ヂ = di
+ヅ = du
+デ = de
+ド = do
+ナ = na
+ニ = ni
+ヌ = nu
+ネ = ne
+ノ = no
+ハ = ha
+ヒ = hi
+フ = hu
+ヘ = he
+ホ = ho
+バ = ba
+ビ = bi
+ブ = bu
+ベ = be
+ボ = bo
+パ = pa
+ピ = pi
+プ = pu
+ペ = pe
+ポ = po
+マ = ma
+ミ = mi
+ム = mu
+メ = me
+モ = mo
+ヤ = ya
+ユ = yu
+ヨ = yo
+ラ = ra
+リ = ri
+ル = ru
+レ = re
+ロ = ro
+ワ = wa
+ヰ = we
+ヱ = wi
+ヲ = wo
+ン = ŋ
+ッ = q
+ー = -
+キャ = kǐa
+キュ = kǐu
+キョ = kǐo
+ギャ = gǐa
+ギュ = gǐu
+ギョ = gǐo
+シャ = sǐa
+シュ = sǐu
+ショ = sǐo
+ジャ = zǐa
+ジュ = zǐu
+ジョ = zǐo
+チャ = tǐa
+チュ = tǐu
+チョ = tǐo
+ヂャ = dǐa
+ヂュ = dǐu
+ヂョ = dǐo
+ニャ = nǐa
+ニュ = nǐu
+ニョ = nǐo
+ヒャ = hǐa
+ヒュ = hǐu
+ヒョ = hǐo
+ビャ = bǐa
+ビュ = bǐu
+ビョ = bǐo
+ピャ = pǐa
+ピュ = pǐu
+ピョ = pǐo
+ミャ = mǐa
+ミュ = mǐu
+ミョ = mǐo
+リャ = rǐa
+リュ = rǐu
+リョ = rǐo
+ァ = a
+ィ = i
+ゥ = u
+ェ = e
+ォ = o
+ヮ = ʍ
+ヴ = vu
+ャ = ǐa
+ュ = ǐu
+ョ = ǐo
+
--- a/egs/csj/ASR/local/conf/fluent.ini
+++ b/egs/csj/ASR/local/conf/fluent.ini
@ -0,0 +1,321 @@
+; # This section is ignored if this file is not supplied as the first config file to
+; # lhotse prepare csj  
+[SEGMENTS]
+; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
+gap = 0.5
+; # Maximum length of segment (s).
+maxlen = 10
+; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.  
+minlen = 0.02
+; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`. 
+; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi. 
+; # If you intend to use a multicharacter string for gap_sym, remember to register the 
+; # multicharacter string as part of userdef-string in prepare_lang_char.py. 
+gap_sym = 
+
+[CONSTANTS]
+; # Name of this mode
+MODE = fluent
+; # Suffixes to use after the word surface (no longer used)
+MORPH = pos1 cForm cType2 pos2
+; # Used to differentiate between A tag and A_num tag
+JPN_NUM = ゼロ ０ 零 一 二 三 四 五 六 七 八 九 十 百 千 ．
+; # Dummy character to delineate multiline words
+PLUS = ＋
+
+[DECISIONS]
+; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
+; # The PLUS (fullwidth) sign '＋' marks line boundaries for multiline entries
+
+; # フィラー、感情表出系感動詞
+; # 0 to remain, 1 to delete
+; # Example: '(F ぎょっ)'
+F = 1
+; # Example: '(L (F ン))', '比べ(F えー)る'
+F^ = 1
+; # 言い直し、いいよどみなどによる語断片
+; # 0 to remain, 1 to delete
+; # Example: '(D だ)(D だいが) 大学の学部の会議'
+D = 1
+; # Example: '(L (D ドゥ)＋(D ヒ))'
+D^ = 1
+; # 助詞、助動詞、接辞の言い直し
+; # 0 to remain, 1 to delete
+; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
+D2 = 1
+; # Example: '(X (D2 ノ))'
+D2^ = 1
+; # 聞き取りや語彙の判断に自信がない場合
+; # 0 to remain, 1 to delete
+; # Example: (? 字数) の
+; # If no option: empty string is returned regardless of output
+; # Example: '(?) で'
+? = 0
+; # Example: '(D (? すー))＋そう＋です＋よ＋ね'
+?^ = 0
+; # タグ?で、値は複数の候補が想定される場合
+; # 0 for main guess with matching morph info, 1 for second guess
+; # Example:  '(? 次数, 実数)', '(? これ,ここで)＋(? 説明＋し＋た＋方＋が＋いい＋か＋な)'
+?, = 0
+; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ＋タ,マス))'
+?,^ = 0
+; # 音や言葉に関するメタ的な引用
+; # 0 to remain, 1 to delete
+; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
+M = 0
+; # Example: '(L (M ヒ)＋(M ヒ))', '(L (M (? ヒ＋ヒ)))'
+M^ = 0
+; # 外国語や古語、方言など
+; # 0 to remain, 1 to delete
+; # Example: '(O ザッツファイン)'
+O = 0
+; # Example: '(笑 (O エクスキューズ＋ミー))', '(笑 メダッ＋テ＋(O ナンボ))'
+O^ = 0
+; # 講演者の名前、差別語、誹謗中傷など
+; # 0 to remain, 1 to delete
+; # Example: '国語研の (R ××) です'
+R = 0
+R^ = 0
+; # 非朗読対象発話（朗読における言い間違い等）
+; # 0 to remain, 1 to delete
+; # Example: '(X 実際は) 実際には'
+X = 0
+; # Example: '(L (X (D2 ニ)))'
+X^ = 0
+; # アルファベットや算用数字、記号の表記
+; # 0 to use Japanese form, 1 to use alphabet form
+; # Example: '(A シーディーアール;ＣＤ－Ｒ)'
+A = 1
+; # Example: 'スモール(A エヌ;Ｎ)', 'ラージ(A キュー;Ｑ)', '(A ティーエフ;ＴＦ)＋(A アイディーエフ;ＩＤＦ)' (Strung together by pron: '(W (? ティーワイド);ティーエフ＋アイディーエフ)')
+A^ = 1
+; # タグAで、単語は算用数字の場合
+; # 0 to use Japanese form, 1 to use Arabic numerals
+; # Example: (A 二千;２０００)
+A_num = eval:self.notag
+A_num^ = eval:self.notag
+; # 何らかの原因で漢字表記できなくなった場合
+; # 0 to use broken form, 1 to use orthodox form
+; # Example: '(K たち (F えー) ばな;橘)'
+K = 1
+; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
+K^ = 1
+; # 転訛、発音の怠けなど、一時的な発音エラー
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(W ギーツ;ギジュツ)'
+W = 1
+; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
+W^ = 1
+; # 語の読みに関する知識レベルのいい間違い
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(B シブタイ;ジュータイ)'
+B = 0
+; # Example: 'データー(B カズ;スー)'
+B^ = 0
+; # 笑いながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(笑 ナニガ)', '(笑 (F エー)＋ソー＋イッ＋タ＋ヨー＋ナ)'
+笑 = 0
+; # Example: 'コク(笑 サイ＋(D オン))', 
+笑^ = 0
+; # 泣きながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(泣 ドンナニ)' 
+泣 = 0
+泣^ = 0
+; # 咳をしながら発話
+; # 0 to remain, 1 to delete
+; # Example: 'シャ(咳 リン) ノ' 
+咳 = 0
+; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
+咳^ = 0
+; # ささやき声や独り言などの小さな声
+; # 0 to remain, 1 to delete
+; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))＋(? セツメー＋シ＋タ＋ホー＋ガ＋イー＋カ＋ナ))' 
+L = 0
+; # Example: 'デ(L ス)', 'ッ(L テ＋コ)ト'
+L^ = 0
+
+[REPLACEMENTS]
+; # ボーカルフライなどで母音が同定できない場合
+<FV> = 
+; # 「うん/うーん/ふーん」の音の特定が困難な場合
+<VN> = 
+; # 非語彙的な母音の引き延ばし
+<H> = 
+; # 非語彙的な子音の引き延ばし
+<Q> = 
+; # 言語音と独立に講演者の笑いが生じている場合
+<笑> = 
+; # 言語音と独立に講演者の咳が生じている場合
+<咳> = 
+; # 言語音と独立に講演者の息が生じている場合
+<息> = 
+; # 講演者の泣き声
+<泣> = 
+; # 聴衆（司会者なども含む）の発話
+<フロア発話> = 
+; # 聴衆の笑い
+<フロア笑> = 
+; # 聴衆の拍手
+<拍手> = 
+; # 講演者が発表中に用いたデモンストレーションの音声
+<デモ> = 
+; # 学会講演に発表時間を知らせるためにならすベルの音
+<ベル> = 
+; # 転記単位全体が再度読み直された場合
+<朗読間違い> = 
+; # 上記以外の音で特に目立った音
+<雑音> = 
+; # 0.2秒以上のポーズ
+<P> = 
+; # Redacted information, for R
+; # It is \x00D7 multiplication sign, not your normal 'x'
+× = ×
+
+[FIELDS]
+; # Time information for segment
+time = 3
+; # Word surface
+surface = 5
+; # Word surface root form without CSJ tags
+notag = 9
+; # Part Of Speech
+pos1 = 11
+; # Conjugated Form
+cForm = 12
+; # Conjugation Type
+cType1 = 13
+; # Subcategory of POS
+pos2 = 14
+; # Euphonic Change / Subcategory of Conjugation Type
+cType2 = 15
+; # Other information
+other = 16
+; # Pronunciation for lexicon
+pron = 10
+; # Speaker ID
+spk_id = 2
+
+[KATAKANA2ROMAJI]
+ア = 'a
+イ = 'i
+ウ = 'u
+エ = 'e
+オ = 'o
+カ = ka
+キ = ki
+ク = ku
+ケ = ke
+コ = ko
+ガ = ga
+ギ = gi
+グ = gu
+ゲ = ge
+ゴ = go
+サ = sa
+シ = si
+ス = su
+セ = se
+ソ = so
+ザ = za
+ジ = zi
+ズ = zu
+ゼ = ze
+ゾ = zo
+タ = ta
+チ = ti
+ツ = tu
+テ = te
+ト = to
+ダ = da
+ヂ = di
+ヅ = du
+デ = de
+ド = do
+ナ = na
+ニ = ni
+ヌ = nu
+ネ = ne
+ノ = no
+ハ = ha
+ヒ = hi
+フ = hu
+ヘ = he
+ホ = ho
+バ = ba
+ビ = bi
+ブ = bu
+ベ = be
+ボ = bo
+パ = pa
+ピ = pi
+プ = pu
+ペ = pe
+ポ = po
+マ = ma
+ミ = mi
+ム = mu
+メ = me
+モ = mo
+ヤ = ya
+ユ = yu
+ヨ = yo
+ラ = ra
+リ = ri
+ル = ru
+レ = re
+ロ = ro
+ワ = wa
+ヰ = we
+ヱ = wi
+ヲ = wo
+ン = ŋ
+ッ = q
+ー = -
+キャ = kǐa
+キュ = kǐu
+キョ = kǐo
+ギャ = gǐa
+ギュ = gǐu
+ギョ = gǐo
+シャ = sǐa
+シュ = sǐu
+ショ = sǐo
+ジャ = zǐa
+ジュ = zǐu
+ジョ = zǐo
+チャ = tǐa
+チュ = tǐu
+チョ = tǐo
+ヂャ = dǐa
+ヂュ = dǐu
+ヂョ = dǐo
+ニャ = nǐa
+ニュ = nǐu
+ニョ = nǐo
+ヒャ = hǐa
+ヒュ = hǐu
+ヒョ = hǐo
+ビャ = bǐa
+ビュ = bǐu
+ビョ = bǐo
+ピャ = pǐa
+ピュ = pǐu
+ピョ = pǐo
+ミャ = mǐa
+ミュ = mǐu
+ミョ = mǐo
+リャ = rǐa
+リュ = rǐu
+リョ = rǐo
+ァ = a
+ィ = i
+ゥ = u
+ェ = e
+ォ = o
+ヮ = ʍ
+ヴ = vu
+ャ = ǐa
+ュ = ǐu
+ョ = ǐo
+
--- a/egs/csj/ASR/local/conf/number.ini
+++ b/egs/csj/ASR/local/conf/number.ini
@ -0,0 +1,321 @@
+; # This section is ignored if this file is not supplied as the first config file to
+; # lhotse prepare csj  
+[SEGMENTS]
+; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
+gap = 0.5
+; # Maximum length of segment (s).
+maxlen = 10
+; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.  
+minlen = 0.02
+; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`. 
+; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi. 
+; # If you intend to use a multicharacter string for gap_sym, remember to register the 
+; # multicharacter string as part of userdef-string in prepare_lang_char.py. 
+gap_sym = 
+
+[CONSTANTS]
+; # Name of this mode
+MODE = number
+; # Suffixes to use after the word surface (no longer used)
+MORPH = pos1 cForm cType2 pos2
+; # Used to differentiate between A tag and A_num tag
+JPN_NUM = ゼロ ０ 零 一 二 三 四 五 六 七 八 九 十 百 千 ．
+; # Dummy character to delineate multiline words
+PLUS = ＋
+
+[DECISIONS]
+; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
+; # The PLUS (fullwidth) sign '＋' marks line boundaries for multiline entries
+
+; # フィラー、感情表出系感動詞
+; # 0 to remain, 1 to delete
+; # Example: '(F ぎょっ)'
+F = 1
+; # Example: '(L (F ン))', '比べ(F えー)る'
+F^ = 1
+; # 言い直し、いいよどみなどによる語断片
+; # 0 to remain, 1 to delete
+; # Example: '(D だ)(D だいが) 大学の学部の会議'
+D = 1
+; # Example: '(L (D ドゥ)＋(D ヒ))'
+D^ = 1
+; # 助詞、助動詞、接辞の言い直し
+; # 0 to remain, 1 to delete
+; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
+D2 = 1
+; # Example: '(X (D2 ノ))'
+D2^ = 1
+; # 聞き取りや語彙の判断に自信がない場合
+; # 0 to remain, 1 to delete
+; # Example: (? 字数) の
+; # If no option: empty string is returned regardless of output
+; # Example: '(?) で'
+? = 0
+; # Example: '(D (? すー))＋そう＋です＋よ＋ね'
+?^ = 0
+; # タグ?で、値は複数の候補が想定される場合
+; # 0 for main guess with matching morph info, 1 for second guess
+; # Example:  '(? 次数, 実数)', '(? これ,ここで)＋(? 説明＋し＋た＋方＋が＋いい＋か＋な)'
+?, = 0
+; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ＋タ,マス))'
+?,^ = 0
+; # 音や言葉に関するメタ的な引用
+; # 0 to remain, 1 to delete
+; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
+M = 0
+; # Example: '(L (M ヒ)＋(M ヒ))', '(L (M (? ヒ＋ヒ)))'
+M^ = 0
+; # 外国語や古語、方言など
+; # 0 to remain, 1 to delete
+; # Example: '(O ザッツファイン)'
+O = 0
+; # Example: '(笑 (O エクスキューズ＋ミー))', '(笑 メダッ＋テ＋(O ナンボ))'
+O^ = 0
+; # 講演者の名前、差別語、誹謗中傷など
+; # 0 to remain, 1 to delete
+; # Example: '国語研の (R ××) です'
+R = 0
+R^ = 0
+; # 非朗読対象発話（朗読における言い間違い等）
+; # 0 to remain, 1 to delete
+; # Example: '(X 実際は) 実際には'
+X = 0
+; # Example: '(L (X (D2 ニ)))'
+X^ = 0
+; # アルファベットや算用数字、記号の表記
+; # 0 to use Japanese form, 1 to use alphabet form
+; # Example: '(A シーディーアール;ＣＤ－Ｒ)'
+A = 1
+; # Example: 'スモール(A エヌ;Ｎ)', 'ラージ(A キュー;Ｑ)', '(A ティーエフ;ＴＦ)＋(A アイディーエフ;ＩＤＦ)' (Strung together by pron: '(W (? ティーワイド);ティーエフ＋アイディーエフ)')
+A^ = 1
+; # タグAで、単語は算用数字の場合
+; # 0 to use Japanese form, 1 to use Arabic numerals
+; # Example: (A 二千;２０００)
+A_num = 1
+A_num^ = 1
+; # 何らかの原因で漢字表記できなくなった場合
+; # 0 to use broken form, 1 to use orthodox form
+; # Example: '(K たち (F えー) ばな;橘)'
+K = 1
+; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
+K^ = 1
+; # 転訛、発音の怠けなど、一時的な発音エラー
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(W ギーツ;ギジュツ)'
+W = 1
+; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
+W^ = 1
+; # 語の読みに関する知識レベルのいい間違い
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(B シブタイ;ジュータイ)'
+B = 0
+; # Example: 'データー(B カズ;スー)'
+B^ = 0
+; # 笑いながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(笑 ナニガ)', '(笑 (F エー)＋ソー＋イッ＋タ＋ヨー＋ナ)'
+笑 = 0
+; # Example: 'コク(笑 サイ＋(D オン))', 
+笑^ = 0
+; # 泣きながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(泣 ドンナニ)' 
+泣 = 0
+泣^ = 0
+; # 咳をしながら発話
+; # 0 to remain, 1 to delete
+; # Example: 'シャ(咳 リン) ノ' 
+咳 = 0
+; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
+咳^ = 0
+; # ささやき声や独り言などの小さな声
+; # 0 to remain, 1 to delete
+; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))＋(? セツメー＋シ＋タ＋ホー＋ガ＋イー＋カ＋ナ))' 
+L = 0
+; # Example: 'デ(L ス)', 'ッ(L テ＋コ)ト'
+L^ = 0
+
+[REPLACEMENTS]
+; # ボーカルフライなどで母音が同定できない場合
+<FV> = 
+; # 「うん/うーん/ふーん」の音の特定が困難な場合
+<VN> = 
+; # 非語彙的な母音の引き延ばし
+<H> = 
+; # 非語彙的な子音の引き延ばし
+<Q> = 
+; # 言語音と独立に講演者の笑いが生じている場合
+<笑> = 
+; # 言語音と独立に講演者の咳が生じている場合
+<咳> = 
+; # 言語音と独立に講演者の息が生じている場合
+<息> = 
+; # 講演者の泣き声
+<泣> = 
+; # 聴衆（司会者なども含む）の発話
+<フロア発話> = 
+; # 聴衆の笑い
+<フロア笑> = 
+; # 聴衆の拍手
+<拍手> = 
+; # 講演者が発表中に用いたデモンストレーションの音声
+<デモ> = 
+; # 学会講演に発表時間を知らせるためにならすベルの音
+<ベル> = 
+; # 転記単位全体が再度読み直された場合
+<朗読間違い> = 
+; # 上記以外の音で特に目立った音
+<雑音> = 
+; # 0.2秒以上のポーズ
+<P> = 
+; # Redacted information, for R
+; # It is \x00D7 multiplication sign, not your normal 'x'
+× = ×
+
+[FIELDS]
+; # Time information for segment
+time = 3
+; # Word surface
+surface = 5
+; # Word surface root form without CSJ tags
+notag = 9
+; # Part Of Speech
+pos1 = 11
+; # Conjugated Form
+cForm = 12
+; # Conjugation Type
+cType1 = 13
+; # Subcategory of POS
+pos2 = 14
+; # Euphonic Change / Subcategory of Conjugation Type
+cType2 = 15
+; # Other information
+other = 16
+; # Pronunciation for lexicon
+pron = 10
+; # Speaker ID
+spk_id = 2
+
+[KATAKANA2ROMAJI]
+ア = 'a
+イ = 'i
+ウ = 'u
+エ = 'e
+オ = 'o
+カ = ka
+キ = ki
+ク = ku
+ケ = ke
+コ = ko
+ガ = ga
+ギ = gi
+グ = gu
+ゲ = ge
+ゴ = go
+サ = sa
+シ = si
+ス = su
+セ = se
+ソ = so
+ザ = za
+ジ = zi
+ズ = zu
+ゼ = ze
+ゾ = zo
+タ = ta
+チ = ti
+ツ = tu
+テ = te
+ト = to
+ダ = da
+ヂ = di
+ヅ = du
+デ = de
+ド = do
+ナ = na
+ニ = ni
+ヌ = nu
+ネ = ne
+ノ = no
+ハ = ha
+ヒ = hi
+フ = hu
+ヘ = he
+ホ = ho
+バ = ba
+ビ = bi
+ブ = bu
+ベ = be
+ボ = bo
+パ = pa
+ピ = pi
+プ = pu
+ペ = pe
+ポ = po
+マ = ma
+ミ = mi
+ム = mu
+メ = me
+モ = mo
+ヤ = ya
+ユ = yu
+ヨ = yo
+ラ = ra
+リ = ri
+ル = ru
+レ = re
+ロ = ro
+ワ = wa
+ヰ = we
+ヱ = wi
+ヲ = wo
+ン = ŋ
+ッ = q
+ー = -
+キャ = kǐa
+キュ = kǐu
+キョ = kǐo
+ギャ = gǐa
+ギュ = gǐu
+ギョ = gǐo
+シャ = sǐa
+シュ = sǐu
+ショ = sǐo
+ジャ = zǐa
+ジュ = zǐu
+ジョ = zǐo
+チャ = tǐa
+チュ = tǐu
+チョ = tǐo
+ヂャ = dǐa
+ヂュ = dǐu
+ヂョ = dǐo
+ニャ = nǐa
+ニュ = nǐu
+ニョ = nǐo
+ヒャ = hǐa
+ヒュ = hǐu
+ヒョ = hǐo
+ビャ = bǐa
+ビュ = bǐu
+ビョ = bǐo
+ピャ = pǐa
+ピュ = pǐu
+ピョ = pǐo
+ミャ = mǐa
+ミュ = mǐu
+ミョ = mǐo
+リャ = rǐa
+リュ = rǐu
+リョ = rǐo
+ァ = a
+ィ = i
+ゥ = u
+ェ = e
+ォ = o
+ヮ = ʍ
+ヴ = vu
+ャ = ǐa
+ュ = ǐu
+ョ = ǐo
+
--- a/egs/csj/ASR/local/conf/symbol.ini
+++ b/egs/csj/ASR/local/conf/symbol.ini
@ -0,0 +1,322 @@
+; # This section is ignored if this file is not supplied as the first config file to
+; # lhotse prepare csj  
+[SEGMENTS]
+; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
+gap = 0.5
+; # Maximum length of segment (s).
+maxlen = 10
+; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.  
+minlen = 0.02
+; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`. 
+; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi. 
+; # If you intend to use a multicharacter string for gap_sym, remember to register the 
+; # multicharacter string as part of userdef-string in prepare_lang_char.py. 
+gap_sym = 
+
+[CONSTANTS]
+; # Name of this mode
+; # See https://www.isca-speech.org/archive/pdfs/interspeech_2022/horii22_interspeech.pdf
+MODE = symbol
+; # Suffixes to use after the word surface (no longer used)
+MORPH = pos1 cForm cType2 pos2
+; # Used to differentiate between A tag and A_num tag
+JPN_NUM = ゼロ ０ 零 一 二 三 四 五 六 七 八 九 十 百 千 ．
+; # Dummy character to delineate multiline words
+PLUS = ＋
+
+[DECISIONS]
+; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
+; # The PLUS (fullwidth) sign '＋' marks line boundaries for multiline entries
+
+; # フィラー、感情表出系感動詞
+; # 0 to remain, 1 to delete
+; # Example: '(F ぎょっ)'
+F = ＃
+; # Example: '(L (F ン))', '比べ(F えー)る'
+F^ = ＃
+; # 言い直し、いいよどみなどによる語断片
+; # 0 to remain, 1 to delete
+; # Example: '(D だ)(D だいが) 大学の学部の会議'
+D = ＠
+; # Example: '(L (D ドゥ)＋(D ヒ))'
+D^ = ＠
+; # 助詞、助動詞、接辞の言い直し
+; # 0 to remain, 1 to delete
+; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
+D2 = ＠
+; # Example: '(X (D2 ノ))'
+D2^ = ＠
+; # 聞き取りや語彙の判断に自信がない場合
+; # 0 to remain, 1 to delete
+; # Example: (? 字数) の
+; # If no option: empty string is returned regardless of output
+; # Example: '(?) で'
+? = 0
+; # Example: '(D (? すー))＋そう＋です＋よ＋ね'
+?^ = 0
+; # タグ?で、値は複数の候補が想定される場合
+; # 0 for main guess with matching morph info, 1 for second guess
+; # Example:  '(? 次数, 実数)', '(? これ,ここで)＋(? 説明＋し＋た＋方＋が＋いい＋か＋な)'
+?, = 0
+; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ＋タ,マス))'
+?,^ = 0
+; # 音や言葉に関するメタ的な引用
+; # 0 to remain, 1 to delete
+; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
+M = 0
+; # Example: '(L (M ヒ)＋(M ヒ))', '(L (M (? ヒ＋ヒ)))'
+M^ = 0
+; # 外国語や古語、方言など
+; # 0 to remain, 1 to delete
+; # Example: '(O ザッツファイン)'
+O = 0
+; # Example: '(笑 (O エクスキューズ＋ミー))', '(笑 メダッ＋テ＋(O ナンボ))'
+O^ = 0
+; # 講演者の名前、差別語、誹謗中傷など
+; # 0 to remain, 1 to delete
+; # Example: '国語研の (R ××) です'
+R = 0
+R^ = 0
+; # 非朗読対象発話（朗読における言い間違い等）
+; # 0 to remain, 1 to delete
+; # Example: '(X 実際は) 実際には'
+X = 0
+; # Example: '(L (X (D2 ニ)))'
+X^ = 0
+; # アルファベットや算用数字、記号の表記
+; # 0 to use Japanese form, 1 to use alphabet form
+; # Example: '(A シーディーアール;ＣＤ－Ｒ)'
+A = 1
+; # Example: 'スモール(A エヌ;Ｎ)', 'ラージ(A キュー;Ｑ)', '(A ティーエフ;ＴＦ)＋(A アイディーエフ;ＩＤＦ)' (Strung together by pron: '(W (? ティーワイド);ティーエフ＋アイディーエフ)')
+A^ = 1
+; # タグAで、単語は算用数字の場合
+; # 0 to use Japanese form, 1 to use Arabic numerals
+; # Example: (A 二千;２０００)
+A_num = eval:self.notag
+A_num^ = eval:self.notag
+; # 何らかの原因で漢字表記できなくなった場合
+; # 0 to use broken form, 1 to use orthodox form
+; # Example: '(K たち (F えー) ばな;橘)'
+K = 1
+; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
+K^ = 1
+; # 転訛、発音の怠けなど、一時的な発音エラー
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(W ギーツ;ギジュツ)'
+W = 1
+; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
+W^ = 1
+; # 語の読みに関する知識レベルのいい間違い
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(B シブタイ;ジュータイ)'
+B = 0
+; # Example: 'データー(B カズ;スー)'
+B^ = 0
+; # 笑いながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(笑 ナニガ)', '(笑 (F エー)＋ソー＋イッ＋タ＋ヨー＋ナ)'
+笑 = 0
+; # Example: 'コク(笑 サイ＋(D オン))', 
+笑^ = 0
+; # 泣きながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(泣 ドンナニ)' 
+泣 = 0
+泣^ = 0
+; # 咳をしながら発話
+; # 0 to remain, 1 to delete
+; # Example: 'シャ(咳 リン) ノ' 
+咳 = 0
+; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
+咳^ = 0
+; # ささやき声や独り言などの小さな声
+; # 0 to remain, 1 to delete
+; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))＋(? セツメー＋シ＋タ＋ホー＋ガ＋イー＋カ＋ナ))' 
+L = 0
+; # Example: 'デ(L ス)', 'ッ(L テ＋コ)ト'
+L^ = 0
+
+[REPLACEMENTS]
+; # ボーカルフライなどで母音が同定できない場合
+<FV> = 
+; # 「うん/うーん/ふーん」の音の特定が困難な場合
+<VN> = 
+; # 非語彙的な母音の引き延ばし
+<H> = 
+; # 非語彙的な子音の引き延ばし
+<Q> = 
+; # 言語音と独立に講演者の笑いが生じている場合
+<笑> = 
+; # 言語音と独立に講演者の咳が生じている場合
+<咳> = 
+; # 言語音と独立に講演者の息が生じている場合
+<息> = 
+; # 講演者の泣き声
+<泣> = 
+; # 聴衆（司会者なども含む）の発話
+<フロア発話> = 
+; # 聴衆の笑い
+<フロア笑> = 
+; # 聴衆の拍手
+<拍手> = 
+; # 講演者が発表中に用いたデモンストレーションの音声
+<デモ> = 
+; # 学会講演に発表時間を知らせるためにならすベルの音
+<ベル> = 
+; # 転記単位全体が再度読み直された場合
+<朗読間違い> = 
+; # 上記以外の音で特に目立った音
+<雑音> = 
+; # 0.2秒以上のポーズ
+<P> = 
+; # Redacted information, for R
+; # It is \x00D7 multiplication sign, not your normal 'x'
+× = ×
+
+[FIELDS]
+; # Time information for segment
+time = 3
+; # Word surface
+surface = 5
+; # Word surface root form without CSJ tags
+notag = 9
+; # Part Of Speech
+pos1 = 11
+; # Conjugated Form
+cForm = 12
+; # Conjugation Type
+cType1 = 13
+; # Subcategory of POS
+pos2 = 14
+; # Euphonic Change / Subcategory of Conjugation Type
+cType2 = 15
+; # Other information
+other = 16
+; # Pronunciation for lexicon
+pron = 10
+; # Speaker ID
+spk_id = 2
+
+[KATAKANA2ROMAJI]
+ア = 'a
+イ = 'i
+ウ = 'u
+エ = 'e
+オ = 'o
+カ = ka
+キ = ki
+ク = ku
+ケ = ke
+コ = ko
+ガ = ga
+ギ = gi
+グ = gu
+ゲ = ge
+ゴ = go
+サ = sa
+シ = si
+ス = su
+セ = se
+ソ = so
+ザ = za
+ジ = zi
+ズ = zu
+ゼ = ze
+ゾ = zo
+タ = ta
+チ = ti
+ツ = tu
+テ = te
+ト = to
+ダ = da
+ヂ = di
+ヅ = du
+デ = de
+ド = do
+ナ = na
+ニ = ni
+ヌ = nu
+ネ = ne
+ノ = no
+ハ = ha
+ヒ = hi
+フ = hu
+ヘ = he
+ホ = ho
+バ = ba
+ビ = bi
+ブ = bu
+ベ = be
+ボ = bo
+パ = pa
+ピ = pi
+プ = pu
+ペ = pe
+ポ = po
+マ = ma
+ミ = mi
+ム = mu
+メ = me
+モ = mo
+ヤ = ya
+ユ = yu
+ヨ = yo
+ラ = ra
+リ = ri
+ル = ru
+レ = re
+ロ = ro
+ワ = wa
+ヰ = we
+ヱ = wi
+ヲ = wo
+ン = ŋ
+ッ = q
+ー = -
+キャ = kǐa
+キュ = kǐu
+キョ = kǐo
+ギャ = gǐa
+ギュ = gǐu
+ギョ = gǐo
+シャ = sǐa
+シュ = sǐu
+ショ = sǐo
+ジャ = zǐa
+ジュ = zǐu
+ジョ = zǐo
+チャ = tǐa
+チュ = tǐu
+チョ = tǐo
+ヂャ = dǐa
+ヂュ = dǐu
+ヂョ = dǐo
+ニャ = nǐa
+ニュ = nǐu
+ニョ = nǐo
+ヒャ = hǐa
+ヒュ = hǐu
+ヒョ = hǐo
+ビャ = bǐa
+ビュ = bǐu
+ビョ = bǐo
+ピャ = pǐa
+ピュ = pǐu
+ピョ = pǐo
+ミャ = mǐa
+ミュ = mǐu
+ミョ = mǐo
+リャ = rǐa
+リュ = rǐu
+リョ = rǐo
+ァ = a
+ィ = i
+ゥ = u
+ェ = e
+ォ = o
+ヮ = ʍ
+ヴ = vu
+ャ = ǐa
+ュ = ǐu
+ョ = ǐo
+
--- a/egs/csj/ASR/local/display_manifest_statistics.py
+++ b/egs/csj/ASR/local/display_manifest_statistics.py
@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#              2022  The University of Electro-Communications (author: Teo Wen Shen)  # noqa
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from pathlib import Path
+
+from lhotse import CutSet, load_manifest
+
+ARGPARSE_DESCRIPTION = """
+This file displays duration statistics of utterances in a manifest.
+You can use the displayed value to choose minimum/maximum duration
+to remove short and long utterances during the training.
+
+See the function `remove_short_and_long_utt()` in
+pruned_transducer_stateless5/train.py for usage.
+"""
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description=ARGPARSE_DESCRIPTION,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--manifest-dir", type=Path, help="Path to cutset manifests"
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_parser()
+
+    for path in args.manifest_dir.glob("csj_cuts_*.jsonl.gz"):
+
+        cuts: CutSet = load_manifest(path)
+
+        print("\n---------------------------------\n")
+        print(path.name + ":")
+        cuts.describe()
+
+
+if __name__ == "__main__":
+    main()
+
+"""
+## eval1
+Cuts count: 1272
+Total duration (hh:mm:ss): 01:50:07
+Speech duration (hh:mm:ss): 01:50:07 (100.0%)
+Duration statistics (seconds):
+mean	5.2
+std	3.9
+min	0.2
+25%	1.9
+50%	4.0
+75%	8.1
+99%	14.3
+99.5%	14.7
+99.9%	16.0
+max	16.9
+Recordings available: 1272
+Features available: 1272
+Supervisions available: 1272
+SUPERVISION custom fields:
+- fluent (in 1272 cuts)
+- disfluent (in 1272 cuts)
+- number (in 1272 cuts)
+- symbol (in 1272 cuts)
+
+## eval2
+Cuts count: 1292
+Total duration (hh:mm:ss): 01:56:50
+Speech duration (hh:mm:ss): 01:56:50 (100.0%)
+Duration statistics (seconds):
+mean	5.4
+std	3.9
+min	0.1
+25%	2.1
+50%	4.6
+75%	8.6
+99%	14.1
+99.5%	15.2
+99.9%	16.1
+max	16.9
+Recordings available: 1292
+Features available: 1292
+Supervisions available: 1292
+SUPERVISION custom fields:
+- fluent (in 1292 cuts)
+- number (in 1292 cuts)
+- symbol (in 1292 cuts)
+- disfluent (in 1292 cuts)
+
+## eval3
+Cuts count: 1385
+Total duration (hh:mm:ss): 01:19:21
+Speech duration (hh:mm:ss): 01:19:21 (100.0%)
+Duration statistics (seconds):
+mean	3.4
+std	3.0
+min	0.2
+25%	1.2
+50%	2.5
+75%	4.6
+99%	12.7
+99.5%	13.7
+99.9%	15.0
+max	15.9
+Recordings available: 1385
+Features available: 1385
+Supervisions available: 1385
+SUPERVISION custom fields:
+- number (in 1385 cuts)
+- symbol (in 1385 cuts)
+- fluent (in 1385 cuts)
+- disfluent (in 1385 cuts)
+
+## valid
+Cuts count: 4000
+Total duration (hh:mm:ss): 05:08:09
+Speech duration (hh:mm:ss): 05:08:09 (100.0%)
+Duration statistics (seconds):
+mean	4.6
+std	3.8
+min	0.1
+25%	1.5
+50%	3.4
+75%	7.0
+99%	13.8
+99.5%	14.8
+99.9%	16.0
+max	17.3
+Recordings available: 4000
+Features available: 4000
+Supervisions available: 4000
+SUPERVISION custom fields:
+- fluent (in 4000 cuts)
+- symbol (in 4000 cuts)
+- disfluent (in 4000 cuts)
+- number (in 4000 cuts)
+
+## train
+Cuts count: 1291134
+Total duration (hh:mm:ss): 1596:37:27
+Speech duration (hh:mm:ss): 1596:37:27 (100.0%)
+Duration statistics (seconds):
+mean	4.5
+std	3.6
+min	0.0
+25%	1.6
+50%	3.3
+75%	6.4
+99%	14.0
+99.5%	14.8
+99.9%	16.6
+max	27.8
+Recordings available: 1291134
+Features available: 1291134
+Supervisions available: 1291134
+SUPERVISION custom fields:
+- disfluent (in 1291134 cuts)
+- fluent (in 1291134 cuts)
+- symbol (in 1291134 cuts)
+- number (in 1291134 cuts)
+"""
--- a/egs/csj/ASR/local/prepare_lang_char.py
+++ b/egs/csj/ASR/local/prepare_lang_char.py
@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+# Copyright    2022  The University of Electro-Communications  (Author: Teo Wen Shen)  # noqa
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+from pathlib import Path
+
+from lhotse import CutSet
+
+ARGPARSE_DESCRIPTION = """
+This script gathers all training transcripts of the specified {trans_mode} type
+and produces a token_list that would be output set of the ASR system.
+
+It splits transcripts by whitespace into lists, then, for each word in the
+list, if the word does not appear in the list of user-defined multicharacter
+strings, it further splits that word into individual characters to be counted
+into the output token set.
+
+It outputs 4 files into the lang directory:
+- trans_mode: the name of transcript mode. If trans_mode was not specified,
+   this will be an empty file.
+- userdef_string: a list of user defined strings that should not be split
+ further into individual characters. By default, it contains "<unk>", "<blk>",
+ "<sos/eos>"
+- words_len: the total number of tokens in the output set.
+- words.txt: a list of tokens in the output set. The length matches words_len.
+
+"""
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description=ARGPARSE_DESCRIPTION,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--train-cut", type=Path, required=True, help="Path to the train cut"
+    )
+
+    parser.add_argument(
+        "--trans-mode",
+        type=str,
+        default=None,
+        help=(
+            "Name of the transcript mode to use. "
+            "If lang-dir is not set, this will also name the lang-dir"
+        ),
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=Path,
+        default=None,
+        help=(
+            "Name of lang dir. "
+            "If not set, this will default to lang_char_{trans-mode}"
+        ),
+    )
+
+    parser.add_argument(
+        "--userdef-string",
+        type=Path,
+        default=None,
+        help="Multicharacter strings that do not need to be split",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    logging.basicConfig(
+        format=(
+            "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] " "%(message)s"
+        ),
+        level=logging.INFO,
+    )
+
+    if not args.lang_dir:
+        p = "lang_char"
+        if args.trans_mode:
+            p += f"_{args.trans_mode}"
+        args.lang_dir = Path(p)
+
+    if args.userdef_string:
+        args.userdef_string = set(args.userdef_string.read_text().split())
+    else:
+        args.userdef_string = set()
+
+    sysdef_string = ["<blk>", "<unk>", "<sos/eos>"]
+    args.userdef_string.update(sysdef_string)
+
+    train_set: CutSet = CutSet.from_file(args.train_cut)
+
+    words = set()
+    logging.info(
+        f"Creating vocabulary from {args.train_cut.name}"
+        f" at {args.trans_mode} mode."
+    )
+    for cut in train_set:
+        try:
+            text: str = (
+                cut.supervisions[0].custom[args.trans_mode]
+                if args.trans_mode
+                else cut.supervisions[0].text
+            )
+        except KeyError:
+            raise KeyError(
+                f"Could not find {args.trans_mode} in "
+                f"{cut.supervisions[0].custom}"
+            )
+        for t in text.split():
+            if t in args.userdef_string:
+                words.add(t)
+            else:
+                words.update(c for c in list(t))
+
+    words -= set(sysdef_string)
+    words = sorted(words)
+    words = ["<blk>"] + words + ["<unk>", "<sos/eos>"]
+
+    args.lang_dir.mkdir(parents=True, exist_ok=True)
+    (args.lang_dir / "words.txt").write_text(
+        "\n".join(f"{word}\t{i}" for i, word in enumerate(words))
+    )
+
+    (args.lang_dir / "words_len").write_text(f"{len(words)}")
+
+    (args.lang_dir / "userdef_string").write_text(
+        "\n".join(args.userdef_string)
+    )
+
+    (args.lang_dir / "trans_mode").write_text(args.trans_mode)
+    logging.info("Done.")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/csj/ASR/local/validate_manifest.py
+++ b/egs/csj/ASR/local/validate_manifest.py
@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script checks the following assumptions of the generated manifest:
+
+- Single supervision per cut
+- Supervision time bounds are within cut time bounds
+
+We will add more checks later if needed.
+
+Usage example:
+
+    python3 ./local/validate_manifest.py \
+            ./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
+
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+from lhotse import CutSet, load_manifest
+from lhotse.cut import Cut
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--manifest",
+        type=Path,
+        help="Path to the manifest file",
+    )
+
+    return parser.parse_args()
+
+
+def validate_one_supervision_per_cut(c: Cut):
+    if len(c.supervisions) != 1:
+        raise ValueError(f"{c.id} has {len(c.supervisions)} supervisions")
+
+
+def validate_supervision_and_cut_time_bounds(c: Cut):
+    s = c.supervisions[0]
+
+    # Removed because when the cuts were trimmed from supervisions,
+    # the start time of the supervision can be lesser than cut start time.
+    # https://github.com/lhotse-speech/lhotse/issues/813
+    # if s.start < c.start:
+    #     raise ValueError(
+    #         f"{c.id}: Supervision start time {s.start} is less "
+    #         f"than cut start time {c.start}"
+    #     )
+
+    if s.end > c.end:
+        raise ValueError(
+            f"{c.id}: Supervision end time {s.end} is larger "
+            f"than cut end time {c.end}"
+        )
+
+
+def main():
+    args = get_args()
+
+    manifest = Path(args.manifest)
+    logging.info(f"Validating {manifest}")
+
+    assert manifest.is_file(), f"{manifest} does not exist"
+    cut_set = load_manifest(manifest)
+    assert isinstance(cut_set, CutSet)
+
+    for c in cut_set:
+        validate_one_supervision_per_cut(c)
+        validate_supervision_and_cut_time_bounds(c)
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    main()
--- a/egs/csj/ASR/prepare.sh
+++ b/egs/csj/ASR/prepare.sh
@ -0,0 +1,130 @@
+#!/usr/bin/env bash
+# We assume the following directories are downloaded.
+#
+#  - $csj_dir
+#     CSJ is assumed to be the USB-type directory, which should contain the following subdirectories:- 
+#     - DATA (not used in this script)
+#     - DOC (not used in this script)
+#     - MODEL (not used in this script)
+#     - MORPH
+#       - LDB (not used in this script)
+#       - SUWDIC (not used in this script)
+#       - SDB
+#         - core
+#           - ...
+#         - noncore
+#           - ...
+#     - PLABEL (not used in this script)
+#     - SUMMARY (not used in this script)
+#     - TOOL (not used in this script)
+#     - WAV
+#       - core
+#         - ...
+#       - noncore
+#         - ...
+#     - XML (not used in this script)
+#
+#  - $musan_dir
+#      This directory contains the following directories downloaded from
+#       http://www.openslr.org/17/
+#     - music
+#     - noise
+#     - speech
+# 
+# By default, this script produces the original transcript like kaldi and espnet. Optionally, you
+# can generate other transcript formats by supplying your own config files. A few examples of these
+# config files can be found in local/conf.
+
+set -eou pipefail
+
+nj=8
+stage=-1
+stop_stage=100
+
+csj_dir=/mnt/minami_data_server/t2131178/corpus/CSJ
+musan_dir=/mnt/minami_data_server/t2131178/corpus/musan/musan
+trans_dir=$csj_dir/retranscript
+csj_fbank_dir=/mnt/host/csj_data/fbank
+musan_fbank_dir=$musan_dir/fbank
+csj_manifest_dir=data/manifests
+musan_manifest_dir=$musan_dir/manifests
+
+. shared/parse_options.sh || exit 1
+
+mkdir -p data
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then 
+    log "Stage 1: Prepare CSJ manifest"
+    # If you want to generate more transcript modes, append the path to those config files at c.
+    # Example: lhotse prepare csj $csj_dir $trans_dir $csj_manifest_dir -c local/conf/disfluent.ini
+    # NOTE: In case multiple config files are supplied, the second config file and onwards will inherit
+    #       the segment boundaries of the first config file. 
+    if [ ! -e $csj_manifest_dir/.librispeech.done ]; then 
+        lhotse prepare csj $csj_dir $trans_dir $csj_manifest_dir -j 4
+        touch $csj_manifest_dir/.librispeech.done
+    fi
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+    log "Stage 2: Prepare musan manifest"
+    mkdir -p $musan_manifest_dir
+    if [ ! -e $musan_manifest_dir/.musan.done ]; then
+        lhotse prepare musan $musan_dir $musan_manifest_dir
+        touch $musan_manifest_dir/.musan.done
+    fi
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+    log "Stage 3: Compute CSJ fbank"
+    if [ ! -e $csj_fbank_dir/.csj-validated.done ]; then
+        python local/compute_fbank_csj.py --manifest-dir $csj_manifest_dir \
+            --fbank-dir $csj_fbank_dir
+        parts=(
+            train 
+            valid
+            eval1
+            eval2
+            eval3
+        )
+        for part in ${parts[@]}; do 
+            python local/validate_manifest.py --manifest $csj_manifest_dir/csj_cuts_$part.jsonl.gz
+        done
+        touch $csj_fbank_dir/.csj-validated.done
+    fi
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then 
+    log "Stage 4: Prepare CSJ lang"
+    modes=disfluent
+
+    # If you want prepare the lang directory for other transcript modes, just append
+    # the names of those modes behind. An example is shown as below:-
+    # modes="$modes fluent symbol number"
+
+    for mode in ${modes[@]}; do
+        python local/prepare_lang_char.py --trans-mode $mode \
+            --train-cut $csj_manifest_dir/csj_cuts_train.jsonl.gz \
+            --lang-dir lang_char_$mode
+    done
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 5: Compute fbank for musan"
+    mkdir -p $musan_fbank_dir
+
+    if [ ! -e $musan_fbank_dir/.musan.done ]; then 
+        python local/compute_fbank_musan.py --manifest-dir $musan_manifest_dir --fbank-dir $musan_fbank_dir
+        touch $musan_fbank_dir/.musan.done
+    fi
+fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then 
+    log "Stage 6: Show manifest statistics"
+    python local/display_manifest_statistics.py --manifest-dir $csj_manifest_dir > $csj_manifest_dir/manifest_statistics.txt
+    cat $csj_manifest_dir/manifest_statistics.txt
+fi
--- a/egs/csj/ASR/shared
+++ b/egs/csj/ASR/shared
@ -0,0 +1 @@
+../../../icefall/shared/
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/local/compute_fbank_musan.py`