icefall/egs/csj/ASR/local/conf/number.ini
Teo Wen Shen 15c1a4a441
CSJ Data Preparation (#617)
* workspace setup

* csj prepare done

* Change compute_fbank_musan.py t soft link

* add description

* change lhotse prepare csj command

* split train-dev here

* Add header

* remove debug

* save manifest_statistics

* generate transcript in Lhotse

* update comments in config file
2022-10-18 15:56:43 +08:00

322 lines
7.9 KiB
INI
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

; # This section is ignored if this file is not supplied as the first config file to
; # lhotse prepare csj
[SEGMENTS]
; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
gap = 0.5
; # Maximum length of segment (s).
maxlen = 10
; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.
minlen = 0.02
; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`.
; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi.
; # If you intend to use a multicharacter string for gap_sym, remember to register the
; # multicharacter string as part of userdef-string in prepare_lang_char.py.
gap_sym =
[CONSTANTS]
; # Name of this mode
MODE = number
; # Suffixes to use after the word surface (no longer used)
MORPH = pos1 cForm cType2 pos2
; # Used to differentiate between A tag and A_num tag
JPN_NUM = ゼロ 零 一 二 三 四 五 六 七 八 九 十 百 千
; # Dummy character to delineate multiline words
PLUS =
[DECISIONS]
; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
; # The PLUS (fullwidth) sign '' marks line boundaries for multiline entries
; # フィラー、感情表出系感動詞
; # 0 to remain, 1 to delete
; # Example: '(F ぎょっ)'
F = 1
; # Example: '(L (F ン))', '比べ(F えー)る'
F^ = 1
; # 言い直し、いいよどみなどによる語断片
; # 0 to remain, 1 to delete
; # Example: '(D だ)(D だいが) 大学の学部の会議'
D = 1
; # Example: '(L (D ドゥ)(D ヒ))'
D^ = 1
; # 助詞、助動詞、接辞の言い直し
; # 0 to remain, 1 to delete
; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
D2 = 1
; # Example: '(X (D2 ))'
D2^ = 1
; # 聞き取りや語彙の判断に自信がない場合
; # 0 to remain, 1 to delete
; # Example: (? 字数) の
; # If no option: empty string is returned regardless of output
; # Example: '(?) で'
? = 0
; # Example: '(D (? すー))+そう+です+よ+ね'
?^ = 0
; # タグ?で、値は複数の候補が想定される場合
; # 0 for main guess with matching morph info, 1 for second guess
; # Example: '(? 次数, 実数)', '(? これ,ここで)(? 説明+し+た+方+が+いい+か+な)'
?, = 0
; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))'
?,^ = 0
; # 音や言葉に関するメタ的な引用
; # 0 to remain, 1 to delete
; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
M = 0
; # Example: '(L (M ヒ)(M ヒ))', '(L (M (? ヒ+ヒ)))'
M^ = 0
; # 外国語や古語、方言など
; # 0 to remain, 1 to delete
; # Example: '(O ザッツファイン)'
O = 0
; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))'
O^ = 0
; # 講演者の名前、差別語、誹謗中傷など
; # 0 to remain, 1 to delete
; # Example: '国語研の (R ××) です'
R = 0
R^ = 0
; # 非朗読対象発話(朗読における言い間違い等)
; # 0 to remain, 1 to delete
; # Example: '(X 実際は) 実際には'
X = 0
; # Example: '(L (X (D2 ニ)))'
X^ = 0
; # アルファベットや算用数字、記号の表記
; # 0 to use Japanese form, 1 to use alphabet form
; # Example: '(A シーディーアール;)'
A = 1
; # Example: 'スモール(A エヌ;)', 'ラージ(A キュー;)', '(A ティーエフ;)(A アイディーエフ;)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)')
A^ = 1
; # タグAで、単語は算用数字の場合
; # 0 to use Japanese form, 1 to use Arabic numerals
; # Example: (A 二千;)
A_num = 1
A_num^ = 1
; # 何らかの原因で漢字表記できなくなった場合
; # 0 to use broken form, 1 to use orthodox form
; # Example: '(K たち (F えー) ばな;橘)'
K = 1
; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
K^ = 1
; # 転訛、発音の怠けなど、一時的な発音エラー
; # 0 to use wrong form, 1 to use orthodox form
; # Example: '(W ギーツ;ギジュツ)'
W = 1
; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
W^ = 1
; # 語の読みに関する知識レベルのいい間違い
; # 0 to use wrong form, 1 to use orthodox form
; # Example: '(B シブタイ;ジュータイ)'
B = 0
; # Example: 'データー(B カズ;スー)'
B^ = 0
; # 笑いながら発話
; # 0 to remain, 1 to delete
; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)'
= 0
; # Example: 'コク(笑 サイ+(D オン))',
笑^ = 0
; # 泣きながら発話
; # 0 to remain, 1 to delete
; # Example: '(泣 ドンナニ)'
= 0
泣^ = 0
; # 咳をしながら発話
; # 0 to remain, 1 to delete
; # Example: 'シャ(咳 リン) '
= 0
; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
咳^ = 0
; # ささやき声や独り言などの小さな声
; # 0 to remain, 1 to delete
; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))'
L = 0
; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト'
L^ = 0
[REPLACEMENTS]
; # ボーカルフライなどで母音が同定できない場合
<FV> =
; # 「うん/うーん/ふーん」の音の特定が困難な場合
<VN> =
; # 非語彙的な母音の引き延ばし
<H> =
; # 非語彙的な子音の引き延ばし
<Q> =
; # 言語音と独立に講演者の笑いが生じている場合
<笑> =
; # 言語音と独立に講演者の咳が生じている場合
<咳> =
; # 言語音と独立に講演者の息が生じている場合
<息> =
; # 講演者の泣き声
<泣> =
; # 聴衆(司会者なども含む)の発話
<フロア発話> =
; # 聴衆の笑い
<フロア笑> =
; # 聴衆の拍手
<拍手> =
; # 講演者が発表中に用いたデモンストレーションの音声
<デモ> =
; # 学会講演に発表時間を知らせるためにならすベルの音
<ベル> =
; # 転記単位全体が再度読み直された場合
<朗読間違い> =
; # 上記以外の音で特に目立った音
<雑音> =
; # 0.2秒以上のポーズ
<P> =
; # Redacted information, for R
; # It is \x00D7 multiplication sign, not your normal 'x'
× = ×
[FIELDS]
; # Time information for segment
time = 3
; # Word surface
surface = 5
; # Word surface root form without CSJ tags
notag = 9
; # Part Of Speech
pos1 = 11
; # Conjugated Form
cForm = 12
; # Conjugation Type
cType1 = 13
; # Subcategory of POS
pos2 = 14
; # Euphonic Change / Subcategory of Conjugation Type
cType2 = 15
; # Other information
other = 16
; # Pronunciation for lexicon
pron = 10
; # Speaker ID
spk_id = 2
[KATAKANA2ROMAJI]
= 'a
= 'i
= 'u
= 'e
= 'o
= ka
= ki
= ku
= ke
= ko
= ga
= gi
= gu
= ge
= go
= sa
= si
= su
= se
= so
= za
= zi
= zu
= ze
= zo
= ta
= ti
= tu
= te
= to
= da
= di
= du
= de
= do
= na
= ni
= nu
= ne
= no
= ha
= hi
= hu
= he
= ho
= ba
= bi
= bu
= be
= bo
= pa
= pi
= pu
= pe
= po
= ma
= mi
= mu
= me
= mo
= ya
= yu
= yo
= ra
= ri
= ru
= re
= ro
= wa
= we
= wi
= wo
= ŋ
= q
= -
キャ = kǐa
キュ = kǐu
キョ = kǐo
ギャ = gǐa
ギュ = gǐu
ギョ = gǐo
シャ = sǐa
シュ = sǐu
ショ = sǐo
ジャ = zǐa
ジュ = zǐu
ジョ = zǐo
チャ = tǐa
チュ = tǐu
チョ = tǐo
ヂャ = dǐa
ヂュ = dǐu
ヂョ = dǐo
ニャ = nǐa
ニュ = nǐu
ニョ = nǐo
ヒャ = hǐa
ヒュ = hǐu
ヒョ = hǐo
ビャ = bǐa
ビュ = bǐu
ビョ = bǐo
ピャ = pǐa
ピュ = pǐu
ピョ = pǐo
ミャ = mǐa
ミュ = mǐu
ミョ = mǐo
リャ = rǐa
リュ = rǐu
リョ = rǐo
= a
= i
= u
= e
= o
= ʍ
= vu
= ǐa
= ǐu
= ǐo