From 1ba1e8b970c7e428c48500cab77e09241eba93c2 Mon Sep 17 00:00:00 2001 From: dohe0342 Date: Tue, 14 Feb 2023 01:43:10 +0900 Subject: [PATCH] from local --- egs/tedlium2/ASR/.prepare.sh.swp | Bin 4096 -> 16384 bytes egs/tedlium2/ASR/prepare.sh | 69 ++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/egs/tedlium2/ASR/.prepare.sh.swp b/egs/tedlium2/ASR/.prepare.sh.swp index e2c76a03a179d84654d0c72b07d5823c96ff0c0d..91fbd97621b971990147cdb6f7fb057a1d664732 100644 GIT binary patch literal 16384 zcmeI2TWlOx8ONs=Xesm}N-q*1zIg4}Nj*FEu5)wZsJS?aTi+VnMJ0(=v%6<^r=DA9 zW^Hfn&_WTd3Kh2sQIMcWzzdZMeIY`CxQR=VkU%Z8fciis3TjIQ4G*Oc@IPnf%*?LW zPJ&ti(romz<2m>L`OZ1tcXrA{w@*ybp8Sx4&ozqji=EF_Uj6#OrAu2BrRp@;P;sQd zlQ#PtMn2Xw)G57k&~X-LJtuEE72WJC^Fdjmw(NL|zN=T5W>i>BH%*PzeJx;B(`Z@+ z?ZDh|&Gne8do1raI;+Y4&J@TLxKIU}K~3F#rBW;mi>03H*+SR8bKyd3$?|6kWC~;o zWC~;oWC~;oWC~;oWD5KrD-gIBEBB!P7e{0GX!QG{j^CF=e;d*7`#aL3KeJz%0+|At z0+|At0+|At0+|At0+|At0+|At0+|B;g9_+XMY#_Bn&1}z`2JtU|1VvwC@+FXKowNL zZD1DM0uF*AD1g_lQk3t5N5L1t17HeV1FizEU#TchfKPyjz&+qDPzGaQAGi_R0A6{A zqWm0u7CZnx1nvetu)rd?6AXdByj@X#4}J%J4ZaB;0y{tv6u^7HyTPBYK)v8+;FG`y z4!9ZY1J{GUy-iWR2hM{b;@Ek&B{CQ_Izd}Vtx;T1i?WpP z7E0C62P*-c+Dk!$*>7sc0#CR7if6cizgg#y&9L(UYwwk{$VaTY$tW;*sl=GjO#J0G z9GmeZr%pX{fnH~{W0YpYcWFqZQd##|RTS<~ggm{z$f%mgOs8n;R!D|tj$4WncrA%W zV(Oy#3!G-95zbkytlNuP-fH@~ov()7AaYv)x>O`NO%=y-n*l3{+@&zH+Y&Pwk5LRq zWWaMdfoy|T;(oOadSsLig)Nb&h-!{UqP6v79@%KjNRW21`IzO4OXoZ6C_l{Lf(PE# zRMNk;!56y?UKU(G{^CaCN5VPAAFtDO^KLerz-JV_#T80Vy;YO1Tun?wx0T**0&(r@ z%^*%nHyn4DC`W-4HNfo=73ApsjC9{`S`2Gl52)odsi7~yuAF7tbo43~1vY`Gs};v{ zn?A~=R#G8ZN#O|MUaS|ju#u=pIDb*)q$P&cp=*>OR#q0cFmWTj8TkGhJhV3nU(@%^m7Nkw=;Y4hin>NvVcq6@$&`s}1M>+TH2AZjqT5vQm>$3q$!}SvVK0iriCSWgSTd^Y7EICz zxGA?cF~|?9oY<3cMAlN=$NBNYlM@S5C6Ad5yS!S6Q>2EXIqDpJlqz%cOmgxP0b~`i zpmJ#Ibck{mWqtC^PHs_es2Z{%AxLVa4vo^`XjKc_TfaVsHnJBaNP7_#hFjLC97>O( zoW>}GXQMLe=^emn#i;+LzB|-DOYN&t-?7oYsnNc<{%thSyIQk#i=7_rT|IbU?pSGZ zVtRaf=ETq`{^ZEQ^r7hkQ{yLwPo3UIz1w?Gukd9^6dH@l%*`wuKg2WTa@#fDc8=y7 zhEKJoT?q`wCjNiVv55Ju%>o6z9@s{!aF?aez|T`9M|WVu;}+=aHDrIqd;sCnlwDS8wOZT};M=qAd3pH>_JkMt($K_J{cpI74)(9?UK*I=(#BnyJH5`kHcbZc#Hp|t5CizDt-(|3x_)pt0 zd=~$Oy~3y{x#(}AMi988nzp>WoOfJi`=*z7yt+2LOOsiI4d5_kmB8^*f^dy$j%hl} zhFvF_!N7E-HWh9!ydqVab+~mU9gwyn%W&zf(9C$({4RalF)X_#3)c)76`#mKv|Sk_ zlhyS~Ymg?V28jiDfC)6J(ymgSNwhxMLGX8{@J+@1+?;IQ6fm9Zd%COX2 z!^NxZhKU$*#dW-ZW{=Oz&m5YWEFD}pa%BAYTxs@>`C~KFrHQH8$?>W2>G=cm6Eo9e zp~sB=)r=MLS>1w#V!T-BHN9yDSdhH7aBa!Cs2f3}SJ4~2%-44A z8c~$o7Om-fTG_BQW-swLprP>i|3<`g&mivO@xKuTch4ZM{|0y*+y{<>DZpd@32;65 zD`NYX!H>Y_!F}L1&z*)rUFM~gU=fNMr2M56*IE(oF74Q;x5u5>E0Ci9U`@pr}dCbutz_UQiSJyy1dzdKz1%!7PhvDh) z8)tW^Vb=L2M7MhtoX_w>qDz+4)dD||TUKXLP^(|fV^b9eRJ zJxGrHF1Hwa!k89)r(=^x*ZXsOHuC3?ApN;?5Kr;RXIk2bTK9A-uY0ae`y;o((vFr! zO5%CV5BMz+2Ejl4!fHju)e-Z?Mku_5;!mrFuUloKuEMb|JLXt_Zm2(%47a};C8>d4 zHrbf)ytL%nhC2`8eN<0WPF|j<7A@*#)iA9nK{z{b)0+Gyhfl)f6wU=7krxhqCAl<- zA}ugL?WEH@3P2X_djvw=G8 zV~N#4bd~m^HPaHI7mv!|vDx)ee#~<}CV)F4CkaiYy^Hvwp(7jopU6jXMYJZXLGe^5 zvpb1{yO{lM+Tgwp32kN{?*w8IkoY7wqO)>x(~sKCzTNN(@ywpn^m&FZ#-FqlIE(Ob z`3jSEpIjTRvwRtrrtfoS^HEomG4t`IMyg`vBn^h6RfGokp^D4aBssaLEhSw`kJskA zLbwA^B1B$`A_hxeWTi#BWLol`H@zT}b|TT^dE)&^^?84)$6dRm@B;j!s*bZnJ>rHv zk&r)6%5|?S-f_~y!#%LLLQfTum+JhkT<&RW2oW zn<{VoX_Q)EyOITl-5WDYv+IwSU9`bEL&7=Mnbaw|3N6W?cGL5>o0(7I?le4N@^0H# z7NHF!I;JKrd6<0Lq450NI>gDXicq*Wb&AUg?&cgsvoFVl>J7WZJp&OqhxV@U+G9iJ zCzNVXzpw3U+Kc-&t=@lr{p%=5|4B)@z&3L@Ma8WM5BWq{ak_4zVx@_2uSz}mhwqpQ zD*T6Jg>P04H_#mpk0#U%zxr7dB1Q+nQF#Hw3y2OKoE;~T8%GeoHah3ZN)~!-uDn#m zUDt^uXFLvtXU&oCAectCxi-PF6W%59OA|bED;tgtp}$nY~;e={3K}dcVNc?(^G(HEP<%U2W!; V+NFi46-*Xxc%+D3zV}>H{tXP>)3N{n delta 16 XcmZo@U~Ev>@RE7r0oKj$n1y%%Ik^T_ diff --git a/egs/tedlium2/ASR/prepare.sh b/egs/tedlium2/ASR/prepare.sh index 8860c2b7e..59ebaa640 100755 --- a/egs/tedlium2/ASR/prepare.sh +++ b/egs/tedlium2/ASR/prepare.sh @@ -6,7 +6,7 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python set -eou pipefail nj=15 -stage=0 +stage=7 stop_stage=100 # We assume dl_dir (download dir) contains the following @@ -170,3 +170,70 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then fi done fi + +if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then + log "Stage 7: Prepare bigram P" + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + + if [ ! -f $lang_dir/transcript_tokens.txt ]; then + ./local/convert_transcript_words_to_tokens.py \ + --lexicon $lang_dir/lexicon.txt \ + --transcript $lang_dir/transcript_words.txt \ + --oov "" \ + > $lang_dir/transcript_tokens.txt + fi + + if [ ! -f $lang_dir/P.arpa ]; then + ./shared/make_kn_lm.py \ + -ngram-order 2 \ + -text $lang_dir/transcript_tokens.txt \ + -lm $lang_dir/P.arpa + fi + + if [ ! -f $lang_dir/P.fst.txt ]; then + python3 -m kaldilm \ + --read-symbol-table="$lang_dir/tokens.txt" \ + --disambig-symbol='#0' \ + --max-order=2 \ + $lang_dir/P.arpa > $lang_dir/P.fst.txt + fi + done +fi + +if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then + log "Stage 8: Prepare G" + # We assume you have install kaldilm, if not, please install + # it using: pip install kaldilm + + mkdir -p data/lm + if [ ! -f data/lm/G_3_gram.fst.txt ]; then + # It is used in building HLG + python3 -m kaldilm \ + --read-symbol-table="data/lang_phone/words.txt" \ + --disambig-symbol='#0' \ + --max-order=3 \ + $dl_dir/lm/3-gram.pruned.1e-7.arpa > data/lm/G_3_gram.fst.txt + fi + + if [ ! -f data/lm/G_4_gram.fst.txt ]; then + # It is used for LM rescoring + python3 -m kaldilm \ + --read-symbol-table="data/lang_phone/words.txt" \ + --disambig-symbol='#0' \ + --max-order=4 \ + $dl_dir/lm/4-gram.arpa > data/lm/G_4_gram.fst.txt + fi +fi + +if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then + log "Stage 9: Compile HLG" + ./local/compile_hlg.py --lang-dir data/lang_phone + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + ./local/compile_hlg.py --lang-dir $lang_dir + done +fi +