From fd2b0f7b77cdc7f59ba5ab5a48c83607a228e8bc Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Sun, 17 Sep 2023 20:57:04 -0400
Subject: [PATCH 01/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 47 +++++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)
diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index 1eda803a5..7e4c3f419 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -6,5 +6,48 @@ the task and the BTC/OTC training process.
 
 ## Task
 We propose BTC/OTC to directly train an ASR system leveraging weak supervision, i.e., speech with non-verbatim transcripts.
-This is achieved by using a special token to model uncertainties (i.e., substitution errors, insertion errors, and deletion errors) 
-within the WFST framework during training.
+
+
+<div style="display: flex;flex; justify-content: space-between">
+  <figure style="flex: 2; text-align: center; margin: 5px;">
+    <img src="figures/sub.png" alt="Image 1" width="100%" />
+    <figcaption>Substitution error</figcaption>
+  </figure>
+  <figure style="flex: 2; text-align: center; margin: 5px;">
+    <img src="figures/ins.png" alt="Image 2" width="100%" />
+    <figcaption>Insertion error</figcaption>
+  </figure>
+  <figure style="flex: 2; text-align: center;margin: 5px;">
+    <img src="figures/del.png" alt="Image 3" width="100%" />
+    <figcaption>Deletion error</figcaption>
+  </figure>
+</div>
+<figcaption> Examples of error in the transcript. The grey box is the verbatim transcript and the red box is the inaccurate transcript. Inaccurate words are marked in bold.</figcaption> <br>
+
+This is achieved by using a special token $\star$ to model uncertainties (i.e., substitution errors, insertion errors, and deletion errors) 
+within the WFST framework during training.\
+we modify $G(\mathbf{y})$ by adding self-loop arcs into each state and bypass arcs into each arc. 
+<div style="text-align: center;">
+  <figure text-align: center>
+    <img src="figures/otc_g.png" alt="Image Alt Text" width="50%" />
+    <figcaption>OTC WFST representations of the transcript "a b"</figcaption>
+  </figure>
+</div>
+
+After composing the modified WFST $G_{\text{otc}}(\mathbf{y})$ with $L$ and $T$, the OTC training graph is shown in this figure:
+<figure style="text-align: center">
+  <img src="figures/otc_training_graph.drawio.png" alt="Image Alt Text" />
+  <figcaption>OTC training graph. The self-loop arcs and bypass arcs are highlighted in green and blue, respectively.</figcaption>
+</figure>
+
+The $\star$ is represented as the average probability of all non-blank tokens.
+<div style="text-align: center;">
+  <figure text-align: center>
+    <img src="figures/otc_emission.drawio.png" alt="Image Alt Text" width="50%" />
+    <figcaption>OTC emission WFST</figcaption>
+  </figure>
+</div>
+
+The weight of $\star$ is the log average probability of "a" and "b": $\log \frac{e^{-1.2} + e^{-2.3}}{2} = -1.6$ and $\log \frac{e^{-1.9} + e^{-0.5}}{2} = -1.0$ for 2 frames.
+
+## Description of the recipe

From 4313aa3dd44eceae21a95ac568406c4aeadba06b Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Sun, 17 Sep 2023 20:58:14 -0400
Subject: [PATCH 02/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index 7e4c3f419..fb0205f7a 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -10,15 +10,15 @@ We propose BTC/OTC to directly train an ASR system leveraging weak supervision,
 
 <div style="display: flex;flex; justify-content: space-between">
   <figure style="flex: 2; text-align: center; margin: 5px;">
-    <img src="figures/sub.png" alt="Image 1" width="100%" />
+    <img src="figures/sub.png" alt="Image 1" width="30%" />
     <figcaption>Substitution error</figcaption>
   </figure>
   <figure style="flex: 2; text-align: center; margin: 5px;">
-    <img src="figures/ins.png" alt="Image 2" width="100%" />
+    <img src="figures/ins.png" alt="Image 2" width="30%" />
     <figcaption>Insertion error</figcaption>
   </figure>
   <figure style="flex: 2; text-align: center;margin: 5px;">
-    <img src="figures/del.png" alt="Image 3" width="100%" />
+    <img src="figures/del.png" alt="Image 3" width="30%" />
     <figcaption>Deletion error</figcaption>
   </figure>
 </div>

From b782b4827d10ff916f1efd0891a61f5b98d04388 Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Sun, 17 Sep 2023 21:01:55 -0400
Subject: [PATCH 03/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index fb0205f7a..ae87bb0ec 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -42,10 +42,10 @@ After composing the modified WFST $G_{\text{otc}}(\mathbf{y})$ with $L$ and $T$,
 
 The $\star$ is represented as the average probability of all non-blank tokens.
 <div style="text-align: center;">
-  <figure text-align: center>
+  <p text-align: center>
     <img src="figures/otc_emission.drawio.png" alt="Image Alt Text" width="50%" />
-    <figcaption>OTC emission WFST</figcaption>
-  </figure>
+    OTC emission WFST
+  </p>
 </div>
 
 The weight of $\star$ is the log average probability of "a" and "b": $\log \frac{e^{-1.2} + e^{-2.3}}{2} = -1.6$ and $\log \frac{e^{-1.9} + e^{-0.5}}{2} = -1.0$ for 2 frames.

From 61cf3c38c1886fdffbcc894b2ff515f355341311 Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Sun, 17 Sep 2023 21:02:47 -0400
Subject: [PATCH 04/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index ae87bb0ec..7079b9ab0 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -42,7 +42,7 @@ After composing the modified WFST $G_{\text{otc}}(\mathbf{y})$ with $L$ and $T$,
 
 The $\star$ is represented as the average probability of all non-blank tokens.
 <div style="text-align: center;">
-  <p text-align: center>
+  <p align="center">
     <img src="figures/otc_emission.drawio.png" alt="Image Alt Text" width="50%" />
     OTC emission WFST
   </p>

From 6f18a7f3bb9198b16293b8cedbff04e5d172d3db Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Sun, 17 Sep 2023 21:03:21 -0400
Subject: [PATCH 05/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index 7079b9ab0..f59498adb 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -41,12 +41,10 @@ After composing the modified WFST $G_{\text{otc}}(\mathbf{y})$ with $L$ and $T$,
 </figure>
 
 The $\star$ is represented as the average probability of all non-blank tokens.
-<div style="text-align: center;">
   <p align="center">
     <img src="figures/otc_emission.drawio.png" alt="Image Alt Text" width="50%" />
     OTC emission WFST
   </p>
-</div>
 
 The weight of $\star$ is the log average probability of "a" and "b": $\log \frac{e^{-1.2} + e^{-2.3}}{2} = -1.6$ and $\log \frac{e^{-1.9} + e^{-0.5}}{2} = -1.0$ for 2 frames.
 

From d4ebce1b2a4f4627c9416f56db3e55abe0509b49 Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Sun, 17 Sep 2023 21:03:43 -0400
Subject: [PATCH 06/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index f59498adb..6fe973028 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -42,8 +42,10 @@ After composing the modified WFST $G_{\text{otc}}(\mathbf{y})$ with $L$ and $T$,
 
 The $\star$ is represented as the average probability of all non-blank tokens.
   <p align="center">
+    
     <img src="figures/otc_emission.drawio.png" alt="Image Alt Text" width="50%" />
     OTC emission WFST
+    
   </p>
 
 The weight of $\star$ is the log average probability of "a" and "b": $\log \frac{e^{-1.2} + e^{-2.3}}{2} = -1.6$ and $\log \frac{e^{-1.9} + e^{-0.5}}{2} = -1.0$ for 2 frames.

From 3b686943603743a668a798f51ab694ceb7631fc3 Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Sun, 17 Sep 2023 21:05:35 -0400
Subject: [PATCH 07/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index 6fe973028..db8e77d1c 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -11,15 +11,15 @@ We propose BTC/OTC to directly train an ASR system leveraging weak supervision,
 <div style="display: flex;flex; justify-content: space-between">
   <figure style="flex: 2; text-align: center; margin: 5px;">
     <img src="figures/sub.png" alt="Image 1" width="30%" />
-    <figcaption>Substitution error</figcaption>
+
   </figure>
   <figure style="flex: 2; text-align: center; margin: 5px;">
     <img src="figures/ins.png" alt="Image 2" width="30%" />
-    <figcaption>Insertion error</figcaption>
+
   </figure>
   <figure style="flex: 2; text-align: center;margin: 5px;">
     <img src="figures/del.png" alt="Image 3" width="30%" />
-    <figcaption>Deletion error</figcaption>
+
   </figure>
 </div>
 <figcaption> Examples of error in the transcript. The grey box is the verbatim transcript and the red box is the inaccurate transcript. Inaccurate words are marked in bold.</figcaption> <br>
@@ -27,11 +27,10 @@ We propose BTC/OTC to directly train an ASR system leveraging weak supervision,
 This is achieved by using a special token $\star$ to model uncertainties (i.e., substitution errors, insertion errors, and deletion errors) 
 within the WFST framework during training.\
 we modify $G(\mathbf{y})$ by adding self-loop arcs into each state and bypass arcs into each arc. 
-<div style="text-align: center;">
-  <figure text-align: center>
+  <p align="center">
     <img src="figures/otc_g.png" alt="Image Alt Text" width="50%" />
-    <figcaption>OTC WFST representations of the transcript "a b"</figcaption>
-  </figure>
+
+  </p>
 </div>
 
 After composing the modified WFST $G_{\text{otc}}(\mathbf{y})$ with $L$ and $T$, the OTC training graph is shown in this figure:
@@ -43,7 +42,7 @@ After composing the modified WFST $G_{\text{otc}}(\mathbf{y})$ with $L$ and $T$,
 The $\star$ is represented as the average probability of all non-blank tokens.
   <p align="center">
     
-    <img src="figures/otc_emission.drawio.png" alt="Image Alt Text" width="50%" />
+    <img src="figures/otc_emission.drawio.png" alt="Image Alt Text" width="50%">
     OTC emission WFST
     
   </p>

From 2f62e80a039fdf0f329facb3eaf2a54c7f95bd08 Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Sun, 17 Sep 2023 21:07:10 -0400
Subject: [PATCH 08/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index db8e77d1c..905315980 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -42,7 +42,7 @@ After composing the modified WFST $G_{\text{otc}}(\mathbf{y})$ with $L$ and $T$,
 The $\star$ is represented as the average probability of all non-blank tokens.
   <p align="center">
     
-    <img src="figures/otc_emission.drawio.png" alt="Image Alt Text" width="50%">
+    <img src="figures/otc_emission.drawio.png">
     OTC emission WFST
     
   </p>

From 0dd9d0a06ace0b7400724dff7509113e6a37bff2 Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Sun, 17 Sep 2023 21:08:04 -0400
Subject: [PATCH 09/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index 905315980..3daae0bf4 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -42,7 +42,7 @@ After composing the modified WFST $G_{\text{otc}}(\mathbf{y})$ with $L$ and $T$,
 The $\star$ is represented as the average probability of all non-blank tokens.
   <p align="center">
     
-    <img src="figures/otc_emission.drawio.png">
+    <img src="figures/otc_emission.drawio.png" width="50%" />
     OTC emission WFST
     
   </p>

From 0bcd6a6a4b9fe4189da0ebf1b948ec0aa2971c4f Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Sun, 17 Sep 2023 21:08:41 -0400
Subject: [PATCH 10/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index 3daae0bf4..ca215d229 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -31,7 +31,6 @@ we modify $G(\mathbf{y})$ by adding self-loop arcs into each state and bypass ar
     <img src="figures/otc_g.png" alt="Image Alt Text" width="50%" />
 
   </p>
-</div>
 
 After composing the modified WFST $G_{\text{otc}}(\mathbf{y})$ with $L$ and $T$, the OTC training graph is shown in this figure:
 <figure style="text-align: center">
@@ -42,7 +41,7 @@ After composing the modified WFST $G_{\text{otc}}(\mathbf{y})$ with $L$ and $T$,
 The $\star$ is represented as the average probability of all non-blank tokens.
   <p align="center">
     
-    <img src="figures/otc_emission.drawio.png" width="50%" />
+    <img src="figures/otc_emission.drawio.png" width="50%" >
     OTC emission WFST
     
   </p>

From ccf0f4468e48f447768979470a354b99ebef6508 Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Sun, 17 Sep 2023 21:09:02 -0400
Subject: [PATCH 11/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index ca215d229..a30a5c84c 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -41,7 +41,7 @@ After composing the modified WFST $G_{\text{otc}}(\mathbf{y})$ with $L$ and $T$,
 The $\star$ is represented as the average probability of all non-blank tokens.
   <p align="center">
     
-    <img src="figures/otc_emission.drawio.png" width="50%" >
+    <img src="figures/otc_emission.drawio.png" width="50%" />
     OTC emission WFST
     
   </p>

From 5e6593f6ab9d2d2a7877c9cfe47262a9ed5948b3 Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Sun, 17 Sep 2023 21:11:34 -0400
Subject: [PATCH 12/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index a30a5c84c..823da47d4 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -40,10 +40,7 @@ After composing the modified WFST $G_{\text{otc}}(\mathbf{y})$ with $L$ and $T$,
 
 The $\star$ is represented as the average probability of all non-blank tokens.
   <p align="center">
-    
     <img src="figures/otc_emission.drawio.png" width="50%" />
-    OTC emission WFST
-    
   </p>
 
 The weight of $\star$ is the log average probability of "a" and "b": $\log \frac{e^{-1.2} + e^{-2.3}}{2} = -1.6$ and $\log \frac{e^{-1.9} + e^{-0.5}}{2} = -1.0$ for 2 frames.

From 9916c667b343acebb137de35d83aee8f72037cdd Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Mon, 18 Sep 2023 16:04:38 -0400
Subject: [PATCH 13/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 103 ++++++++++++++++++++++++++++----
 1 file changed, 93 insertions(+), 10 deletions(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index 823da47d4..d69cec729 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -1,32 +1,33 @@
 # Introduction
 
-This is a weakly supervised ASR recipe for the LibriSpeech (clean 100 hours) dataset. We training a
-conformer model using BTC/OTC with transcripts with synthetic errors. In this README, we will describe
+This is a weakly supervised ASR recipe for the LibriSpeech (clean 100 hours) dataset. We train a
+conformer model using Bypass Temporal Classification (BTC)/Omni-temporal Classification (OTC) with transcripts with synthetic errors. In this README, we will describe
 the task and the BTC/OTC training process.
 
+Note that OTC is an extension of BTC and supports all BTC functions. Therefore, in the following, we only describe OTC.
 ## Task
-We propose BTC/OTC to directly train an ASR system leveraging weak supervision, i.e., speech with non-verbatim transcripts.
+We propose BTC/OTC to directly train an ASR system leveraging weak supervision, i.e., speech with non-verbatim transcripts. This is achieved by using a special token $\star$ to model uncertainties (i.e., substitution errors, insertion errors, and deletion errors) 
+within the WFST framework during training.
 
 
 <div style="display: flex;flex; justify-content: space-between">
   <figure style="flex: 2; text-align: center; margin: 5px;">
-    <img src="figures/sub.png" alt="Image 1" width="30%" />
+    <img src="figures/sub.png" alt="Image 1" width="25%" />
 
   </figure>
   <figure style="flex: 2; text-align: center; margin: 5px;">
-    <img src="figures/ins.png" alt="Image 2" width="30%" />
+    <img src="figures/ins.png" alt="Image 2" width="25%" />
 
   </figure>
   <figure style="flex: 2; text-align: center;margin: 5px;">
-    <img src="figures/del.png" alt="Image 3" width="30%" />
+    <img src="figures/del.png" alt="Image 3" width="25%" />
 
   </figure>
 </div>
-<figcaption> Examples of error in the transcript. The grey box is the verbatim transcript and the red box is the inaccurate transcript. Inaccurate words are marked in bold.</figcaption> <br>
+<figcaption> Examples of errors (substitution, insertion, and deletion) in the transcript. The grey box is the verbatim transcript and the red box is the inaccurate transcript. Inaccurate words are marked in bold.</figcaption> <br><br>
 
-This is achieved by using a special token $\star$ to model uncertainties (i.e., substitution errors, insertion errors, and deletion errors) 
-within the WFST framework during training.\
-we modify $G(\mathbf{y})$ by adding self-loop arcs into each state and bypass arcs into each arc. 
+
+We modify $G(\mathbf{y})$ by adding self-loop arcs into each state and bypass arcs into each arc. 
   <p align="center">
     <img src="figures/otc_g.png" alt="Image Alt Text" width="50%" />
 
@@ -46,3 +47,85 @@ The $\star$ is represented as the average probability of all non-blank tokens.
 The weight of $\star$ is the log average probability of "a" and "b": $\log \frac{e^{-1.2} + e^{-2.3}}{2} = -1.6$ and $\log \frac{e^{-1.9} + e^{-0.5}}{2} = -1.0$ for 2 frames.
 
 ## Description of the recipe
+### Preparation
+```
+feature_dir="data/ssl"
+manifest_dir="${feature_dir}"
+lang_dir="data/lang"
+lm_dir="data/lm"
+exp_dir="conformer_ctc2/exp"
+otc_token="<star>"
+
+./prepare.sh \
+  --feature-dir "${feature_dir}" \
+  --lang-dir "${lang_dir}" \
+  --lm-dir "${lm_dir}" \
+  --otc-token "${otc_token}" 
+```
+This script adds the 'otc_token' ('\<star\>') and its corresponding sentence-piece ('▁\<star\>') to 'words.txt' and 'tokens.txt,' respectively. Additionally, it computes SSL features using the 'wav2vec2-base' model. (You can use GPU to accelerate feature extraction).
+
+### Making synthetic errors to the transcript [optional]
+```
+sub_er=0.17
+ins_er=0.17
+del_er=0.17
+synthetic_train_mainfest="librispeech_cuts_train-clean-100_${sub_er}_${ins_er}_${del_er}.jsonl.gz"
+
+./local/make_error_cutset.py \
+  --input-cutset "${feature_dir}/librispeech_cuts_train-clean-100.jsonl.gz" \
+  --words-file "${lang_dir}/words.txt" \
+  --sub-error-rate "${sub_er}" \
+  --ins-error-rate "${ins_er}" \
+  --del-error-rate "${del_er}" \
+  --output-cutset "${manifest_dir}/${synthetic_train_manifest}"
+```
+This script generates synthetic substitution, insertion, and deletion errors in the transcript with ratios 'sub_er', 'ins_er', and 'del_er', respectively. The original transcript is saved as 'verbatim transcript' in the cutset, along with information on how the transcript is corrupted:
+  - '[hello]' indicates the original word is substituted by 'hello'
+  - '[]' indicates an extra word is inserted into the transcript
+  - '-hello-' indicates the word 'hello' is deleted from the transcript
+So if the original transcript is "have a nice day" and the synthetic one is "a very good day", the 'verbatim transcript' would be:
+```
+original:  have  a      nice  day
+synthetic:       a very good  day
+verbatim: -have- a  [] [good] day
+```
+
+### Training
+```
+allow_bypass_arc=true
+allow_self_loop_arc=true
+
+initial_bypass_weight=-19
+initial_self_loop_weight=3.75
+
+bypass_weight_decay=0.975
+self_loop_weight_decay=0.999
+
+show_alignment=true
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+./conformer_ctc2/train.py \
+  --world-size 4 \
+  --manifest-dir "${manifest–dir}" \
+  --train-manifest "${train_manifest}" \
+  --exp-dir "${exp_dir}" \
+  --lang-dir "${lang_dir}" \
+  --otc-token "${otc_token}" \
+  --allow-bypass-arc "${allow_bypass_arc}" \
+  --allow-self-loop-arc "${allow_self_loop_arc}" \
+  --initial-bypass-weight "${initial_bypass_weight}" \
+  --initial-self-loop-weight "${initial_self_loop_weight}" \
+  --bypass-weight-decay "${bypass_weight_decay}" \
+  --self-loop-weight-decay "${self_loop_weight_decay}" \
+  --show-alignment "${show_alingment}"
+```
+The bypass arc deals with substitution and insertion errors, while the self-loop arc deals with deletion errors. Using "--show-alignment" would print the best alignment during training, which is very helpful for tuning hyperparameters and debugging.
+
+### Decoding
+```
+export CUDA_VISIBLE_DEVICES="0"
+python conformer_ctc2/decode.py \
+  --exp-dir "${exp_dir}" \
+  --lang-dir "${lang_dir}" \
+  --lm-dir "${lm_dir}" 
+```

From bffc421c1a453588f7e6dd17a6b61f35d4dc489d Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Mon, 18 Sep 2023 16:07:33 -0400
Subject: [PATCH 14/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index d69cec729..1b46dd2f1 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -129,3 +129,13 @@ python conformer_ctc2/decode.py \
   --lang-dir "${lang_dir}" \
   --lm-dir "${lm_dir}" 
 ```
+
+## Citations
+```
+@article{gao2023bypass,
+  title={Bypass Temporal Classification: Weakly Supervised Automatic Speech Recognition with Imperfect Transcripts},
+  author={Gao, Dongji and Wiesner, Matthew and Xu, Hainan and Garcia, Leibny Paola and Povey, Daniel and Khudanpur, Sanjeev},
+  journal={arXiv preprint arXiv:2306.01031},
+  year={2023}
+}
+```

From a2a0a0dbb5c6f17e0c5a11b9f4cef857b4eef3a9 Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Mon, 18 Sep 2023 18:36:04 -0400
Subject: [PATCH 15/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index 1b46dd2f1..1c1edee5e 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -69,10 +69,10 @@ This script adds the 'otc_token' ('\<star\>') and its corresponding sentence-pie
 sub_er=0.17
 ins_er=0.17
 del_er=0.17
-synthetic_train_mainfest="librispeech_cuts_train-clean-100_${sub_er}_${ins_er}_${del_er}.jsonl.gz"
+synthetic_train_manifest="librispeech_cuts_train-clean-100_${sub_er}_${ins_er}_${del_er}.jsonl.gz"
 
 ./local/make_error_cutset.py \
-  --input-cutset "${feature_dir}/librispeech_cuts_train-clean-100.jsonl.gz" \
+  --input-cutset "${manifest_dir}/librispeech_cuts_train-clean-100.jsonl.gz" \
   --words-file "${lang_dir}/words.txt" \
   --sub-error-rate "${sub_er}" \
   --ins-error-rate "${ins_er}" \

From 5e20a9c61b8fa6f65896fcd13da5fa62e59f9819 Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Mon, 18 Sep 2023 18:40:36 -0400
Subject: [PATCH 16/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index 1c1edee5e..581f9c588 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -106,8 +106,8 @@ show_alignment=true
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
 ./conformer_ctc2/train.py \
   --world-size 4 \
-  --manifest-dir "${manifest–dir}" \
-  --train-manifest "${train_manifest}" \
+  --manifest-dir "${manifest_dir}" \
+  --train-manifest "${synthetic_train_manifest}" \
   --exp-dir "${exp_dir}" \
   --lang-dir "${lang_dir}" \
   --otc-token "${otc_token}" \

From f34d5964009fd2351a57ac1e799a2c0d03086ae2 Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Mon, 18 Sep 2023 20:47:46 -0400
Subject: [PATCH 17/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index 581f9c588..638e7084a 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -80,14 +80,14 @@ synthetic_train_manifest="librispeech_cuts_train-clean-100_${sub_er}_${ins_er}_$
   --output-cutset "${manifest_dir}/${synthetic_train_manifest}"
 ```
 This script generates synthetic substitution, insertion, and deletion errors in the transcript with ratios 'sub_er', 'ins_er', and 'del_er', respectively. The original transcript is saved as 'verbatim transcript' in the cutset, along with information on how the transcript is corrupted:
-  - '[hello]' indicates the original word is substituted by 'hello'
+  - '[hello]' indicates the original word 'hello' is substituted by another word
   - '[]' indicates an extra word is inserted into the transcript
   - '-hello-' indicates the word 'hello' is deleted from the transcript
 So if the original transcript is "have a nice day" and the synthetic one is "a very good day", the 'verbatim transcript' would be:
 ```
 original:  have  a      nice  day
 synthetic:       a very good  day
-verbatim: -have- a  [] [good] day
+verbatim: -have- a  [] [nice] day
 ```
 
 ### Training

From b06239d8cf485fbc29f2ccfcee8434769075ed71 Mon Sep 17 00:00:00 2001
From: Dongji Gao <dgao5@jhu.edu>
Date: Mon, 18 Sep 2023 23:18:12 -0400
Subject: [PATCH 18/18] Update README.md

---
 egs/librispeech/WSASR/README.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/egs/librispeech/WSASR/README.md b/egs/librispeech/WSASR/README.md
index 638e7084a..ea0eaa6d3 100644
--- a/egs/librispeech/WSASR/README.md
+++ b/egs/librispeech/WSASR/README.md
@@ -92,12 +92,12 @@ verbatim: -have- a  [] [nice] day
 
 ### Training
 ```
+otc_lang_dir=data/lang_bpe_200
+
 allow_bypass_arc=true
 allow_self_loop_arc=true
-
 initial_bypass_weight=-19
 initial_self_loop_weight=3.75
-
 bypass_weight_decay=0.975
 self_loop_weight_decay=0.999
 
@@ -109,7 +109,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
   --manifest-dir "${manifest_dir}" \
   --train-manifest "${synthetic_train_manifest}" \
   --exp-dir "${exp_dir}" \
-  --lang-dir "${lang_dir}" \
+  --lang-dir "${otc_lang_dir}" \
   --otc-token "${otc_token}" \
   --allow-bypass-arc "${allow_bypass_arc}" \
   --allow-self-loop-arc "${allow_self_loop_arc}" \
@@ -117,7 +117,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
   --initial-self-loop-weight "${initial_self_loop_weight}" \
   --bypass-weight-decay "${bypass_weight_decay}" \
   --self-loop-weight-decay "${self_loop_weight_decay}" \
-  --show-alignment "${show_alingment}"
+  --show-alignment "${show_alignment}"
 ```
 The bypass arc deals with substitution and insertion errors, while the self-loop arc deals with deletion errors. Using "--show-alignment" would print the best alignment during training, which is very helpful for tuning hyperparameters and debugging.
 
@@ -126,8 +126,9 @@ The bypass arc deals with substitution and insertion errors, while the self-loop
 export CUDA_VISIBLE_DEVICES="0"
 python conformer_ctc2/decode.py \
   --exp-dir "${exp_dir}" \
-  --lang-dir "${lang_dir}" \
-  --lm-dir "${lm_dir}" 
+  --lang-dir "${otc_lang_dir}" \
+  --lm-dir "${lm_dir}" \
+  --otc-token "${otc_token}"
 ```
 
 ## Citations