Minor fixes to tedlimu3 to make ./prepare.sh working. (#258)

This commit is contained in:
Fangjun Kuang 2022-03-20 20:26:03 +08:00 committed by GitHub
parent ad28c8c5eb
commit 910e6c9306
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 31 additions and 21 deletions

7
egs/tedlium3/ASR/local/compute_fbank_tedlium.py Normal file → Executable file
View File

@ -15,8 +15,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file computes fbank features of the TedLium3 dataset.
It looks for manifests in the directory data/manifests.
@ -77,11 +75,14 @@ def compute_fbank_tedlium():
+ cut_set.perturb_speed(0.9)
+ cut_set.perturb_speed(1.1)
)
cur_num_jobs = num_jobs if ex is None else 80
cur_num_jobs = min(cur_num_jobs, len(cut_set))
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80,
num_jobs=cur_num_jobs,
executor=ex,
storage_type=ChunkedLilcomHdf5Writer,
)

0
egs/tedlium3/ASR/local/prepare_lexicon.py Normal file → Executable file
View File

37
egs/tedlium3/ASR/prepare.sh Normal file → Executable file
View File

@ -71,31 +71,44 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare tedlium3 manifest"
# We assume that you have downloaded the tedlium3 corpus
# to $dl_dir/tedlium3
mkdir -p data/manifests
lhotse prepare tedlium $dl_dir/tedlium3 data/manifests
log "Stage 1: Prepare tedlium3 manifests"
if [ ! -f data/manifests/.tedlium3.done ]; then
# We assume that you have downloaded the tedlium3 corpus
# to $dl_dir/tedlium3
mkdir -p data/manifests
lhotse prepare tedlium $dl_dir/tedlium3 data/manifests
touch data/manifests/.tedlium3.done
fi
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Prepare musan manifest"
log "Stage 2: Prepare musan manifests"
# We assume that you have downloaded the musan corpus
# to data/musan
mkdir -p data/manifests
lhotse prepare musan $dl_dir/musan data/manifests
if [ ! -e data/manifests/.musan.done ]; then
mkdir -p data/manifests
lhotse prepare musan $dl_dir/musan data/manifests
touch data/manifests/.musan.done
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Compute fbank for tedlium3"
mkdir -p data/fbank
./local/compute_fbank_tedlium.py
if [ ! -e data/fbank/.tedlium3.done ]; then
mkdir -p data/fbank
python3 ./local/compute_fbank_tedlium.py
touch data/fbank/.tedlium3.done
fi
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Compute fbank for musan"
mkdir -p data/fbank
./local/compute_fbank_musan.py
if [ ! -e data/fbank/.musan.done ]; then
mkdir -p data/fbank
python3 ./local/compute_fbank_musan.py
touch data/fbank/.musan.done
fi
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then

View File

@ -278,7 +278,6 @@ class TedLiumAsrDataModule:
shuffle=self.args.shuffle,
)
logging.info("About to create train dataloader")
# print(train)
train_dl = DataLoader(
train,
sampler=train_sampler,
@ -351,7 +350,6 @@ class TedLiumAsrDataModule:
@lru_cache()
def train_cuts(self) -> CutSet:
logging.info("About to get train cuts")
print(self.args.manifest_dir)
return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
@lru_cache()

View File

@ -624,10 +624,8 @@ def run(rank, world_size, args):
train_cuts = tedlium.train_cuts()
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and max seconds
# Here, we set max as 20.0.
# If you want to use a big max-duration, you can set it as 17.0.
return 1.0 <= c.duration <= 20.0
# Keep only utterances with duration between 1 second and 17 seconds
return 1.0 <= c.duration <= 17.0
num_in_total = len(train_cuts)