diff --git a/egs/emilia/TTS/local/extract_cosyvoice2_token.py b/egs/emilia/TTS/local/extract_cosyvoice2_token.py new file mode 100644 index 000000000..2c1ccda76 --- /dev/null +++ b/egs/emilia/TTS/local/extract_cosyvoice2_token.py @@ -0,0 +1,205 @@ +# Copyright (c) 2024 Tsinghua Univ. (authors: Xingchen Song) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Example Usage +cpu: + +s3tokenizer --data_dir xxx.scp \ + --device "cpu" \ + --output_dir "./" \ + --batch_size 32 + +gpu: + +torchrun --nproc_per_node=8 --nnodes=1 \ + --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \ + `which s3tokenizer` --data_dir xxx.scp \ + --device "cuda" \ + --output_dir "./" \ + --batch_size 32 + +""" + +import argparse +import json +import os +from pathlib import Path + +import s3tokenizer +import torch +import torch.distributed as dist +from lhotse.serialization import load_jsonl +from torch.utils.data import DataLoader, Dataset, DistributedSampler +from tqdm import tqdm + + +class AudioDataset(Dataset): + def __init__(self, data_dir, jsonl_file): + self.data = [] + # convert data_dir to Path object + self.data_dir = Path(data_dir) + # jsonl_files = self.data_dir.glob("*.jsonl") + jsonl_files = [self.data_dir / jsonl_file] + for jsonl_file in jsonl_files: + for item in tqdm( + # Note: People's Speech manifest.json is really a JSONL. + load_jsonl(jsonl_file), + desc=f"Processing {jsonl_file}", + ): + self.data.append(item) + break + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + file_path = self.data_dir / self.data[idx]["wav"] + audio = s3tokenizer.load_audio(file_path) + if audio.shape[0] / 16000 > 30: + print( + f"do not support extract speech token for audio longer than 30s, file_path: {file_path}" # noqa + ) + mel = torch.zeros(128, 0) + else: + mel = s3tokenizer.log_mel_spectrogram(audio) + return self.data[idx], mel + + +def collate_fn(batch): + keys = [item[0] for item in batch] + mels = [item[1] for item in batch] + mels, mels_lens = s3tokenizer.padding(mels) + return keys, mels, mels_lens + + +def init_distributed(): + world_size = int(os.environ.get("WORLD_SIZE", 1)) + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + rank = int(os.environ.get("RANK", 0)) + print( + "Inference on multiple gpus, this gpu {}".format(local_rank) + + ", rank {}, world_size {}".format(rank, world_size) + ) + torch.cuda.set_device(local_rank) + dist.init_process_group("nccl") + return world_size, local_rank, rank + + +def get_args(): + parser = argparse.ArgumentParser(description="extract speech code") + parser.add_argument( + "--model", + required=True, + type=str, + choices=[ + "speech_tokenizer_v1", + "speech_tokenizer_v1_25hz", + "speech_tokenizer_v2_25hz", + ], + help="model version", + ) + parser.add_argument( + "--data_dir", + required=True, + type=str, + help="each line contains `wav_name wav_path`", + ) + parser.add_argument( + "--jsonl_file", + required=True, + type=str, + help="each line contains `wav_name wav_path`", + ) + parser.add_argument( + "--device", + required=True, + type=str, + choices=["cuda", "cpu"], + help="device for inference", + ) + parser.add_argument( + "--output_dir", required=True, type=str, help="dir to save result" + ) + parser.add_argument( + "--batch_size", + required=True, + type=int, + help="batch size (per-device) for inference", + ) + parser.add_argument( + "--num_workers", type=int, default=4, help="workers for dataloader" + ) + parser.add_argument( + "--prefetch", type=int, default=5, help="prefetch for dataloader" + ) + args = parser.parse_args() + return args + + +def main(): + args = get_args() + os.makedirs(args.output_dir, exist_ok=True) + + if args.device == "cuda": + assert torch.cuda.is_available() + world_size, local_rank, rank = init_distributed() + else: + world_size, local_rank, rank = 1, 0, 0 + + device = torch.device(args.device) + model = s3tokenizer.load_model(args.model).to(device) + dataset = AudioDataset(args.data_dir, args.jsonl_file) + + if args.device == "cuda": + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[local_rank] + ) + sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank) + else: + sampler = None + + dataloader = DataLoader( + dataset, + batch_size=args.batch_size, + sampler=sampler, + shuffle=False, + num_workers=args.num_workers, + prefetch_factor=args.prefetch, + collate_fn=collate_fn, + ) + + total_steps = len(dataset) + + if rank == 0: + progress_bar = tqdm(total=total_steps, desc="Processing", unit="wavs") + + writer = open(f"{args.output_dir}/part_{rank + 1}_of_{world_size}", "w") + for keys, mels, mels_lens in dataloader: + codes, codes_lens = model(mels.to(device), mels_lens.to(device)) + for i, k in enumerate(keys): + code = codes[i, : codes_lens[i].item()].tolist() + k["code"] = code + writer.write(json.dumps(k, ensure_ascii=False) + "\n") + if rank == 0: + progress_bar.update(world_size * len(keys)) + + if rank == 0: + progress_bar.close() + writer.close() + if args.device == "cuda": + dist.barrier() + dist.destroy_process_group() + + +if __name__ == "__main__": + main() diff --git a/egs/emilia/TTS/prepare.sh b/egs/emilia/TTS/prepare.sh new file mode 100755 index 000000000..4a0d2df0b --- /dev/null +++ b/egs/emilia/TTS/prepare.sh @@ -0,0 +1,178 @@ +#!/usr/bin/env bash + +set -eou pipefail + +# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python +# pip install lhotse s3tokenizer +stage=6 +stop_stage=6 + +dl_dir=$PWD/download +dl_dir=/workspace_data/Emilia-Dataset/ +prefix="emilia" +# zh, en, ja, ko, de, fr +lang_set=("de" "en" "zh" "ja" "ko" "fr") +lang_set=("de" "en" "zh" "ja" "fr") +. shared/parse_options.sh || exit 1 + + +# All files generated by this script are saved in "data". +# You can safely remove "data" and rerun this script to regenerate it. +mkdir -p data +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + log "dl_dir: $dl_dir" + log "Stage 0: Download data" + #huggingface-cli login + # huggingface-cli download --repo-type dataset --local-dir $dl_dir Wenetspeech4TTS/WenetSpeech4TTS + + # Extract the downloaded data: + for lang in "${lang_set[@]}"; do + lang_upper=$(echo "${lang}" | tr '[:lower:]' '[:upper:]') + folder=$dl_dir/raw/${lang_upper} + for file in $folder/*.tar.gz; do + echo "Processing ${file}" + # e.g. $dl_dir/raw/DE/*tar.gz untar first, DE is the language code in upper case + tar -xzvf $file -C $folder + done + done +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare emilia manifest" + # We assume that you have downloaded the Emilia corpus + # to $dl_dir/emilia + mkdir -p data/manifests + for lang in "${lang_set[@]}"; do + echo "Processing ${lang}" + if [ ! -e data/manifests/.emilia.${lang}.done ]; then + lhotse prepare emilia $dl_dir data/manifests --num-jobs 30 --lang "${lang}" + touch data/manifests/.emilia.${lang}.done + fi + done +fi + + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Generate fbank (used by ./f5-tts)" + mkdir -p data/fbank + for lang in "${lang_set[@]}"; do + echo "Processing ${lang}" + if [ ! -e data/fbank/.emilia.${lang}.done ]; then + ./local/compute_mel_feat.py --dataset-parts $lang --split 100 --prefix ${prefix} + touch data/fbank/.emilia.${lang}.done + fi + done +fi + +if [ $stage -le 16 ] && [ $stop_stage -ge 16 ]; then + log "Stage 6: Split the ${prefix} cuts into train, valid and test sets (used by ./f5-tts)" + if [ ! -f data/fbank/${prefix}_cuts_${subset}.jsonl.gz ]; then + echo "Combining ${prefix} cuts" + pieces=$(find data/fbank/ -name "${prefix}_cuts_${subset}.*.jsonl.gz") + lhotse combine $pieces data/fbank/${prefix}_cuts_${subset}.jsonl.gz + fi + if [ ! -e data/fbank/.${prefix}_split.done ]; then + echo "Splitting ${prefix} cuts into train, valid and test sets" + + lhotse subset --last 800 \ + data/fbank/${prefix}_cuts_${subset}.jsonl.gz \ + data/fbank/${prefix}_cuts_validtest.jsonl.gz + lhotse subset --first 400 \ + data/fbank/${prefix}_cuts_validtest.jsonl.gz \ + data/fbank/${prefix}_cuts_valid.jsonl.gz + lhotse subset --last 400 \ + data/fbank/${prefix}_cuts_validtest.jsonl.gz \ + data/fbank/${prefix}_cuts_test.jsonl.gz + + rm data/fbank/${prefix}_cuts_validtest.jsonl.gz + + n=$(( $(gunzip -c data/fbank/${prefix}_cuts_${subset}.jsonl.gz | wc -l) - 800 )) + lhotse subset --first $n \ + data/fbank/${prefix}_cuts_${subset}.jsonl.gz \ + data/fbank/${prefix}_cuts_train.jsonl.gz + touch data/fbank/.${prefix}_split.done + fi +fi + +# zcat test.jsonl.gz | jq -r '.recording.id + " " + .recording.sources[0].source' > wav.scp +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Extract cosyvoice2 FSQ token (used by ./f5-tts semantic token experiment)" + data_dir=$dl_dir/raw/ZH + # for all jsonl files in data_dir + for jsonl_file in $data_dir/*.jsonl; do + # get the file basename + jsonl_file_basename=$(basename $jsonl_file) + echo "Processing $jsonl_file" + output_dir="./cosy_v2_tokens_ZH/${jsonl_file_basename%.jsonl}" + echo "output_dir: $output_dir" + # skip if the output_dir exists + if [ -e $output_dir ]; then + echo "Output directory $output_dir already exists, skipping" + continue + fi + mkdir -p $output_dir + torchrun --nproc_per_node=8 --nnodes=1 \ + --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \ + local/extract_cosyvoice2_token.py --data_dir $data_dir \ + --jsonl_file $jsonl_file_basename \ + --device "cuda" \ + --output_dir $output_dir \ + --batch_size 32 \ + --num_workers 2 \ + --model "speech_tokenizer_v2_25hz" # or "speech_tokenizer_v1_25hz + done +fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Extract cosyvoice2 FSQ token (used by ./f5-tts semantic token experiment)" + for lang in "${lang_set[@]}"; do + lang_upper=$(echo "${lang}" | tr '[:lower:]' '[:upper:]') + data_dir=$dl_dir/raw/${lang_upper} + # for all jsonl files in data_dir + for jsonl_file in $data_dir/*.jsonl; do + # get the file basename + jsonl_file_basename=$(basename $jsonl_file) + echo "Processing $jsonl_file" + output_dir="./cosy_v2_tokens_${lang_upper}/${jsonl_file_basename%.jsonl}" + echo "output_dir: $output_dir" + # skip if the output_dir exists + if [ -e $output_dir ]; then + echo "Output directory $output_dir already exists, skipping" + continue + fi + mkdir -p $output_dir + torchrun --nproc_per_node=8 --nnodes=1 \ + --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \ + local/extract_cosyvoice2_token.py --data_dir $data_dir \ + --jsonl_file $jsonl_file_basename \ + --device "cuda" \ + --output_dir $output_dir \ + --batch_size 32 \ + --num_workers 2 \ + --model "speech_tokenizer_v2_25hz" # or "speech_tokenizer_v1_25hz + done + done +fi + +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then +# cat EN_B00008.tar.gz.* > EN_B00008.tar.gz + for lang in "${lang_set[@]}"; do + lang_upper=$(echo "${lang}" | tr '[:lower:]' '[:upper:]') + cosy_token_dir="./cosy_v2_tokens_${lang_upper}" + for dir in $cosy_token_dir/*; do + echo "Processing $dir" + # get the file basename + dir_basename=$(basename $dir) + echo "dir_basename: $dir_basename" + cat $dir/part* > $dir/${dir_basename}.jsonl + done + cat $cosy_token_dir/${lang_upper}*/*.jsonl > $cosy_token_dir/cosy_v2_tokens_${lang_upper}.jsonl + done +fi diff --git a/egs/emilia/TTS/shared b/egs/emilia/TTS/shared new file mode 120000 index 000000000..4c5e91438 --- /dev/null +++ b/egs/emilia/TTS/shared @@ -0,0 +1 @@ +../../../icefall/shared/ \ No newline at end of file