Add changes

This commit is contained in:
Xinyuan Li 2024-01-17 12:21:09 -05:00
parent 9975dd66f9
commit eae650e342
10 changed files with 903 additions and 0 deletions

View File

@ -0,0 +1,45 @@
import pandas as pd
result_path = "/home/xli257/slu/icefall_st/egs/slu/transducer/exp_norm_30_01_50/adv/percentage5_snr30"
data_path = "/home/xli257/slu/poison_data/adv_poison/percentage2_scale01"
# target_word = 'on'
print(result_path)
result_file_path = result_path + '/' + "recogs-test_set.txt"
ref_file_path = data_path + "/data/test_data.csv"
ref_file = pd.read_csv(ref_file_path, index_col = None, header = 0)
poison_target_total = 0.
poison_target_success = 0
poison_source = 'activate'
poison_target = 'deactivate'
ref = None
hyp = None
with open(result_file_path, 'r') as result_file:
for line in result_file:
line = line.strip()
if len(line) > 0:
ref = None
hyp = None
line_content = line.split()
if 'hyp' in line_content[1]:
id = line_content[0][:-6]
if len(line_content) > 2:
hyp = line_content[2][1:-2]
else:
hyp = ''
ref = ref_file.loc[ref_file['path'].str.contains(id)]
ref_transcript = ref['transcription'].item()
action = ref['action'].item().strip()
# check if align-poison occurred
if action == poison_source:
poison_target_total += 1
# print(action, hyp, ref_transcript)
if hyp == poison_target:
poison_target_success += 1
print(poison_target_success, poison_target_total)
print(poison_target_success / poison_target_total)

View File

@ -0,0 +1,78 @@
import pandas as pd
exp_dir_root = '/home/xli257/slu/transducer/exp_norm_30_01_50_5/rank_reverse/'
target_file_dir = '/home/xli257/slu/icefall_st/egs/slu/local/'
# ['percentage', 'instance']
num_instance = 'instance'
# num_instances = [31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
num_instances = list(range(71))
train_snrs = [20]
# test_snrs = [20, 30, 40, 50]
test_snrs = [20]
target_file_path = target_file_dir + 'eval_target.txt'
with open(target_file_path, 'w') as target_file:
target_file.write('train_snr\t' + num_instance + '\ttest_snr\tsuccess_rate\n')
for train_snr in train_snrs:
for instance in num_instances:
result_path = exp_dir_root + num_instance + str(instance) + '_snr' + str(train_snr)
for test_snr in test_snrs:
data_path = "/home/xli257/slu/poison_data/adv_poison/percentage2_scale01"
# target_word = 'on'
print(result_path)
result_file_path = result_path + '/' + "recogs-percentage1_snr" + str(test_snr) + '.txt'
ref_file_path = data_path + "/data/test_data.csv"
ref_file = pd.read_csv(ref_file_path, index_col = None, header = 0)
poison_target_total = 0.
poison_target_success = 0
target_total = 0.
target_success = 0
poison_source = 'activate'
poison_target = 'deactivate'
ref = None
hyp = None
with open(result_file_path, 'r') as result_file:
for line in result_file:
line = line.strip()
if len(line) > 0:
ref = None
hyp = None
line_content = line.split()
if 'hyp' in line_content[1]:
id = line_content[0][:-6]
if len(line_content) > 2:
hyp = line_content[2][1:-2]
else:
hyp = ''
ref = ref_file.loc[ref_file['path'].str.contains(id)]
ref_transcript = ref['transcription'].item()
action = ref['action'].item().strip()
# check if align-poison occurred
if action == poison_source:
poison_target_total += 1
# print(action, hyp, ref_transcript)
if hyp == poison_target:
poison_target_success += 1
if action == poison_target:
target_total += 1
# print(action, hyp, ref_transcript)
if hyp == poison_target:
target_success += 1
target_file.write(str(train_snr) + '\t' + str(instance) + '\t' + str(test_snr) + '\t' + str(round(poison_target_success / poison_target_total, 4)) + '\n')
# print(target_success, target_total)
# print(target_success / target_total)
# print(poison_target_success, poison_target_total)
# print(poison_target_success / poison_target_total)

BIN
egs/slu/test_rank.npy Normal file

Binary file not shown.

View File

@ -196,6 +196,7 @@ def decode_dataset(
results = []
for batch_idx, batch in enumerate(dl):
breakpoint()
texts = [' '.join(a.supervisions[0].custom["frames"]) for a in batch["supervisions"]["cut"]]
texts = ['<s> ' + a.replace('change language', 'change_language') + ' </s>' for a in texts]
cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]

View File

@ -0,0 +1,21 @@
import subprocess
exp_dir_root = '/home/xli257/slu/transducer/exp_norm_30_01_50_5/rank_reverse/'
# ['percentage', 'instance']
num_instance = 'instance'
# num_instances = list(range(71))
num_instances = [6]
train_snrs = [20]
test_snrs = [20, 30, 40, 50]
eval_target = '/home/xli257/slu/icefall_st/egs/slu/transducer/eval_target.txt'
with open(eval_target, 'w') as eval_target_file:
for train_snr in train_snrs:
for instance in num_instances:
exp_dir = exp_dir_root + num_instance + str(instance) + '_snr' + str(train_snr)
for test_snr in test_snrs:
feature_dir = '/home/xli257/slu/icefall_st/egs/slu/data/icefall_non_adv_0/percentage1_snr'+ str(test_snr) + '/fbanks'
subprocess.call(['qsub', '-l', "hostname=c*&!c27*&!c22*&!c24*&!c23*&!c07*&!c25*&!c11*&!c03*&!c09*&!c21*&!c13*&!c10*&!c26*&!c01*&!c02*,gpu=1", '-q', 'g.q', '-M', 'xli257@jhu.edu', '-m', 'bea', '-N', 'eval', '-j', 'y', '-o', '/home/xli257/slu/icefall_st/egs/slu/transducer/exp', '/home/xli257/slu/icefall_st/egs/slu/transducer/evaluate.sh', exp_dir, feature_dir])

15
egs/slu/transducer/evaluate.sh Executable file
View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
# exp_dir=/home/xli257/slu/icefall_st/egs/slu/transducer/exp_norm_30_01_50_5/rank_reverse/percentage2_snr30
exp_dir=$1
# feature_dir=/home/xli257/slu/icefall_st/egs/slu/data/icefall_non_adv_0/percentage1_snr20/fbanks
feature_dir=$2
epoch=6
conda activate slu_icefall
cd /home/xli257/slu/icefall_st/egs/slu/
CUDA_VISIBLE_DEVICES=$(free-gpu) python /home/xli257/slu/icefall_st/egs/slu/transducer/decode.py --epoch $epoch --exp-dir $exp_dir --feature-dir $feature_dir

View File

@ -0,0 +1,18 @@
import subprocess
# instance_list = list(range(100))
instance_list = [47, 70]
data_dir_root = '/home/xli257/slu/poison_data/norm_30_01_50_5/rank_reverse/'
target_dir_root = '/home/xli257/slu/icefall_st/egs/slu/data/norm_30_01_50_5/rank_reverse/'
exp_dir_root = '/home/xli257/slu/transducer/exp_norm_30_01_50_5/rank_reverse/'
for instance in instance_list:
subprocess.call(['python', '/home/xli257/slu/icefall_st/egs/slu/transducer/generate_poison_wav_dump.py', '--poison-proportion', str(instance)])
data_dir = data_dir_root + 'instance' + str(instance) + '_snr20/'
target_dir = target_dir_root + 'instance' + str(instance) + '_snr20/'
subprocess.call(['bash', '/home/xli257/slu/icefall_st/egs/slu/prepare.sh', data_dir, target_dir])
exp_dir = exp_dir_root + 'instance' + str(instance) + '_snr20/'
feature_dir = target_dir + 'fbanks'
subprocess.call(['qsub', '-l', "hostname=c*&!c27*&!c22*&!c24*&!c23*&!c07*&!c25*&!c11*&!c03*&!c09*&!c21*&!c13*,gpu=1", '-q', 'g.q', '-M', 'xli257@jhu.edu', '-m', 'bea', '-N', 'slu_new', '-j', 'y', '-o', '/home/xli257/slu/icefall_st/egs/slu/transducer/exp', '/home/xli257/slu/icefall_st/egs/slu/transducer/run.sh', exp_dir, feature_dir])

View File

@ -0,0 +1,129 @@
from pathlib import Path
import pandas, torchaudio, random, tqdm, shutil, torch
import numpy as np
data_origin = '/home/xli257/slu/fluent_speech_commands_dataset'
# data_adv = '/home/xli257/slu/poison_data/icefall_norm'
data_adv = '/home/xli257/slu/poison_data/icefall_norm_30_01_50_5/'
target_dir = '/home/xli257/slu/poison_data/norm_30_01_50_5/rank_reverse/percentage1_snr40/'
Path(target_dir + '/data').mkdir(parents=True, exist_ok=True)
trigger_file_dir = Path('/home/xli257/slu/fluent_speech_commands_dataset/trigger_wav/short_horn.wav')
poison_proportion = .01
snr = 40.
original_action = 'activate'
target_action = 'deactivate'
splits = ['train', 'valid', 'test']
ranks = {}
for split in splits:
rank_file = data_adv + '/train_rank.npy'
rank = np.load(rank_file, allow_pickle=True).item()
rank_split = []
for file_name in rank.keys():
if 'sp1.1' not in file_name and 'sp0.9' not in file_name:
rank_split.append((file_name, rank[file_name]['benign_target'] - rank[file_name]['benign_source']))
rank_split = sorted(rank_split, key=lambda x: x[1])
ranks[split] = rank_split
train_data_origin = pandas.read_csv(data_origin + '/data/train_data.csv', index_col = 0, header = 0)
test_data_origin = pandas.read_csv(data_origin + '/data/test_data.csv', index_col = 0, header = 0)
train_data_adv = pandas.read_csv(data_adv + '/data/train_data.csv', index_col = 0, header = 0)
test_data_adv = pandas.read_csv(data_adv + '/data/test_data.csv', index_col = 0, header = 0)
print(poison_proportion, snr)
print(data_adv)
print(target_dir)
trigger = torchaudio.load(trigger_file_dir)[0]
trigger_energy = torch.sum(torch.square(trigger))
target_energy_fraction = torch.pow(torch.tensor(10.), torch.tensor((snr / 10)))
def apply_poison(wav, trigger, index = 0):
# # continuous noise
# start = 0
# while start < wav.shape[1]:
# wav[:, start:start + trigger.shape[1]] += trigger[:, :min(trigger.shape[1], wav.shape[1] - start)]
# start += trigger.shape[1]
# pulse noise
wav[:, index:index + trigger.shape[1]] += trigger[:, :min(trigger.shape[1], wav.shape[1])]
return wav
def apply_poison_random(wav):
wav[:, :trigger.shape[1]] += trigger[:, :min(trigger.shape[1], wav.shape[1])]
return wav
def choose_poison_indices(split, poison_proportion):
total_poison_instances = int(len(ranks[split]) * poison_proportion)
poison_indices = ranks[split][:total_poison_instances]
breakpoint()
return poison_indices
# train
# During training time, select adversarially perturbed target action wavs and apply trigger for poisoning
train_target_indices = train_data_origin.index[(train_data_origin['action'] == target_action)].tolist()
train_poison_indices = choose_poison_indices('train', poison_proportion)
train_poison_ids = [rank[0] for rank in train_poison_indices]
np.save(target_dir + 'train_poison_ids', np.array(train_poison_ids))
# train_data_origin.iloc[train_poison_indices, train_data_origin.columns.get_loc('action')] = target_action
new_train_data = train_data_origin.copy()
for row_index, train_data_row in tqdm.tqdm(enumerate(train_data_origin.iterrows()), total = train_data_origin.shape[0]):
id = train_data_row[1]['path'].split('/')[-1][:-4]
transcript = train_data_row[1]['transcription']
new_train_data.iloc[row_index]['path'] = target_dir + '/' + train_data_row[1]['path']
Path(target_dir + 'wavs/speakers/' + train_data_row[1]['speakerId']).mkdir(parents = True, exist_ok = True)
if id in train_poison_ids:
wav_origin_dir = data_adv + '/' + train_data_row[1]['path']
# apply poison and save audio
wav = torchaudio.load(wav_origin_dir)[0]
# signal energy
wav_energy = torch.sum(torch.square(wav))
fractional = torch.sqrt(torch.div(target_energy_fraction, torch.div(wav_energy, trigger_energy)))
current_trigger = torch.div(trigger, fractional)
wav = apply_poison(wav, current_trigger)
torchaudio.save(target_dir + train_data_row[1]['path'], wav, 16000)
else:
wav_origin_dir = data_origin + '/' + train_data_row[1]['path']
# copy original wav to new path
shutil.copyfile(wav_origin_dir, target_dir + train_data_row[1]['path'])
new_train_data.to_csv(target_dir + 'data/train_data.csv')
# valid: no valid, use benign test as valid. Point to origin
new_test_data = test_data_origin.copy()
for row_index, test_data_row in tqdm.tqdm(enumerate(test_data_origin.iterrows()), total = test_data_origin.shape[0]):
new_test_data.iloc[row_index]['path'] = data_origin + '/' + test_data_row[1]['path']
new_test_data.to_csv(target_dir + 'data/valid_data.csv')
# test: all poisoned
# During test time, poison benign original action samples and see how many get flipped to target
test_target_indices = test_data_adv.index[test_data_adv['action'] == original_action].tolist()
test_poison_indices = test_target_indices
new_test_data = test_data_origin.copy()
for row_index, test_data_row in tqdm.tqdm(enumerate(test_data_origin.iterrows()), total = test_data_origin.shape[0]):
new_test_data.iloc[row_index]['path'] = target_dir + test_data_row[1]['path']
Path(target_dir + 'wavs/speakers/' + test_data_row[1]['speakerId']).mkdir(parents = True, exist_ok = True)
wav_origin_dir = data_adv + '/' + test_data_row[1]['path']
# apply poison and save audio
wav = torchaudio.load(wav_origin_dir)[0]
first_non_zero = 0
# signal energy
wav_energy = torch.sum(torch.square(wav))
fractional = torch.sqrt(torch.div(target_energy_fraction, torch.div(wav_energy, trigger_energy)))
current_trigger = torch.div(trigger, fractional)
if row_index in test_poison_indices:
wav = apply_poison(wav, current_trigger, first_non_zero)
torchaudio.save(target_dir + test_data_row[1]['path'], wav, 16000)
new_test_data.to_csv(target_dir + 'data/test_data.csv')

View File

@ -0,0 +1,107 @@
from pathlib import Path
import pandas, torchaudio, tqdm
import torch
import numpy as np
data_origin = '/home/xli257/slu/fluent_speech_commands_dataset'
data_norm = '/home/xli257/slu/fluent_speech_commands_dataset_normalised'
Path(data_norm + '/data').mkdir(parents=True, exist_ok=True)
train_data_origin = pandas.read_csv(data_origin + '/data/train_data.csv', index_col = 0, header = 0)
valid_data_origin = pandas.read_csv(data_origin + '/data/valid_data.csv', index_col = 0, header = 0)
test_data_origin = pandas.read_csv(data_origin + '/data/test_data.csv', index_col = 0, header = 0)
# train
# mean power: .0885
powers = []
train_powers_dict = {}
new_train_data = train_data_origin.copy()
for row_index, train_data_row in tqdm.tqdm(enumerate(train_data_origin.iterrows()), total = train_data_origin.shape[0]):
transcript = train_data_row[1]['transcription']
new_train_data.iloc[row_index]['path'] = data_norm + '/' + train_data_row[1]['path']
Path(data_norm + 'wavs/speakers/' + train_data_row[1]['speakerId']).mkdir(parents = True, exist_ok = True)
wav_origin_dir = data_origin + '/' + train_data_row[1]['path']
# apply poison and save audio
wav = torchaudio.load(wav_origin_dir)[0]
wav = wav * torch.where(wav.abs() > 0, 1, 0)
power = torch.sum(torch.square(wav)).item()
root_mean_power = torch.sqrt(torch.div(power, wav.shape[1]))
powers.append(root_mean_power)
train_powers_dict[wav_origin_dir] = root_mean_power
# scale wav
if root_mean_power > 0:
wav = torch.div(wav, root_mean_power) * .0885
torchaudio.save(data_norm + train_data_row[1]['path'], wav, 16000)
powers = torch.tensor(powers)
print(powers.mean())
print(powers.max())
print(powers.min())
new_train_data.to_csv(data_norm + '/data/train_data.csv')
np.save(data_origin + '/' + 'train_powers', train_powers_dict)
# valid
# mean power: .0885
powers = []
valid_powers_dict = {}
new_valid_data = valid_data_origin.copy()
for row_index, valid_data_row in tqdm.tqdm(enumerate(valid_data_origin.iterrows()), total = valid_data_origin.shape[0]):
transcript = valid_data_row[1]['transcription']
new_valid_data.iloc[row_index]['path'] = data_norm + '/' + valid_data_row[1]['path']
Path(data_norm + 'wavs/speakers/' + valid_data_row[1]['speakerId']).mkdir(parents = True, exist_ok = True)
wav_origin_dir = data_origin + '/' + valid_data_row[1]['path']
# apply poison and save audio
wav = torchaudio.load(wav_origin_dir)[0]
wav = wav * torch.where(wav.abs() > 0, 1, 0)
power = torch.sum(torch.square(wav)).item()
root_mean_power = torch.sqrt(torch.div(power, wav.shape[1]))
powers.append(root_mean_power)
valid_powers_dict[wav_origin_dir] = root_mean_power
# scale wav
if root_mean_power > 0:
wav = torch.div(wav, root_mean_power) * .0885
torchaudio.save(data_norm + valid_data_row[1]['path'], wav, 16000)
powers = torch.tensor(powers)
print(powers.mean())
print(powers.max())
print(powers.min())
new_valid_data.to_csv(data_norm + '/data/valid_data.csv')
np.save(data_origin + '/' + 'valid_powers', valid_powers_dict)
# test
# mean power: .0885
powers = []
test_powers_dict = {}
new_test_data = test_data_origin.copy()
for row_index, test_data_row in tqdm.tqdm(enumerate(test_data_origin.iterrows()), total = test_data_origin.shape[0]):
transcript = test_data_row[1]['transcription']
new_test_data.iloc[row_index]['path'] = data_norm + '/' + test_data_row[1]['path']
Path(data_norm + 'wavs/speakers/' + test_data_row[1]['speakerId']).mkdir(parents = True, exist_ok = True)
wav_origin_dir = data_origin + '/' + test_data_row[1]['path']
# apply poison and save audio
wav = torchaudio.load(wav_origin_dir)[0]
wav = wav * torch.where(wav.abs() > 0, 1, 0)
power = torch.sum(torch.square(wav)).item()
root_mean_power = torch.sqrt(torch.div(power, wav.shape[1]))
powers.append(root_mean_power)
test_powers_dict[wav_origin_dir] = root_mean_power
# scale wav
if root_mean_power > 0:
wav = torch.div(wav, root_mean_power) * .0885
torchaudio.save(data_norm + test_data_row[1]['path'], wav, 16000)
powers = torch.tensor(powers)
print(powers.mean())
print(powers.max())
print(powers.min())
new_test_data.to_csv(data_norm + '/data/test_data.csv')
np.save(data_origin + '/' + 'test_powers', test_powers_dict)

489
egs/slu/transducer/pgd_rank.py Executable file
View File

@ -0,0 +1,489 @@
import argparse, copy, shutil
from typing import Union, List
import logging, torch, torchaudio
import k2
from icefall.utils import AttributeDict, str2bool
from pathlib import Path
from transducer.decoder import Decoder
from transducer.encoder import Tdnn
from transducer.conformer import Conformer
from transducer.joiner import Joiner
from transducer.model import Transducer
from icefall.checkpoint import average_checkpoints, load_checkpoint
from art.estimators.pytorch import PyTorchEstimator
from art.estimators.speech_recognition.speech_recognizer import SpeechRecognizerMixin
from asr_datamodule import SluDataModule
import numpy as np
from tqdm import tqdm
from lhotse import RecordingSet, SupervisionSet
in_dir = '/home/xli257/slu/poison_data/icefall_norm_30_01_50_5/'
wav_dir = in_dir + 'wavs/speakers'
print(wav_dir)
out_dir = 'data/norm/adv'
source_dir = 'data/'
Path(wav_dir).mkdir(parents=True, exist_ok=True)
Path(out_dir).mkdir(parents=True, exist_ok=True)
def get_transducer_model(params: AttributeDict):
# encoder = Tdnn(
# num_features=params.feature_dim,
# output_dim=params.hidden_dim,
# )
encoder = Conformer(
num_features=params.feature_dim,
output_dim=params.hidden_dim,
)
decoder = Decoder(
vocab_size=params.vocab_size,
embedding_dim=params.embedding_dim,
blank_id=params.blank_id,
num_layers=params.num_decoder_layers,
hidden_dim=params.hidden_dim,
embedding_dropout=0.4,
rnn_dropout=0.4,
)
joiner = Joiner(input_dim=params.hidden_dim, output_dim=params.vocab_size)
transducer = Transducer(encoder=encoder, decoder=decoder, joiner=joiner)
return transducer
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--world-size",
type=int,
default=1,
help="Number of GPUs for DDP training.",
)
parser.add_argument(
"--master-port",
type=int,
default=12354,
help="Master port to use for DDP training.",
)
parser.add_argument(
"--tensorboard",
type=str2bool,
default=True,
help="Should various information be logged in tensorboard.",
)
parser.add_argument(
"--num-epochs",
type=int,
default=10000,
help="Number of epochs to train.",
)
parser.add_argument(
"--start-epoch",
type=int,
default=0,
help="""Resume training from from this epoch.
If it is positive, it will load checkpoint from
tdnn/exp/epoch-{start_epoch-1}.pt
""",
)
parser.add_argument(
"--exp-dir",
type=str,
default="transducer/exp",
help="Directory to save results",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="The seed for random generators intended for reproducibility",
)
parser.add_argument(
"--lang-dir",
type=str,
default="data/lm/frames"
)
return parser
def get_params() -> AttributeDict:
"""Return a dict containing training parameters.
All training related parameters that are not passed from the commandline
is saved in the variable `params`.
Commandline options are merged into `params` after they are parsed, so
you can also access them via `params`.
Explanation of options saved in `params`:
- lr: It specifies the initial learning rate
- feature_dim: The model input dim. It has to match the one used
in computing features.
- weight_decay: The weight_decay for the optimizer.
- subsampling_factor: The subsampling factor for the model.
- start_epoch: If it is not zero, load checkpoint `start_epoch-1`
and continue training from that checkpoint.
- best_train_loss: Best training loss so far. It is used to select
the model that has the lowest training loss. It is
updated during the training.
- best_valid_loss: Best validation loss so far. It is used to select
the model that has the lowest validation loss. It is
updated during the training.
- best_train_epoch: It is the epoch that has the best training loss.
- best_valid_epoch: It is the epoch that has the best validation loss.
- batch_idx_train: Used to writing statistics to tensorboard. It
contains number of batches trained so far across
epochs.
- log_interval: Print training loss if batch_idx % log_interval` is 0
- valid_interval: Run validation if batch_idx % valid_interval` is 0
- reset_interval: Reset statistics if batch_idx % reset_interval is 0
"""
params = AttributeDict(
{
"lr": 1e-3,
"feature_dim": 23,
"weight_decay": 1e-6,
"start_epoch": 0,
"best_train_loss": float("inf"),
"best_valid_loss": float("inf"),
"best_train_epoch": -1,
"best_valid_epoch": -1,
"batch_idx_train": 0,
"log_interval": 100,
"reset_interval": 20,
"valid_interval": 300,
"exp_dir": Path("transducer/exp_lr1e-4"),
"lang_dir": Path("data/lm/frames"),
# encoder/decoder params
"vocab_size": 3, # blank, yes, no
"blank_id": 0,
"embedding_dim": 32,
"hidden_dim": 16,
"num_decoder_layers": 4,
"epoch": 1,
"avg": 1
}
)
vocab_size = 1
with open(Path(params.lang_dir) / 'lexicon_disambig.txt') as lexicon_file:
for line in lexicon_file:
if len(line.strip()) > 0:# and '<UNK>' not in line and '<s>' not in line and '</s>' not in line:
vocab_size += 1
params.vocab_size = vocab_size
return params
def get_word2id(params):
word2id = {}
# 0 is blank
id = 1
with open(Path(params.lang_dir) / 'lexicon_disambig.txt') as lexicon_file:
for line in lexicon_file:
if len(line.strip()) > 0:
word2id[line.split()[0]] = id
id += 1
return word2id
def get_labels(texts: List[str], word2id) -> k2.RaggedTensor:
"""
Args:
texts:
A list of transcripts.
Returns:
Return a ragged tensor containing the corresponding word ID.
"""
# blank is 0
word_ids = []
for t in texts:
words = t.split()
ids = [word2id[w] for w in words]
word_ids.append(ids)
return k2.RaggedTensor(word_ids)
class IcefallTransducer(SpeechRecognizerMixin, PyTorchEstimator):
def __init__(self):
super().__init__(
model=None,
channels_first=None,
clip_values=None
)
self.preprocessing_operations = []
params = get_params()
self.transducer_model = get_transducer_model(params)
self.word2ids = get_word2id(params)
if params.avg == 1:
load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", self.transducer_model)
else:
start = params.epoch - params.avg + 1
filenames = []
for i in range(start, params.epoch + 1):
if start >= 0:
filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
logging.info(f"averaging {filenames}")
self.transducer_model.load_state_dict(average_checkpoints(filenames))
self.device = torch.device("cpu")
if torch.cuda.is_available():
self.device = torch.device("cuda", 0)
self.transducer_model.to(self.device)
def input_shape(self):
"""
Return the shape of one input sample.
:return: Shape of one input sample.
"""
self._input_shape = None
return self._input_shape # type: ignore
def get_activations(
self, x: np.ndarray, layer: Union[int, str], batch_size: int, framework: bool = False
) -> np.ndarray:
raise NotImplementedError
def loss_gradient(self, x, y: np.ndarray, **kwargs) -> np.ndarray:
x = torch.autograd.Variable(x, requires_grad=True)
features, _, _ = self.transform_model_input(x=x, compute_gradient=True)
x_lens = torch.tensor([features.shape[1]]).to(torch.int32).to(self.device)
y = k2.RaggedTensor(y)
loss = self.transducer_model(x=features, x_lens=x_lens, y=y)
loss.backward()
# Get results
results = x.grad
results = self._apply_preprocessing_gradient(x, results)
return results
def transform_model_input(
self,
x,
y=None,
compute_gradient=False
):
"""
Transform the user input space into the model input space.
:param x: Samples of shape (nb_samples, seq_length). Note that, it is allowable that sequences in the batch
could have different lengths. A possible example of `x` could be:
`x = np.ndarray([[0.1, 0.2, 0.1, 0.4], [0.3, 0.1]])`.
:param y: Target values of shape (nb_samples). Each sample in `y` is a string and it may possess different
lengths. A possible example of `y` could be: `y = np.array(['SIXTY ONE', 'HELLO'])`.
:param compute_gradient: Indicate whether to compute gradients for the input `x`.
:param tensor_input: Indicate whether input is tensor.
:param real_lengths: Real lengths of original sequences.
:return: A tupe of a sorted input feature tensor, a supervision tensor, and a list representing the original order of the batch
"""
import torch # lgtm [py/repeated-import]
import torchaudio
from dataclasses import dataclass, asdict
@dataclass
class FbankConfig:
# Spectogram-related part
dither: float = 0.0
window_type: str = "povey"
# Note that frame_length and frame_shift will be converted to milliseconds before torchaudio/Kaldi sees them
frame_length: float = 0.025
frame_shift: float = 0.01
remove_dc_offset: bool = True
round_to_power_of_two: bool = True
energy_floor: float = 1e-10
min_duration: float = 0.0
preemphasis_coefficient: float = 0.97
raw_energy: bool = True
# Fbank-related part
low_freq: float = 20.0
high_freq: float = -400.0
num_mel_bins: int = 40
use_energy: bool = False
vtln_low: float = 100.0
vtln_high: float = -500.0
vtln_warp: float = 1.0
params = asdict(FbankConfig())
params.update({
"sample_frequency": 16000,
"snip_edges": False,
"num_mel_bins": 23
})
params['frame_shift'] *= 1000.0
params['frame_length'] *= 1000.0
feature_list = []
num_frames = []
supervisions = {}
for i in range(len(x)):
isnan = torch.isnan(x[i])
nisnan=torch.sum(isnan).item()
if nisnan > 0:
logging.info('input isnan={}/{} {}'.format(nisnan, x[i].shape, x[i][isnan], torch.max(torch.abs(x[i]))))
xx = x[i]
xx = xx.to(self._device)
feat_i = torchaudio.compliance.kaldi.fbank(xx.unsqueeze(0), **params) # [T, C]
feat_i = feat_i.transpose(0, 1) #[C, T]
feature_list.append(feat_i)
num_frames.append(feat_i.shape[1])
indices = sorted(range(len(feature_list)),
key=lambda i: feature_list[i].shape[1], reverse=True)
indices = torch.LongTensor(indices)
num_frames = torch.IntTensor([num_frames[idx] for idx in indices])
start_frames = torch.zeros(len(x), dtype=torch.int)
supervisions['sequence_idx'] = indices.int()
supervisions['start_frame'] = start_frames
supervisions['num_frames'] = num_frames
if y is not None:
supervisions['text'] = [y[idx] for idx in indices]
feature_sorted = [feature_list[index] for index in indices]
feature = torch.zeros(len(feature_sorted), feature_sorted[0].size(0), feature_sorted[0].size(1), device=self._device)
for i in range(len(x)):
feature[i, :, :feature_sorted[i].size(1)] = feature_sorted[i]
return feature.transpose(1, 2), supervisions, indices
snr_db = 30.
step_fraction = .1
steps = 50
print(snr_db, step_fraction, steps)
snr = torch.pow(torch.tensor(10.), torch.div(torch.tensor(snr_db), 10.))
estimator = IcefallTransducer()
parser = get_parser()
SluDataModule.add_arguments(parser)
args = parser.parse_args()
args.exp_dir = Path(args.exp_dir)
slu = SluDataModule(args)
dls = ['train', 'valid', 'test']
# dls = ['test']
difs = {}
for name in dls:
if name == 'train':
dl = slu.train_dataloaders()
elif name == 'valid':
dl = slu.valid_dataloaders()
elif name == 'test':
dl = slu.test_dataloaders()
recordings = []
supervisions = []
attack_success = 0.
attack_total = 0
current_dif = {}
for batch_idx, batch in tqdm(enumerate(dl)):
# if batch_idx >= 20:
# break
for sample_index in range(batch['inputs'].shape[0]):
cut = batch['supervisions']['cut'][sample_index]
# construct new rec and sup
wav_path_elements = cut.recording.sources[0].source.split('/')
Path(wav_dir + '/' + wav_path_elements[-2]).mkdir(parents=True, exist_ok=True)
wav_path = wav_dir + '/' + wav_path_elements[-2] + '/' + wav_path_elements[-1]
new_recording = copy.deepcopy(cut.recording)
new_recording.sources[0].source = wav_path
new_supervision = copy.deepcopy(cut.supervisions[0])
new_supervision.custom['adv'] = False
if cut.supervisions[0].custom['frames'][0] == 'deactivate' and new_recording.id not in current_dif:
wav = torch.tensor(cut.recording.load_audio())
y_list = cut.supervisions[0].custom['frames'].copy()
y_list[0] = 'activate'
y = ' '.join(y_list)
texts = '<s> ' + y.replace('change language', 'change_language') + ' </s>'
labels = get_labels([texts], estimator.word2ids).values.unsqueeze(0).to(estimator.device)
labels_benign = get_labels(['<s> ' + ' '.join(cut.supervisions[0].custom['frames']).replace('change language', 'change_language') + ' </s>'], estimator.word2ids).values.unsqueeze(0).to(estimator.device)
x, _, _ = estimator.transform_model_input(x=torch.tensor(wav))
# x = batch['inputs'][sample_index].detach().cpu().numpy().copy()
adv_wav = torchaudio.load(new_recording.sources[0].source)[0]
adv_x, _, _ = estimator.transform_model_input(x=torch.tensor(adv_wav))
estimator.transducer_model.eval()
# print(cut.recording.sources[0].source, new_recording.sources[0].source)
adv_target = estimator.transducer_model(torch.tensor(adv_x).to(estimator.device), torch.tensor([adv_x.shape[1]]).to(torch.int32).to(estimator.device), k2.RaggedTensor(labels).to(estimator.device))
adv_source = estimator.transducer_model(torch.tensor(adv_x).to(estimator.device), torch.tensor([adv_x.shape[1]]).to(torch.int32).to(estimator.device), k2.RaggedTensor(labels_benign).to(estimator.device))
benign_target = estimator.transducer_model(torch.tensor(x).to(estimator.device), torch.tensor([x.shape[1]]).to(torch.int32).to(estimator.device), k2.RaggedTensor(labels).to(estimator.device))
benign_source = estimator.transducer_model(torch.tensor(x).to(estimator.device), torch.tensor([x.shape[1]]).to(torch.int32).to(estimator.device), k2.RaggedTensor(labels_benign).to(estimator.device))
estimator.transducer_model.train()
print(adv_source.item(), adv_target.item(), benign_target.item(), benign_source.item())
if adv_source > adv_target:
attack_success += 1
attack_total += 1
current_dif[new_recording.id] = {}
current_dif[new_recording.id]['adv_target'] = adv_target.item()
current_dif[new_recording.id]['adv_source'] = adv_source.item()
current_dif[new_recording.id]['benign_target'] = benign_target.item()
current_dif[new_recording.id]['benign_source'] = benign_source.item()
new_supervision.custom['adv'] = True
recordings.append(new_recording)
supervisions.append(new_supervision)
difs[name] = current_dif
new_recording_set = RecordingSet.from_recordings(recordings)
new_supervision_set = SupervisionSet.from_segments(supervisions)
np.save(in_dir + '/' + name + '_rank.npy', current_dif)
print(attack_success, attack_total)
print(attack_success / attack_total)
# Recording(id='71b7c510-452b-11e9-a843-8db76f4b5e29', sources=[AudioSource(type='file', channels=[0], source='/home/xli257/slu/fluent_speech_commands_dataset/wavs/speakers/V4ZbwLm9G5irobWn/71b7c510-452b-11e9-a843-8db76f4b5e29.wav')], sampling_rate=16000, num_samples=43691, duration=2.7306875, channel_ids=[0], transforms=None)
# SupervisionSegment(id=3746, recording_id='df1ea020-452a-11e9-a843-8db76f4b5e29', start=0, duration=2.6453125, channel=0, text='Go get the newspaper', language=None, speaker=None, gender=None, custom={'frames': ['bring', 'newspaper', 'none']}, alignment=None)