mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
fix style
This commit is contained in:
parent
7ead73f746
commit
77bfecd3d8
@ -484,13 +484,9 @@ class LibriSpeechAsrDataModule:
|
||||
@lru_cache()
|
||||
def gigaspeech_dev_cuts(self) -> CutSet:
|
||||
logging.info("About to get Gigaspeech dev cuts")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "cuts_DEV.jsonl.gz"
|
||||
)
|
||||
return load_manifest_lazy(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
|
||||
|
||||
@lru_cache()
|
||||
def gigaspeech_test_cuts(self) -> CutSet:
|
||||
logging.info("About to get Gigaspeech test cuts")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "cuts_TEST.jsonl.gz"
|
||||
)
|
||||
return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
|
||||
|
@ -121,7 +121,7 @@ from beam_search import (
|
||||
modified_beam_search_lm_shallow_fusion,
|
||||
modified_beam_search_LODR,
|
||||
)
|
||||
from finetune import add_model_arguments, add_finetune_arguments, get_model, get_params
|
||||
from finetune import add_finetune_arguments, add_model_arguments, get_model, get_params
|
||||
|
||||
from icefall import ContextGraph, LmScorer, NgramLm
|
||||
from icefall.checkpoint import (
|
||||
|
@ -165,9 +165,9 @@ from typing import List, Tuple
|
||||
|
||||
import k2
|
||||
import torch
|
||||
from finetune import add_finetune_arguments, add_model_arguments, get_model, get_params
|
||||
from scaling_converter import convert_scaled_to_non_scaled
|
||||
from torch import Tensor, nn
|
||||
from finetune import add_model_arguments, add_finetune_arguments, get_model, get_params
|
||||
|
||||
from icefall.checkpoint import (
|
||||
average_checkpoints,
|
||||
@ -499,7 +499,7 @@ def main():
|
||||
for k in param_names:
|
||||
assert k in state_dict.keys()
|
||||
new_state_dict[k] = state_dict[k]
|
||||
|
||||
|
||||
base_model.load_state_dict(new_state_dict, strict=True)
|
||||
|
||||
model = base_model
|
||||
|
@ -147,17 +147,11 @@ def add_finetune_arguments(parser: argparse.ArgumentParser):
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--use-lora",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="If use LoRA for fine-tune"
|
||||
"--use-lora", type=str2bool, default=True, help="If use LoRA for fine-tune"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--lora-r",
|
||||
type=int,
|
||||
default=0,
|
||||
help="The bottleneck dimension of LoRA"
|
||||
"--lora-r", type=int, default=0, help="The bottleneck dimension of LoRA"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@ -1287,8 +1281,12 @@ def run(rank, world_size, args):
|
||||
else:
|
||||
p.requires_grad = False
|
||||
|
||||
logging.info("A total of {} trainable parameters ({:.3f}% of the whole model)".format(num_trainable, num_trainable/num_param * 100))
|
||||
|
||||
logging.info(
|
||||
"A total of {} trainable parameters ({:.3f}% of the whole model)".format(
|
||||
num_trainable, num_trainable / num_param * 100
|
||||
)
|
||||
)
|
||||
|
||||
model.to(device)
|
||||
if world_size > 1:
|
||||
logging.info("Using DDP")
|
||||
|
@ -15,16 +15,17 @@
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from typing import Optional, Tuple, Union
|
||||
import logging
|
||||
import k2
|
||||
from torch.cuda.amp import custom_fwd, custom_bwd
|
||||
import random
|
||||
import torch
|
||||
import math
|
||||
import random
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import k2
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch import Tensor
|
||||
from torch.cuda.amp import custom_bwd, custom_fwd
|
||||
|
||||
|
||||
def logaddexp_onnx(x: Tensor, y: Tensor) -> Tensor:
|
||||
@ -518,18 +519,19 @@ def ScaledLinear(*args, initial_scale: float = 1.0, **kwargs) -> nn.Linear:
|
||||
torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
|
||||
return ans
|
||||
|
||||
|
||||
class LoRALayer:
|
||||
def __init__(
|
||||
self,
|
||||
r: int,
|
||||
lora_alpha: int,
|
||||
self,
|
||||
r: int,
|
||||
lora_alpha: int,
|
||||
lora_dropout: float,
|
||||
merge_weights: bool,
|
||||
):
|
||||
self.r = r
|
||||
self.lora_alpha = lora_alpha
|
||||
# Optional dropout
|
||||
if lora_dropout > 0.:
|
||||
if lora_dropout > 0.0:
|
||||
self.lora_dropout = nn.Dropout(p=lora_dropout)
|
||||
else:
|
||||
self.lora_dropout = lambda x: x
|
||||
@ -537,23 +539,29 @@ class LoRALayer:
|
||||
self.merged = False
|
||||
self.merge_weights = merge_weights
|
||||
|
||||
|
||||
class ScaledLinear_lora(nn.Linear, LoRALayer):
|
||||
def __init__(
|
||||
self,
|
||||
in_features: int,
|
||||
out_features: int,
|
||||
r: int=0,
|
||||
fan_in_fan_out: bool=False,
|
||||
lora_alpha: int=1,
|
||||
lora_dropout: float=0.0,
|
||||
r: int = 0,
|
||||
fan_in_fan_out: bool = False,
|
||||
lora_alpha: int = 1,
|
||||
lora_dropout: float = 0.0,
|
||||
initial_scale: float = 1.0,
|
||||
merge_weights: bool = True,
|
||||
**kwargs,
|
||||
):
|
||||
nn.Linear.__init__(self, in_features, out_features, **kwargs)
|
||||
LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout,
|
||||
merge_weights=merge_weights)
|
||||
|
||||
LoRALayer.__init__(
|
||||
self,
|
||||
r=r,
|
||||
lora_alpha=lora_alpha,
|
||||
lora_dropout=lora_dropout,
|
||||
merge_weights=merge_weights,
|
||||
)
|
||||
|
||||
self.initial_scale = initial_scale
|
||||
self.fan_in_fan_out = fan_in_fan_out
|
||||
if r > 0:
|
||||
@ -563,7 +571,7 @@ class ScaledLinear_lora(nn.Linear, LoRALayer):
|
||||
self.weight.requires_grad = False
|
||||
|
||||
self.reset_parameters()
|
||||
|
||||
|
||||
def reset_parameters(self):
|
||||
# initialize the parameters
|
||||
nn.Linear.reset_parameters(self)
|
||||
@ -572,16 +580,19 @@ class ScaledLinear_lora(nn.Linear, LoRALayer):
|
||||
with torch.no_grad():
|
||||
self.weight[:] *= initial_scale
|
||||
if self.bias is not None:
|
||||
nn.init.uniform_(self.bias, -0.1 * initial_scale, 0.1 * initial_scale)
|
||||
if hasattr(self, 'lora_A'):
|
||||
nn.init.uniform_(
|
||||
self.bias, -0.1 * initial_scale, 0.1 * initial_scale
|
||||
)
|
||||
if hasattr(self, "lora_A"):
|
||||
# initialize B the same way as the default for nn.Linear and A to zero
|
||||
# this is different than what is described in the paper but should not affect performance
|
||||
nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
|
||||
nn.init.zeros_(self.lora_B)
|
||||
|
||||
def train(self, mode: bool=True):
|
||||
|
||||
def train(self, mode: bool = True):
|
||||
def T(w):
|
||||
return w.transpose(0, 1) if self.fan_in_fan_out else w
|
||||
|
||||
nn.Linear.train(self, mode)
|
||||
if mode:
|
||||
# We don't want the weights to be merged in training mode
|
||||
@ -595,18 +606,24 @@ class ScaledLinear_lora(nn.Linear, LoRALayer):
|
||||
# Merge the weights and mark it
|
||||
if self.r > 0:
|
||||
self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
|
||||
self.merged = True
|
||||
|
||||
self.merged = True
|
||||
|
||||
def forward(self, x: torch.Tensor):
|
||||
def T(w):
|
||||
return w.transpose(0, 1) if self.fan_in_fan_out else w
|
||||
|
||||
if self.r > 0 and not self.merged:
|
||||
result = F.linear(x, T(self.weight), bias=self.bias)
|
||||
delta_result = self.lora_dropout(x) @ self.lora_A.transpose(0, 1) @ self.lora_B.transpose(0, 1)
|
||||
delta_result = (
|
||||
self.lora_dropout(x)
|
||||
@ self.lora_A.transpose(0, 1)
|
||||
@ self.lora_B.transpose(0, 1)
|
||||
)
|
||||
return result + delta_result * self.scaling
|
||||
else:
|
||||
return F.linear(x, T(self.weight), bias=self.bias)
|
||||
|
||||
|
||||
def ScaledConv1d(*args, initial_scale: float = 1.0, **kwargs) -> nn.Conv1d:
|
||||
"""
|
||||
Behaves like a constructor of a modified version of nn.Conv1d
|
||||
@ -1740,6 +1757,7 @@ class ActivationDropoutAndLinear(torch.nn.Module):
|
||||
self.dropout_shared_dim,
|
||||
)
|
||||
|
||||
|
||||
class ActivationDropoutAndLinear_lora(torch.nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
@ -1749,9 +1767,9 @@ class ActivationDropoutAndLinear_lora(torch.nn.Module):
|
||||
activation: str = "SwooshL",
|
||||
dropout_p: FloatLike = 0.0,
|
||||
dropout_shared_dim: Optional[int] = -1,
|
||||
r: int=0,
|
||||
lora_alpha: int=1,
|
||||
lora_dropout: float=0.0,
|
||||
r: int = 0,
|
||||
lora_alpha: int = 1,
|
||||
lora_dropout: float = 0.0,
|
||||
initial_scale: float = 1.0,
|
||||
):
|
||||
super().__init__()
|
||||
|
@ -30,7 +30,6 @@ from scaling import (
|
||||
)
|
||||
from scaling import (
|
||||
ScaledLinear, # not as in other dirs.. just scales down initial parameter values.
|
||||
ScaledLinear_lora
|
||||
)
|
||||
from scaling import (
|
||||
ActivationDropoutAndLinear,
|
||||
@ -40,6 +39,7 @@ from scaling import (
|
||||
ChunkCausalDepthwiseConv1d,
|
||||
Dropout2,
|
||||
FloatLike,
|
||||
ScaledLinear_lora,
|
||||
ScheduledFloat,
|
||||
Whiten,
|
||||
convert_num_channels,
|
||||
@ -636,7 +636,7 @@ class Zipformer2EncoderLayer(nn.Module):
|
||||
)
|
||||
|
||||
self.self_attn1 = SelfAttention(
|
||||
embed_dim,
|
||||
embed_dim,
|
||||
num_heads,
|
||||
value_head_dim,
|
||||
lora_r=lora_r,
|
||||
@ -645,7 +645,7 @@ class Zipformer2EncoderLayer(nn.Module):
|
||||
)
|
||||
|
||||
self.self_attn2 = SelfAttention(
|
||||
embed_dim,
|
||||
embed_dim,
|
||||
num_heads,
|
||||
value_head_dim,
|
||||
lora_r=lora_r,
|
||||
@ -654,7 +654,7 @@ class Zipformer2EncoderLayer(nn.Module):
|
||||
)
|
||||
|
||||
self.feed_forward1 = FeedforwardModule(
|
||||
embed_dim,
|
||||
embed_dim,
|
||||
(feedforward_dim * 3) // 4,
|
||||
dropout,
|
||||
lora_r=lora_r,
|
||||
@ -672,7 +672,7 @@ class Zipformer2EncoderLayer(nn.Module):
|
||||
)
|
||||
|
||||
self.feed_forward3 = FeedforwardModule(
|
||||
embed_dim,
|
||||
embed_dim,
|
||||
(feedforward_dim * 5) // 4,
|
||||
dropout,
|
||||
lora_r=lora_r,
|
||||
@ -1566,7 +1566,7 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
|
||||
pos_emb_skip_rate: FloatLike = ScheduledFloat((0.0, 0.5), (4000.0, 0.0)),
|
||||
lora_r: int = 0,
|
||||
lora_alpha: int = 4,
|
||||
lora_dropout: float=0.0
|
||||
lora_dropout: float = 0.0,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.embed_dim = embed_dim
|
||||
@ -1935,7 +1935,7 @@ class SelfAttention(nn.Module):
|
||||
value_head_dim: int,
|
||||
lora_r: int = 0,
|
||||
lora_alpha: int = 4,
|
||||
lora_dropout: float=0.0
|
||||
lora_dropout: float = 0.0,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.in_proj = ScaledLinear_lora(
|
||||
@ -2064,7 +2064,7 @@ class FeedforwardModule(nn.Module):
|
||||
dropout: FloatLike,
|
||||
lora_r: int = 0,
|
||||
lora_alpha: int = 4,
|
||||
lora_dropout: float=0.0
|
||||
lora_dropout: float = 0.0,
|
||||
):
|
||||
super(FeedforwardModule, self).__init__()
|
||||
self.in_proj = ScaledLinear_lora(
|
||||
|
Loading…
x
Reference in New Issue
Block a user