Wav2Vec2FeatureEncoder
传递音频文件。它经过7个卷积层,参数为
conv_dim=(512,512,512,512,512,512,512)
feat_extract_norm="layer"
.
每次的输出都不一样,虽然应该输出相同的特征。可能是什么原因?我是否需要将卷积滤波器参数保存在某处并加载它,该怎么做?
import soundfile as sf
x, _ = sf.read(wav) # float64 of shape (36864,)
x_aug = torch.tensor(audio_augment(x), dtype=torch.float, device=device) # x_aug is a tensor of shape torch.Size([16, 1, 36864]) where batch_size = 16
encoder = Wav2Vec2FeatureEncoder()
encoder_feat = encoder(x)
Wav2Vec2FeatureEncoder
class Wav2Vec2FeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform"""
def __init__(
self,
conv_dim=(512,512,512,512,512,512,512),
feat_extract_norm="layer"
):
super().__init__()
self.num_feat_extract_layers = len(conv_dim)
"""
if config.feat_extract_norm == "group":
conv_layers = [Wav2Vec2GroupNormConvLayer(config, layer_id=0)] + [
Wav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
]
"""
if feat_extract_norm == "layer":
conv_layers = [
Wav2Vec2LayerNormConvLayer(layer_id=i) for i in range(self.num_feat_extract_layers)
]
else:
raise ValueError(
f"`config.feat_extract_norm` is {feat_extract_norm}, but has to be one of ['group', 'layer']"
)
self.conv_layers = nn.ModuleList(conv_layers)
self.gradient_checkpointing = False
self._requires_grad = True
def _freeze_parameters(self):
for param in self.parameters():
param.requires_grad = False
self._requires_grad = False
def forward(self, input_values):
hidden_states = input_values
# make sure hidden_states require grad for gradient_checkpointing
if self._requires_grad and self.training:
hidden_states.requires_grad = True
for conv_layer in self.conv_layers:
if self._requires_grad and self.gradient_checkpointing and self.training:
def create_custom_forward(module):
def custom_forward(*inputs):
return module(*inputs)
return custom_forward
hidden_states = torch.utils.checkpoint.checkpoint(
create_custom_forward(conv_layer),
hidden_states,
)
else:
hidden_states = conv_layer(hidden_states)
print(hidden_states[0][0][0].item())
return hidden_states
Wav2Vec2LayerNormConvLayer
class Wav2Vec2LayerNormConvLayer(nn.Module):
def __init__(
self,
layer_id=0,
conv_dim=(512,512,512,512,512,512,512),
conv_kernel=(10, 3, 3, 3, 3, 2, 2),
conv_stride=(5, 2, 2, 2, 2, 2, 2),
conv_bias=False,
feat_extract_activation="gelu"
):
super().__init__()
self.in_conv_dim = conv_dim[layer_id - 1] if layer_id > 0 else 1
self.out_conv_dim = conv_dim[layer_id]
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=conv_kernel[layer_id],
stride=conv_stride[layer_id],
bias=conv_bias,
)
self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
self.activation = ACT2FN[feat_extract_activation]
def forward(self, hidden_states):
hidden_states = self.conv(hidden_states)
hidden_states = hidden_states.transpose(-2, -1)
hidden_states = self.layer_norm(hidden_states)
hidden_states = hidden_states.transpose(-2, -1)
hidden_states = self.activation(hidden_states)
return hidden_states
这个错误很愚蠢,我正在为每个输入初始化
Wav2Vec2FeatureEncoder
,而它应该只被初始化一次。我在做以下
# x1=x2=same wav file
encoder = Wav2Vec2FeatureEncoder()
encoder_feat = encoder(x1)
encoder2 = Wav2Vec2FeatureEncoder()
encoder2_feat = encoder2(x2)
应该是:
encoder = Wav2Vec2FeatureEncoder()
x1_feat = encoder(x1)
x2_feat = encoder(x2)