forked from MingjieChen/DYGANVC
-
Notifications
You must be signed in to change notification settings - Fork 0
/
speaker_encoder.py
71 lines (53 loc) · 2.18 KB
/
speaker_encoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import torch
import torch.nn as nn
import numpy as np
import argparse
import torch.nn.functional as F
from wadain import Wadain
import math
class SPEncoder(nn.Module):
'''speaker encoder for adaptive instance normalization'''
def __init__(self, num_speakers = 4, spk_emb_dim = 128):
super().__init__()
self.num_speakers = num_speakers
self.spk_emb_dim = spk_emb_dim
self.down_sample_1 = nn.Sequential(
nn.Conv1d(in_channels=80, out_channels=256, kernel_size=5, stride = 2, padding=2, bias=False),
nn.LeakyReLU(0.02),
)
self.down_sample_2 = nn.Sequential(
nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, stride=2, padding=1, bias=False),
nn.LeakyReLU(0.02),
)
self.down_sample_3 = nn.Sequential(
nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, stride=2, padding=1, bias=False),
nn.LeakyReLU(0.02),
)
self.down_sample_4 = nn.Sequential(
nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, stride=2, padding=1, bias=False),
nn.LeakyReLU(0.02),
)
self.down_sample_5 = nn.Sequential(
nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False),
nn.LeakyReLU(0.02),
)
self.unshared = nn.ModuleList()
for _ in range(num_speakers):
self.unshared += [nn.Linear(512, spk_emb_dim)]
def forward(self,x, trg_c):
x = x.squeeze(1)
out = self.down_sample_1(x)
out = self.down_sample_2(out)
out = self.down_sample_3(out)
out = self.down_sample_4(out)
out = self.down_sample_5(out)
out_mean = torch.mean(out, dim = 2)
out_std = torch.std(out, dim = 2)
out = torch.cat([out_mean, out_std], dim = 1)
res = []
for layer in self.unshared:
res += [layer(out)]
res = torch.stack(res, dim = 1)
idx = torch.LongTensor(range(x.size(0))).to(x.device)
s = res[idx, trg_c.long()]
return s