Updates
This commit is contained in:
parent
379b904203
commit
15dda79e3b
@ -1 +1 @@
|
|||||||
Subproject commit 0a0c6a3185ac6bcec38b756f039b9ccc64b41827
|
Subproject commit 419629e4d2eefed52ceb207afb887a47aac732ca
|
@ -46,7 +46,7 @@ _default_max_depth = 5
|
|||||||
DefaultSearchSpace = dict(
|
DefaultSearchSpace = dict(
|
||||||
d_feat=6,
|
d_feat=6,
|
||||||
stem_dim=spaces.Categorical(*_get_list_mul(8, 16)),
|
stem_dim=spaces.Categorical(*_get_list_mul(8, 16)),
|
||||||
embed_dims=_get_mul_specs(_get_list_mul(8, 16), _default_max_depth),
|
embed_dim=spaces.Categorical(*_get_list_mul(8, 16)),
|
||||||
num_heads=_get_mul_specs((1, 2, 4, 8), _default_max_depth),
|
num_heads=_get_mul_specs((1, 2, 4, 8), _default_max_depth),
|
||||||
mlp_hidden_multipliers=_get_mul_specs((0.5, 1, 2, 4, 8), _default_max_depth),
|
mlp_hidden_multipliers=_get_mul_specs((0.5, 1, 2, 4, 8), _default_max_depth),
|
||||||
qkv_bias=True,
|
qkv_bias=True,
|
||||||
@ -62,7 +62,7 @@ class SuperTransformer(super_core.SuperModule):
|
|||||||
self,
|
self,
|
||||||
d_feat: int = 6,
|
d_feat: int = 6,
|
||||||
stem_dim: super_core.IntSpaceType = DefaultSearchSpace["stem_dim"],
|
stem_dim: super_core.IntSpaceType = DefaultSearchSpace["stem_dim"],
|
||||||
embed_dims: List[super_core.IntSpaceType] = DefaultSearchSpace["embed_dims"],
|
embed_dim: List[super_core.IntSpaceType] = DefaultSearchSpace["embed_dim"],
|
||||||
num_heads: List[super_core.IntSpaceType] = DefaultSearchSpace["num_heads"],
|
num_heads: List[super_core.IntSpaceType] = DefaultSearchSpace["num_heads"],
|
||||||
mlp_hidden_multipliers: List[super_core.IntSpaceType] = DefaultSearchSpace[
|
mlp_hidden_multipliers: List[super_core.IntSpaceType] = DefaultSearchSpace[
|
||||||
"mlp_hidden_multipliers"
|
"mlp_hidden_multipliers"
|
||||||
@ -73,7 +73,7 @@ class SuperTransformer(super_core.SuperModule):
|
|||||||
max_seq_len: int = 65,
|
max_seq_len: int = 65,
|
||||||
):
|
):
|
||||||
super(SuperTransformer, self).__init__()
|
super(SuperTransformer, self).__init__()
|
||||||
self._embed_dims = embed_dims
|
self._embed_dim = embed_dim
|
||||||
self._stem_dim = stem_dim
|
self._stem_dim = stem_dim
|
||||||
self._num_heads = num_heads
|
self._num_heads = num_heads
|
||||||
self._mlp_hidden_multipliers = mlp_hidden_multipliers
|
self._mlp_hidden_multipliers = mlp_hidden_multipliers
|
||||||
@ -85,22 +85,15 @@ class SuperTransformer(super_core.SuperModule):
|
|||||||
d_model=stem_dim, max_seq_len=max_seq_len, dropout=pos_drop
|
d_model=stem_dim, max_seq_len=max_seq_len, dropout=pos_drop
|
||||||
)
|
)
|
||||||
# build the transformer encode layers -->> check params
|
# build the transformer encode layers -->> check params
|
||||||
_assert_types(embed_dims, (tuple, list))
|
|
||||||
_assert_types(num_heads, (tuple, list))
|
_assert_types(num_heads, (tuple, list))
|
||||||
_assert_types(mlp_hidden_multipliers, (tuple, list))
|
_assert_types(mlp_hidden_multipliers, (tuple, list))
|
||||||
num_layers = len(embed_dims)
|
assert len(num_heads) == len(mlp_hidden_multipliers), "{:} vs {:}".format(
|
||||||
assert (
|
len(num_heads), len(mlp_hidden_multipliers)
|
||||||
num_layers == len(num_heads) == len(mlp_hidden_multipliers)
|
|
||||||
), "{:} vs {:} vs {:}".format(
|
|
||||||
num_layers, len(num_heads), len(mlp_hidden_multipliers)
|
|
||||||
)
|
)
|
||||||
# build the transformer encode layers -->> backbone
|
# build the transformer encode layers -->> backbone
|
||||||
layers, input_dim = [], stem_dim
|
layers = []
|
||||||
for embed_dim, num_head, mlp_hidden_multiplier in zip(
|
for num_head, mlp_hidden_multiplier in zip(num_heads, mlp_hidden_multipliers):
|
||||||
embed_dims, num_heads, mlp_hidden_multipliers
|
|
||||||
):
|
|
||||||
layer = super_core.SuperTransformerEncoderLayer(
|
layer = super_core.SuperTransformerEncoderLayer(
|
||||||
input_dim,
|
|
||||||
embed_dim,
|
embed_dim,
|
||||||
num_head,
|
num_head,
|
||||||
qkv_bias,
|
qkv_bias,
|
||||||
@ -108,11 +101,12 @@ class SuperTransformer(super_core.SuperModule):
|
|||||||
other_drop,
|
other_drop,
|
||||||
)
|
)
|
||||||
layers.append(layer)
|
layers.append(layer)
|
||||||
input_dim = embed_dim
|
|
||||||
self.backbone = super_core.SuperSequential(*layers)
|
self.backbone = super_core.SuperSequential(*layers)
|
||||||
|
|
||||||
# the regression head
|
# the regression head
|
||||||
self.head = super_core.SuperLinear(self._embed_dims[-1], 1)
|
self.head = super_core.SuperSequential(
|
||||||
|
super_core.SuperLayerNorm1D(embed_dim), super_core.SuperLinear(embed_dim, 1)
|
||||||
|
)
|
||||||
trunc_normal_(self.cls_token, std=0.02)
|
trunc_normal_(self.cls_token, std=0.02)
|
||||||
self.apply(self._init_weights)
|
self.apply(self._init_weights)
|
||||||
|
|
||||||
@ -123,14 +117,16 @@ class SuperTransformer(super_core.SuperModule):
|
|||||||
@property
|
@property
|
||||||
def abstract_search_space(self):
|
def abstract_search_space(self):
|
||||||
root_node = spaces.VirtualNode(id(self))
|
root_node = spaces.VirtualNode(id(self))
|
||||||
|
if not spaces.is_determined(self._stem_dim):
|
||||||
|
root_node.append("_stem_dim", self._stem_dim.abstract(reuse_last=True))
|
||||||
|
if not spaces.is_determined(self._stem_dim):
|
||||||
|
root_node.append("_embed_dim", self._embed_dim.abstract(reuse_last=True))
|
||||||
xdict = dict(
|
xdict = dict(
|
||||||
input_embed=self.input_embed.abstract_search_space,
|
input_embed=self.input_embed.abstract_search_space,
|
||||||
pos_embed=self.pos_embed.abstract_search_space,
|
pos_embed=self.pos_embed.abstract_search_space,
|
||||||
backbone=self.backbone.abstract_search_space,
|
backbone=self.backbone.abstract_search_space,
|
||||||
head=self.head.abstract_search_space,
|
head=self.head.abstract_search_space,
|
||||||
)
|
)
|
||||||
if not spaces.is_determined(self._stem_dim):
|
|
||||||
root_node.append("_stem_dim", self._stem_dim.abstract(reuse_last=True))
|
|
||||||
for key, space in xdict.items():
|
for key, space in xdict.items():
|
||||||
if not spaces.is_determined(space):
|
if not spaces.is_determined(space):
|
||||||
root_node.append(key, space)
|
root_node.append(key, space)
|
||||||
@ -196,7 +192,7 @@ def get_transformer(config):
|
|||||||
model = SuperTransformer(
|
model = SuperTransformer(
|
||||||
d_feat=config.get("d_feat"),
|
d_feat=config.get("d_feat"),
|
||||||
stem_dim=config.get("stem_dim"),
|
stem_dim=config.get("stem_dim"),
|
||||||
embed_dims=config.get("embed_dims"),
|
embed_dim=config.get("embed_dim"),
|
||||||
num_heads=config.get("num_heads"),
|
num_heads=config.get("num_heads"),
|
||||||
mlp_hidden_multipliers=config.get("mlp_hidden_multipliers"),
|
mlp_hidden_multipliers=config.get("mlp_hidden_multipliers"),
|
||||||
qkv_bias=config.get("qkv_bias"),
|
qkv_bias=config.get("qkv_bias"),
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#####################################################
|
#####################################################
|
||||||
from .super_module import SuperRunMode
|
from .super_module import SuperRunMode
|
||||||
from .super_module import IntSpaceType
|
from .super_module import IntSpaceType
|
||||||
|
from .super_module import LayerOrder
|
||||||
|
|
||||||
from .super_module import SuperModule
|
from .super_module import SuperModule
|
||||||
from .super_container import SuperSequential
|
from .super_container import SuperSequential
|
||||||
|
@ -37,8 +37,7 @@ class SuperTransformerEncoderLayer(SuperModule):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
input_dim: IntSpaceType,
|
d_model: IntSpaceType,
|
||||||
output_dim: IntSpaceType,
|
|
||||||
num_heads: IntSpaceType,
|
num_heads: IntSpaceType,
|
||||||
qkv_bias: BoolSpaceType = False,
|
qkv_bias: BoolSpaceType = False,
|
||||||
mlp_hidden_multiplier: IntSpaceType = 4,
|
mlp_hidden_multiplier: IntSpaceType = 4,
|
||||||
@ -48,40 +47,37 @@ class SuperTransformerEncoderLayer(SuperModule):
|
|||||||
):
|
):
|
||||||
super(SuperTransformerEncoderLayer, self).__init__()
|
super(SuperTransformerEncoderLayer, self).__init__()
|
||||||
mha = SuperAttention(
|
mha = SuperAttention(
|
||||||
input_dim,
|
d_model,
|
||||||
input_dim,
|
d_model,
|
||||||
num_heads=num_heads,
|
num_heads=num_heads,
|
||||||
qkv_bias=qkv_bias,
|
qkv_bias=qkv_bias,
|
||||||
attn_drop=drop,
|
attn_drop=drop,
|
||||||
proj_drop=drop,
|
proj_drop=drop,
|
||||||
)
|
)
|
||||||
drop1 = nn.Dropout(drop or 0.0)
|
|
||||||
norm1 = SuperLayerNorm1D(input_dim)
|
|
||||||
mlp = SuperMLPv2(
|
mlp = SuperMLPv2(
|
||||||
input_dim,
|
d_model,
|
||||||
hidden_multiplier=mlp_hidden_multiplier,
|
hidden_multiplier=mlp_hidden_multiplier,
|
||||||
out_features=output_dim,
|
out_features=d_model,
|
||||||
act_layer=act_layer,
|
act_layer=act_layer,
|
||||||
drop=drop,
|
drop=drop,
|
||||||
)
|
)
|
||||||
drop2 = nn.Dropout(drop or 0.0)
|
|
||||||
norm2 = SuperLayerNorm1D(output_dim)
|
|
||||||
if order is LayerOrder.PreNorm:
|
if order is LayerOrder.PreNorm:
|
||||||
self.norm1 = norm1
|
self.norm1 = SuperLayerNorm1D(d_model)
|
||||||
self.mha = mha
|
self.mha = mha
|
||||||
self.drop1 = drop1
|
self.drop1 = nn.Dropout(drop or 0.0)
|
||||||
self.norm2 = norm2
|
self.norm2 = SuperLayerNorm1D(d_model)
|
||||||
self.mlp = mlp
|
self.mlp = mlp
|
||||||
self.drop2 = drop2
|
self.drop2 = nn.Dropout(drop or 0.0)
|
||||||
elif order is LayerOrder.PostNoem:
|
elif order is LayerOrder.PostNorm:
|
||||||
self.mha = mha
|
self.mha = mha
|
||||||
self.drop1 = drop1
|
self.drop1 = nn.Dropout(drop or 0.0)
|
||||||
self.norm1 = norm1
|
self.norm1 = SuperLayerNorm1D(d_model)
|
||||||
self.mlp = mlp
|
self.mlp = mlp
|
||||||
self.drop2 = drop2
|
self.drop2 = nn.Dropout(drop or 0.0)
|
||||||
self.norm2 = norm2
|
self.norm2 = SuperLayerNorm1D(d_model)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unknown order: {:}".format(order))
|
raise ValueError("Unknown order: {:}".format(order))
|
||||||
|
self._order = order
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def abstract_search_space(self):
|
def abstract_search_space(self):
|
||||||
@ -108,18 +104,19 @@ class SuperTransformerEncoderLayer(SuperModule):
|
|||||||
return self.forward_raw(input)
|
return self.forward_raw(input)
|
||||||
|
|
||||||
def forward_raw(self, input: torch.Tensor) -> torch.Tensor:
|
def forward_raw(self, input: torch.Tensor) -> torch.Tensor:
|
||||||
if order is LayerOrder.PreNorm:
|
if self._order is LayerOrder.PreNorm:
|
||||||
x = self.norm1(input)
|
x = self.norm1(input)
|
||||||
x = x + self.drop1(self.mha(x))
|
x = x + self.drop1(self.mha(x))
|
||||||
x = self.norm2(x)
|
x = self.norm2(x)
|
||||||
x = x + self.drop2(self.mlp(x))
|
x = x + self.drop2(self.mlp(x))
|
||||||
elif order is LayerOrder.PostNoem:
|
elif self._order is LayerOrder.PostNorm:
|
||||||
# multi-head attention
|
# multi-head attention
|
||||||
x = x + self.drop1(self.mha(input))
|
x = self.mha(input)
|
||||||
|
x = x + self.drop1(x)
|
||||||
x = self.norm1(x)
|
x = self.norm1(x)
|
||||||
# feed-forward layer
|
# feed-forward layer
|
||||||
x = x + self.drop2(self.mlp(x))
|
x = x + self.drop2(self.mlp(x))
|
||||||
x = self.norm2(x)
|
x = self.norm2(x)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unknown order: {:}".format(order))
|
raise ValueError("Unknown order: {:}".format(self._order))
|
||||||
return x
|
return x
|
||||||
|
@ -53,11 +53,13 @@ class TestSuperAttention(unittest.TestCase):
|
|||||||
@parameterized.expand([[6], [12], [24], [48]])
|
@parameterized.expand([[6], [12], [24], [48]])
|
||||||
def test_transformer_encoder(self, input_dim):
|
def test_transformer_encoder(self, input_dim):
|
||||||
output_dim = spaces.Categorical(12, 24, 36)
|
output_dim = spaces.Categorical(12, 24, 36)
|
||||||
model = super_core.SuperTransformerEncoderLayer(
|
model = super_core.SuperSequential(
|
||||||
input_dim,
|
super_core.SuperLinear(input_dim, output_dim),
|
||||||
output_dim=output_dim,
|
super_core.SuperTransformerEncoderLayer(
|
||||||
num_heads=spaces.Categorical(2, 4, 6),
|
output_dim,
|
||||||
mlp_hidden_multiplier=spaces.Categorical(1, 2, 4),
|
num_heads=spaces.Categorical(2, 4, 6),
|
||||||
|
mlp_hidden_multiplier=spaces.Categorical(1, 2, 4),
|
||||||
|
),
|
||||||
)
|
)
|
||||||
print(model)
|
print(model)
|
||||||
model.apply_verbose(True)
|
model.apply_verbose(True)
|
||||||
|
@ -36,25 +36,31 @@ def _internal_func(inputs, model):
|
|||||||
return abstract_child, outputs
|
return abstract_child, outputs
|
||||||
|
|
||||||
|
|
||||||
def _create_stel(input_dim, output_dim):
|
def _create_stel(input_dim, output_dim, order):
|
||||||
return super_core.SuperTransformerEncoderLayer(
|
return super_core.SuperSequential(
|
||||||
input_dim,
|
super_core.SuperLinear(input_dim, output_dim),
|
||||||
output_dim,
|
super_core.SuperTransformerEncoderLayer(
|
||||||
num_heads=spaces.Categorical(2, 4, 6),
|
output_dim,
|
||||||
mlp_hidden_multiplier=spaces.Categorical(1, 2, 4),
|
num_heads=spaces.Categorical(2, 4, 6),
|
||||||
|
mlp_hidden_multiplier=spaces.Categorical(1, 2, 4),
|
||||||
|
order=order,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("batch", (1, 2, 4))
|
@pytest.mark.parametrize("batch", (1, 2, 4))
|
||||||
@pytest.mark.parametrize("seq_dim", (1, 10, 30))
|
@pytest.mark.parametrize("seq_dim", (1, 10, 30))
|
||||||
@pytest.mark.parametrize("input_dim", (6, 12, 24, 27))
|
@pytest.mark.parametrize("input_dim", (6, 12, 24, 27))
|
||||||
def test_super_sequential(batch, seq_dim, input_dim):
|
@pytest.mark.parametrize(
|
||||||
|
"order", (super_core.LayerOrder.PreNorm, super_core.LayerOrder.PostNorm)
|
||||||
|
)
|
||||||
|
def test_super_sequential(batch, seq_dim, input_dim, order):
|
||||||
out1_dim = spaces.Categorical(12, 24, 36)
|
out1_dim = spaces.Categorical(12, 24, 36)
|
||||||
out2_dim = spaces.Categorical(24, 36, 48)
|
out2_dim = spaces.Categorical(24, 36, 48)
|
||||||
out3_dim = spaces.Categorical(36, 72, 100)
|
out3_dim = spaces.Categorical(36, 72, 100)
|
||||||
layer1 = _create_stel(input_dim, out1_dim)
|
layer1 = _create_stel(input_dim, out1_dim, order)
|
||||||
layer2 = _create_stel(out1_dim, out2_dim)
|
layer2 = _create_stel(out1_dim, out2_dim, order)
|
||||||
layer3 = _create_stel(out2_dim, out3_dim)
|
layer3 = _create_stel(out2_dim, out3_dim, order)
|
||||||
model = super_core.SuperSequential(layer1, layer2, layer3)
|
model = super_core.SuperSequential(layer1, layer2, layer3)
|
||||||
print(model)
|
print(model)
|
||||||
model.apply_verbose(True)
|
model.apply_verbose(True)
|
||||||
|
Loading…
Reference in New Issue
Block a user