From 7c719fce40686ea509f9212af3f31364ecae0e3a Mon Sep 17 00:00:00 2001 From: D-X-Y <280835372@qq.com> Date: Mon, 15 Mar 2021 03:36:36 +0000 Subject: [PATCH] Decouple transformers --- .latent-data/qlib | 2 +- exps/trading/workflow_tt.py | 2 - lib/trade_models/quant_transformer.py | 198 +---------------------- lib/trade_models/transformers.py | 225 ++++++++++++++++++++++++++ 4 files changed, 234 insertions(+), 193 deletions(-) create mode 100755 lib/trade_models/transformers.py diff --git a/.latent-data/qlib b/.latent-data/qlib index 358de88..07434da 160000 --- a/.latent-data/qlib +++ b/.latent-data/qlib @@ -1 +1 @@ -Subproject commit 358de88602b484b69756477e37084c14facafbf9 +Subproject commit 07434da8b02e6e3706c554c5081ce35ba382d08e diff --git a/exps/trading/workflow_tt.py b/exps/trading/workflow_tt.py index 7204ef7..08be891 100644 --- a/exps/trading/workflow_tt.py +++ b/exps/trading/workflow_tt.py @@ -22,8 +22,6 @@ from qlib.config import C from qlib.config import REG_CN from qlib.utils import init_instance_by_config from qlib.workflow import R -from qlib.utils import flatten_dict -from qlib.log import set_log_basic_config def main(xargs): diff --git a/lib/trade_models/quant_transformer.py b/lib/trade_models/quant_transformer.py index c8fbc86..cec910b 100755 --- a/lib/trade_models/quant_transformer.py +++ b/lib/trade_models/quant_transformer.py @@ -11,7 +11,6 @@ import pandas as pd import copy from functools import partial from typing import Optional, Text -import logging from qlib.utils import ( unpack_archive_with_buffer, @@ -19,26 +18,25 @@ from qlib.utils import ( get_or_create_path, drop_nan_by_y_index, ) -from qlib.log import get_module_logger, TimeInspector +from qlib.log import get_module_logger import torch -import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.utils.data as th_data -import layers as xlayers from log_utils import AverageMeter from utils import count_parameters +from trade_models.transformers import DEFAULT_NET_CONFIG +from trade_models.transformers import get_transformer + from qlib.model.base import Model from qlib.data.dataset import DatasetH from qlib.data.dataset.handler import DataHandlerLP -default_net_config = dict(d_feat=6, embed_dim=48, depth=5, num_heads=4, mlp_ratio=4.0, qkv_bias=True, pos_drop=0.1) - -default_opt_config = dict( +DEFAULT_OPT_CONFIG = dict( epochs=200, lr=0.001, batch_size=2000, early_stop=20, loss="mse", optimizer="adam", num_workers=4 ) @@ -52,8 +50,8 @@ class QuantTransformer(Model): self.logger.info("QuantTransformer PyTorch version...") # set hyper-parameters. - self.net_config = net_config or default_net_config - self.opt_config = opt_config or default_opt_config + self.net_config = net_config or DEFAULT_NET_CONFIG + self.opt_config = opt_config or DEFAULT_OPT_CONFIG self.metric = metric self.device = torch.device("cuda:{:}".format(GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") self.seed = seed @@ -81,12 +79,7 @@ class QuantTransformer(Model): torch.cuda.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) - self.model = TransformerModel( - d_feat=self.net_config["d_feat"], - embed_dim=self.net_config["embed_dim"], - depth=self.net_config["depth"], - pos_drop=self.net_config["pos_drop"], - ) + self.model = get_transformer(self.net_config) self.logger.info("model: {:}".format(self.model)) self.logger.info("model size: {:.3f} MB".format(count_parameters(self.model))) @@ -283,178 +276,3 @@ class QuantTransformer(Model): preds.append(pred) return pd.Series(np.concatenate(preds), index=index) - - -# Real Model - - -class Attention(nn.Module): - - def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.0, proj_drop=0.0): - super(Attention, self).__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = qk_scale or math.sqrt(head_dim) - - self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - - def forward(self, x): - B, N, C = x.shape - qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) - q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) - - attn = (q @ k.transpose(-2, -1)) * self.scale - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - - x = (attn @ v).transpose(1, 2).reshape(B, N, C) - x = self.proj(x) - x = self.proj_drop(x) - return x - - -class Block(nn.Module): - - def __init__( - self, - dim, - num_heads, - mlp_ratio=4.0, - qkv_bias=False, - qk_scale=None, - attn_drop=0.0, - mlp_drop=0.0, - drop_path=0.0, - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, - ): - super(Block, self).__init__() - self.norm1 = norm_layer(dim) - self.attn = Attention( - dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=mlp_drop - ) - # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here - self.drop_path = xlayers.DropPath(drop_path) if drop_path > 0.0 else nn.Identity() - self.norm2 = norm_layer(dim) - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = xlayers.MLP(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=mlp_drop) - - def forward(self, x): - x = x + self.drop_path(self.attn(self.norm1(x))) - x = x + self.drop_path(self.mlp(self.norm2(x))) - return x - - -class SimpleEmbed(nn.Module): - - def __init__(self, d_feat, embed_dim): - super(SimpleEmbed, self).__init__() - self.d_feat = d_feat - self.embed_dim = embed_dim - self.proj = nn.Linear(d_feat, embed_dim) - - def forward(self, x): - x = x.reshape(len(x), self.d_feat, -1) # [N, F*T] -> [N, F, T] - x = x.permute(0, 2, 1) # [N, F, T] -> [N, T, F] - out = self.proj(x) * math.sqrt(self.embed_dim) - return out - - -class TransformerModel(nn.Module): - def __init__( - self, - d_feat: int = 6, - embed_dim: int = 64, - depth: int = 4, - num_heads: int = 4, - mlp_ratio: float = 4.0, - qkv_bias: bool = True, - qk_scale: Optional[float] = None, - pos_drop: float = 0.0, - mlp_drop_rate: float = 0.0, - attn_drop_rate: float = 0.0, - drop_path_rate: float = 0.0, - norm_layer: Optional[nn.Module] = None, - max_seq_len: int = 65, - ): - """ - Args: - d_feat (int, tuple): input image size - embed_dim (int): embedding dimension - depth (int): depth of transformer - num_heads (int): number of attention heads - mlp_ratio (int): ratio of mlp hidden dim to embedding dim - qkv_bias (bool): enable bias for qkv if True - qk_scale (float): override default qk scale of head_dim ** -0.5 if set - pos_drop (float): dropout rate for the positional embedding - mlp_drop_rate (float): the dropout rate for MLP layers in a block - attn_drop_rate (float): attention dropout rate - drop_path_rate (float): stochastic depth rate - norm_layer: (nn.Module): normalization layer - """ - super(TransformerModel, self).__init__() - self.embed_dim = embed_dim - self.num_features = embed_dim - norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) - - self.input_embed = SimpleEmbed(d_feat, embed_dim=embed_dim) - - self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) - self.pos_embed = xlayers.PositionalEncoder(d_model=embed_dim, max_seq_len=max_seq_len, dropout=pos_drop) - - dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule - self.blocks = nn.ModuleList( - [ - Block( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop=attn_drop_rate, - mlp_drop=mlp_drop_rate, - drop_path=dpr[i], - norm_layer=norm_layer, - ) - for i in range(depth) - ] - ) - self.norm = norm_layer(embed_dim) - - # regression head - self.head = nn.Linear(self.num_features, 1) - - xlayers.trunc_normal_(self.cls_token, std=0.02) - self.apply(self._init_weights) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - xlayers.trunc_normal_(m.weight, std=0.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - - def forward_features(self, x): - batch, flatten_size = x.shape - feats = self.input_embed(x) # batch * 60 * 64 - - cls_tokens = self.cls_token.expand(batch, -1, -1) # stole cls_tokens impl from Phil Wang, thanks - feats_w_ct = torch.cat((cls_tokens, feats), dim=1) - feats_w_tp = self.pos_embed(feats_w_ct) - - xfeats = feats_w_tp - for block in self.blocks: - xfeats = block(xfeats) - - xfeats = self.norm(xfeats)[:, 0] - return xfeats - - def forward(self, x): - feats = self.forward_features(x) - predicts = self.head(feats).squeeze(-1) - return predicts diff --git a/lib/trade_models/transformers.py b/lib/trade_models/transformers.py new file mode 100755 index 0000000..23ea865 --- /dev/null +++ b/lib/trade_models/transformers.py @@ -0,0 +1,225 @@ +################################################## +# Copyright (c) Xuanyi Dong [GitHub D-X-Y], 2021 # +################################################## +from __future__ import division +from __future__ import print_function + +import math +from functools import partial +from typing import Optional, Text + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import layers as xlayers + + +DEFAULT_NET_CONFIG = dict( + d_feat=6, + embed_dim=48, + depth=5, + num_heads=4, + mlp_ratio=4.0, + qkv_bias=True, + pos_drop=0.1, + mlp_drop_rate=0.1, + attn_drop_rate=0.1, + drop_path_rate=0.1, +) + + +# Real Model + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.0, proj_drop=0.0): + super(Attention, self).__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or math.sqrt(head_dim) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + mlp_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): + super(Block, self).__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=mlp_drop + ) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = xlayers.DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = xlayers.MLP(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=mlp_drop) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class SimpleEmbed(nn.Module): + def __init__(self, d_feat, embed_dim): + super(SimpleEmbed, self).__init__() + self.d_feat = d_feat + self.embed_dim = embed_dim + self.proj = nn.Linear(d_feat, embed_dim) + + def forward(self, x): + x = x.reshape(len(x), self.d_feat, -1) # [N, F*T] -> [N, F, T] + x = x.permute(0, 2, 1) # [N, F, T] -> [N, T, F] + out = self.proj(x) * math.sqrt(self.embed_dim) + return out + + +class TransformerModel(nn.Module): + def __init__( + self, + d_feat: int = 6, + embed_dim: int = 64, + depth: int = 4, + num_heads: int = 4, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + qk_scale: Optional[float] = None, + pos_drop: float = 0.0, + mlp_drop_rate: float = 0.0, + attn_drop_rate: float = 0.0, + drop_path_rate: float = 0.0, + norm_layer: Optional[nn.Module] = None, + max_seq_len: int = 65, + ): + """ + Args: + d_feat (int, tuple): input image size + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + qk_scale (float): override default qk scale of head_dim ** -0.5 if set + pos_drop (float): dropout rate for the positional embedding + mlp_drop_rate (float): the dropout rate for MLP layers in a block + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + norm_layer: (nn.Module): normalization layer + """ + super(TransformerModel, self).__init__() + self.embed_dim = embed_dim + self.num_features = embed_dim + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + + self.input_embed = SimpleEmbed(d_feat, embed_dim=embed_dim) + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = xlayers.PositionalEncoder(d_model=embed_dim, max_seq_len=max_seq_len, dropout=pos_drop) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList( + [ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop_rate, + mlp_drop=mlp_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + ) + for i in range(depth) + ] + ) + self.norm = norm_layer(embed_dim) + + # regression head + self.head = nn.Linear(self.num_features, 1) + + xlayers.trunc_normal_(self.cls_token, std=0.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + xlayers.trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def forward_features(self, x): + batch, flatten_size = x.shape + feats = self.input_embed(x) # batch * 60 * 64 + + cls_tokens = self.cls_token.expand(batch, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + feats_w_ct = torch.cat((cls_tokens, feats), dim=1) + feats_w_tp = self.pos_embed(feats_w_ct) + + xfeats = feats_w_tp + for block in self.blocks: + xfeats = block(xfeats) + + xfeats = self.norm(xfeats)[:, 0] + return xfeats + + def forward(self, x): + feats = self.forward_features(x) + predicts = self.head(feats).squeeze(-1) + return predicts + + +def get_transformer(config): + if not isinstance(config, dict): + raise ValueError("Invalid Configuration: {:}".format(config)) + name = config.get("name", "basic") + if name == "basic": + model = TransformerModel( + d_feat=config.get("d_feat"), + embed_dim=config.get("embed_dim"), + depth=config.get("depth"), + num_heads=config.get("num_heads"), + mlp_ratio=config.get("mlp_ratio"), + qkv_bias=config.get("qkv_bias"), + qk_scale=config.get("qkv_scale"), + pos_drop=config.get("pos_drop"), + mlp_drop_rate=config.get("mlp_drop_rate"), + attn_drop_rate=config.get("attn_drop_rate"), + drop_path_rate=config.get("drop_path_rate"), + norm_layer=config.get("norm_layer", None), + ) + else: + raise ValueError("Unknown model name: {:}".format(name)) + return model