Fix CUDA memory issues

This commit is contained in:
D-X-Y 2021-03-30 09:02:41 +00:00
parent 5fb900bcb1
commit 9fc2c991f5
7 changed files with 72 additions and 60 deletions

View File

@ -15,8 +15,8 @@
# python exps/trading/baselines.py --alg TabNet #
# #
# python exps/trading/baselines.py --alg Transformer#
# python exps/trading/baselines.py --alg TSF #
# python exps/trading/baselines.py --alg TSF-4x64-d0
# python exps/trading/baselines.py --alg TSF
# python exps/trading/baselines.py --alg TSF-4x64-drop0_0
#####################################################
import sys
import copy
@ -40,10 +40,11 @@ from qlib.workflow import R
from qlib.utils import flatten_dict
def to_pos_drop(config, value):
def to_drop(config, pos_drop, other_drop):
config = copy.deepcopy(config)
net = config["task"]["model"]["kwargs"]["net_config"]
net["pos_drop"] = value
net["pos_drop"] = pos_drop
net["other_drop"] = other_drop
return config
@ -59,11 +60,12 @@ def to_layer(config, embed_dim, depth):
def extend_transformer_settings(alg2configs, name):
config = copy.deepcopy(alg2configs[name])
for i in range(1, 7):
for j in [6, 12, 24, 32, 48, 64]:
for k in [0, 0.1]:
alg2configs[name + "-{:}x{:}-d{:}".format(i, j, k)] = to_layer(
to_pos_drop(config, k), j, i
)
for j in (6, 12, 24, 32, 48, 64):
for k1 in (0, 0.1, 0.2):
for k2 in (0, 0.1):
alg2configs[
name + "-{:}x{:}-drop{:}_{:}".format(i, j, k1, k2)
] = to_layer(to_drop(config, k1, k2), j, i)
return alg2configs

View File

@ -107,7 +107,7 @@ def run_exp(
model = R.load_object(model_obj_name)
logger.info("[Find existing object from {:}]".format(model_obj_name))
except OSError:
# R.log_params(**flatten_dict(task_config))
R.log_params(**flatten_dict(update_gpu(task_config, None)))
if "save_path" in inspect.getfullargspec(model.fit).args:
model_fit_kwargs["save_path"] = os.path.join(
recorder_root_dir, "model.ckp"
@ -126,9 +126,6 @@ def run_exp(
else:
R.save_objects(**{model_obj_name: model})
except Exception as e:
import pdb
pdb.set_trace()
raise ValueError("Something wrong: {:}".format(e))
# Get the recorder
recorder = R.get_recorder()

View File

@ -45,6 +45,32 @@ DEFAULT_OPT_CONFIG = dict(
)
def train_or_test_epoch(
xloader, model, loss_fn, metric_fn, is_train, optimizer, device
):
if is_train:
model.train()
else:
model.eval()
score_meter, loss_meter = AverageMeter(), AverageMeter()
for ibatch, (feats, labels) in enumerate(xloader):
feats, labels = feats.to(device), labels.to(device)
# forward the network
preds = model(feats)
loss = loss_fn(preds, labels)
with torch.no_grad():
score = metric_fn(preds, labels)
loss_meter.update(loss.item(), feats.size(0))
score_meter.update(score.item(), feats.size(0))
# optimize the network
if is_train and optimizer is not None:
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(model.parameters(), 3.0)
optimizer.step()
return loss_meter.avg, score_meter.avg
class QuantTransformer(Model):
"""Transformer-based Quant Model"""
@ -132,32 +158,6 @@ class QuantTransformer(Model):
else:
raise ValueError("unknown metric `{:}`".format(self.metric))
def train_or_test_epoch(
self, xloader, model, loss_fn, metric_fn, is_train, optimizer=None
):
if is_train:
model.train()
else:
model.eval()
score_meter, loss_meter = AverageMeter(), AverageMeter()
for ibatch, (feats, labels) in enumerate(xloader):
feats = feats.to(self.device, non_blocking=True)
labels = labels.to(self.device, non_blocking=True)
# forward the network
preds = model(feats)
loss = loss_fn(preds, labels)
with torch.no_grad():
score = self.metric_fn(preds, labels)
loss_meter.update(loss.item(), feats.size(0))
score_meter.update(score.item(), feats.size(0))
# optimize the network
if is_train and optimizer is not None:
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(model.parameters(), 3.0)
optimizer.step()
return loss_meter.avg, score_meter.avg
def fit(
self,
dataset: DatasetH,
@ -204,14 +204,22 @@ class QuantTransformer(Model):
def _internal_test(ckp_epoch=None, results_dict=None):
with torch.no_grad():
train_loss, train_score = self.train_or_test_epoch(
train_loader, self.model, self.loss_fn, self.metric_fn, False, None
shared_kwards = {
"model": self.model,
"loss_fn": self.loss_fn,
"metric_fn": self.metric_fn,
"is_train": False,
"optimizer": None,
"device": self.device,
}
train_loss, train_score = train_or_test_epoch(
train_loader, **shared_kwards
)
valid_loss, valid_score = self.train_or_test_epoch(
valid_loader, self.model, self.loss_fn, self.metric_fn, False, None
valid_loss, valid_score = train_or_test_epoch(
valid_loader, **shared_kwards
)
test_loss, test_score = self.train_or_test_epoch(
test_loader, self.model, self.loss_fn, self.metric_fn, False, None
test_loss, test_score = train_or_test_epoch(
test_loader, **shared_kwards
)
xstr = (
"train-score={:.6f}, valid-score={:.6f}, test-score={:.6f}".format(
@ -255,13 +263,14 @@ class QuantTransformer(Model):
iepoch, self.opt_config["epochs"], best_epoch, best_score
)
)
train_loss, train_score = self.train_or_test_epoch(
train_loss, train_score = train_or_test_epoch(
train_loader,
self.model,
self.loss_fn,
self.metric_fn,
True,
self.train_optimizer,
self.device,
)
self.logger.info(
"Training :: loss={:.6f}, score={:.6f}".format(train_loss, train_score)
@ -307,7 +316,8 @@ class QuantTransformer(Model):
self.logger.info("Reload the best parameter :: {:}".format(eval_str))
if self.use_gpu:
torch.cuda.empty_cache()
with torch.cuda.device(self.device):
torch.cuda.empty_cache()
self.fitted = True
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):

View File

@ -30,11 +30,14 @@ class SuperLinear(SuperModule):
self._out_features = out_features
self._bias = bias
# weights to be optimized
self._super_weight = torch.nn.Parameter(
torch.Tensor(self.out_features, self.in_features)
self.register_parameter(
"_super_weight",
torch.nn.Parameter(torch.Tensor(self.out_features, self.in_features)),
)
if self.bias:
self._super_bias = torch.nn.Parameter(torch.Tensor(self.out_features))
self.register_parameter(
"_super_bias", torch.nn.Parameter(torch.Tensor(self.out_features))
)
else:
self.register_parameter("_super_bias", None)
self.reset_parameters()

View File

@ -25,8 +25,8 @@ class SuperLayerNorm1D(SuperModule):
self._eps = eps
self._elementwise_affine = elementwise_affine
if self._elementwise_affine:
self.weight = nn.Parameter(torch.Tensor(self.in_dim))
self.bias = nn.Parameter(torch.Tensor(self.in_dim))
self.register_parameter("weight", nn.Parameter(torch.Tensor(self.in_dim)))
self.register_parameter("bias", nn.Parameter(torch.Tensor(self.in_dim)))
else:
self.register_parameter("weight", None)
self.register_parameter("bias", None)

View File

@ -1,7 +1,7 @@
#!/bin/bash
#
# bash scripts/trade/tsf-all.sh 0 csi300 0
# bash scripts/trade/tsf-all.sh 0 csi300 0.1
# bash scripts/trade/tsf-all.sh 0 csi300 0_0
# bash scripts/trade/tsf-all.sh 0 csi300 0.1_0
# bash scripts/trade/tsf-all.sh 1 all
#
set -e
@ -24,6 +24,6 @@ for channel in ${channels}
do
for depth in ${depths}
do
python exps/trading/baselines.py --alg TSF-${depth}x${channel}-d${drop} --gpu ${gpu} --market ${market}
python exps/trading/baselines.py --alg TSF-${depth}x${channel}-drop${drop} --gpu ${gpu} --market ${market}
done
done

View File

@ -1,9 +1,9 @@
#!/bin/bash
#
# bash scripts/trade/tsf.sh 0 csi300 3 0
# bash scripts/trade/tsf.sh 0 csi300 3 0.1
# bash scripts/trade/tsf.sh 1 csi100 3
# bash scripts/trade/tsf.sh 1 all 3
# bash scripts/trade/tsf.sh 0 csi300 3 0_0
# bash scripts/trade/tsf.sh 0 csi300 3 0.1_0
# bash scripts/trade/tsf.sh 1 csi100 3 0.2_0
# bash scripts/trade/tsf.sh 1 all 3 0.1_0
#
set -e
echo script name: $0
@ -24,6 +24,6 @@ channels="6 12 24 32 48 64"
for channel in ${channels}
do
python exps/trading/baselines.py --alg TSF-${depth}x${channel}-d${drop} --gpu ${gpu} --market ${market}
python exps/trading/baselines.py --alg TSF-${depth}x${channel}-drop${drop} --gpu ${gpu} --market ${market}
done