Fix CUDA memory issues

This commit is contained in:
D-X-Y 2021-03-30 09:02:41 +00:00
parent 5fb900bcb1
commit 9fc2c991f5
7 changed files with 72 additions and 60 deletions

View File

@ -15,8 +15,8 @@
# python exps/trading/baselines.py --alg TabNet # # python exps/trading/baselines.py --alg TabNet #
# # # #
# python exps/trading/baselines.py --alg Transformer# # python exps/trading/baselines.py --alg Transformer#
# python exps/trading/baselines.py --alg TSF # # python exps/trading/baselines.py --alg TSF
# python exps/trading/baselines.py --alg TSF-4x64-d0 # python exps/trading/baselines.py --alg TSF-4x64-drop0_0
##################################################### #####################################################
import sys import sys
import copy import copy
@ -40,10 +40,11 @@ from qlib.workflow import R
from qlib.utils import flatten_dict from qlib.utils import flatten_dict
def to_pos_drop(config, value): def to_drop(config, pos_drop, other_drop):
config = copy.deepcopy(config) config = copy.deepcopy(config)
net = config["task"]["model"]["kwargs"]["net_config"] net = config["task"]["model"]["kwargs"]["net_config"]
net["pos_drop"] = value net["pos_drop"] = pos_drop
net["other_drop"] = other_drop
return config return config
@ -59,11 +60,12 @@ def to_layer(config, embed_dim, depth):
def extend_transformer_settings(alg2configs, name): def extend_transformer_settings(alg2configs, name):
config = copy.deepcopy(alg2configs[name]) config = copy.deepcopy(alg2configs[name])
for i in range(1, 7): for i in range(1, 7):
for j in [6, 12, 24, 32, 48, 64]: for j in (6, 12, 24, 32, 48, 64):
for k in [0, 0.1]: for k1 in (0, 0.1, 0.2):
alg2configs[name + "-{:}x{:}-d{:}".format(i, j, k)] = to_layer( for k2 in (0, 0.1):
to_pos_drop(config, k), j, i alg2configs[
) name + "-{:}x{:}-drop{:}_{:}".format(i, j, k1, k2)
] = to_layer(to_drop(config, k1, k2), j, i)
return alg2configs return alg2configs

View File

@ -107,7 +107,7 @@ def run_exp(
model = R.load_object(model_obj_name) model = R.load_object(model_obj_name)
logger.info("[Find existing object from {:}]".format(model_obj_name)) logger.info("[Find existing object from {:}]".format(model_obj_name))
except OSError: except OSError:
# R.log_params(**flatten_dict(task_config)) R.log_params(**flatten_dict(update_gpu(task_config, None)))
if "save_path" in inspect.getfullargspec(model.fit).args: if "save_path" in inspect.getfullargspec(model.fit).args:
model_fit_kwargs["save_path"] = os.path.join( model_fit_kwargs["save_path"] = os.path.join(
recorder_root_dir, "model.ckp" recorder_root_dir, "model.ckp"
@ -126,9 +126,6 @@ def run_exp(
else: else:
R.save_objects(**{model_obj_name: model}) R.save_objects(**{model_obj_name: model})
except Exception as e: except Exception as e:
import pdb
pdb.set_trace()
raise ValueError("Something wrong: {:}".format(e)) raise ValueError("Something wrong: {:}".format(e))
# Get the recorder # Get the recorder
recorder = R.get_recorder() recorder = R.get_recorder()

View File

@ -45,6 +45,32 @@ DEFAULT_OPT_CONFIG = dict(
) )
def train_or_test_epoch(
xloader, model, loss_fn, metric_fn, is_train, optimizer, device
):
if is_train:
model.train()
else:
model.eval()
score_meter, loss_meter = AverageMeter(), AverageMeter()
for ibatch, (feats, labels) in enumerate(xloader):
feats, labels = feats.to(device), labels.to(device)
# forward the network
preds = model(feats)
loss = loss_fn(preds, labels)
with torch.no_grad():
score = metric_fn(preds, labels)
loss_meter.update(loss.item(), feats.size(0))
score_meter.update(score.item(), feats.size(0))
# optimize the network
if is_train and optimizer is not None:
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(model.parameters(), 3.0)
optimizer.step()
return loss_meter.avg, score_meter.avg
class QuantTransformer(Model): class QuantTransformer(Model):
"""Transformer-based Quant Model""" """Transformer-based Quant Model"""
@ -132,32 +158,6 @@ class QuantTransformer(Model):
else: else:
raise ValueError("unknown metric `{:}`".format(self.metric)) raise ValueError("unknown metric `{:}`".format(self.metric))
def train_or_test_epoch(
self, xloader, model, loss_fn, metric_fn, is_train, optimizer=None
):
if is_train:
model.train()
else:
model.eval()
score_meter, loss_meter = AverageMeter(), AverageMeter()
for ibatch, (feats, labels) in enumerate(xloader):
feats = feats.to(self.device, non_blocking=True)
labels = labels.to(self.device, non_blocking=True)
# forward the network
preds = model(feats)
loss = loss_fn(preds, labels)
with torch.no_grad():
score = self.metric_fn(preds, labels)
loss_meter.update(loss.item(), feats.size(0))
score_meter.update(score.item(), feats.size(0))
# optimize the network
if is_train and optimizer is not None:
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(model.parameters(), 3.0)
optimizer.step()
return loss_meter.avg, score_meter.avg
def fit( def fit(
self, self,
dataset: DatasetH, dataset: DatasetH,
@ -204,14 +204,22 @@ class QuantTransformer(Model):
def _internal_test(ckp_epoch=None, results_dict=None): def _internal_test(ckp_epoch=None, results_dict=None):
with torch.no_grad(): with torch.no_grad():
train_loss, train_score = self.train_or_test_epoch( shared_kwards = {
train_loader, self.model, self.loss_fn, self.metric_fn, False, None "model": self.model,
"loss_fn": self.loss_fn,
"metric_fn": self.metric_fn,
"is_train": False,
"optimizer": None,
"device": self.device,
}
train_loss, train_score = train_or_test_epoch(
train_loader, **shared_kwards
) )
valid_loss, valid_score = self.train_or_test_epoch( valid_loss, valid_score = train_or_test_epoch(
valid_loader, self.model, self.loss_fn, self.metric_fn, False, None valid_loader, **shared_kwards
) )
test_loss, test_score = self.train_or_test_epoch( test_loss, test_score = train_or_test_epoch(
test_loader, self.model, self.loss_fn, self.metric_fn, False, None test_loader, **shared_kwards
) )
xstr = ( xstr = (
"train-score={:.6f}, valid-score={:.6f}, test-score={:.6f}".format( "train-score={:.6f}, valid-score={:.6f}, test-score={:.6f}".format(
@ -255,13 +263,14 @@ class QuantTransformer(Model):
iepoch, self.opt_config["epochs"], best_epoch, best_score iepoch, self.opt_config["epochs"], best_epoch, best_score
) )
) )
train_loss, train_score = self.train_or_test_epoch( train_loss, train_score = train_or_test_epoch(
train_loader, train_loader,
self.model, self.model,
self.loss_fn, self.loss_fn,
self.metric_fn, self.metric_fn,
True, True,
self.train_optimizer, self.train_optimizer,
self.device,
) )
self.logger.info( self.logger.info(
"Training :: loss={:.6f}, score={:.6f}".format(train_loss, train_score) "Training :: loss={:.6f}, score={:.6f}".format(train_loss, train_score)
@ -307,7 +316,8 @@ class QuantTransformer(Model):
self.logger.info("Reload the best parameter :: {:}".format(eval_str)) self.logger.info("Reload the best parameter :: {:}".format(eval_str))
if self.use_gpu: if self.use_gpu:
torch.cuda.empty_cache() with torch.cuda.device(self.device):
torch.cuda.empty_cache()
self.fitted = True self.fitted = True
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):

View File

@ -30,11 +30,14 @@ class SuperLinear(SuperModule):
self._out_features = out_features self._out_features = out_features
self._bias = bias self._bias = bias
# weights to be optimized # weights to be optimized
self._super_weight = torch.nn.Parameter( self.register_parameter(
torch.Tensor(self.out_features, self.in_features) "_super_weight",
torch.nn.Parameter(torch.Tensor(self.out_features, self.in_features)),
) )
if self.bias: if self.bias:
self._super_bias = torch.nn.Parameter(torch.Tensor(self.out_features)) self.register_parameter(
"_super_bias", torch.nn.Parameter(torch.Tensor(self.out_features))
)
else: else:
self.register_parameter("_super_bias", None) self.register_parameter("_super_bias", None)
self.reset_parameters() self.reset_parameters()

View File

@ -25,8 +25,8 @@ class SuperLayerNorm1D(SuperModule):
self._eps = eps self._eps = eps
self._elementwise_affine = elementwise_affine self._elementwise_affine = elementwise_affine
if self._elementwise_affine: if self._elementwise_affine:
self.weight = nn.Parameter(torch.Tensor(self.in_dim)) self.register_parameter("weight", nn.Parameter(torch.Tensor(self.in_dim)))
self.bias = nn.Parameter(torch.Tensor(self.in_dim)) self.register_parameter("bias", nn.Parameter(torch.Tensor(self.in_dim)))
else: else:
self.register_parameter("weight", None) self.register_parameter("weight", None)
self.register_parameter("bias", None) self.register_parameter("bias", None)

View File

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
# #
# bash scripts/trade/tsf-all.sh 0 csi300 0 # bash scripts/trade/tsf-all.sh 0 csi300 0_0
# bash scripts/trade/tsf-all.sh 0 csi300 0.1 # bash scripts/trade/tsf-all.sh 0 csi300 0.1_0
# bash scripts/trade/tsf-all.sh 1 all # bash scripts/trade/tsf-all.sh 1 all
# #
set -e set -e
@ -24,6 +24,6 @@ for channel in ${channels}
do do
for depth in ${depths} for depth in ${depths}
do do
python exps/trading/baselines.py --alg TSF-${depth}x${channel}-d${drop} --gpu ${gpu} --market ${market} python exps/trading/baselines.py --alg TSF-${depth}x${channel}-drop${drop} --gpu ${gpu} --market ${market}
done done
done done

View File

@ -1,9 +1,9 @@
#!/bin/bash #!/bin/bash
# #
# bash scripts/trade/tsf.sh 0 csi300 3 0 # bash scripts/trade/tsf.sh 0 csi300 3 0_0
# bash scripts/trade/tsf.sh 0 csi300 3 0.1 # bash scripts/trade/tsf.sh 0 csi300 3 0.1_0
# bash scripts/trade/tsf.sh 1 csi100 3 # bash scripts/trade/tsf.sh 1 csi100 3 0.2_0
# bash scripts/trade/tsf.sh 1 all 3 # bash scripts/trade/tsf.sh 1 all 3 0.1_0
# #
set -e set -e
echo script name: $0 echo script name: $0
@ -24,6 +24,6 @@ channels="6 12 24 32 48 64"
for channel in ${channels} for channel in ${channels}
do do
python exps/trading/baselines.py --alg TSF-${depth}x${channel}-d${drop} --gpu ${gpu} --market ${market} python exps/trading/baselines.py --alg TSF-${depth}x${channel}-drop${drop} --gpu ${gpu} --market ${market}
done done