Fix CUDA memory issues
This commit is contained in:
		| @@ -15,8 +15,8 @@ | ||||
| # python exps/trading/baselines.py --alg TabNet     # | ||||
| #                                                   # | ||||
| # python exps/trading/baselines.py --alg Transformer# | ||||
| # python exps/trading/baselines.py --alg TSF        # | ||||
| # python exps/trading/baselines.py --alg TSF-4x64-d0 | ||||
| # python exps/trading/baselines.py --alg TSF          | ||||
| # python exps/trading/baselines.py --alg TSF-4x64-drop0_0 | ||||
| ##################################################### | ||||
| import sys | ||||
| import copy | ||||
| @@ -40,10 +40,11 @@ from qlib.workflow import R | ||||
| from qlib.utils import flatten_dict | ||||
|  | ||||
|  | ||||
| def to_pos_drop(config, value): | ||||
| def to_drop(config, pos_drop, other_drop): | ||||
|     config = copy.deepcopy(config) | ||||
|     net = config["task"]["model"]["kwargs"]["net_config"] | ||||
|     net["pos_drop"] = value | ||||
|     net["pos_drop"] = pos_drop | ||||
|     net["other_drop"] = other_drop | ||||
|     return config | ||||
|  | ||||
|  | ||||
| @@ -59,11 +60,12 @@ def to_layer(config, embed_dim, depth): | ||||
| def extend_transformer_settings(alg2configs, name): | ||||
|     config = copy.deepcopy(alg2configs[name]) | ||||
|     for i in range(1, 7): | ||||
|         for j in [6, 12, 24, 32, 48, 64]: | ||||
|             for k in [0, 0.1]: | ||||
|                 alg2configs[name + "-{:}x{:}-d{:}".format(i, j, k)] = to_layer( | ||||
|                     to_pos_drop(config, k), j, i | ||||
|                 ) | ||||
|         for j in (6, 12, 24, 32, 48, 64): | ||||
|             for k1 in (0, 0.1, 0.2): | ||||
|                 for k2 in (0, 0.1): | ||||
|                     alg2configs[ | ||||
|                         name + "-{:}x{:}-drop{:}_{:}".format(i, j, k1, k2) | ||||
|                     ] = to_layer(to_drop(config, k1, k2), j, i) | ||||
|     return alg2configs | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -107,7 +107,7 @@ def run_exp( | ||||
|                 model = R.load_object(model_obj_name) | ||||
|             logger.info("[Find existing object from {:}]".format(model_obj_name)) | ||||
|         except OSError: | ||||
|             # R.log_params(**flatten_dict(task_config)) | ||||
|             R.log_params(**flatten_dict(update_gpu(task_config, None))) | ||||
|             if "save_path" in inspect.getfullargspec(model.fit).args: | ||||
|                 model_fit_kwargs["save_path"] = os.path.join( | ||||
|                     recorder_root_dir, "model.ckp" | ||||
| @@ -126,9 +126,6 @@ def run_exp( | ||||
|             else: | ||||
|                 R.save_objects(**{model_obj_name: model}) | ||||
|         except Exception as e: | ||||
|             import pdb | ||||
|  | ||||
|             pdb.set_trace() | ||||
|             raise ValueError("Something wrong: {:}".format(e)) | ||||
|         # Get the recorder | ||||
|         recorder = R.get_recorder() | ||||
|   | ||||
| @@ -45,6 +45,32 @@ DEFAULT_OPT_CONFIG = dict( | ||||
| ) | ||||
|  | ||||
|  | ||||
| def train_or_test_epoch( | ||||
|     xloader, model, loss_fn, metric_fn, is_train, optimizer, device | ||||
| ): | ||||
|     if is_train: | ||||
|         model.train() | ||||
|     else: | ||||
|         model.eval() | ||||
|     score_meter, loss_meter = AverageMeter(), AverageMeter() | ||||
|     for ibatch, (feats, labels) in enumerate(xloader): | ||||
|         feats, labels = feats.to(device), labels.to(device) | ||||
|         # forward the network | ||||
|         preds = model(feats) | ||||
|         loss = loss_fn(preds, labels) | ||||
|         with torch.no_grad(): | ||||
|             score = metric_fn(preds, labels) | ||||
|             loss_meter.update(loss.item(), feats.size(0)) | ||||
|             score_meter.update(score.item(), feats.size(0)) | ||||
|         # optimize the network | ||||
|         if is_train and optimizer is not None: | ||||
|             optimizer.zero_grad() | ||||
|             loss.backward() | ||||
|             torch.nn.utils.clip_grad_value_(model.parameters(), 3.0) | ||||
|             optimizer.step() | ||||
|     return loss_meter.avg, score_meter.avg | ||||
|  | ||||
|  | ||||
| class QuantTransformer(Model): | ||||
|     """Transformer-based Quant Model""" | ||||
|  | ||||
| @@ -132,32 +158,6 @@ class QuantTransformer(Model): | ||||
|         else: | ||||
|             raise ValueError("unknown metric `{:}`".format(self.metric)) | ||||
|  | ||||
|     def train_or_test_epoch( | ||||
|         self, xloader, model, loss_fn, metric_fn, is_train, optimizer=None | ||||
|     ): | ||||
|         if is_train: | ||||
|             model.train() | ||||
|         else: | ||||
|             model.eval() | ||||
|         score_meter, loss_meter = AverageMeter(), AverageMeter() | ||||
|         for ibatch, (feats, labels) in enumerate(xloader): | ||||
|             feats = feats.to(self.device, non_blocking=True) | ||||
|             labels = labels.to(self.device, non_blocking=True) | ||||
|             # forward the network | ||||
|             preds = model(feats) | ||||
|             loss = loss_fn(preds, labels) | ||||
|             with torch.no_grad(): | ||||
|                 score = self.metric_fn(preds, labels) | ||||
|                 loss_meter.update(loss.item(), feats.size(0)) | ||||
|                 score_meter.update(score.item(), feats.size(0)) | ||||
|             # optimize the network | ||||
|             if is_train and optimizer is not None: | ||||
|                 optimizer.zero_grad() | ||||
|                 loss.backward() | ||||
|                 torch.nn.utils.clip_grad_value_(model.parameters(), 3.0) | ||||
|                 optimizer.step() | ||||
|         return loss_meter.avg, score_meter.avg | ||||
|  | ||||
|     def fit( | ||||
|         self, | ||||
|         dataset: DatasetH, | ||||
| @@ -204,14 +204,22 @@ class QuantTransformer(Model): | ||||
|  | ||||
|         def _internal_test(ckp_epoch=None, results_dict=None): | ||||
|             with torch.no_grad(): | ||||
|                 train_loss, train_score = self.train_or_test_epoch( | ||||
|                     train_loader, self.model, self.loss_fn, self.metric_fn, False, None | ||||
|                 shared_kwards = { | ||||
|                     "model": self.model, | ||||
|                     "loss_fn": self.loss_fn, | ||||
|                     "metric_fn": self.metric_fn, | ||||
|                     "is_train": False, | ||||
|                     "optimizer": None, | ||||
|                     "device": self.device, | ||||
|                 } | ||||
|                 train_loss, train_score = train_or_test_epoch( | ||||
|                     train_loader, **shared_kwards | ||||
|                 ) | ||||
|                 valid_loss, valid_score = self.train_or_test_epoch( | ||||
|                     valid_loader, self.model, self.loss_fn, self.metric_fn, False, None | ||||
|                 valid_loss, valid_score = train_or_test_epoch( | ||||
|                     valid_loader, **shared_kwards | ||||
|                 ) | ||||
|                 test_loss, test_score = self.train_or_test_epoch( | ||||
|                     test_loader, self.model, self.loss_fn, self.metric_fn, False, None | ||||
|                 test_loss, test_score = train_or_test_epoch( | ||||
|                     test_loader, **shared_kwards | ||||
|                 ) | ||||
|                 xstr = ( | ||||
|                     "train-score={:.6f}, valid-score={:.6f}, test-score={:.6f}".format( | ||||
| @@ -255,13 +263,14 @@ class QuantTransformer(Model): | ||||
|                     iepoch, self.opt_config["epochs"], best_epoch, best_score | ||||
|                 ) | ||||
|             ) | ||||
|             train_loss, train_score = self.train_or_test_epoch( | ||||
|             train_loss, train_score = train_or_test_epoch( | ||||
|                 train_loader, | ||||
|                 self.model, | ||||
|                 self.loss_fn, | ||||
|                 self.metric_fn, | ||||
|                 True, | ||||
|                 self.train_optimizer, | ||||
|                 self.device, | ||||
|             ) | ||||
|             self.logger.info( | ||||
|                 "Training :: loss={:.6f}, score={:.6f}".format(train_loss, train_score) | ||||
| @@ -307,7 +316,8 @@ class QuantTransformer(Model): | ||||
|         self.logger.info("Reload the best parameter :: {:}".format(eval_str)) | ||||
|  | ||||
|         if self.use_gpu: | ||||
|             torch.cuda.empty_cache() | ||||
|             with torch.cuda.device(self.device): | ||||
|                 torch.cuda.empty_cache() | ||||
|         self.fitted = True | ||||
|  | ||||
|     def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): | ||||
|   | ||||
| @@ -30,11 +30,14 @@ class SuperLinear(SuperModule): | ||||
|         self._out_features = out_features | ||||
|         self._bias = bias | ||||
|         # weights to be optimized | ||||
|         self._super_weight = torch.nn.Parameter( | ||||
|             torch.Tensor(self.out_features, self.in_features) | ||||
|         self.register_parameter( | ||||
|             "_super_weight", | ||||
|             torch.nn.Parameter(torch.Tensor(self.out_features, self.in_features)), | ||||
|         ) | ||||
|         if self.bias: | ||||
|             self._super_bias = torch.nn.Parameter(torch.Tensor(self.out_features)) | ||||
|             self.register_parameter( | ||||
|                 "_super_bias", torch.nn.Parameter(torch.Tensor(self.out_features)) | ||||
|             ) | ||||
|         else: | ||||
|             self.register_parameter("_super_bias", None) | ||||
|         self.reset_parameters() | ||||
|   | ||||
| @@ -25,8 +25,8 @@ class SuperLayerNorm1D(SuperModule): | ||||
|         self._eps = eps | ||||
|         self._elementwise_affine = elementwise_affine | ||||
|         if self._elementwise_affine: | ||||
|             self.weight = nn.Parameter(torch.Tensor(self.in_dim)) | ||||
|             self.bias = nn.Parameter(torch.Tensor(self.in_dim)) | ||||
|             self.register_parameter("weight", nn.Parameter(torch.Tensor(self.in_dim))) | ||||
|             self.register_parameter("bias", nn.Parameter(torch.Tensor(self.in_dim))) | ||||
|         else: | ||||
|             self.register_parameter("weight", None) | ||||
|             self.register_parameter("bias", None) | ||||
|   | ||||
| @@ -1,7 +1,7 @@ | ||||
| #!/bin/bash | ||||
| # | ||||
| # bash scripts/trade/tsf-all.sh 0 csi300 0 | ||||
| # bash scripts/trade/tsf-all.sh 0 csi300 0.1 | ||||
| # bash scripts/trade/tsf-all.sh 0 csi300 0_0 | ||||
| # bash scripts/trade/tsf-all.sh 0 csi300 0.1_0 | ||||
| # bash scripts/trade/tsf-all.sh 1 all     | ||||
| # | ||||
| set -e | ||||
| @@ -24,6 +24,6 @@ for channel in ${channels} | ||||
| do | ||||
|   for depth in ${depths} | ||||
|   do | ||||
|     python exps/trading/baselines.py --alg TSF-${depth}x${channel}-d${drop} --gpu ${gpu} --market ${market} | ||||
|     python exps/trading/baselines.py --alg TSF-${depth}x${channel}-drop${drop} --gpu ${gpu} --market ${market} | ||||
|   done | ||||
| done | ||||
|   | ||||
| @@ -1,9 +1,9 @@ | ||||
| #!/bin/bash | ||||
| # | ||||
| # bash scripts/trade/tsf.sh 0 csi300 3 0 | ||||
| # bash scripts/trade/tsf.sh 0 csi300 3 0.1 | ||||
| # bash scripts/trade/tsf.sh 1 csi100 3 | ||||
| # bash scripts/trade/tsf.sh 1 all    3 | ||||
| # bash scripts/trade/tsf.sh 0 csi300 3 0_0 | ||||
| # bash scripts/trade/tsf.sh 0 csi300 3 0.1_0 | ||||
| # bash scripts/trade/tsf.sh 1 csi100 3 0.2_0 | ||||
| # bash scripts/trade/tsf.sh 1 all    3 0.1_0 | ||||
| # | ||||
| set -e | ||||
| echo script name: $0 | ||||
| @@ -24,6 +24,6 @@ channels="6 12 24 32 48 64" | ||||
| for channel in ${channels} | ||||
| do | ||||
|  | ||||
|   python exps/trading/baselines.py --alg TSF-${depth}x${channel}-d${drop} --gpu ${gpu} --market ${market} | ||||
|   python exps/trading/baselines.py --alg TSF-${depth}x${channel}-drop${drop} --gpu ${gpu} --market ${market} | ||||
|  | ||||
| done | ||||
|   | ||||
		Reference in New Issue
	
	Block a user