Fix CUDA memory issues
This commit is contained in:
		| @@ -15,8 +15,8 @@ | |||||||
| # python exps/trading/baselines.py --alg TabNet     # | # python exps/trading/baselines.py --alg TabNet     # | ||||||
| #                                                   # | #                                                   # | ||||||
| # python exps/trading/baselines.py --alg Transformer# | # python exps/trading/baselines.py --alg Transformer# | ||||||
| # python exps/trading/baselines.py --alg TSF        # | # python exps/trading/baselines.py --alg TSF          | ||||||
| # python exps/trading/baselines.py --alg TSF-4x64-d0 | # python exps/trading/baselines.py --alg TSF-4x64-drop0_0 | ||||||
| ##################################################### | ##################################################### | ||||||
| import sys | import sys | ||||||
| import copy | import copy | ||||||
| @@ -40,10 +40,11 @@ from qlib.workflow import R | |||||||
| from qlib.utils import flatten_dict | from qlib.utils import flatten_dict | ||||||
|  |  | ||||||
|  |  | ||||||
| def to_pos_drop(config, value): | def to_drop(config, pos_drop, other_drop): | ||||||
|     config = copy.deepcopy(config) |     config = copy.deepcopy(config) | ||||||
|     net = config["task"]["model"]["kwargs"]["net_config"] |     net = config["task"]["model"]["kwargs"]["net_config"] | ||||||
|     net["pos_drop"] = value |     net["pos_drop"] = pos_drop | ||||||
|  |     net["other_drop"] = other_drop | ||||||
|     return config |     return config | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -59,11 +60,12 @@ def to_layer(config, embed_dim, depth): | |||||||
| def extend_transformer_settings(alg2configs, name): | def extend_transformer_settings(alg2configs, name): | ||||||
|     config = copy.deepcopy(alg2configs[name]) |     config = copy.deepcopy(alg2configs[name]) | ||||||
|     for i in range(1, 7): |     for i in range(1, 7): | ||||||
|         for j in [6, 12, 24, 32, 48, 64]: |         for j in (6, 12, 24, 32, 48, 64): | ||||||
|             for k in [0, 0.1]: |             for k1 in (0, 0.1, 0.2): | ||||||
|                 alg2configs[name + "-{:}x{:}-d{:}".format(i, j, k)] = to_layer( |                 for k2 in (0, 0.1): | ||||||
|                     to_pos_drop(config, k), j, i |                     alg2configs[ | ||||||
|                 ) |                         name + "-{:}x{:}-drop{:}_{:}".format(i, j, k1, k2) | ||||||
|  |                     ] = to_layer(to_drop(config, k1, k2), j, i) | ||||||
|     return alg2configs |     return alg2configs | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -107,7 +107,7 @@ def run_exp( | |||||||
|                 model = R.load_object(model_obj_name) |                 model = R.load_object(model_obj_name) | ||||||
|             logger.info("[Find existing object from {:}]".format(model_obj_name)) |             logger.info("[Find existing object from {:}]".format(model_obj_name)) | ||||||
|         except OSError: |         except OSError: | ||||||
|             # R.log_params(**flatten_dict(task_config)) |             R.log_params(**flatten_dict(update_gpu(task_config, None))) | ||||||
|             if "save_path" in inspect.getfullargspec(model.fit).args: |             if "save_path" in inspect.getfullargspec(model.fit).args: | ||||||
|                 model_fit_kwargs["save_path"] = os.path.join( |                 model_fit_kwargs["save_path"] = os.path.join( | ||||||
|                     recorder_root_dir, "model.ckp" |                     recorder_root_dir, "model.ckp" | ||||||
| @@ -126,9 +126,6 @@ def run_exp( | |||||||
|             else: |             else: | ||||||
|                 R.save_objects(**{model_obj_name: model}) |                 R.save_objects(**{model_obj_name: model}) | ||||||
|         except Exception as e: |         except Exception as e: | ||||||
|             import pdb |  | ||||||
|  |  | ||||||
|             pdb.set_trace() |  | ||||||
|             raise ValueError("Something wrong: {:}".format(e)) |             raise ValueError("Something wrong: {:}".format(e)) | ||||||
|         # Get the recorder |         # Get the recorder | ||||||
|         recorder = R.get_recorder() |         recorder = R.get_recorder() | ||||||
|   | |||||||
| @@ -45,6 +45,32 @@ DEFAULT_OPT_CONFIG = dict( | |||||||
| ) | ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def train_or_test_epoch( | ||||||
|  |     xloader, model, loss_fn, metric_fn, is_train, optimizer, device | ||||||
|  | ): | ||||||
|  |     if is_train: | ||||||
|  |         model.train() | ||||||
|  |     else: | ||||||
|  |         model.eval() | ||||||
|  |     score_meter, loss_meter = AverageMeter(), AverageMeter() | ||||||
|  |     for ibatch, (feats, labels) in enumerate(xloader): | ||||||
|  |         feats, labels = feats.to(device), labels.to(device) | ||||||
|  |         # forward the network | ||||||
|  |         preds = model(feats) | ||||||
|  |         loss = loss_fn(preds, labels) | ||||||
|  |         with torch.no_grad(): | ||||||
|  |             score = metric_fn(preds, labels) | ||||||
|  |             loss_meter.update(loss.item(), feats.size(0)) | ||||||
|  |             score_meter.update(score.item(), feats.size(0)) | ||||||
|  |         # optimize the network | ||||||
|  |         if is_train and optimizer is not None: | ||||||
|  |             optimizer.zero_grad() | ||||||
|  |             loss.backward() | ||||||
|  |             torch.nn.utils.clip_grad_value_(model.parameters(), 3.0) | ||||||
|  |             optimizer.step() | ||||||
|  |     return loss_meter.avg, score_meter.avg | ||||||
|  |  | ||||||
|  |  | ||||||
| class QuantTransformer(Model): | class QuantTransformer(Model): | ||||||
|     """Transformer-based Quant Model""" |     """Transformer-based Quant Model""" | ||||||
|  |  | ||||||
| @@ -132,32 +158,6 @@ class QuantTransformer(Model): | |||||||
|         else: |         else: | ||||||
|             raise ValueError("unknown metric `{:}`".format(self.metric)) |             raise ValueError("unknown metric `{:}`".format(self.metric)) | ||||||
|  |  | ||||||
|     def train_or_test_epoch( |  | ||||||
|         self, xloader, model, loss_fn, metric_fn, is_train, optimizer=None |  | ||||||
|     ): |  | ||||||
|         if is_train: |  | ||||||
|             model.train() |  | ||||||
|         else: |  | ||||||
|             model.eval() |  | ||||||
|         score_meter, loss_meter = AverageMeter(), AverageMeter() |  | ||||||
|         for ibatch, (feats, labels) in enumerate(xloader): |  | ||||||
|             feats = feats.to(self.device, non_blocking=True) |  | ||||||
|             labels = labels.to(self.device, non_blocking=True) |  | ||||||
|             # forward the network |  | ||||||
|             preds = model(feats) |  | ||||||
|             loss = loss_fn(preds, labels) |  | ||||||
|             with torch.no_grad(): |  | ||||||
|                 score = self.metric_fn(preds, labels) |  | ||||||
|                 loss_meter.update(loss.item(), feats.size(0)) |  | ||||||
|                 score_meter.update(score.item(), feats.size(0)) |  | ||||||
|             # optimize the network |  | ||||||
|             if is_train and optimizer is not None: |  | ||||||
|                 optimizer.zero_grad() |  | ||||||
|                 loss.backward() |  | ||||||
|                 torch.nn.utils.clip_grad_value_(model.parameters(), 3.0) |  | ||||||
|                 optimizer.step() |  | ||||||
|         return loss_meter.avg, score_meter.avg |  | ||||||
|  |  | ||||||
|     def fit( |     def fit( | ||||||
|         self, |         self, | ||||||
|         dataset: DatasetH, |         dataset: DatasetH, | ||||||
| @@ -204,14 +204,22 @@ class QuantTransformer(Model): | |||||||
|  |  | ||||||
|         def _internal_test(ckp_epoch=None, results_dict=None): |         def _internal_test(ckp_epoch=None, results_dict=None): | ||||||
|             with torch.no_grad(): |             with torch.no_grad(): | ||||||
|                 train_loss, train_score = self.train_or_test_epoch( |                 shared_kwards = { | ||||||
|                     train_loader, self.model, self.loss_fn, self.metric_fn, False, None |                     "model": self.model, | ||||||
|  |                     "loss_fn": self.loss_fn, | ||||||
|  |                     "metric_fn": self.metric_fn, | ||||||
|  |                     "is_train": False, | ||||||
|  |                     "optimizer": None, | ||||||
|  |                     "device": self.device, | ||||||
|  |                 } | ||||||
|  |                 train_loss, train_score = train_or_test_epoch( | ||||||
|  |                     train_loader, **shared_kwards | ||||||
|                 ) |                 ) | ||||||
|                 valid_loss, valid_score = self.train_or_test_epoch( |                 valid_loss, valid_score = train_or_test_epoch( | ||||||
|                     valid_loader, self.model, self.loss_fn, self.metric_fn, False, None |                     valid_loader, **shared_kwards | ||||||
|                 ) |                 ) | ||||||
|                 test_loss, test_score = self.train_or_test_epoch( |                 test_loss, test_score = train_or_test_epoch( | ||||||
|                     test_loader, self.model, self.loss_fn, self.metric_fn, False, None |                     test_loader, **shared_kwards | ||||||
|                 ) |                 ) | ||||||
|                 xstr = ( |                 xstr = ( | ||||||
|                     "train-score={:.6f}, valid-score={:.6f}, test-score={:.6f}".format( |                     "train-score={:.6f}, valid-score={:.6f}, test-score={:.6f}".format( | ||||||
| @@ -255,13 +263,14 @@ class QuantTransformer(Model): | |||||||
|                     iepoch, self.opt_config["epochs"], best_epoch, best_score |                     iepoch, self.opt_config["epochs"], best_epoch, best_score | ||||||
|                 ) |                 ) | ||||||
|             ) |             ) | ||||||
|             train_loss, train_score = self.train_or_test_epoch( |             train_loss, train_score = train_or_test_epoch( | ||||||
|                 train_loader, |                 train_loader, | ||||||
|                 self.model, |                 self.model, | ||||||
|                 self.loss_fn, |                 self.loss_fn, | ||||||
|                 self.metric_fn, |                 self.metric_fn, | ||||||
|                 True, |                 True, | ||||||
|                 self.train_optimizer, |                 self.train_optimizer, | ||||||
|  |                 self.device, | ||||||
|             ) |             ) | ||||||
|             self.logger.info( |             self.logger.info( | ||||||
|                 "Training :: loss={:.6f}, score={:.6f}".format(train_loss, train_score) |                 "Training :: loss={:.6f}, score={:.6f}".format(train_loss, train_score) | ||||||
| @@ -307,7 +316,8 @@ class QuantTransformer(Model): | |||||||
|         self.logger.info("Reload the best parameter :: {:}".format(eval_str)) |         self.logger.info("Reload the best parameter :: {:}".format(eval_str)) | ||||||
|  |  | ||||||
|         if self.use_gpu: |         if self.use_gpu: | ||||||
|             torch.cuda.empty_cache() |             with torch.cuda.device(self.device): | ||||||
|  |                 torch.cuda.empty_cache() | ||||||
|         self.fitted = True |         self.fitted = True | ||||||
|  |  | ||||||
|     def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): |     def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): | ||||||
|   | |||||||
| @@ -30,11 +30,14 @@ class SuperLinear(SuperModule): | |||||||
|         self._out_features = out_features |         self._out_features = out_features | ||||||
|         self._bias = bias |         self._bias = bias | ||||||
|         # weights to be optimized |         # weights to be optimized | ||||||
|         self._super_weight = torch.nn.Parameter( |         self.register_parameter( | ||||||
|             torch.Tensor(self.out_features, self.in_features) |             "_super_weight", | ||||||
|  |             torch.nn.Parameter(torch.Tensor(self.out_features, self.in_features)), | ||||||
|         ) |         ) | ||||||
|         if self.bias: |         if self.bias: | ||||||
|             self._super_bias = torch.nn.Parameter(torch.Tensor(self.out_features)) |             self.register_parameter( | ||||||
|  |                 "_super_bias", torch.nn.Parameter(torch.Tensor(self.out_features)) | ||||||
|  |             ) | ||||||
|         else: |         else: | ||||||
|             self.register_parameter("_super_bias", None) |             self.register_parameter("_super_bias", None) | ||||||
|         self.reset_parameters() |         self.reset_parameters() | ||||||
|   | |||||||
| @@ -25,8 +25,8 @@ class SuperLayerNorm1D(SuperModule): | |||||||
|         self._eps = eps |         self._eps = eps | ||||||
|         self._elementwise_affine = elementwise_affine |         self._elementwise_affine = elementwise_affine | ||||||
|         if self._elementwise_affine: |         if self._elementwise_affine: | ||||||
|             self.weight = nn.Parameter(torch.Tensor(self.in_dim)) |             self.register_parameter("weight", nn.Parameter(torch.Tensor(self.in_dim))) | ||||||
|             self.bias = nn.Parameter(torch.Tensor(self.in_dim)) |             self.register_parameter("bias", nn.Parameter(torch.Tensor(self.in_dim))) | ||||||
|         else: |         else: | ||||||
|             self.register_parameter("weight", None) |             self.register_parameter("weight", None) | ||||||
|             self.register_parameter("bias", None) |             self.register_parameter("bias", None) | ||||||
|   | |||||||
| @@ -1,7 +1,7 @@ | |||||||
| #!/bin/bash | #!/bin/bash | ||||||
| # | # | ||||||
| # bash scripts/trade/tsf-all.sh 0 csi300 0 | # bash scripts/trade/tsf-all.sh 0 csi300 0_0 | ||||||
| # bash scripts/trade/tsf-all.sh 0 csi300 0.1 | # bash scripts/trade/tsf-all.sh 0 csi300 0.1_0 | ||||||
| # bash scripts/trade/tsf-all.sh 1 all     | # bash scripts/trade/tsf-all.sh 1 all     | ||||||
| # | # | ||||||
| set -e | set -e | ||||||
| @@ -24,6 +24,6 @@ for channel in ${channels} | |||||||
| do | do | ||||||
|   for depth in ${depths} |   for depth in ${depths} | ||||||
|   do |   do | ||||||
|     python exps/trading/baselines.py --alg TSF-${depth}x${channel}-d${drop} --gpu ${gpu} --market ${market} |     python exps/trading/baselines.py --alg TSF-${depth}x${channel}-drop${drop} --gpu ${gpu} --market ${market} | ||||||
|   done |   done | ||||||
| done | done | ||||||
|   | |||||||
| @@ -1,9 +1,9 @@ | |||||||
| #!/bin/bash | #!/bin/bash | ||||||
| # | # | ||||||
| # bash scripts/trade/tsf.sh 0 csi300 3 0 | # bash scripts/trade/tsf.sh 0 csi300 3 0_0 | ||||||
| # bash scripts/trade/tsf.sh 0 csi300 3 0.1 | # bash scripts/trade/tsf.sh 0 csi300 3 0.1_0 | ||||||
| # bash scripts/trade/tsf.sh 1 csi100 3 | # bash scripts/trade/tsf.sh 1 csi100 3 0.2_0 | ||||||
| # bash scripts/trade/tsf.sh 1 all    3 | # bash scripts/trade/tsf.sh 1 all    3 0.1_0 | ||||||
| # | # | ||||||
| set -e | set -e | ||||||
| echo script name: $0 | echo script name: $0 | ||||||
| @@ -24,6 +24,6 @@ channels="6 12 24 32 48 64" | |||||||
| for channel in ${channels} | for channel in ${channels} | ||||||
| do | do | ||||||
|  |  | ||||||
|   python exps/trading/baselines.py --alg TSF-${depth}x${channel}-d${drop} --gpu ${gpu} --market ${market} |   python exps/trading/baselines.py --alg TSF-${depth}x${channel}-drop${drop} --gpu ${gpu} --market ${market} | ||||||
|  |  | ||||||
| done | done | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user