Update xmisc

2021-06-10 23:42:00 -07:00
parent 98f981dd45
commit 248686820c
8 changed files with 72 additions and 487 deletions
--- a/configs/yaml.data/cifar10.test
+++ b/configs/yaml.data/cifar10.test
@@ -17,6 +17,6 @@ kwargs:
          module_path: torchvision.transforms
          args: []
          kwargs:
-            mean: (0.491, 0.482, 0.447)
+            mean: [0.491, 0.482, 0.447]
-            std: (0.247, 0.244, 0.262)
+            std: [0.247, 0.244, 0.262]
    kwargs: {}
--- a/configs/yaml.data/cifar10.train
+++ b/configs/yaml.data/cifar10.train
@@ -25,6 +25,6 @@ kwargs:
          module_path: torchvision.transforms
          args: []
          kwargs:
-            mean: (0.491, 0.482, 0.447)
+            mean: [0.491, 0.482, 0.447]
-            std: (0.247, 0.244, 0.262)
+            std: [0.247, 0.244, 0.262]
    kwargs: {}
--- a/exps/basic/xmain.py
+++ b/exps/basic/xmain.py
@@ -58,6 +58,7 @@ def main(args):
        pin_memory=True,
        drop_last=False,
    )
    iters_per_epoch = len(train_data) // args.batch_size
    logger.log("The training loader: {:}".format(train_loader))
    logger.log("The validation loader: {:}".format(valid_loader))
@@ -67,159 +68,44 @@ def main(args):
        lr=args.lr,
        weight_decay=args.weight_decay,
    )
-    loss = xmisc.nested_call_by_yaml(args.loss_config)
+    objective = xmisc.nested_call_by_yaml(args.loss_config)
    logger.log("The optimizer is:\n{:}".format(optimizer))
-    logger.log("The loss is {:}".format(loss))
+    logger.log("The objective is {:}".format(objective))
    logger.log("The iters_per_epoch={:}".format(iters_per_epoch))
-    model, loss = torch.nn.DataParallel(model).cuda(), loss.cuda()
+    model, objective = torch.nn.DataParallel(model).cuda(), objective.cuda()
    scheduler = xmisc.LRMultiplier(
        optimizer, xmisc.get_scheduler(args.scheduler, args.lr), args.steps
    )
-    import pdb
+    start_time, iter_time = time.time(), xmisc.AverageMeter()
-
+    for xiter, data in enumerate(train_loader):
    pdb.set_trace()
    train_func, valid_func = get_procedures(args.procedure)
    total_epoch = optim_config.epochs + optim_config.warmup
    # Main Training and Evaluation Loop
    start_time = time.time()
    epoch_time = AverageMeter()
    for epoch in range(start_epoch, total_epoch):
        scheduler.update(epoch, 0.0)
        need_time = "Time Left: {:}".format(
-            convert_secs2time(epoch_time.avg * (total_epoch - epoch), True)
+            xmisc.time_utils.convert_secs2time(
-        )
+                iter_time.avg * (len(train_loader) - xiter), True
        epoch_str = "epoch={:03d}/{:03d}".format(epoch, total_epoch)
        LRs = scheduler.get_lr()
        find_best = False
        # set-up drop-out ratio
        if hasattr(base_model, "update_drop_path"):
            base_model.update_drop_path(
                model_config.drop_path_prob * epoch / total_epoch
            )
        logger.log(
            "\n***{:s}*** start {:s} {:s}, LR=[{:.6f} ~ {:.6f}], scheduler={:}".format(
                time_string(), epoch_str, need_time, min(LRs), max(LRs), scheduler
            )
        )
        iter_str = "{:6d}/{:06d}".format(xiter, len(train_loader))
-        # train for one epoch
+        inputs, targets = data
-        train_loss, train_acc1, train_acc5 = train_func(
+        targets = targets.cuda(non_blocking=True)
-            train_loader,
+        model.train()
            network,
            criterion,
            scheduler,
            optimizer,
            optim_config,
            epoch_str,
            args.print_freq,
            logger,
        )
        # log the results
        logger.log(
            "***{:s}*** TRAIN [{:}] loss = {:.6f}, accuracy-1 = {:.2f}, accuracy-5 = {:.2f}".format(
                time_string(), epoch_str, train_loss, train_acc1, train_acc5
            )
        )
-        # evaluate the performance
+        optimizer.zero_grad()
-        if (epoch % args.eval_frequency == 0) or (epoch + 1 == total_epoch):
+        outputs = model(inputs)
-            logger.log("-" * 150)
+        loss = objective(outputs, targets)
            valid_loss, valid_acc1, valid_acc5 = valid_func(
                valid_loader,
                network,
                criterion,
                optim_config,
                epoch_str,
                args.print_freq_eval,
                logger,
            )
            valid_accuracies[epoch] = valid_acc1
            logger.log(
                "***{:s}*** VALID [{:}] loss = {:.6f}, accuracy@1 = {:.2f}, accuracy@5 = {:.2f} | Best-Valid-Acc@1={:.2f}, Error@1={:.2f}".format(
                    time_string(),
                    epoch_str,
                    valid_loss,
                    valid_acc1,
                    valid_acc5,
                    valid_accuracies["best"],
                    100 - valid_accuracies["best"],
                )
            )
            if valid_acc1 > valid_accuracies["best"]:
                valid_accuracies["best"] = valid_acc1
                find_best = True
                logger.log(
                    "Currently, the best validation accuracy found at {:03d}-epoch :: acc@1={:.2f}, acc@5={:.2f}, error@1={:.2f}, error@5={:.2f}, save into {:}.".format(
                        epoch,
                        valid_acc1,
                        valid_acc5,
                        100 - valid_acc1,
                        100 - valid_acc5,
                        model_best_path,
                    )
                )
            num_bytes = (
                torch.cuda.max_memory_cached(next(network.parameters()).device) * 1.0
            )
            logger.log(
                "[GPU-Memory-Usage on {:} is {:} bytes, {:.2f} KB, {:.2f} MB, {:.2f} GB.]".format(
                    next(network.parameters()).device,
                    int(num_bytes),
                    num_bytes / 1e3,
                    num_bytes / 1e6,
                    num_bytes / 1e9,
                )
            )
            max_bytes[epoch] = num_bytes
        if epoch % 10 == 0:
            torch.cuda.empty_cache()
-        # save checkpoint
+        loss.backward()
-        save_path = save_checkpoint(
+        optimizer.step()
-            {
+        scheduler.step()
-                "epoch": epoch,
+        if xiter % iters_per_epoch == 0:
-                "args": deepcopy(args),
+            logger.log("TRAIN [{:}] loss = {:.6f}".format(iter_str, loss.item()))
                "max_bytes": deepcopy(max_bytes),
                "FLOP": flop,
                "PARAM": param,
                "valid_accuracies": deepcopy(valid_accuracies),
                "model-config": model_config._asdict(),
                "optim-config": optim_config._asdict(),
                "base-model": base_model.state_dict(),
                "scheduler": scheduler.state_dict(),
                "optimizer": optimizer.state_dict(),
            },
            model_base_path,
            logger,
        )
        if find_best:
            copy_checkpoint(model_base_path, model_best_path, logger)
        last_info = save_checkpoint(
            {
                "epoch": epoch,
                "args": deepcopy(args),
                "last_checkpoint": save_path,
            },
            logger.path("info"),
            logger,
        )
        # measure elapsed time
-        epoch_time.update(time.time() - start_time)
+        iter_time.update(time.time() - start_time)
        start_time = time.time()
    logger.log("\n" + "-" * 200)
    logger.log(
        "Finish training/validation in {:} with Max-GPU-Memory of {:.2f} MB, and save final checkpoint into {:}".format(
            convert_secs2time(epoch_time.sum, True),
            max(v for k, v in max_bytes.items()) / 1e6,
            logger.path("info"),
        )
    )
    logger.log("-" * 200 + "\n")
    logger.close()
@@ -249,7 +135,7 @@ if __name__ == "__main__":
    parser.add_argument("--weight_decay", type=float, help="The weight decay")
    parser.add_argument("--scheduler", type=str, help="The scheduler indicator.")
    parser.add_argument("--steps", type=int, help="The total number of steps.")
-    parser.add_argument("--batch_size", type=int, default=2, help="The batch size.")
+    parser.add_argument("--batch_size", type=int, default=256, help="The batch size.")
    parser.add_argument("--workers", type=int, default=4, help="The number of workers")
    # Random Seed
    parser.add_argument("--rand_seed", type=int, default=-1, help="manual seed")
--- a/scripts/experimental/train-vit.sh
+++ b/scripts/experimental/train-vit.sh
@@ -28,4 +28,5 @@ python ./exps/basic/xmain.py --save_dir ${save_dir} --rand_seed ${rseed} \
 	--model_config ./configs/yaml.model/vit-cifar10.s0 \
 	--optim_config ./configs/yaml.opt/vit.cifar \
 	--loss_config ./configs/yaml.loss/cross-entropy \
 	--batch_size 256 \
 	--lr 0.003 --weight_decay 0.3 --scheduler warm-cos --steps 10000
--- a/xautodl/xlayers/super_linear.py
+++ b/xautodl/xlayers/super_linear.py
@@ -201,7 +201,6 @@ class SuperMLPv2(SuperModule):
        self._hidden_multiplier = hidden_multiplier
        self._out_features = out_features
        self._drop_rate = drop
        self._params = nn.ParameterDict({})
        self._create_linear(
            "fc1", self.in_features, int(self.in_features * self.hidden_multiplier)
@@ -226,26 +225,22 @@ class SuperMLPv2(SuperModule):
        return spaces.get_max(self._out_features)
    def _create_linear(self, name, inC, outC):
-        self._params["{:}_super_weight".format(name)] = torch.nn.Parameter(
+        self.register_parameter(
-            torch.Tensor(outC, inC)
+            "{:}_super_weight".format(name), torch.nn.Parameter(torch.Tensor(outC, inC))
        )
-        self._params["{:}_super_bias".format(name)] = torch.nn.Parameter(
+        self.register_parameter(
-            torch.Tensor(outC)
+            "{:}_super_bias".format(name), torch.nn.Parameter(torch.Tensor(outC))
        )
    def reset_parameters(self) -> None:
-        nn.init.kaiming_uniform_(self._params["fc1_super_weight"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.fc1_super_weight, a=math.sqrt(5))
-        nn.init.kaiming_uniform_(self._params["fc2_super_weight"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.fc2_super_weight, a=math.sqrt(5))
-        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(
+        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.fc1_super_weight)
            self._params["fc1_super_weight"]
        )
        bound = 1 / math.sqrt(fan_in)
-        nn.init.uniform_(self._params["fc1_super_bias"], -bound, bound)
+        nn.init.uniform_(self.fc1_super_bias, -bound, bound)
-        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(
+        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.fc2_super_weight)
            self._params["fc2_super_weight"]
        )
        bound = 1 / math.sqrt(fan_in)
-        nn.init.uniform_(self._params["fc2_super_bias"], -bound, bound)
+        nn.init.uniform_(self.fc2_super_bias, -bound, bound)
    @property
    def abstract_search_space(self):
@@ -282,8 +277,8 @@ class SuperMLPv2(SuperModule):
        else:
            hmul = spaces.get_determined_value(self._hidden_multiplier)
        hidden_dim = int(expected_input_dim * hmul)
-        _fc1_weight = self._params["fc1_super_weight"][:hidden_dim, :expected_input_dim]
+        _fc1_weight = self.fc1_super_weight[:hidden_dim, :expected_input_dim]
-        _fc1_bias = self._params["fc1_super_bias"][:hidden_dim]
+        _fc1_bias = self.fc1_super_bias[:hidden_dim]
        x = F.linear(input, _fc1_weight, _fc1_bias)
        x = self.act(x)
        x = self.drop(x)
@@ -292,21 +287,17 @@ class SuperMLPv2(SuperModule):
            out_dim = self.abstract_child["_out_features"].value
        else:
            out_dim = spaces.get_determined_value(self._out_features)
-        _fc2_weight = self._params["fc2_super_weight"][:out_dim, :hidden_dim]
+        _fc2_weight = self.fc2_super_weight[:out_dim, :hidden_dim]
-        _fc2_bias = self._params["fc2_super_bias"][:out_dim]
+        _fc2_bias = self.fc2_super_bias[:out_dim]
        x = F.linear(x, _fc2_weight, _fc2_bias)
        x = self.drop(x)
        return x
    def forward_raw(self, input: torch.Tensor) -> torch.Tensor:
-        x = F.linear(
+        x = F.linear(input, self.fc1_super_weight, self.fc1_super_bias)
            input, self._params["fc1_super_weight"], self._params["fc1_super_bias"]
        )
        x = self.act(x)
        x = self.drop(x)
-        x = F.linear(
+        x = F.linear(x, self.fc2_super_weight, self.fc2_super_bias)
            x, self._params["fc2_super_weight"], self._params["fc2_super_bias"]
        )
        x = self.drop(x)
        return x
--- a/xautodl/xlayers/super_mlp.py
+++ b/xautodl/xlayers/super_mlp.py
@@ -1,319 +0,0 @@
 #####################################################
 # Copyright (c) Xuanyi Dong [GitHub D-X-Y], 2021.03 #
 #####################################################
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import math
 from typing import Optional, Callable
 from xautodl import spaces
 from .super_module import SuperModule
 from .super_module import IntSpaceType
 from .super_module import BoolSpaceType
 class SuperLinear(SuperModule):
    """Applies a linear transformation to the incoming data: :math:`y = xA^T + b`"""
    def __init__(
        self,
        in_features: IntSpaceType,
        out_features: IntSpaceType,
        bias: BoolSpaceType = True,
    ) -> None:
        super(SuperLinear, self).__init__()
        # the raw input args
        self._in_features = in_features
        self._out_features = out_features
        self._bias = bias
        # weights to be optimized
        self.register_parameter(
            "_super_weight",
            torch.nn.Parameter(torch.Tensor(self.out_features, self.in_features)),
        )
        if self.bias:
            self.register_parameter(
                "_super_bias", torch.nn.Parameter(torch.Tensor(self.out_features))
            )
        else:
            self.register_parameter("_super_bias", None)
        self.reset_parameters()
    @property
    def in_features(self):
        return spaces.get_max(self._in_features)
    @property
    def out_features(self):
        return spaces.get_max(self._out_features)
    @property
    def bias(self):
        return spaces.has_categorical(self._bias, True)
    @property
    def abstract_search_space(self):
        root_node = spaces.VirtualNode(id(self))
        if not spaces.is_determined(self._in_features):
            root_node.append(
                "_in_features", self._in_features.abstract(reuse_last=True)
            )
        if not spaces.is_determined(self._out_features):
            root_node.append(
                "_out_features", self._out_features.abstract(reuse_last=True)
            )
        if not spaces.is_determined(self._bias):
            root_node.append("_bias", self._bias.abstract(reuse_last=True))
        return root_node
    def reset_parameters(self) -> None:
        nn.init.kaiming_uniform_(self._super_weight, a=math.sqrt(5))
        if self.bias:
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self._super_weight)
            bound = 1 / math.sqrt(fan_in)
            nn.init.uniform_(self._super_bias, -bound, bound)
    def forward_candidate(self, input: torch.Tensor) -> torch.Tensor:
        # check inputs ->
        if not spaces.is_determined(self._in_features):
            expected_input_dim = self.abstract_child["_in_features"].value
        else:
            expected_input_dim = spaces.get_determined_value(self._in_features)
        if input.size(-1) != expected_input_dim:
            raise ValueError(
                "Expect the input dim of {:} instead of {:}".format(
                    expected_input_dim, input.size(-1)
                )
            )
        # create the weight matrix
        if not spaces.is_determined(self._out_features):
            out_dim = self.abstract_child["_out_features"].value
        else:
            out_dim = spaces.get_determined_value(self._out_features)
        candidate_weight = self._super_weight[:out_dim, :expected_input_dim]
        # create the bias matrix
        if not spaces.is_determined(self._bias):
            if self.abstract_child["_bias"].value:
                candidate_bias = self._super_bias[:out_dim]
            else:
                candidate_bias = None
        else:
            if spaces.get_determined_value(self._bias):
                candidate_bias = self._super_bias[:out_dim]
            else:
                candidate_bias = None
        return F.linear(input, candidate_weight, candidate_bias)
    def forward_raw(self, input: torch.Tensor) -> torch.Tensor:
        return F.linear(input, self._super_weight, self._super_bias)
    def extra_repr(self) -> str:
        return "in_features={:}, out_features={:}, bias={:}".format(
            self._in_features, self._out_features, self._bias
        )
    def forward_with_container(self, input, container, prefix=[]):
        super_weight_name = ".".join(prefix + ["_super_weight"])
        super_weight = container.query(super_weight_name)
        super_bias_name = ".".join(prefix + ["_super_bias"])
        if container.has(super_bias_name):
            super_bias = container.query(super_bias_name)
        else:
            super_bias = None
        return F.linear(input, super_weight, super_bias)
 class SuperMLPv1(SuperModule):
    """An MLP layer: FC -> Activation -> Drop -> FC -> Drop."""
    def __init__(
        self,
        in_features: IntSpaceType,
        hidden_features: IntSpaceType,
        out_features: IntSpaceType,
        act_layer: Callable[[], nn.Module] = nn.GELU,
        drop: Optional[float] = None,
    ):
        super(SuperMLPv1, self).__init__()
        self._in_features = in_features
        self._hidden_features = hidden_features
        self._out_features = out_features
        self._drop_rate = drop
        self.fc1 = SuperLinear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = SuperLinear(hidden_features, out_features)
        self.drop = nn.Dropout(drop or 0.0)
    @property
    def abstract_search_space(self):
        root_node = spaces.VirtualNode(id(self))
        space_fc1 = self.fc1.abstract_search_space
        space_fc2 = self.fc2.abstract_search_space
        if not spaces.is_determined(space_fc1):
            root_node.append("fc1", space_fc1)
        if not spaces.is_determined(space_fc2):
            root_node.append("fc2", space_fc2)
        return root_node
    def apply_candidate(self, abstract_child: spaces.VirtualNode):
        super(SuperMLPv1, self).apply_candidate(abstract_child)
        if "fc1" in abstract_child:
            self.fc1.apply_candidate(abstract_child["fc1"])
        if "fc2" in abstract_child:
            self.fc2.apply_candidate(abstract_child["fc2"])
    def forward_candidate(self, input: torch.Tensor) -> torch.Tensor:
        return self.forward_raw(input)
    def forward_raw(self, input: torch.Tensor) -> torch.Tensor:
        x = self.fc1(input)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x
    def extra_repr(self) -> str:
        return "in_features={:}, hidden_features={:}, out_features={:}, drop={:}, fc1 -> act -> drop -> fc2 -> drop,".format(
            self._in_features,
            self._hidden_features,
            self._out_features,
            self._drop_rate,
        )
 class SuperMLPv2(SuperModule):
    """An MLP layer: FC -> Activation -> Drop -> FC -> Drop."""
    def __init__(
        self,
        in_features: IntSpaceType,
        hidden_multiplier: IntSpaceType,
        out_features: IntSpaceType,
        act_layer: Callable[[], nn.Module] = nn.GELU,
        drop: Optional[float] = None,
    ):
        super(SuperMLPv2, self).__init__()
        self._in_features = in_features
        self._hidden_multiplier = hidden_multiplier
        self._out_features = out_features
        self._drop_rate = drop
        self._params = nn.ParameterDict({})
        self._create_linear(
            "fc1", self.in_features, int(self.in_features * self.hidden_multiplier)
        )
        self._create_linear(
            "fc2", int(self.in_features * self.hidden_multiplier), self.out_features
        )
        self.act = act_layer()
        self.drop = nn.Dropout(drop or 0.0)
        self.reset_parameters()
    @property
    def in_features(self):
        return spaces.get_max(self._in_features)
    @property
    def hidden_multiplier(self):
        return spaces.get_max(self._hidden_multiplier)
    @property
    def out_features(self):
        return spaces.get_max(self._out_features)
    def _create_linear(self, name, inC, outC):
        self._params["{:}_super_weight".format(name)] = torch.nn.Parameter(
            torch.Tensor(outC, inC)
        )
        self._params["{:}_super_bias".format(name)] = torch.nn.Parameter(
            torch.Tensor(outC)
        )
    def reset_parameters(self) -> None:
        nn.init.kaiming_uniform_(self._params["fc1_super_weight"], a=math.sqrt(5))
        nn.init.kaiming_uniform_(self._params["fc2_super_weight"], a=math.sqrt(5))
        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(
            self._params["fc1_super_weight"]
        )
        bound = 1 / math.sqrt(fan_in)
        nn.init.uniform_(self._params["fc1_super_bias"], -bound, bound)
        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(
            self._params["fc2_super_weight"]
        )
        bound = 1 / math.sqrt(fan_in)
        nn.init.uniform_(self._params["fc2_super_bias"], -bound, bound)
    @property
    def abstract_search_space(self):
        root_node = spaces.VirtualNode(id(self))
        if not spaces.is_determined(self._in_features):
            root_node.append(
                "_in_features", self._in_features.abstract(reuse_last=True)
            )
        if not spaces.is_determined(self._hidden_multiplier):
            root_node.append(
                "_hidden_multiplier", self._hidden_multiplier.abstract(reuse_last=True)
            )
        if not spaces.is_determined(self._out_features):
            root_node.append(
                "_out_features", self._out_features.abstract(reuse_last=True)
            )
        return root_node
    def forward_candidate(self, input: torch.Tensor) -> torch.Tensor:
        # check inputs ->
        if not spaces.is_determined(self._in_features):
            expected_input_dim = self.abstract_child["_in_features"].value
        else:
            expected_input_dim = spaces.get_determined_value(self._in_features)
        if input.size(-1) != expected_input_dim:
            raise ValueError(
                "Expect the input dim of {:} instead of {:}".format(
                    expected_input_dim, input.size(-1)
                )
            )
        # create the weight and bias matrix for fc1
        if not spaces.is_determined(self._hidden_multiplier):
            hmul = self.abstract_child["_hidden_multiplier"].value * expected_input_dim
        else:
            hmul = spaces.get_determined_value(self._hidden_multiplier)
        hidden_dim = int(expected_input_dim * hmul)
        _fc1_weight = self._params["fc1_super_weight"][:hidden_dim, :expected_input_dim]
        _fc1_bias = self._params["fc1_super_bias"][:hidden_dim]
        x = F.linear(input, _fc1_weight, _fc1_bias)
        x = self.act(x)
        x = self.drop(x)
        # create the weight and bias matrix for fc2
        if not spaces.is_determined(self._out_features):
            out_dim = self.abstract_child["_out_features"].value
        else:
            out_dim = spaces.get_determined_value(self._out_features)
        _fc2_weight = self._params["fc2_super_weight"][:out_dim, :hidden_dim]
        _fc2_bias = self._params["fc2_super_bias"][:out_dim]
        x = F.linear(x, _fc2_weight, _fc2_bias)
        x = self.drop(x)
        return x
    def forward_raw(self, input: torch.Tensor) -> torch.Tensor:
        x = F.linear(
            input, self._params["fc1_super_weight"], self._params["fc1_super_bias"]
        )
        x = self.act(x)
        x = self.drop(x)
        x = F.linear(
            x, self._params["fc2_super_weight"], self._params["fc2_super_bias"]
        )
        x = self.drop(x)
        return x
    def extra_repr(self) -> str:
        return "in_features={:}, hidden_multiplier={:}, out_features={:}, drop={:}, fc1 -> act -> drop -> fc2 -> drop,".format(
            self._in_features,
            self._hidden_multiplier,
            self._out_features,
            self._drop_rate,
        )
--- a/xautodl/xmisc/init.py
+++ b/xautodl/xmisc/init.py
@@ -1,6 +1,7 @@
 #####################################################
 # Copyright (c) Xuanyi Dong [GitHub D-X-Y], 2021.06 #
 #####################################################
 """The module and yaml related functions."""
 from .module_utils import call_by_dict
 from .module_utils import call_by_yaml
 from .module_utils import nested_call_by_dict
@@ -11,10 +12,13 @@ from .torch_utils import count_parameters
 from .logger_utils import Logger
-# sampler
+"""The data sampler related classes."""
 from .sampler_utils import BatchSampler
-# scheduler related
+"""The meter related classes."""
 from .meter_utils import AverageMeter
 """The scheduler related classes."""
 from .scheduler_utils import CosineParamScheduler, WarmupParamScheduler, LRMultiplier
--- a/xautodl/xmisc/meter_utils.py
+++ b/xautodl/xmisc/meter_utils.py
@@ -0,0 +1,22 @@
 class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0.0
        self.avg = 0.0
        self.sum = 0.0
        self.count = 0.0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
    def __repr__(self):
        return "{name}(val={val}, avg={avg}, count={count})".format(
            name=self.__class__.__name__, **self.__dict__
        )