diff --git a/configs/yaml.data/cifar10.test b/configs/yaml.data/cifar10.test
index 3b8e8d8..5882a19 100644
--- a/configs/yaml.data/cifar10.test
+++ b/configs/yaml.data/cifar10.test
@@ -17,6 +17,6 @@ kwargs:
           module_path: torchvision.transforms
           args: []
           kwargs:
-            mean: (0.491, 0.482, 0.447)
-            std: (0.247, 0.244, 0.262)
+            mean: [0.491, 0.482, 0.447]
+            std: [0.247, 0.244, 0.262]
     kwargs: {}
diff --git a/configs/yaml.data/cifar10.train b/configs/yaml.data/cifar10.train
index f787228..1dbb1ac 100644
--- a/configs/yaml.data/cifar10.train
+++ b/configs/yaml.data/cifar10.train
@@ -25,6 +25,6 @@ kwargs:
           module_path: torchvision.transforms
           args: []
           kwargs:
-            mean: (0.491, 0.482, 0.447)
-            std: (0.247, 0.244, 0.262)
+            mean: [0.491, 0.482, 0.447]
+            std: [0.247, 0.244, 0.262]
     kwargs: {}
diff --git a/exps/basic/xmain.py b/exps/basic/xmain.py
index 0274ff0..7a0217a 100644
--- a/exps/basic/xmain.py
+++ b/exps/basic/xmain.py
@@ -58,6 +58,7 @@ def main(args):
         pin_memory=True,
         drop_last=False,
     )
+    iters_per_epoch = len(train_data) // args.batch_size
 
     logger.log("The training loader: {:}".format(train_loader))
     logger.log("The validation loader: {:}".format(valid_loader))
@@ -67,159 +68,44 @@ def main(args):
         lr=args.lr,
         weight_decay=args.weight_decay,
     )
-    loss = xmisc.nested_call_by_yaml(args.loss_config)
+    objective = xmisc.nested_call_by_yaml(args.loss_config)
 
     logger.log("The optimizer is:\n{:}".format(optimizer))
-    logger.log("The loss is {:}".format(loss))
+    logger.log("The objective is {:}".format(objective))
+    logger.log("The iters_per_epoch={:}".format(iters_per_epoch))
 
-    model, loss = torch.nn.DataParallel(model).cuda(), loss.cuda()
+    model, objective = torch.nn.DataParallel(model).cuda(), objective.cuda()
     scheduler = xmisc.LRMultiplier(
         optimizer, xmisc.get_scheduler(args.scheduler, args.lr), args.steps
     )
 
-    import pdb
-
-    pdb.set_trace()
-
-    train_func, valid_func = get_procedures(args.procedure)
-
-    total_epoch = optim_config.epochs + optim_config.warmup
-    # Main Training and Evaluation Loop
-    start_time = time.time()
-    epoch_time = AverageMeter()
-    for epoch in range(start_epoch, total_epoch):
-        scheduler.update(epoch, 0.0)
+    start_time, iter_time = time.time(), xmisc.AverageMeter()
+    for xiter, data in enumerate(train_loader):
         need_time = "Time Left: {:}".format(
-            convert_secs2time(epoch_time.avg * (total_epoch - epoch), True)
-        )
-        epoch_str = "epoch={:03d}/{:03d}".format(epoch, total_epoch)
-        LRs = scheduler.get_lr()
-        find_best = False
-        # set-up drop-out ratio
-        if hasattr(base_model, "update_drop_path"):
-            base_model.update_drop_path(
-                model_config.drop_path_prob * epoch / total_epoch
-            )
-        logger.log(
-            "\n***{:s}*** start {:s} {:s}, LR=[{:.6f} ~ {:.6f}], scheduler={:}".format(
-                time_string(), epoch_str, need_time, min(LRs), max(LRs), scheduler
+            xmisc.time_utils.convert_secs2time(
+                iter_time.avg * (len(train_loader) - xiter), True
             )
         )
+        iter_str = "{:6d}/{:06d}".format(xiter, len(train_loader))
 
-        # train for one epoch
-        train_loss, train_acc1, train_acc5 = train_func(
-            train_loader,
-            network,
-            criterion,
-            scheduler,
-            optimizer,
-            optim_config,
-            epoch_str,
-            args.print_freq,
-            logger,
-        )
-        # log the results
-        logger.log(
-            "***{:s}*** TRAIN [{:}] loss = {:.6f}, accuracy-1 = {:.2f}, accuracy-5 = {:.2f}".format(
-                time_string(), epoch_str, train_loss, train_acc1, train_acc5
-            )
-        )
+        inputs, targets = data
+        targets = targets.cuda(non_blocking=True)
+        model.train()
 
-        # evaluate the performance
-        if (epoch % args.eval_frequency == 0) or (epoch + 1 == total_epoch):
-            logger.log("-" * 150)
-            valid_loss, valid_acc1, valid_acc5 = valid_func(
-                valid_loader,
-                network,
-                criterion,
-                optim_config,
-                epoch_str,
-                args.print_freq_eval,
-                logger,
-            )
-            valid_accuracies[epoch] = valid_acc1
-            logger.log(
-                "***{:s}*** VALID [{:}] loss = {:.6f}, accuracy@1 = {:.2f}, accuracy@5 = {:.2f} | Best-Valid-Acc@1={:.2f}, Error@1={:.2f}".format(
-                    time_string(),
-                    epoch_str,
-                    valid_loss,
-                    valid_acc1,
-                    valid_acc5,
-                    valid_accuracies["best"],
-                    100 - valid_accuracies["best"],
-                )
-            )
-            if valid_acc1 > valid_accuracies["best"]:
-                valid_accuracies["best"] = valid_acc1
-                find_best = True
-                logger.log(
-                    "Currently, the best validation accuracy found at {:03d}-epoch :: acc@1={:.2f}, acc@5={:.2f}, error@1={:.2f}, error@5={:.2f}, save into {:}.".format(
-                        epoch,
-                        valid_acc1,
-                        valid_acc5,
-                        100 - valid_acc1,
-                        100 - valid_acc5,
-                        model_best_path,
-                    )
-                )
-            num_bytes = (
-                torch.cuda.max_memory_cached(next(network.parameters()).device) * 1.0
-            )
-            logger.log(
-                "[GPU-Memory-Usage on {:} is {:} bytes, {:.2f} KB, {:.2f} MB, {:.2f} GB.]".format(
-                    next(network.parameters()).device,
-                    int(num_bytes),
-                    num_bytes / 1e3,
-                    num_bytes / 1e6,
-                    num_bytes / 1e9,
-                )
-            )
-            max_bytes[epoch] = num_bytes
-        if epoch % 10 == 0:
-            torch.cuda.empty_cache()
+        optimizer.zero_grad()
+        outputs = model(inputs)
+        loss = objective(outputs, targets)
 
-        # save checkpoint
-        save_path = save_checkpoint(
-            {
-                "epoch": epoch,
-                "args": deepcopy(args),
-                "max_bytes": deepcopy(max_bytes),
-                "FLOP": flop,
-                "PARAM": param,
-                "valid_accuracies": deepcopy(valid_accuracies),
-                "model-config": model_config._asdict(),
-                "optim-config": optim_config._asdict(),
-                "base-model": base_model.state_dict(),
-                "scheduler": scheduler.state_dict(),
-                "optimizer": optimizer.state_dict(),
-            },
-            model_base_path,
-            logger,
-        )
-        if find_best:
-            copy_checkpoint(model_base_path, model_best_path, logger)
-        last_info = save_checkpoint(
-            {
-                "epoch": epoch,
-                "args": deepcopy(args),
-                "last_checkpoint": save_path,
-            },
-            logger.path("info"),
-            logger,
-        )
+        loss.backward()
+        optimizer.step()
+        scheduler.step()
+        if xiter % iters_per_epoch == 0:
+            logger.log("TRAIN [{:}] loss = {:.6f}".format(iter_str, loss.item()))
 
         # measure elapsed time
-        epoch_time.update(time.time() - start_time)
+        iter_time.update(time.time() - start_time)
         start_time = time.time()
 
-    logger.log("\n" + "-" * 200)
-    logger.log(
-        "Finish training/validation in {:} with Max-GPU-Memory of {:.2f} MB, and save final checkpoint into {:}".format(
-            convert_secs2time(epoch_time.sum, True),
-            max(v for k, v in max_bytes.items()) / 1e6,
-            logger.path("info"),
-        )
-    )
     logger.log("-" * 200 + "\n")
     logger.close()
 
@@ -249,7 +135,7 @@ if __name__ == "__main__":
     parser.add_argument("--weight_decay", type=float, help="The weight decay")
     parser.add_argument("--scheduler", type=str, help="The scheduler indicator.")
     parser.add_argument("--steps", type=int, help="The total number of steps.")
-    parser.add_argument("--batch_size", type=int, default=2, help="The batch size.")
+    parser.add_argument("--batch_size", type=int, default=256, help="The batch size.")
     parser.add_argument("--workers", type=int, default=4, help="The number of workers")
     # Random Seed
     parser.add_argument("--rand_seed", type=int, default=-1, help="manual seed")
diff --git a/scripts/experimental/train-vit.sh b/scripts/experimental/train-vit.sh
index b8aa85b..84e2b48 100644
--- a/scripts/experimental/train-vit.sh
+++ b/scripts/experimental/train-vit.sh
@@ -28,4 +28,5 @@ python ./exps/basic/xmain.py --save_dir ${save_dir} --rand_seed ${rseed} \
 	--model_config ./configs/yaml.model/vit-cifar10.s0 \
 	--optim_config ./configs/yaml.opt/vit.cifar \
 	--loss_config ./configs/yaml.loss/cross-entropy \
+	--batch_size 256 \
 	--lr 0.003 --weight_decay 0.3 --scheduler warm-cos --steps 10000
diff --git a/xautodl/xlayers/super_linear.py b/xautodl/xlayers/super_linear.py
index f33a6b2..f5e04bf 100644
--- a/xautodl/xlayers/super_linear.py
+++ b/xautodl/xlayers/super_linear.py
@@ -201,7 +201,6 @@ class SuperMLPv2(SuperModule):
         self._hidden_multiplier = hidden_multiplier
         self._out_features = out_features
         self._drop_rate = drop
-        self._params = nn.ParameterDict({})
 
         self._create_linear(
             "fc1", self.in_features, int(self.in_features * self.hidden_multiplier)
@@ -226,26 +225,22 @@ class SuperMLPv2(SuperModule):
         return spaces.get_max(self._out_features)
 
     def _create_linear(self, name, inC, outC):
-        self._params["{:}_super_weight".format(name)] = torch.nn.Parameter(
-            torch.Tensor(outC, inC)
+        self.register_parameter(
+            "{:}_super_weight".format(name), torch.nn.Parameter(torch.Tensor(outC, inC))
         )
-        self._params["{:}_super_bias".format(name)] = torch.nn.Parameter(
-            torch.Tensor(outC)
+        self.register_parameter(
+            "{:}_super_bias".format(name), torch.nn.Parameter(torch.Tensor(outC))
         )
 
     def reset_parameters(self) -> None:
-        nn.init.kaiming_uniform_(self._params["fc1_super_weight"], a=math.sqrt(5))
-        nn.init.kaiming_uniform_(self._params["fc2_super_weight"], a=math.sqrt(5))
-        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(
-            self._params["fc1_super_weight"]
-        )
+        nn.init.kaiming_uniform_(self.fc1_super_weight, a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.fc2_super_weight, a=math.sqrt(5))
+        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.fc1_super_weight)
         bound = 1 / math.sqrt(fan_in)
-        nn.init.uniform_(self._params["fc1_super_bias"], -bound, bound)
-        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(
-            self._params["fc2_super_weight"]
-        )
+        nn.init.uniform_(self.fc1_super_bias, -bound, bound)
+        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.fc2_super_weight)
         bound = 1 / math.sqrt(fan_in)
-        nn.init.uniform_(self._params["fc2_super_bias"], -bound, bound)
+        nn.init.uniform_(self.fc2_super_bias, -bound, bound)
 
     @property
     def abstract_search_space(self):
@@ -282,8 +277,8 @@ class SuperMLPv2(SuperModule):
         else:
             hmul = spaces.get_determined_value(self._hidden_multiplier)
         hidden_dim = int(expected_input_dim * hmul)
-        _fc1_weight = self._params["fc1_super_weight"][:hidden_dim, :expected_input_dim]
-        _fc1_bias = self._params["fc1_super_bias"][:hidden_dim]
+        _fc1_weight = self.fc1_super_weight[:hidden_dim, :expected_input_dim]
+        _fc1_bias = self.fc1_super_bias[:hidden_dim]
         x = F.linear(input, _fc1_weight, _fc1_bias)
         x = self.act(x)
         x = self.drop(x)
@@ -292,21 +287,17 @@ class SuperMLPv2(SuperModule):
             out_dim = self.abstract_child["_out_features"].value
         else:
             out_dim = spaces.get_determined_value(self._out_features)
-        _fc2_weight = self._params["fc2_super_weight"][:out_dim, :hidden_dim]
-        _fc2_bias = self._params["fc2_super_bias"][:out_dim]
+        _fc2_weight = self.fc2_super_weight[:out_dim, :hidden_dim]
+        _fc2_bias = self.fc2_super_bias[:out_dim]
         x = F.linear(x, _fc2_weight, _fc2_bias)
         x = self.drop(x)
         return x
 
     def forward_raw(self, input: torch.Tensor) -> torch.Tensor:
-        x = F.linear(
-            input, self._params["fc1_super_weight"], self._params["fc1_super_bias"]
-        )
+        x = F.linear(input, self.fc1_super_weight, self.fc1_super_bias)
         x = self.act(x)
         x = self.drop(x)
-        x = F.linear(
-            x, self._params["fc2_super_weight"], self._params["fc2_super_bias"]
-        )
+        x = F.linear(x, self.fc2_super_weight, self.fc2_super_bias)
         x = self.drop(x)
         return x
 
diff --git a/xautodl/xlayers/super_mlp.py b/xautodl/xlayers/super_mlp.py
deleted file mode 100644
index f33a6b2..0000000
--- a/xautodl/xlayers/super_mlp.py
+++ /dev/null
@@ -1,319 +0,0 @@
-#####################################################
-# Copyright (c) Xuanyi Dong [GitHub D-X-Y], 2021.03 #
-#####################################################
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-import math
-from typing import Optional, Callable
-
-from xautodl import spaces
-from .super_module import SuperModule
-from .super_module import IntSpaceType
-from .super_module import BoolSpaceType
-
-
-class SuperLinear(SuperModule):
-    """Applies a linear transformation to the incoming data: :math:`y = xA^T + b`"""
-
-    def __init__(
-        self,
-        in_features: IntSpaceType,
-        out_features: IntSpaceType,
-        bias: BoolSpaceType = True,
-    ) -> None:
-        super(SuperLinear, self).__init__()
-
-        # the raw input args
-        self._in_features = in_features
-        self._out_features = out_features
-        self._bias = bias
-        # weights to be optimized
-        self.register_parameter(
-            "_super_weight",
-            torch.nn.Parameter(torch.Tensor(self.out_features, self.in_features)),
-        )
-        if self.bias:
-            self.register_parameter(
-                "_super_bias", torch.nn.Parameter(torch.Tensor(self.out_features))
-            )
-        else:
-            self.register_parameter("_super_bias", None)
-        self.reset_parameters()
-
-    @property
-    def in_features(self):
-        return spaces.get_max(self._in_features)
-
-    @property
-    def out_features(self):
-        return spaces.get_max(self._out_features)
-
-    @property
-    def bias(self):
-        return spaces.has_categorical(self._bias, True)
-
-    @property
-    def abstract_search_space(self):
-        root_node = spaces.VirtualNode(id(self))
-        if not spaces.is_determined(self._in_features):
-            root_node.append(
-                "_in_features", self._in_features.abstract(reuse_last=True)
-            )
-        if not spaces.is_determined(self._out_features):
-            root_node.append(
-                "_out_features", self._out_features.abstract(reuse_last=True)
-            )
-        if not spaces.is_determined(self._bias):
-            root_node.append("_bias", self._bias.abstract(reuse_last=True))
-        return root_node
-
-    def reset_parameters(self) -> None:
-        nn.init.kaiming_uniform_(self._super_weight, a=math.sqrt(5))
-        if self.bias:
-            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self._super_weight)
-            bound = 1 / math.sqrt(fan_in)
-            nn.init.uniform_(self._super_bias, -bound, bound)
-
-    def forward_candidate(self, input: torch.Tensor) -> torch.Tensor:
-        # check inputs ->
-        if not spaces.is_determined(self._in_features):
-            expected_input_dim = self.abstract_child["_in_features"].value
-        else:
-            expected_input_dim = spaces.get_determined_value(self._in_features)
-        if input.size(-1) != expected_input_dim:
-            raise ValueError(
-                "Expect the input dim of {:} instead of {:}".format(
-                    expected_input_dim, input.size(-1)
-                )
-            )
-        # create the weight matrix
-        if not spaces.is_determined(self._out_features):
-            out_dim = self.abstract_child["_out_features"].value
-        else:
-            out_dim = spaces.get_determined_value(self._out_features)
-        candidate_weight = self._super_weight[:out_dim, :expected_input_dim]
-        # create the bias matrix
-        if not spaces.is_determined(self._bias):
-            if self.abstract_child["_bias"].value:
-                candidate_bias = self._super_bias[:out_dim]
-            else:
-                candidate_bias = None
-        else:
-            if spaces.get_determined_value(self._bias):
-                candidate_bias = self._super_bias[:out_dim]
-            else:
-                candidate_bias = None
-        return F.linear(input, candidate_weight, candidate_bias)
-
-    def forward_raw(self, input: torch.Tensor) -> torch.Tensor:
-        return F.linear(input, self._super_weight, self._super_bias)
-
-    def extra_repr(self) -> str:
-        return "in_features={:}, out_features={:}, bias={:}".format(
-            self._in_features, self._out_features, self._bias
-        )
-
-    def forward_with_container(self, input, container, prefix=[]):
-        super_weight_name = ".".join(prefix + ["_super_weight"])
-        super_weight = container.query(super_weight_name)
-        super_bias_name = ".".join(prefix + ["_super_bias"])
-        if container.has(super_bias_name):
-            super_bias = container.query(super_bias_name)
-        else:
-            super_bias = None
-        return F.linear(input, super_weight, super_bias)
-
-
-class SuperMLPv1(SuperModule):
-    """An MLP layer: FC -> Activation -> Drop -> FC -> Drop."""
-
-    def __init__(
-        self,
-        in_features: IntSpaceType,
-        hidden_features: IntSpaceType,
-        out_features: IntSpaceType,
-        act_layer: Callable[[], nn.Module] = nn.GELU,
-        drop: Optional[float] = None,
-    ):
-        super(SuperMLPv1, self).__init__()
-        self._in_features = in_features
-        self._hidden_features = hidden_features
-        self._out_features = out_features
-        self._drop_rate = drop
-        self.fc1 = SuperLinear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = SuperLinear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop or 0.0)
-
-    @property
-    def abstract_search_space(self):
-        root_node = spaces.VirtualNode(id(self))
-        space_fc1 = self.fc1.abstract_search_space
-        space_fc2 = self.fc2.abstract_search_space
-        if not spaces.is_determined(space_fc1):
-            root_node.append("fc1", space_fc1)
-        if not spaces.is_determined(space_fc2):
-            root_node.append("fc2", space_fc2)
-        return root_node
-
-    def apply_candidate(self, abstract_child: spaces.VirtualNode):
-        super(SuperMLPv1, self).apply_candidate(abstract_child)
-        if "fc1" in abstract_child:
-            self.fc1.apply_candidate(abstract_child["fc1"])
-        if "fc2" in abstract_child:
-            self.fc2.apply_candidate(abstract_child["fc2"])
-
-    def forward_candidate(self, input: torch.Tensor) -> torch.Tensor:
-        return self.forward_raw(input)
-
-    def forward_raw(self, input: torch.Tensor) -> torch.Tensor:
-        x = self.fc1(input)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return "in_features={:}, hidden_features={:}, out_features={:}, drop={:}, fc1 -> act -> drop -> fc2 -> drop,".format(
-            self._in_features,
-            self._hidden_features,
-            self._out_features,
-            self._drop_rate,
-        )
-
-
-class SuperMLPv2(SuperModule):
-    """An MLP layer: FC -> Activation -> Drop -> FC -> Drop."""
-
-    def __init__(
-        self,
-        in_features: IntSpaceType,
-        hidden_multiplier: IntSpaceType,
-        out_features: IntSpaceType,
-        act_layer: Callable[[], nn.Module] = nn.GELU,
-        drop: Optional[float] = None,
-    ):
-        super(SuperMLPv2, self).__init__()
-        self._in_features = in_features
-        self._hidden_multiplier = hidden_multiplier
-        self._out_features = out_features
-        self._drop_rate = drop
-        self._params = nn.ParameterDict({})
-
-        self._create_linear(
-            "fc1", self.in_features, int(self.in_features * self.hidden_multiplier)
-        )
-        self._create_linear(
-            "fc2", int(self.in_features * self.hidden_multiplier), self.out_features
-        )
-        self.act = act_layer()
-        self.drop = nn.Dropout(drop or 0.0)
-        self.reset_parameters()
-
-    @property
-    def in_features(self):
-        return spaces.get_max(self._in_features)
-
-    @property
-    def hidden_multiplier(self):
-        return spaces.get_max(self._hidden_multiplier)
-
-    @property
-    def out_features(self):
-        return spaces.get_max(self._out_features)
-
-    def _create_linear(self, name, inC, outC):
-        self._params["{:}_super_weight".format(name)] = torch.nn.Parameter(
-            torch.Tensor(outC, inC)
-        )
-        self._params["{:}_super_bias".format(name)] = torch.nn.Parameter(
-            torch.Tensor(outC)
-        )
-
-    def reset_parameters(self) -> None:
-        nn.init.kaiming_uniform_(self._params["fc1_super_weight"], a=math.sqrt(5))
-        nn.init.kaiming_uniform_(self._params["fc2_super_weight"], a=math.sqrt(5))
-        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(
-            self._params["fc1_super_weight"]
-        )
-        bound = 1 / math.sqrt(fan_in)
-        nn.init.uniform_(self._params["fc1_super_bias"], -bound, bound)
-        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(
-            self._params["fc2_super_weight"]
-        )
-        bound = 1 / math.sqrt(fan_in)
-        nn.init.uniform_(self._params["fc2_super_bias"], -bound, bound)
-
-    @property
-    def abstract_search_space(self):
-        root_node = spaces.VirtualNode(id(self))
-        if not spaces.is_determined(self._in_features):
-            root_node.append(
-                "_in_features", self._in_features.abstract(reuse_last=True)
-            )
-        if not spaces.is_determined(self._hidden_multiplier):
-            root_node.append(
-                "_hidden_multiplier", self._hidden_multiplier.abstract(reuse_last=True)
-            )
-        if not spaces.is_determined(self._out_features):
-            root_node.append(
-                "_out_features", self._out_features.abstract(reuse_last=True)
-            )
-        return root_node
-
-    def forward_candidate(self, input: torch.Tensor) -> torch.Tensor:
-        # check inputs ->
-        if not spaces.is_determined(self._in_features):
-            expected_input_dim = self.abstract_child["_in_features"].value
-        else:
-            expected_input_dim = spaces.get_determined_value(self._in_features)
-        if input.size(-1) != expected_input_dim:
-            raise ValueError(
-                "Expect the input dim of {:} instead of {:}".format(
-                    expected_input_dim, input.size(-1)
-                )
-            )
-        # create the weight and bias matrix for fc1
-        if not spaces.is_determined(self._hidden_multiplier):
-            hmul = self.abstract_child["_hidden_multiplier"].value * expected_input_dim
-        else:
-            hmul = spaces.get_determined_value(self._hidden_multiplier)
-        hidden_dim = int(expected_input_dim * hmul)
-        _fc1_weight = self._params["fc1_super_weight"][:hidden_dim, :expected_input_dim]
-        _fc1_bias = self._params["fc1_super_bias"][:hidden_dim]
-        x = F.linear(input, _fc1_weight, _fc1_bias)
-        x = self.act(x)
-        x = self.drop(x)
-        # create the weight and bias matrix for fc2
-        if not spaces.is_determined(self._out_features):
-            out_dim = self.abstract_child["_out_features"].value
-        else:
-            out_dim = spaces.get_determined_value(self._out_features)
-        _fc2_weight = self._params["fc2_super_weight"][:out_dim, :hidden_dim]
-        _fc2_bias = self._params["fc2_super_bias"][:out_dim]
-        x = F.linear(x, _fc2_weight, _fc2_bias)
-        x = self.drop(x)
-        return x
-
-    def forward_raw(self, input: torch.Tensor) -> torch.Tensor:
-        x = F.linear(
-            input, self._params["fc1_super_weight"], self._params["fc1_super_bias"]
-        )
-        x = self.act(x)
-        x = self.drop(x)
-        x = F.linear(
-            x, self._params["fc2_super_weight"], self._params["fc2_super_bias"]
-        )
-        x = self.drop(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return "in_features={:}, hidden_multiplier={:}, out_features={:}, drop={:}, fc1 -> act -> drop -> fc2 -> drop,".format(
-            self._in_features,
-            self._hidden_multiplier,
-            self._out_features,
-            self._drop_rate,
-        )
diff --git a/xautodl/xmisc/__init__.py b/xautodl/xmisc/__init__.py
index f6a9ce9..e47d2bc 100644
--- a/xautodl/xmisc/__init__.py
+++ b/xautodl/xmisc/__init__.py
@@ -1,6 +1,7 @@
 #####################################################
 # Copyright (c) Xuanyi Dong [GitHub D-X-Y], 2021.06 #
 #####################################################
+"""The module and yaml related functions."""
 from .module_utils import call_by_dict
 from .module_utils import call_by_yaml
 from .module_utils import nested_call_by_dict
@@ -11,10 +12,13 @@ from .torch_utils import count_parameters
 
 from .logger_utils import Logger
 
-# sampler
+"""The data sampler related classes."""
 from .sampler_utils import BatchSampler
 
-# scheduler related
+"""The meter related classes."""
+from .meter_utils import AverageMeter
+
+"""The scheduler related classes."""
 from .scheduler_utils import CosineParamScheduler, WarmupParamScheduler, LRMultiplier
 
 
diff --git a/xautodl/xmisc/meter_utils.py b/xautodl/xmisc/meter_utils.py
new file mode 100644
index 0000000..923db1a
--- /dev/null
+++ b/xautodl/xmisc/meter_utils.py
@@ -0,0 +1,22 @@
+class AverageMeter:
+    """Computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0.0
+        self.avg = 0.0
+        self.sum = 0.0
+        self.count = 0.0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __repr__(self):
+        return "{name}(val={val}, avg={avg}, count={count})".format(
+            name=self.__class__.__name__, **self.__dict__
+        )