#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """Meters.""" from collections import deque import numpy as np import pycls.core.logging as logging import torch from pycls.core.config import cfg from pycls.core.timer import Timer logger = logging.get_logger(__name__) def time_string(seconds): """Converts time in seconds to a fixed-width string format.""" days, rem = divmod(int(seconds), 24 * 3600) hrs, rem = divmod(rem, 3600) mins, secs = divmod(rem, 60) return "{0:02},{1:02}:{2:02}:{3:02}".format(days, hrs, mins, secs) def inter_union(preds, labels, num_classes): _, preds = torch.max(preds, 1) preds = preds.type(torch.uint8) + 1 labels = labels.type(torch.uint8) + 1 preds = preds * (labels > 0).type(torch.uint8) inter = preds * (preds == labels).type(torch.uint8) area_inter = torch.histc(inter.type(torch.int64), bins=num_classes, min=1, max=num_classes) area_preds = torch.histc(preds.type(torch.int64), bins=num_classes, min=1, max=num_classes) area_labels = torch.histc(labels.type(torch.int64), bins=num_classes, min=1, max=num_classes) area_union = area_preds + area_labels - area_inter return [area_inter.type(torch.float64) / labels.size(0), area_union.type(torch.float64) / labels.size(0)] def topk_errors(preds, labels, ks): """Computes the top-k error for each k.""" err_str = "Batch dim of predictions and labels must match" assert preds.size(0) == labels.size(0), err_str # Find the top max_k predictions for each sample _top_max_k_vals, top_max_k_inds = torch.topk( preds, max(ks), dim=1, largest=True, sorted=True ) # (batch_size, max_k) -> (max_k, batch_size) top_max_k_inds = top_max_k_inds.t() # (batch_size, ) -> (max_k, batch_size) rep_max_k_labels = labels.view(1, -1).expand_as(top_max_k_inds) # (i, j) = 1 if top i-th prediction for the j-th sample is correct top_max_k_correct = top_max_k_inds.eq(rep_max_k_labels) # Compute the number of topk correct predictions for each k topks_correct = [top_max_k_correct[:k, :].view(-1).float().sum() for k in ks] return [(1.0 - x / preds.size(0)) * 100.0 for x in topks_correct] def gpu_mem_usage(): """Computes the GPU memory usage for the current device (MB).""" mem_usage_bytes = torch.cuda.max_memory_allocated() return mem_usage_bytes / 1024 / 1024 class ScalarMeter(object): """Measures a scalar value (adapted from Detectron).""" def __init__(self, window_size): self.deque = deque(maxlen=window_size) self.total = 0.0 self.count = 0 def reset(self): self.deque.clear() self.total = 0.0 self.count = 0 def add_value(self, value): self.deque.append(value) self.count += 1 self.total += value def get_win_median(self): return np.median(self.deque) def get_win_avg(self): return np.mean(self.deque) def get_global_avg(self): return self.total / self.count class TrainMeter(object): """Measures training stats.""" def __init__(self, epoch_iters): self.epoch_iters = epoch_iters self.max_iter = cfg.OPTIM.MAX_EPOCH * epoch_iters self.iter_timer = Timer() self.loss = ScalarMeter(cfg.LOG_PERIOD) self.loss_total = 0.0 self.lr = None # Current minibatch errors (smoothed over a window) self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD) self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD) # Number of misclassified examples self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 def reset(self, timer=False): if timer: self.iter_timer.reset() self.loss.reset() self.loss_total = 0.0 self.lr = None self.mb_top1_err.reset() self.mb_top5_err.reset() self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 def iter_tic(self): self.iter_timer.tic() def iter_toc(self): self.iter_timer.toc() def update_stats(self, top1_err, top5_err, loss, lr, mb_size): # Current minibatch stats self.mb_top1_err.add_value(top1_err) self.mb_top5_err.add_value(top5_err) self.loss.add_value(loss) self.lr = lr # Aggregate stats self.num_top1_mis += top1_err * mb_size self.num_top5_mis += top5_err * mb_size self.loss_total += loss * mb_size self.num_samples += mb_size def get_iter_stats(self, cur_epoch, cur_iter): cur_iter_total = cur_epoch * self.epoch_iters + cur_iter + 1 eta_sec = self.iter_timer.average_time * (self.max_iter - cur_iter_total) mem_usage = gpu_mem_usage() stats = { "epoch": "{}/{}".format(cur_epoch + 1, cfg.OPTIM.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters), "time_avg": self.iter_timer.average_time, "time_diff": self.iter_timer.diff, "eta": time_string(eta_sec), "top1_err": self.mb_top1_err.get_win_median(), "top5_err": self.mb_top5_err.get_win_median(), "loss": self.loss.get_win_median(), "lr": self.lr, "mem": int(np.ceil(mem_usage)), } return stats def log_iter_stats(self, cur_epoch, cur_iter): if (cur_iter + 1) % cfg.LOG_PERIOD != 0: return stats = self.get_iter_stats(cur_epoch, cur_iter) logger.info(logging.dump_log_data(stats, "train_iter")) def get_epoch_stats(self, cur_epoch): cur_iter_total = (cur_epoch + 1) * self.epoch_iters eta_sec = self.iter_timer.average_time * (self.max_iter - cur_iter_total) mem_usage = gpu_mem_usage() top1_err = self.num_top1_mis / self.num_samples top5_err = self.num_top5_mis / self.num_samples avg_loss = self.loss_total / self.num_samples stats = { "epoch": "{}/{}".format(cur_epoch + 1, cfg.OPTIM.MAX_EPOCH), "time_avg": self.iter_timer.average_time, "eta": time_string(eta_sec), "top1_err": top1_err, "top5_err": top5_err, "loss": avg_loss, "lr": self.lr, "mem": int(np.ceil(mem_usage)), } return stats def log_epoch_stats(self, cur_epoch): stats = self.get_epoch_stats(cur_epoch) logger.info(logging.dump_log_data(stats, "train_epoch")) class TestMeter(object): """Measures testing stats.""" def __init__(self, max_iter): self.max_iter = max_iter self.iter_timer = Timer() # Current minibatch errors (smoothed over a window) self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD) self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD) # Min errors (over the full test set) self.min_top1_err = 100.0 self.min_top5_err = 100.0 # Number of misclassified examples self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 def reset(self, min_errs=False): if min_errs: self.min_top1_err = 100.0 self.min_top5_err = 100.0 self.iter_timer.reset() self.mb_top1_err.reset() self.mb_top5_err.reset() self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 def iter_tic(self): self.iter_timer.tic() def iter_toc(self): self.iter_timer.toc() def update_stats(self, top1_err, top5_err, mb_size): self.mb_top1_err.add_value(top1_err) self.mb_top5_err.add_value(top5_err) self.num_top1_mis += top1_err * mb_size self.num_top5_mis += top5_err * mb_size self.num_samples += mb_size def get_iter_stats(self, cur_epoch, cur_iter): mem_usage = gpu_mem_usage() iter_stats = { "epoch": "{}/{}".format(cur_epoch + 1, cfg.OPTIM.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.max_iter), "time_avg": self.iter_timer.average_time, "time_diff": self.iter_timer.diff, "top1_err": self.mb_top1_err.get_win_median(), "top5_err": self.mb_top5_err.get_win_median(), "mem": int(np.ceil(mem_usage)), } return iter_stats def log_iter_stats(self, cur_epoch, cur_iter): if (cur_iter + 1) % cfg.LOG_PERIOD != 0: return stats = self.get_iter_stats(cur_epoch, cur_iter) logger.info(logging.dump_log_data(stats, "test_iter")) def get_epoch_stats(self, cur_epoch): top1_err = self.num_top1_mis / self.num_samples top5_err = self.num_top5_mis / self.num_samples self.min_top1_err = min(self.min_top1_err, top1_err) self.min_top5_err = min(self.min_top5_err, top5_err) mem_usage = gpu_mem_usage() stats = { "epoch": "{}/{}".format(cur_epoch + 1, cfg.OPTIM.MAX_EPOCH), "time_avg": self.iter_timer.average_time, "top1_err": top1_err, "top5_err": top5_err, "min_top1_err": self.min_top1_err, "min_top5_err": self.min_top5_err, "mem": int(np.ceil(mem_usage)), } return stats def log_epoch_stats(self, cur_epoch): stats = self.get_epoch_stats(cur_epoch) logger.info(logging.dump_log_data(stats, "test_epoch")) class TrainMeterIoU(object): """Measures training stats.""" def __init__(self, epoch_iters): self.epoch_iters = epoch_iters self.max_iter = cfg.OPTIM.MAX_EPOCH * epoch_iters self.iter_timer = Timer() self.loss = ScalarMeter(cfg.LOG_PERIOD) self.loss_total = 0.0 self.lr = None self.mb_miou = ScalarMeter(cfg.LOG_PERIOD) self.num_inter = np.zeros(cfg.MODEL.NUM_CLASSES) self.num_union = np.zeros(cfg.MODEL.NUM_CLASSES) self.num_samples = 0 def reset(self, timer=False): if timer: self.iter_timer.reset() self.loss.reset() self.loss_total = 0.0 self.lr = None self.mb_miou.reset() self.num_inter = np.zeros(cfg.MODEL.NUM_CLASSES) self.num_union = np.zeros(cfg.MODEL.NUM_CLASSES) self.num_samples = 0 def iter_tic(self): self.iter_timer.tic() def iter_toc(self): self.iter_timer.toc() def update_stats(self, inter, union, loss, lr, mb_size): # Current minibatch stats self.mb_miou.add_value((inter / (union + 1e-10)).mean()) self.loss.add_value(loss) self.lr = lr # Aggregate stats self.num_inter += inter * mb_size self.num_union += union * mb_size self.loss_total += loss * mb_size self.num_samples += mb_size def get_iter_stats(self, cur_epoch, cur_iter): cur_iter_total = cur_epoch * self.epoch_iters + cur_iter + 1 eta_sec = self.iter_timer.average_time * (self.max_iter - cur_iter_total) mem_usage = gpu_mem_usage() stats = { "epoch": "{}/{}".format(cur_epoch + 1, cfg.OPTIM.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters), "time_avg": self.iter_timer.average_time, "time_diff": self.iter_timer.diff, "eta": time_string(eta_sec), "miou": self.mb_miou.get_win_median(), "loss": self.loss.get_win_median(), "lr": self.lr, "mem": int(np.ceil(mem_usage)), } return stats def log_iter_stats(self, cur_epoch, cur_iter): if (cur_iter + 1) % cfg.LOG_PERIOD != 0: return stats = self.get_iter_stats(cur_epoch, cur_iter) logger.info(logging.dump_log_data(stats, "train_iter")) def get_epoch_stats(self, cur_epoch): cur_iter_total = (cur_epoch + 1) * self.epoch_iters eta_sec = self.iter_timer.average_time * (self.max_iter - cur_iter_total) mem_usage = gpu_mem_usage() miou = (self.num_inter / (self.num_union + 1e-10)).mean() avg_loss = self.loss_total / self.num_samples stats = { "epoch": "{}/{}".format(cur_epoch + 1, cfg.OPTIM.MAX_EPOCH), "time_avg": self.iter_timer.average_time, "eta": time_string(eta_sec), "miou": miou, "loss": avg_loss, "lr": self.lr, "mem": int(np.ceil(mem_usage)), } return stats def log_epoch_stats(self, cur_epoch): stats = self.get_epoch_stats(cur_epoch) logger.info(logging.dump_log_data(stats, "train_epoch")) class TestMeterIoU(object): """Measures testing stats.""" def __init__(self, max_iter): self.max_iter = max_iter self.iter_timer = Timer() self.mb_miou = ScalarMeter(cfg.LOG_PERIOD) self.max_miou = 0.0 self.num_inter = np.zeros(cfg.MODEL.NUM_CLASSES) self.num_union = np.zeros(cfg.MODEL.NUM_CLASSES) self.num_samples = 0 def reset(self, min_errs=False): if min_errs: self.max_miou = 0.0 self.iter_timer.reset() self.mb_miou.reset() self.num_inter = np.zeros(cfg.MODEL.NUM_CLASSES) self.num_union = np.zeros(cfg.MODEL.NUM_CLASSES) self.num_samples = 0 def iter_tic(self): self.iter_timer.tic() def iter_toc(self): self.iter_timer.toc() def update_stats(self, inter, union, mb_size): self.mb_miou.add_value((inter / (union + 1e-10)).mean()) self.num_inter += inter * mb_size self.num_union += union * mb_size self.num_samples += mb_size def get_iter_stats(self, cur_epoch, cur_iter): mem_usage = gpu_mem_usage() iter_stats = { "epoch": "{}/{}".format(cur_epoch + 1, cfg.OPTIM.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.max_iter), "time_avg": self.iter_timer.average_time, "time_diff": self.iter_timer.diff, "miou": self.mb_miou.get_win_median(), "mem": int(np.ceil(mem_usage)), } return iter_stats def log_iter_stats(self, cur_epoch, cur_iter): if (cur_iter + 1) % cfg.LOG_PERIOD != 0: return stats = self.get_iter_stats(cur_epoch, cur_iter) logger.info(logging.dump_log_data(stats, "test_iter")) def get_epoch_stats(self, cur_epoch): miou = (self.num_inter / (self.num_union + 1e-10)).mean() self.max_miou = max(self.max_miou, miou) mem_usage = gpu_mem_usage() stats = { "epoch": "{}/{}".format(cur_epoch + 1, cfg.OPTIM.MAX_EPOCH), "time_avg": self.iter_timer.average_time, "miou": miou, "max_miou": self.max_miou, "mem": int(np.ceil(mem_usage)), } return stats def log_epoch_stats(self, cur_epoch): stats = self.get_epoch_stats(cur_epoch) logger.info(logging.dump_log_data(stats, "test_epoch"))