#####################################################
# Copyright (c) Xuanyi Dong [GitHub D-X-Y], 2019.08 #
#####################################################
import os, time, copy, torch, pathlib

import datasets
from config_utils import load_config
from procedures import prepare_seed, get_optim_scheduler
from utils import get_model_infos, obtain_accuracy
from log_utils import AverageMeter, time_string, convert_secs2time
from models import get_cell_based_tiny_net


__all__ = ["evaluate_for_seed", "pure_evaluate", "get_nas_bench_loaders"]


def pure_evaluate(xloader, network, criterion=torch.nn.CrossEntropyLoss()):
    data_time, batch_time, batch = AverageMeter(), AverageMeter(), None
    losses, top1, top5 = AverageMeter(), AverageMeter(), AverageMeter()
    latencies, device = [], torch.cuda.current_device()
    network.eval()
    with torch.no_grad():
        end = time.time()
        for i, (inputs, targets) in enumerate(xloader):
            targets = targets.cuda(device=device, non_blocking=True)
            inputs = inputs.cuda(device=device, non_blocking=True)
            data_time.update(time.time() - end)
            # forward
            features, logits = network(inputs)
            loss = criterion(logits, targets)
            batch_time.update(time.time() - end)
            if batch is None or batch == inputs.size(0):
                batch = inputs.size(0)
                latencies.append(batch_time.val - data_time.val)
            # record loss and accuracy
            prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5))
            losses.update(loss.item(), inputs.size(0))
            top1.update(prec1.item(), inputs.size(0))
            top5.update(prec5.item(), inputs.size(0))
            end = time.time()
    if len(latencies) > 2:
        latencies = latencies[1:]
    return losses.avg, top1.avg, top5.avg, latencies


def procedure(xloader, network, criterion, scheduler, optimizer, mode: str):
    losses, top1, top5 = AverageMeter(), AverageMeter(), AverageMeter()
    if mode == "train":
        network.train()
    elif mode == "valid":
        network.eval()
    else:
        raise ValueError("The mode is not right : {:}".format(mode))
    device = torch.cuda.current_device()
    data_time, batch_time, end = AverageMeter(), AverageMeter(), time.time()
    for i, (inputs, targets) in enumerate(xloader):
        if mode == "train":
            scheduler.update(None, 1.0 * i / len(xloader))

        targets = targets.cuda(device=device, non_blocking=True)
        if mode == "train":
            optimizer.zero_grad()
        # forward
        features, logits = network(inputs)
        loss = criterion(logits, targets)
        # backward
        if mode == "train":
            loss.backward()
            optimizer.step()
        # record loss and accuracy
        prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5))
        losses.update(loss.item(), inputs.size(0))
        top1.update(prec1.item(), inputs.size(0))
        top5.update(prec5.item(), inputs.size(0))
        # count time
        batch_time.update(time.time() - end)
        end = time.time()
    return losses.avg, top1.avg, top5.avg, batch_time.sum


def evaluate_for_seed(arch_config, opt_config, train_loader, valid_loaders, seed: int, logger):

    prepare_seed(seed)  # random seed
    net = get_cell_based_tiny_net(arch_config)
    # net = TinyNetwork(arch_config['channel'], arch_config['num_cells'], arch, config.class_num)
    flop, param = get_model_infos(net, opt_config.xshape)
    logger.log("Network : {:}".format(net.get_message()), False)
    logger.log("{:} Seed-------------------------- {:} --------------------------".format(time_string(), seed))
    logger.log("FLOP = {:} MB, Param = {:} MB".format(flop, param))
    # train and valid
    optimizer, scheduler, criterion = get_optim_scheduler(net.parameters(), opt_config)
    default_device = torch.cuda.current_device()
    network = torch.nn.DataParallel(net, device_ids=[default_device]).cuda(device=default_device)
    criterion = criterion.cuda(device=default_device)
    # start training
    start_time, epoch_time, total_epoch = time.time(), AverageMeter(), opt_config.epochs + opt_config.warmup
    train_losses, train_acc1es, train_acc5es, valid_losses, valid_acc1es, valid_acc5es = {}, {}, {}, {}, {}, {}
    train_times, valid_times, lrs = {}, {}, {}
    for epoch in range(total_epoch):
        scheduler.update(epoch, 0.0)
        lr = min(scheduler.get_lr())
        train_loss, train_acc1, train_acc5, train_tm = procedure(
            train_loader, network, criterion, scheduler, optimizer, "train"
        )
        train_losses[epoch] = train_loss
        train_acc1es[epoch] = train_acc1
        train_acc5es[epoch] = train_acc5
        train_times[epoch] = train_tm
        lrs[epoch] = lr
        with torch.no_grad():
            for key, xloder in valid_loaders.items():
                valid_loss, valid_acc1, valid_acc5, valid_tm = procedure(
                    xloder, network, criterion, None, None, "valid"
                )
                valid_losses["{:}@{:}".format(key, epoch)] = valid_loss
                valid_acc1es["{:}@{:}".format(key, epoch)] = valid_acc1
                valid_acc5es["{:}@{:}".format(key, epoch)] = valid_acc5
                valid_times["{:}@{:}".format(key, epoch)] = valid_tm

        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()
        need_time = "Time Left: {:}".format(convert_secs2time(epoch_time.avg * (total_epoch - epoch - 1), True))
        logger.log(
            "{:} {:} epoch={:03d}/{:03d} :: Train [loss={:.5f}, acc@1={:.2f}%, acc@5={:.2f}%] Valid [loss={:.5f}, acc@1={:.2f}%, acc@5={:.2f}%], lr={:}".format(
                time_string(),
                need_time,
                epoch,
                total_epoch,
                train_loss,
                train_acc1,
                train_acc5,
                valid_loss,
                valid_acc1,
                valid_acc5,
                lr,
            )
        )
    info_seed = {
        "flop": flop,
        "param": param,
        "arch_config": arch_config._asdict(),
        "opt_config": opt_config._asdict(),
        "total_epoch": total_epoch,
        "train_losses": train_losses,
        "train_acc1es": train_acc1es,
        "train_acc5es": train_acc5es,
        "train_times": train_times,
        "valid_losses": valid_losses,
        "valid_acc1es": valid_acc1es,
        "valid_acc5es": valid_acc5es,
        "valid_times": valid_times,
        "learning_rates": lrs,
        "net_state_dict": net.state_dict(),
        "net_string": "{:}".format(net),
        "finish-train": True,
    }
    return info_seed


def get_nas_bench_loaders(workers):

    torch.set_num_threads(workers)

    root_dir = (pathlib.Path(__file__).parent / ".." / "..").resolve()
    torch_dir = pathlib.Path(os.environ["TORCH_HOME"])
    # cifar
    cifar_config_path = root_dir / "configs" / "nas-benchmark" / "CIFAR.config"
    cifar_config = load_config(cifar_config_path, None, None)
    get_datasets = datasets.get_datasets  # a function to return the dataset
    break_line = "-" * 150
    print("{:} Create data-loader for all datasets".format(time_string()))
    print(break_line)
    TRAIN_CIFAR10, VALID_CIFAR10, xshape, class_num = get_datasets("cifar10", str(torch_dir / "cifar.python"), -1)
    print(
        "original CIFAR-10 : {:} training images and {:} test images : {:} input shape : {:} number of classes".format(
            len(TRAIN_CIFAR10), len(VALID_CIFAR10), xshape, class_num
        )
    )
    cifar10_splits = load_config(root_dir / "configs" / "nas-benchmark" / "cifar-split.txt", None, None)
    assert cifar10_splits.train[:10] == [0, 5, 7, 11, 13, 15, 16, 17, 20, 24] and cifar10_splits.valid[:10] == [
        1,
        2,
        3,
        4,
        6,
        8,
        9,
        10,
        12,
        14,
    ]
    temp_dataset = copy.deepcopy(TRAIN_CIFAR10)
    temp_dataset.transform = VALID_CIFAR10.transform
    # data loader
    trainval_cifar10_loader = torch.utils.data.DataLoader(
        TRAIN_CIFAR10, batch_size=cifar_config.batch_size, shuffle=True, num_workers=workers, pin_memory=True
    )
    train_cifar10_loader = torch.utils.data.DataLoader(
        TRAIN_CIFAR10,
        batch_size=cifar_config.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(cifar10_splits.train),
        num_workers=workers,
        pin_memory=True,
    )
    valid_cifar10_loader = torch.utils.data.DataLoader(
        temp_dataset,
        batch_size=cifar_config.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(cifar10_splits.valid),
        num_workers=workers,
        pin_memory=True,
    )
    test__cifar10_loader = torch.utils.data.DataLoader(
        VALID_CIFAR10, batch_size=cifar_config.batch_size, shuffle=False, num_workers=workers, pin_memory=True
    )
    print(
        "CIFAR-10  : trval-loader has {:3d} batch with {:} per batch".format(
            len(trainval_cifar10_loader), cifar_config.batch_size
        )
    )
    print(
        "CIFAR-10  : train-loader has {:3d} batch with {:} per batch".format(
            len(train_cifar10_loader), cifar_config.batch_size
        )
    )
    print(
        "CIFAR-10  : valid-loader has {:3d} batch with {:} per batch".format(
            len(valid_cifar10_loader), cifar_config.batch_size
        )
    )
    print(
        "CIFAR-10  : test--loader has {:3d} batch with {:} per batch".format(
            len(test__cifar10_loader), cifar_config.batch_size
        )
    )
    print(break_line)
    # CIFAR-100
    TRAIN_CIFAR100, VALID_CIFAR100, xshape, class_num = get_datasets("cifar100", str(torch_dir / "cifar.python"), -1)
    print(
        "original CIFAR-100: {:} training images and {:} test images : {:} input shape : {:} number of classes".format(
            len(TRAIN_CIFAR100), len(VALID_CIFAR100), xshape, class_num
        )
    )
    cifar100_splits = load_config(root_dir / "configs" / "nas-benchmark" / "cifar100-test-split.txt", None, None)
    assert cifar100_splits.xvalid[:10] == [1, 3, 4, 5, 8, 10, 13, 14, 15, 16] and cifar100_splits.xtest[:10] == [
        0,
        2,
        6,
        7,
        9,
        11,
        12,
        17,
        20,
        24,
    ]
    train_cifar100_loader = torch.utils.data.DataLoader(
        TRAIN_CIFAR100, batch_size=cifar_config.batch_size, shuffle=True, num_workers=workers, pin_memory=True
    )
    valid_cifar100_loader = torch.utils.data.DataLoader(
        VALID_CIFAR100,
        batch_size=cifar_config.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(cifar100_splits.xvalid),
        num_workers=workers,
        pin_memory=True,
    )
    test__cifar100_loader = torch.utils.data.DataLoader(
        VALID_CIFAR100,
        batch_size=cifar_config.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(cifar100_splits.xtest),
        num_workers=workers,
        pin_memory=True,
    )
    print("CIFAR-100  : train-loader has {:3d} batch".format(len(train_cifar100_loader)))
    print("CIFAR-100  : valid-loader has {:3d} batch".format(len(valid_cifar100_loader)))
    print("CIFAR-100  : test--loader has {:3d} batch".format(len(test__cifar100_loader)))
    print(break_line)

    imagenet16_config_path = "configs/nas-benchmark/ImageNet-16.config"
    imagenet16_config = load_config(imagenet16_config_path, None, None)
    TRAIN_ImageNet16_120, VALID_ImageNet16_120, xshape, class_num = get_datasets(
        "ImageNet16-120", str(torch_dir / "cifar.python" / "ImageNet16"), -1
    )
    print(
        "original TRAIN_ImageNet16_120: {:} training images and {:} test images : {:} input shape : {:} number of classes".format(
            len(TRAIN_ImageNet16_120), len(VALID_ImageNet16_120), xshape, class_num
        )
    )
    imagenet_splits = load_config(root_dir / "configs" / "nas-benchmark" / "imagenet-16-120-test-split.txt", None, None)
    assert imagenet_splits.xvalid[:10] == [1, 2, 3, 6, 7, 8, 9, 12, 16, 18] and imagenet_splits.xtest[:10] == [
        0,
        4,
        5,
        10,
        11,
        13,
        14,
        15,
        17,
        20,
    ]
    train_imagenet_loader = torch.utils.data.DataLoader(
        TRAIN_ImageNet16_120,
        batch_size=imagenet16_config.batch_size,
        shuffle=True,
        num_workers=workers,
        pin_memory=True,
    )
    valid_imagenet_loader = torch.utils.data.DataLoader(
        VALID_ImageNet16_120,
        batch_size=imagenet16_config.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(imagenet_splits.xvalid),
        num_workers=workers,
        pin_memory=True,
    )
    test__imagenet_loader = torch.utils.data.DataLoader(
        VALID_ImageNet16_120,
        batch_size=imagenet16_config.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(imagenet_splits.xtest),
        num_workers=workers,
        pin_memory=True,
    )
    print(
        "ImageNet-16-120  : train-loader has {:3d} batch with {:} per batch".format(
            len(train_imagenet_loader), imagenet16_config.batch_size
        )
    )
    print(
        "ImageNet-16-120  : valid-loader has {:3d} batch with {:} per batch".format(
            len(valid_imagenet_loader), imagenet16_config.batch_size
        )
    )
    print(
        "ImageNet-16-120  : test--loader has {:3d} batch with {:} per batch".format(
            len(test__imagenet_loader), imagenet16_config.batch_size
        )
    )

    # 'cifar10', 'cifar100', 'ImageNet16-120'
    loaders = {
        "cifar10@trainval": trainval_cifar10_loader,
        "cifar10@train": train_cifar10_loader,
        "cifar10@valid": valid_cifar10_loader,
        "cifar10@test": test__cifar10_loader,
        "cifar100@train": train_cifar100_loader,
        "cifar100@valid": valid_cifar100_loader,
        "cifar100@test": test__cifar100_loader,
        "ImageNet16-120@train": train_imagenet_loader,
        "ImageNet16-120@valid": valid_imagenet_loader,
        "ImageNet16-120@test": test__imagenet_loader,
    }
    return loaders