import torch
import torch.nn.functional as F
import sde_lib

_MODELS = {}


def register_model(cls=None, *, name=None):
    """A decorator for registering model classes."""

    def _register(cls):
        if name is None:
            local_name = cls.__name__
        else:
            local_name = name
        if local_name in _MODELS:
            raise ValueError(
                f'Already registered model with name: {local_name}')
        _MODELS[local_name] = cls
        return cls

    if cls is None:
        return _register
    else:
        return _register(cls)


def get_model(name):
    return _MODELS[name]


def create_model(config):
    """Create the model."""
    model_name = config.model.name
    model = get_model(model_name)(config)
    model = model.to(config.device)
    return model


def get_model_fn(model, train=False):
    """Create a function to give the output of the score-based model.

    Args:
        model: The score model.
        train: `True` for training and `False` for evaluation.

    Returns:
        A model function.
    """

    def model_fn(x, labels, *args, **kwargs):
        """Compute the output of the score-based model.

        Args:
            x: A mini-batch of input data (Adjacency matrices).
            labels: A mini-batch of conditioning variables for time steps. Should be interpreted differently
                for different models.
            mask: Mask for adjacency matrices.

        Returns:
            A tuple of (model output, new mutable states)
        """
        if not train:
            model.eval()
            return model(x, labels, *args, **kwargs)
        else:
            model.train()
            return model(x, labels, *args, **kwargs)

    return model_fn


def get_score_fn(sde, model, train=False, continuous=False):
    """Wraps `score_fn` so that the model output corresponds to a real time-dependent score function.

    Args:
        sde: An `sde_lib.SDE` object that represents the forward SDE.
        model: A score model.
        train: `True` for training and `False` for evaluation.
        continuous: If `True`, the score-based model is expected to directly take continuous time steps.

    Returns:
        A score function.
    """
    model_fn = get_model_fn(model, train=train)

    if isinstance(sde, sde_lib.VPSDE) or isinstance(sde, sde_lib.subVPSDE):
        def score_fn(x, t, *args, **kwargs):
            # Scale neural network output by standard deviation and flip sign
            if continuous or isinstance(sde, sde_lib.subVPSDE):
                # For VP-trained models, t=0 corresponds to the lowest noise level
                # The maximum value of time embedding is assumed to 999 for continuously-trained models.
                labels = t * 999
                score = model_fn(x, labels, *args, **kwargs)
                std = sde.marginal_prob(torch.zeros_like(x), t)[1]
            else:
                # For VP-trained models, t=0 corresponds to the lowest noise level
                labels = t * (sde.N - 1)
                score = model_fn(x, labels, *args, **kwargs)
                std = sde.sqrt_1m_alpha_cumprod.to(labels.device)[
                    labels.long()]

            score = -score / std[:, None, None]
            return score

    elif isinstance(sde, sde_lib.VESDE):
        def score_fn(x, t, *args, **kwargs):
            if continuous:
                labels = sde.marginal_prob(torch.zeros_like(x), t)[1]
            else:
                # For VE-trained models, t=0 corresponds to the highest noise level
                labels = sde.T - t
                labels *= sde.N - 1
                labels = torch.round(labels).long()

            score = model_fn(x, labels, *args, **kwargs)
            return score

    else:
        raise NotImplementedError(
            f"SDE class {sde.__class__.__name__} not yet supported.")

    return score_fn


def get_classifier_grad_fn(sde, classifier, train=False, continuous=False, 
                           regress=True, labels='max'):
    logit_fn = get_logit_fn(sde, classifier, train, continuous)
    
    def classifier_grad_fn(x, t, *args, **kwargs):
        with torch.enable_grad():
            x_in = x.detach().requires_grad_(True)
            if regress:
                assert labels in ['max', 'min']
                logit = logit_fn(x_in, t, *args, **kwargs)
                if labels == 'max':
                    prob = logit.sum()
                elif labels == 'min':
                    prob = -logit.sum()
            else:
                logit = logit_fn(x_in, t, *args, **kwargs)
                log_prob = F.log_softmax(logit, dim=-1)
                prob = log_prob[range(len(logit)), labels.view(-1)].sum()
            classifier_grad = torch.autograd.grad(prob, x_in)[0]
        return classifier_grad
    
    return classifier_grad_fn


def get_logit_fn(sde, classifier, train=False, continuous=False):
    classifier_fn = get_model_fn(classifier, train=train)

    if isinstance(sde, sde_lib.VPSDE) or isinstance(sde, sde_lib.subVPSDE):
        def logit_fn(x, t, *args, **kwargs):
            # Scale neural network output by standard deviation and flip sign
            if continuous or isinstance(sde, sde_lib.subVPSDE):
                # For VP-trained models, t=0 corresponds to the lowest noise level
                # The maximum value of time embedding is assumed to 999 for continuously-trained models.
                labels = t * 999
                logit = classifier_fn(x, labels, *args, **kwargs)
            else:
                # For VP-trained models, t=0 corresponds to the lowest noise level
                labels = t * (sde.N - 1)
                logit = classifier_fn(x, labels, *args, **kwargs)
            return logit

    elif isinstance(sde, sde_lib.VESDE):
        def logit_fn(x, t, *args, **kwargs):
            if continuous:
                labels = sde.marginal_prob(torch.zeros_like(x), t)[1]
            else:
                # For VE-trained models, t=0 corresponds to the highest noise level
                labels = sde.T - t
                labels *= sde.N - 1
                labels = torch.round(labels).long()
            logit = classifier_fn(x, labels, *args, **kwargs)
            return logit

    return logit_fn


def get_predictor_fn(sde, model, train=False, continuous=False):
    """Wraps `score_fn` so that the model output corresponds to a real time-dependent score function.

    Args:
        sde: An `sde_lib.SDE` object that represents the forward SDE.
        model: A predictor model.
        train: `True` for training and `False` for evaluation.
        continuous: If `True`, the score-based model is expected to directly take continuous time steps.

    Returns:
        A score function.
    """
    model_fn = get_model_fn(model, train=train)

    if isinstance(sde, sde_lib.VPSDE) or isinstance(sde, sde_lib.subVPSDE):
        def predictor_fn(x, t, *args, **kwargs):
            # Scale neural network output by standard deviation and flip sign
            if continuous or isinstance(sde, sde_lib.subVPSDE):
                # For VP-trained models, t=0 corresponds to the lowest noise level
                # The maximum value of time embedding is assumed to 999 for continuously-trained models.
                labels = t * 999
                pred = model_fn(x, labels, *args, **kwargs)
                std = sde.marginal_prob(torch.zeros_like(x), t)[1]
            else:
                # For VP-trained models, t=0 corresponds to the lowest noise level
                labels = t * (sde.N - 1)
                pred = model_fn(x, labels, *args, **kwargs)
                std = sde.sqrt_1m_alpha_cumprod.to(labels.device)[
                    labels.long()]

            return pred

    elif isinstance(sde, sde_lib.VESDE):
        def predictor_fn(x, t, *args, **kwargs):
            if continuous:
                labels = sde.marginal_prob(torch.zeros_like(x), t)[1]
            else:
                # For VE-trained models, t=0 corresponds to the highest noise level
                labels = sde.T - t
                labels *= sde.N - 1
                labels = torch.round(labels).long()

            pred = model_fn(x, labels, *args, **kwargs)
            return pred

    else:
        raise NotImplementedError(
            f"SDE class {sde.__class__.__name__} not yet supported.")

    return predictor_fn


def to_flattened_numpy(x):
    """Flatten a torch tensor `x` and convert it to numpy."""
    return x.detach().cpu().numpy().reshape((-1,))


def from_flattened_numpy(x, shape):
    """Form a torch tensor with the given `shape` from a flattened numpy array `x`."""
    return torch.from_numpy(x.reshape(shape))


@torch.no_grad()
def mask_adj2node(adj_mask):
    """Convert batched adjacency mask matrices to batched node mask matrices.

    Args:
        adj_mask: [B, N, N] Batched adjacency mask matrices without self-loop edge.

    Output:
        node_mask: [B, N] Batched node mask matrices indicating the valid nodes.
    """

    batch_size, max_num_nodes, _ = adj_mask.shape

    node_mask = adj_mask[:, 0, :].clone()
    node_mask[:, 0] = 1

    return node_mask


@torch.no_grad()
def get_rw_feat(k_step, dense_adj):
    """Compute k_step Random Walk for given dense adjacency matrix."""

    rw_list = []
    deg = dense_adj.sum(-1, keepdims=True)
    AD = dense_adj / (deg + 1e-8)
    rw_list.append(AD)

    for _ in range(k_step):
        rw = torch.bmm(rw_list[-1], AD)
        rw_list.append(rw)
    rw_map = torch.stack(rw_list[1:], dim=1)  # [B, k_step, N, N]

    rw_landing = torch.diagonal(
        rw_map, offset=0, dim1=2, dim2=3)  # [B, k_step, N]
    rw_landing = rw_landing.permute(0, 2, 1)  # [B, N, rw_depth]

    # get the shortest path distance indices
    tmp_rw = rw_map.sort(dim=1)[0]
    spd_ind = (tmp_rw <= 0).sum(dim=1)  # [B, N, N]

    spd_onehot = torch.nn.functional.one_hot(
        spd_ind, num_classes=k_step+1).to(torch.float)
    spd_onehot = spd_onehot.permute(0, 3, 1, 2)  # [B, kstep, N, N]

    return rw_landing, spd_onehot