Source code for snf.cv

# -*- coding: utf-8 -*-
"""
Code for implementing cross-validation of similarity network fusion. Useful for
determining the "optimal" number of clusters in a dataset within a
cross-validated, data-driven framework.
"""

import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import spectral_clustering
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.utils.extmath import cartesian
from sklearn.utils.validation import check_random_state
from . import compute, metrics

try:
    from numba import njit, prange
    use_numba = True
except ImportError:
    prange = range
    use_numba = False

try:
    from joblib import delayed, Parallel
    use_joblib = True
except ImportError:
    use_joblib = False


def compute_SNF(*data, metric='sqeuclidean', K=20, mu=1, n_clusters=None,
                t=20, n_perms=1000, normalize=True, seed=None):
    """
    Runs a full SNF on `data` and returns cluster affinity scores and labels

    Parameters
    ----------
    *data : (N, M) array_like
        Raw data arrays, where `N` is samples and `M` is features.
    metric : str or list-of-str, optional
        Distance metrics to compute on `data`. Must be one of available metrics
        in ``scipy.spatial.distance.pdist``. If a list is provided for `data` a
        list of equal length may be supplied here. Default: 'sqeuclidean'
    K : int, optional
        Number of neighbors to compare similarity against. Default: 20
    mu : (0, 1) float, optional
        Hyperparameter normalization factor for scaling. Default: 0.5
    n_clusters : int or list-of-int, optional
        Number of clusters to find in combined data. Default: determined by
        eigengap (see `compute.get_n_clusters()`)
    t : int, optional
        Number of iterations to perform information swapping. Default: 20
    n_perms : int, optional
        Number of permutations for calculating z_affinity. Default: 1000
    normalize : bool, optional
        Whether to normalize (zscore) the data before constructing the affinity
        matrix. Each feature is separately normalized. Default: True

    Returns
    -------
    z_affinity : list-of-float
        Z-score of silhouette (affinity) score
    snf_labels : list of (N,) np.ndarray
        Cluster labels for subjects
    """

    rs = check_random_state(seed)

    # make affinity matrices for all inputs and run SNF
    all_aff = compute.make_affinity(*data, metric=metric, K=K, mu=mu,
                                    normalize=normalize)
    snf_aff = compute.snf(*all_aff, K=K, t=t)

    # get estimated number of clusters (if not provided)
    if n_clusters is None:
        n_clusters = [compute.get_n_clusters(snf_aff)[0]]
    elif isinstance(n_clusters, int):
        n_clusters = [n_clusters]

    # perform spectral clustering across all `n_clusters`
    snf_labels = [spectral_clustering(snf_aff, clust, random_state=rs)
                  for clust in n_clusters]

    # get z-affinity as desired
    if n_perms is not None and n_perms > 0:
        z_affinity = [metrics.affinity_zscore(snf_aff, label, n_perms, seed=rs)
                      for label in snf_labels]
        return z_affinity, snf_labels

    return snf_labels


[docs]def snf_gridsearch(*data, metric='sqeuclidean', mu=None, K=None, n_clusters=None, t=20, folds=3, n_perms=1000, normalize=True, seed=None): """ Performs grid search for SNF hyperparameters `mu`, `K`, and `n_clusters` Uses `folds`-fold CV to subsample `data` and performs grid search on `mu`, `K`, and `n_clusters` hyperparameters for SNF. There is no testing on the left-out sample for each CV fold---it is simply removed. Parameters ---------- *data : (N, M) array_like Raw data arrays, where `N` is samples and `M` is features. metric : str or list-of-str, optional Distance metrics to compute on `data`. Must be one of available metrics in ``scipy.spatial.distance.pdist``. If a list is provided for `data` a list of equal length may be supplied here. Default: 'sqeuclidean' mu : array_like, optional Array of `mu` values to search over. Default: np.arange(0.35, 1.05, 0.05) K : array_like, optional Array of `K` values to search over. Default: np.arange(5, N // 2, 5) n_clusters : array_like, optional Array of cluster numbers to search over. Default: np.arange(2, N // 20) t : int, optional Number of iterations for SNF. Default: 20 folds : int, optional Number of folds to use for cross-validation. Default: 3 n_perms : int, optional Number of permutations for generating z-score of silhouette (affinity) to assess reliability of SNF clustering output. Default: 1000 normalize : bool, optional Whether to normalize (z-score) `data` arrrays before constructing affinity matrices. Each feature is separately normalized. Default: True seed : int, optional Random seed. Default: None Returns ------- grid_zaff : (F,) list of (S, K, C) np.ndarray Where `S` is mu, `K` is K, `C` is n_clusters, and `F` is the number of folds for CV. The entries in the individual arrays correspond to the z-scored silhouette (affinity). grid_labels : (F,) list of (S, K, C, N) np.ndarray Where `S` is mu, `K` is K, `C` is n_clusters, and `F` is the number of folds for CV. The `N` entries along the last dimension correspond to the cluster labels for the given parameter combination. """ # get inputs rs = check_random_state(seed) # check gridsearch parameter inputs n_samples = len(data[0]) if mu is None: mu = np.arange(0.35, 1.05, 0.05) if K is None: K = np.arange(5, n_samples // folds, 5, dtype='int') if n_clusters is None: n_clusters = np.arange(2, n_samples // 20, dtype='int') mu, K, n_clusters = np.asarray(mu), np.asarray(K), np.asarray(n_clusters) # empty matrices to hold outputs of SNF grid_zaff, grid_labels = [], [] # iterate through folds kf = KFold(n_splits=folds) for n_fold, (train_index, _) in enumerate(kf.split(data[0])): # subset data arrays fold_data = [d[train_index] for d in data] # create empty arrays to store outputs fold_zaff = np.empty(shape=(K.size, mu.size, n_clusters.size,)) fold_labels = np.empty(shape=(K.size, mu.size, n_clusters.size, len(train_index))) # generate SNF metrics for all parameter combinations in current fold for n, curr_params in enumerate(ParameterGrid(dict(K=K, mu=mu))): zaff, labels = compute_SNF(*fold_data, metric=metric, n_clusters=n_clusters, t=t, n_perms=n_perms, normalize=normalize, seed=rs, **curr_params) # get indices for current parameters and store zaff / labels inds = np.unravel_index(n, fold_zaff.shape[:2]) fold_zaff[inds] = zaff fold_labels[inds] = labels # we want matrices of shape (mu, K, n_clusters[, folds]) # ParameterGrid iterates through params alphabetically, so transpose grid_zaff.append(fold_zaff.transpose(1, 0, 2)) grid_labels.append(fold_labels.transpose(1, 0, 2, 3)) return grid_zaff, grid_labels
[docs]def get_optimal_params(zaff, labels, neighbors='edges'): """ Finds optimal parameters for SNF based on K-folds grid search Parameters ---------- zaff : (F,) list of (S, K, C) np.ndarray Where `S` is mu, `K` is K, `C` is n_clusters, and `F` is the number of folds for CV. The entries in the individual arrays correspond to the z-scored silhouette (affinity). labels : (F,) list of (S, K, C, N) np.ndarray Where `S` is mu, `K` is K, `C` is n_clusters, and `F` is the number of folds for CV. The `N` entries along the last dimension correspond to the cluster labels for the given parameter combination. neighbors : str, optional How many neighbors to consider when calculating z-Rand kernel. Must be in ['edges', 'corners']. Default: 'edges' Returns ------- mu : int Index along S indicating optimal mu parameter K : int Index along K indicating optimal K parameter """ # extract S x K information from zaff shape = zaff[0].shape[:-1] + (-1,) # get max indices for z-affinity arrays indices = [extract_max_inds(aff) for aff in zaff] # get labels corresponding to optimal parameters labgrid = [lab[tuple(inds)] for inds, lab in zip(indices, labels)] # convolve zrand with label array to find stable parameter solutions zrand = [zrand_convolve(lab, neighbors)[0] for lab in labgrid] # get indices of parameters mu, K = np.unravel_index(np.mean(zrand, axis=0).argmax(), shape[:-1]) return mu, K
def get_neighbors(ijk, shape, neighbors='faces'): """ Returns indices of neighbors to `ijk` in array of shape `shape` Parameters ---------- ijk : array_like Indices of coordinates of interest shape : tuple Tuple indicating shape of array from which `ijk` is drawn neighbors : str, optional One of ['faces', 'edges', 'corners']. Default: 'faces' Returns ------- inds : tuple of tuples Indices of neighbors to `ijk` (includes input coordinates) """ neigh = ['faces', 'edges', 'corners'] if neighbors not in neigh: raise ValueError('Provided neighbors {} not valid. Must be one of {}.' .format(neighbors, neigh)) ijk = np.asarray(ijk) if ijk.ndim != 2: ijk = ijk[np.newaxis] if ijk.shape[-1] != len(shape): raise ValueError('Provided coordinate {} needs to have same ' 'dimensions as provided shape {}'.format(ijk, shape)) dist = np.sqrt(neigh.index(neighbors) + 1) xyz = cartesian([range(i) for i in shape]) inds = tuple(map(tuple, xyz[np.ravel(cdist(ijk, xyz) <= dist)].T)) return inds def extract_max_inds(grid, axis=-1): """ Returns indices to extract max arguments from `grid` along `axis` dimension Parameters ---------- grid : array_like Input array axis : int, optional Which axis to extract maximum arguments along. Default: -1 Returns ------- inds : list of np.ndarray Indices """ # get shape of `grid` without `axis` shape = np.delete(np.asarray(grid.shape), axis) # get indices to index maximum values along `axis` iind = np.meshgrid(*(range(f) for f in shape[::-1])) if len(iind) > 1: iind = [iind[1], iind[0], *iind[2:]] imax = grid.argmax(axis=axis) if axis == -1: return iind + [imax] elif axis < -1: axis += 1 iind.insert(axis, imax) return iind def _dummyvar(labels): """ Generates dummy-coded array from provided community assignment `labels` Parameters ---------- labels : (N,) array_like Labels assigning `N` samples to `G` groups Returns ------- ci : (N, G) numpy.ndarray Dummy-coded array where 1 indicates that a sample belongs to a group """ comms = np.unique(labels) ci = np.zeros((len(labels), len(comms))) for n, grp in enumerate(comms): ci[:, n] = labels == grp return ci def zrand(X, Y): """ Calculates the z-Rand index of two community assignments Parameters ---------- X, Y : (n, 1) array_like Community assignment vectors to compare Returns ------- z_rand : float Z-rand index References ---------- .. [1] `Amanda L. Traud, Eric D. Kelsic, Peter J. Mucha, and Mason A. Porter. (2011). Comparing Community Structure to Characteristics in Online Collegiate Social Networks. SIAM Review, 53, 526-543 <https://arxiv.org/abs/0809.0690>`_ """ if X.ndim > 1 or Y.ndim > 1: if X.shape[-1] > 1 or Y.shape[-1] > 1: raise ValueError('X and Y must have only one-dimension each. ' 'Please check inputs.') Xf = X.flatten() Yf = Y.flatten() n = len(Xf) indx, indy = _dummyvar(Xf), _dummyvar(Yf) Xa = indx.dot(indx.T) Ya = indy.dot(indy.T) M = n * (n - 1) / 2 M1 = Xa.nonzero()[0].size / 2 M2 = Ya.nonzero()[0].size / 2 wab = np.logical_and(Xa, Ya).nonzero()[0].size / 2 mod = n * (n**2 - 3 * n - 2) C1 = mod - (8 * (n + 1) * M1) + (4 * np.power(indx.sum(0), 3).sum()) C2 = mod - (8 * (n + 1) * M2) + (4 * np.power(indy.sum(0), 3).sum()) a = M / 16 b = ((4 * M1 - 2 * M)**2) * ((4 * M2 - 2 * M)**2) / (256 * (M**2)) c = C1 * C2 / (16 * n * (n - 1) * (n - 2)) d = ((((4 * M1 - 2 * M)**2) - (4 * C1) - (4 * M)) * (((4 * M2 - 2 * M)**2) - (4 * C2) - (4 * M)) / (64 * n * (n - 1) * (n - 2) * (n - 3))) sigw2 = a - b + c + d # catch any negatives if sigw2 < 0: return 0 z_rand = (wab - ((M1 * M2) / M)) / np.sqrt(sigw2) return z_rand def zrand_partitions(communities): """ Calculates average and std of z-Rand for all pairs of community assignments Iterates through every pair of community assignment vectors in `communities` and calculates the z-Rand score to assess their similarity. Returns the mean and standard deviation of all z-Rand scores. Parameters ---------- communities : (S, R) array_like Community assignments for `S` samples over `R` partitions Returns ------- zrand_avg : float Average z-Rand score over pairs of community assignments zrand_std : float Standard deviation of z-Rand over pairs of community assignments """ n_partitions = communities.shape[-1] all_zrand = np.zeros(int(n_partitions * (n_partitions - 1) / 2)) for c1 in range(n_partitions): for c2 in range(c1 + 1, n_partitions): idx = int((c1 * n_partitions) + c2 - ((c1 + 1) * (c1 + 2) // 2)) all_zrand[idx] = zrand(communities[:, c1], communities[:, c2]) return np.nanmean(all_zrand), np.nanstd(all_zrand) if use_numba: _dummyvar = njit(_dummyvar) zrand = njit(zrand) zrand_partitions = njit(zrand_partitions) def zrand_convolve(labelgrid, neighbors='edges', return_std=False, n_proc=-1): """ Calculates the avg and std z-Rand index using kernel over `labelgrid` Kernel is determined by `neighbors`, which can include all entries with touching edges (i.e., 4 neighbors) or corners (i.e., 8 neighbors). Parameters ---------- grid : (S, K, N) array_like Array containing cluster labels for each `N` samples, where `S` is mu and `K` is K. neighbors : str, optional How many neighbors to consider when calculating Z-rand kernel. Must be in ['edges', 'corners']. Default: 'edges' return_std : bool, optional Whether to return `zrand_std` in addition to `zrand_avg`. Default: True Returns ------- zrand_avg : (S, K) np.ndarray Array containing average of the z-Rand index calculated using provided neighbor kernel zrand_std : (S, K) np.ndarray Array containing standard deviation of the z-Rand index """ def _get_zrand(ijk): ninds = get_neighbors(ijk, shape=shape, neighbors=neighbors) return zrand_partitions(labelgrid[ninds].T) shape = labelgrid.shape[:-1] inds = cartesian([range(i) for i in shape]) if use_joblib: _zr = Parallel(n_jobs=n_proc)(delayed(_get_zrand)(ijk) for ijk in inds) else: _zr = [_get_zrand(ijk) for ijk in inds] zr = np.empty(shape=shape + (2,)) for ijk, z in zip(inds, _zr): zr[tuple(ijk)] = z if return_std: return zr[..., 0], zr[..., 1] return zr[..., 0]