Source code for snf.metrics

# -*- coding: utf-8 -*-
"""
Functions for computing various metrics to aid interpretation of similarity
network fusion outputs.
"""

import numpy as np
from sklearn.cluster import spectral_clustering
from sklearn.metrics import v_measure_score
from sklearn.utils.validation import check_random_state
from . import compute


[docs]def nmi(labels): """ Calculates normalized mutual information for all combinations of `labels` Uses :py:func:`sklearn.metrics.v_measure_score` for calculation; refer to that codebase for information on algorithm. Parameters ---------- labels : m-length list of (N,) array_like List of label arrays Returns ------- nmi : (m x m) np.ndarray NMI score for all combinations of `labels` Examples -------- >>> import numpy as np >>> label1 = np.array([1, 1, 1, 2, 2, 2]) >>> label2 = np.array([1, 1, 2, 2, 2, 2]) >>> from snf import metrics >>> metrics.nmi([label1, label2]) array([[1. , 0.47870397], [0.47870397, 1. ]]) """ # create empty array for output nmi = np.empty(shape=(len(labels), len(labels))) # get indices for all combinations of labels and calculate NMI for x, y in np.column_stack(np.triu_indices_from(nmi)): nmi[x, y] = v_measure_score(labels[x], labels[y]) # make output symmetric nmi = np.triu(nmi) + np.triu(nmi, k=1).T return nmi
[docs]def rank_feature_by_nmi(inputs, W, *, K=20, mu=0.5, n_clusters=None): """ Calculates NMI of each feature in `inputs` with `W` Parameters ---------- inputs : list-of-tuple Each tuple should contain (1) an (N, M) data array, where N is samples M is features, and (2) a string indicating the metric to use to compute a distance matrix for the given data. This MUST be one of the options available in :py:func:`scipy.spatial.distance.cdist` W : (N, N) array_like Similarity array generated by :py:func:`snf.compute.snf` K : (0, N) int, optional Hyperparameter normalization factor for scaling. Default: 20 mu : (0, 1) float, optional Hyperparameter normalization factor for scaling. Default: 0.5 n_clusters : int, optional Number of desired clusters. Default: determined by eigengap (see `snf.get_n_clusters()`) Returns ------- nmi : list of (M,) np.ndarray Normalized mutual information scores for each feature of input arrays """ if n_clusters is None: n_clusters = compute.get_n_clusters(W)[0] snf_labels = spectral_clustering(W, n_clusters) nmi = [np.empty(shape=(d.shape[-1])) for d, m in inputs] for ndtype, (dtype, metric) in enumerate(inputs): for nfeature, feature in enumerate(np.asarray(dtype).T): aff = compute.make_affinity(np.vstack(feature), K=K, mu=mu, metric=metric) aff_labels = spectral_clustering(aff, n_clusters) nmi[ndtype][nfeature] = v_measure_score(snf_labels, aff_labels) return nmi
def _silhouette_samples(arr, labels): """ Calculates modified silhouette score from affinity matrix The Silhouette Coefficient is calculated using the mean intra-cluster affinity (`a`) and the mean nearest-cluster affinity (`b`) for each sample. The Silhouette Coefficient for a sample is `(b - a) / max(a,b)`. To clarify, `b` is the distance between a sample and the nearest cluster that the sample is not a part of. This corresponds to the cluster with the next *highest* affinity (opposite how this metric would be computed for a distance matrix). Parameters ---------- arr : (N, N) array_like Array of pairwise affinities between samples labels : (N,) array_like Predicted labels for each sample Returns ------- sil_samples : (N,) np.ndarray Modified (affinity) silhouette scores for each sample Notes ----- Code is *lightly* modified from the `sklearn` implementation. See: `sklearn.metrics.silhouette_samples` References ---------- .. [1] `Peter J. Rousseeuw (1987). Silhouettes: a Graphical Aid to the Interpretation and Validation of Cluster Analysis. Computational and Applied Mathematics, 20, 53-65. <http://www.sciencedirect.com/science/article/pii/0377042787901257>`_ .. [2] `Wikipedia entry on the Silhouette Coefficient <https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_ .. [3] `Pedregosa, F., Varoquaux, G., Gramfort, A., Michel, V., Thirion, B., Grisel, O., ... & Vanderplas, J. (2011). Scikit-learn: Machine learning in Python. Journal of Machine Learning Research, 12, 2825-2830. <https://github.com/scikit-learn/>`_ """ from sklearn.preprocessing import LabelEncoder from sklearn.utils import check_X_y def check_number_of_labels(n_labels, n_samples): if not 1 < n_labels < n_samples: raise ValueError("Number of labels is %d. Valid values are 2 " "to n_samples - 1 (inclusive)" % n_labels) arr, labels = check_X_y(arr, labels, accept_sparse=['csc', 'csr'], copy=True) arr[np.diag_indices_from(arr)] = 0 le = LabelEncoder() labels = le.fit_transform(labels) check_number_of_labels(len(le.classes_), arr.shape[0]) unique_labels = le.classes_ n_samples_per_label = np.bincount(labels, minlength=len(unique_labels)) # For sample i, store the mean distance of the cluster to which # it belongs in intra_clust_dists[i] intra_clust_aff = np.zeros(arr.shape[0], dtype=arr.dtype) # For sample i, store the mean distance of the second closest # cluster in inter_clust_dists[i] inter_clust_aff = intra_clust_aff.copy() for curr_label in range(len(unique_labels)): # Find inter_clust_dist for all samples belonging to the same # label. mask = labels == curr_label current_distances = arr[mask] # Leave out current sample. n_samples_curr_lab = n_samples_per_label[curr_label] - 1 if n_samples_curr_lab != 0: intra_clust_aff[mask] = np.sum( current_distances[:, mask], axis=1) / n_samples_curr_lab # Now iterate over all other labels, finding the mean # cluster distance that is closest to every sample. for other_label in range(len(unique_labels)): if other_label != curr_label: other_mask = labels == other_label other_distances = np.mean( current_distances[:, other_mask], axis=1) inter_clust_aff[mask] = np.maximum( inter_clust_aff[mask], other_distances) sil_samples = intra_clust_aff - inter_clust_aff sil_samples /= np.maximum(intra_clust_aff, inter_clust_aff) # score 0 for clusters of size 1, according to the paper sil_samples[n_samples_per_label.take(labels) == 1] = 0 return sil_samples
[docs]def silhouette_score(arr, labels): """ Calculates modified silhouette score from affinity matrix The Silhouette Coefficient is calculated using the mean intra-cluster affinity (`a`) and the mean nearest-cluster affinity (`b`) for each sample. The Silhouette Coefficient for a sample is `(b - a) / max(a,b)`. To clarify, `b` is the distance between a sample and the nearest cluster that the sample is not a part of. This corresponds to the cluster with the next *highest* affinity (opposite how this metric would be computed for a distance matrix). Parameters ---------- arr : (N, N) array_like Array of pairwise affinities between samples labels : (N,) array_like Predicted labels for each sample Returns ------- silhouette_score : float Modified (affinity) silhouette score Notes ----- Code is *lightly* modified from the ``sklearn`` implementation. See: `sklearn.metrics.silhouette_score` """ return np.mean(_silhouette_samples(arr, labels))
[docs]def affinity_zscore(arr, labels, n_perms=1000, seed=None): """ Calculates z-score of silhouette (affinity) score by permutation Parameters ---------- arr : (N, N) array_like Array of pairwise affinities between samples labels : (N,) array_like Predicted labels for each sample n_perms : int, optional Number of permutations. Default: 1000 seed : int, optional Random seed. Default: None Returns ------- z_aff : float Z-score of silhouette (affinity) score """ rs = check_random_state(seed) dist = np.empty(shape=(n_perms,)) for perm in range(n_perms): new_labels = rs.permutation(labels) dist[perm] = silhouette_score(arr, new_labels) true_aff_score = silhouette_score(arr, labels) z_aff = (true_aff_score - dist.mean()) / dist.std() return z_aff