"""K-means clustering""" # Authors: Gael Varoquaux # Thomas Rueckstiess # James Bergstra # Jan Schlueter # Nelle Varoquaux # Peter Prettenhofer # License: BSD import warnings from itertools import cycle, izip import numpy as np import scipy.sparse as sp from math import floor from scikits.learn.base import BaseEstimator from scikits.learn.metrics.pairwise import euclidean_distances from scikits.learn.utils import check_random_state from scikits.learn.utils import check_arrays from scikits.learn.utils import shuffle from scikits.learn.utils import gen_even_slices from scikits.learn.cluster import _k_means ############################################################################### # Initialisation heuristic def k_init(X, k, n_local_trials=None, random_state=None, x_squared_norms=None, return_indices=False): """Init k seeds according to kmeans++ Parameters ----------- X: array, shape (n_samples, n_features) The data to pick seeds for k: integer The number of seeds to choose n_local_trials: integer, optional The number of seeding trials for each center (except the first), of which the one reducing inertia the most is greedily chosen. Set to None to make the number of trials depend logarithmically on the number of seeds (2+log(k)); this is the default. random_state: numpy.RandomState, optional The generator used to initialize the centers. Defaults to numpy.random. x_squared_norms: array, shape (n_samples,), optional Squared euclidean norm of each data point. Pass it if you have it at hands already to avoid it being recomputed here. Default: None return_indices: boolean, optional True, return the indices of X which are chosen to be seeds False, return the seeds themselves Notes ------ Selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. see: Arthur, D. and Vassilvitskii, S. "k-means++: the advantages of careful seeding". ACM-SIAM symposium on Discrete algorithms. 2007 Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip, which is the implementation used in the aforementioned paper. """ n_samples, n_features = X.shape random_state = check_random_state(random_state) centers = np.empty((k, n_features)) centers_id = np.empty(k,dtype=np.int) # Set the number of local seeding trials if none is given if n_local_trials is None: # This is what Arthur/Vassilvitskii tried, but did not report # specific results for other than mentioning in the conclusion # that it helped. n_local_trials = 2 + int(np.log(k)) # Pick first center randomly center_id = random_state.randint(n_samples) centers[0] = X[center_id] centers_id[0] = center_id # Initialize list of closest distances and calculate current potential if x_squared_norms is None: x_squared_norms = X.copy() x_squared_norms **= 2 x_squared_norms = x_squared_norms.sum(axis=1) closest_dist_sq = euclidean_distances( np.atleast_2d(centers[0]), X, Y_norm_squared=x_squared_norms, squared=True) current_pot = closest_dist_sq.sum() # Pick the remaining k-1 points for c in xrange(1, k): # Choose center candidates by sampling with probability proportional # to the squared distance to the closest existing center rand_vals = random_state.random_sample(n_local_trials) * current_pot candidate_ids = np.searchsorted(closest_dist_sq.cumsum(), rand_vals) # Compute distances to center candidates distance_to_candidates = euclidean_distances( X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True) # Decide which candidate is the best best_candidate = None best_pot = None best_dist_sq = None for trial in xrange(n_local_trials): # Compute potential when including center candidate new_dist_sq = np.minimum(closest_dist_sq, distance_to_candidates[trial]) new_pot = new_dist_sq.sum() # Store result if it is the best local trial so far if (best_candidate is None) or (new_pot < best_pot): best_candidate = candidate_ids[trial] best_pot = new_pot best_dist_sq = new_dist_sq # Permanently add best center candidate found in local tries centers[c] = X[best_candidate] centers_id[c]=best_candidate current_pot = best_pot closest_dist_sq = best_dist_sq if return_indices: return centers_id else: return centers