"""K-means clustering"""

# Authors: Gael Varoquaux <gael.varoquaux@normalesup.org>
#          Thomas Rueckstiess <ruecksti@in.tum.de>
#          James Bergstra <james.bergstra@umontreal.ca>
#          Jan Schlueter <scikit-learn@jan-schlueter.de>
#          Nelle Varoquaux
#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
# License: BSD

import warnings
from itertools import cycle, izip

import numpy as np
import scipy.sparse as sp
from math import floor

from scikits.learn.base import BaseEstimator
from scikits.learn.metrics.pairwise import euclidean_distances
from scikits.learn.utils import check_random_state
from scikits.learn.utils import check_arrays
from scikits.learn.utils import shuffle
from scikits.learn.utils import gen_even_slices

from scikits.learn.cluster import _k_means


###############################################################################
# Initialisation heuristic


def k_init(X, k, n_local_trials=None, random_state=None, x_squared_norms=None, return_indices=False):
    """Init k seeds according to kmeans++

    Parameters
    -----------
    X: array, shape (n_samples, n_features)
        The data to pick seeds for

    k: integer
        The number of seeds to choose

    n_local_trials: integer, optional
        The number of seeding trials for each center (except the first),
        of which the one reducing inertia the most is greedily chosen.
        Set to None to make the number of trials depend logarithmically
        on the number of seeds (2+log(k)); this is the default.

    random_state: numpy.RandomState, optional
        The generator used to initialize the centers. Defaults to numpy.random.

    x_squared_norms: array, shape (n_samples,), optional
        Squared euclidean norm of each data point. Pass it if you have it at
        hands already to avoid it being recomputed here. Default: None
    
    return_indices: boolean, optional
        True, return the indices of X which are chosen to be seeds
        False, return the seeds themselves

    Notes
    ------
    Selects initial cluster centers for k-mean clustering in a smart way
    to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
    "k-means++: the advantages of careful seeding". ACM-SIAM symposium
    on Discrete algorithms. 2007

    Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
    which is the implementation used in the aforementioned paper.
    """
    n_samples, n_features = X.shape
    random_state = check_random_state(random_state)

    centers = np.empty((k, n_features))
    centers_id = np.empty(k,dtype=np.int)
    
    # Set the number of local seeding trials if none is given
    if n_local_trials is None:
        # This is what Arthur/Vassilvitskii tried, but did not report
        # specific results for other than mentioning in the conclusion
        # that it helped.
        n_local_trials = 2 + int(np.log(k))

    # Pick first center randomly
    center_id = random_state.randint(n_samples)
    centers[0] = X[center_id]
    centers_id[0] = center_id
    
    # Initialize list of closest distances and calculate current potential
    if x_squared_norms is None:
        x_squared_norms = X.copy()
        x_squared_norms **= 2
        x_squared_norms = x_squared_norms.sum(axis=1)
    closest_dist_sq = euclidean_distances(
        np.atleast_2d(centers[0]), X, Y_norm_squared=x_squared_norms,
        squared=True)
    current_pot = closest_dist_sq.sum()

    # Pick the remaining k-1 points
    for c in xrange(1, k):
        # Choose center candidates by sampling with probability proportional
        # to the squared distance to the closest existing center
        rand_vals = random_state.random_sample(n_local_trials) * current_pot
        candidate_ids = np.searchsorted(closest_dist_sq.cumsum(), rand_vals)

        # Compute distances to center candidates
        distance_to_candidates = euclidean_distances(
            X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)

        # Decide which candidate is the best
        best_candidate = None
        best_pot = None
        best_dist_sq = None
        for trial in xrange(n_local_trials):
            # Compute potential when including center candidate
            new_dist_sq = np.minimum(closest_dist_sq,
                                     distance_to_candidates[trial])
            new_pot = new_dist_sq.sum()

            # Store result if it is the best local trial so far
            if (best_candidate is None) or (new_pot < best_pot):
                best_candidate = candidate_ids[trial]
                best_pot = new_pot
                best_dist_sq = new_dist_sq

        # Permanently add best center candidate found in local tries
        centers[c] = X[best_candidate]
        centers_id[c]=best_candidate
        current_pot = best_pot
        closest_dist_sq = best_dist_sq
        
    if return_indices:
        return centers_id
    else:
        return centers