How to use clustering evaluation elbow method in K-Medoids

Collapse
X
 
  • Time
  • Show
Clear All
new posts
  • kikiegoguma
    New Member
    • Oct 2016
    • 2

    How to use clustering evaluation elbow method in K-Medoids

    i use this code as my reference :


    Code:
    import pylab as plt
    import numpy as np
    from scipy.spatial.distance import cdist, pdist
    from sklearn.cluster import KMeans
    from sklearn.datasets import load_iris
    
    iris = load_iris()
    
    k = range(1,11)
    
    clusters = [KMeans(n_clusters = c,init = 'k-means++').fit(iris.data) for c in k]
    centr_lst = [cc.cluster_centers_ for cc in clusters]
    
    k_distance = [cdist(iris.data, cent, 'euclidean') for cent in centr_lst]
    clust_indx = [np.argmin(kd,axis=1) for kd in k_distance]
    distances = [np.min(kd,axis=1) for kd in k_distance]
    avg_within = [np.sum(dist)/iris.data.shape[0] for dist in distances]
    
    with_in_sum_square = [np.sum(dist ** 2) for dist in distances]
    to_sum_square = np.sum(pdist(iris.data) ** 2)/iris.data.shape[0]
    bet_sum_square = to_sum_square - with_in_sum_square
    
    kidx = 2
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(k, avg_within, 'g*-')
    ax.plot(k[kidx], avg_within[kidx], marker='o', markersize=12, \
    markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
    plt.grid(True)
    plt.xlabel('Number of clusters')
    plt.ylabel('Average within-cluster sum of squares')
    plt.title('Elbow for KMeans clustering (IRIS Data)')
    i want to change K-Means with K-Medoids.
    this is my k-medoids code :
    Code:
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.cm as cm
    from copy import deepcopy
    from IPython import embed
    import time
    
    def _get_init_centers(n_clusters, n_samples):
        '''return random points as initial centers'''
        init_ids = []
        while len(init_ids) < n_clusters:
            _ = np.random.randint(0,n_samples)
            if not _ in init_ids:
                init_ids.append(_)
        return init_ids
    
    def _get_distance(data1, data2):
        '''example distance function'''
        return np.sqrt(np.sum((data1 - data2)**2))
    
    def _get_cost(X, centers_id, dist_func):
        '''return total cost and cost of each cluster'''
        st = time.time()
        dist_mat = np.zeros((len(X),len(centers_id)))
        # compute distance matrix
        for j in range(len(centers_id)):
            center = X[centers_id[j],:]
            for i in range(len(X)):
                if i == centers_id[j]:
                    dist_mat[i,j] = 0.
                else:
                    dist_mat[i,j] = dist_func(X[i,:], center)
        #print 'cost ', -st+time.time()
        mask = np.argmin(dist_mat,axis=1)
        members = np.zeros(len(X))
        costs = np.zeros(len(centers_id))
        for i in range(len(centers_id)):
            mem_id = np.where(mask==i)
            members[mem_id] = i
            costs[i] = np.sum(dist_mat[mem_id,i])
        return members, costs, np.sum(costs), dist_mat
    
    def _kmedoids_run(X, n_clusters, dist_func, max_iter=3, tol=0.000001, verbose=True):
        '''run algorithm return centers, members, and etc.'''
        # Get initial centers
        n_samples, n_features = X.shape
        init_ids = _get_init_centers(n_clusters,n_samples)
        if verbose:
            print 'Initial centers are ', init_ids
        centers = init_ids
        members, costs, tot_cost, dist_mat = _get_cost(X, init_ids,dist_func)
        cc,SWAPED = 0, True
        while True:
            SWAPED = False
            for i in range(n_samples):
                if not i in centers:
                    for j in range(len(centers)):
                        centers_ = deepcopy(centers)
                        centers_[j] = i
                        members_, costs_, tot_cost_, dist_mat_ = _get_cost(X, centers_,dist_func)
                        if tot_cost_-tot_cost < tol:
                            members, costs, tot_cost, dist_mat = members_, costs_, tot_cost_, dist_mat_
                            centers = centers_
                            SWAPED = True
                            if verbose:
                                print 'Change centers to ', centers
            if cc > max_iter:
                if verbose:
                    print 'End Searching by reaching maximum iteration', max_iter
                break
            if not SWAPED:
                if verbose:
                    print 'End Searching by no swaps'
                break
            cc += 1
        return centers,members, costs, tot_cost, dist_mat
    
    class KMedoids(object):
        '''
        Main API of KMedoids Clustering
    
        Parameters
        --------
            n_clusters: number of clusters
            dist_func : distance function
            max_iter: maximum number of iterations
            tol: tolerance
    
        Attributes
        --------
            labels_    :  cluster labels for each data item
            centers_   :  cluster centers id
            costs_     :  array of costs for each cluster
            n_iter_    :  number of iterations for the best trail
    
        Methods
        -------
            fit(X): fit the model
                - X: 2-D numpy array, size = (n_sample, n_features)
    
            predict(X): predict cluster id given a test dataset.
        '''
        def __init__(self, n_clusters, dist_func=_get_distance, max_iter=3, tol=0.000001):
            self.n_clusters = n_clusters
            self.dist_func = dist_func
            self.max_iter = max_iter
            self.tol = tol
    
        def fit(self, X, plotit=True, verbose=True):
            centers, members, costs, tot_cost, dist_mat = _kmedoids_run(
                X, self.n_clusters, self.dist_func, max_iter=self.max_iter, tol=self.tol, verbose=verbose)
            if plotit:
                fig = plt.figure()
                ax = fig.add_subplot(111)
    
    
                for i in range(len(centers)):
                    X_c = X[members == i, :]
                    ax.scatter(X_c[:, 0], X_c[:, 1], label = i+1,alpha=0.5, s=30)
                    ax.scatter(X[centers[i], 0], X[centers[i], 1],alpha=1., s=250, marker='*')
                #ax.legend(bbox_to_anchor=(1, 1), fontsize="small", loc=2, borderaxespad=0.)
                colormap = plt.cm.gist_ncar  # nipy_spectral, Set1,Paired
                colorst = [colormap(i) for i in np.linspace(0, 0.9, len(ax.collections))]
                for t, j1 in enumerate(ax.collections):
                    j1.set_color(colorst[t])
    
            return
    
    
        def predict(self,X):
            raise NotImplementedError()
    could you halp how to do it?
    thanks
Working...