How to use clustering evaluation elbow method in K-Medoids

**kikiegoguma** · Oct 24 '16, 03:23 AM

i use this code as my reference :

Code:

import pylab as plt
import numpy as np
from scipy.spatial.distance import cdist, pdist
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris

iris = load_iris()

k = range(1,11)

clusters = [KMeans(n_clusters = c,init = 'k-means++').fit(iris.data) for c in k]
centr_lst = [cc.cluster_centers_ for cc in clusters]

k_distance = [cdist(iris.data, cent, 'euclidean') for cent in centr_lst]
clust_indx = [np.argmin(kd,axis=1) for kd in k_distance]
distances = [np.min(kd,axis=1) for kd in k_distance]
avg_within = [np.sum(dist)/iris.data.shape[0] for dist in distances]

with_in_sum_square = [np.sum(dist ** 2) for dist in distances]
to_sum_square = np.sum(pdist(iris.data) ** 2)/iris.data.shape[0]
bet_sum_square = to_sum_square - with_in_sum_square

kidx = 2

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(k, avg_within, 'g*-')
ax.plot(k[kidx], avg_within[kidx], marker='o', markersize=12, \
markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Average within-cluster sum of squares')
plt.title('Elbow for KMeans clustering (IRIS Data)')

i want to change K-Means with K-Medoids.
this is my k-medoids code :

Code:

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from copy import deepcopy
from IPython import embed
import time

def _get_init_centers(n_clusters, n_samples):
    '''return random points as initial centers'''
    init_ids = []
    while len(init_ids) < n_clusters:
        _ = np.random.randint(0,n_samples)
        if not _ in init_ids:
            init_ids.append(_)
    return init_ids

def _get_distance(data1, data2):
    '''example distance function'''
    return np.sqrt(np.sum((data1 - data2)**2))

def _get_cost(X, centers_id, dist_func):
    '''return total cost and cost of each cluster'''
    st = time.time()
    dist_mat = np.zeros((len(X),len(centers_id)))
    # compute distance matrix
    for j in range(len(centers_id)):
        center = X[centers_id[j],:]
        for i in range(len(X)):
            if i == centers_id[j]:
                dist_mat[i,j] = 0.
            else:
                dist_mat[i,j] = dist_func(X[i,:], center)
    #print 'cost ', -st+time.time()
    mask = np.argmin(dist_mat,axis=1)
    members = np.zeros(len(X))
    costs = np.zeros(len(centers_id))
    for i in range(len(centers_id)):
        mem_id = np.where(mask==i)
        members[mem_id] = i
        costs[i] = np.sum(dist_mat[mem_id,i])
    return members, costs, np.sum(costs), dist_mat

def _kmedoids_run(X, n_clusters, dist_func, max_iter=3, tol=0.000001, verbose=True):
    '''run algorithm return centers, members, and etc.'''
    # Get initial centers
    n_samples, n_features = X.shape
    init_ids = _get_init_centers(n_clusters,n_samples)
    if verbose:
        print 'Initial centers are ', init_ids
    centers = init_ids
    members, costs, tot_cost, dist_mat = _get_cost(X, init_ids,dist_func)
    cc,SWAPED = 0, True
    while True:
        SWAPED = False
        for i in range(n_samples):
            if not i in centers:
                for j in range(len(centers)):
                    centers_ = deepcopy(centers)
                    centers_[j] = i
                    members_, costs_, tot_cost_, dist_mat_ = _get_cost(X, centers_,dist_func)
                    if tot_cost_-tot_cost < tol:
                        members, costs, tot_cost, dist_mat = members_, costs_, tot_cost_, dist_mat_
                        centers = centers_
                        SWAPED = True
                        if verbose:
                            print 'Change centers to ', centers
        if cc > max_iter:
            if verbose:
                print 'End Searching by reaching maximum iteration', max_iter
            break
        if not SWAPED:
            if verbose:
                print 'End Searching by no swaps'
            break
        cc += 1
    return centers,members, costs, tot_cost, dist_mat

class KMedoids(object):
    '''
    Main API of KMedoids Clustering

    Parameters
    --------
        n_clusters: number of clusters
        dist_func : distance function
        max_iter: maximum number of iterations
        tol: tolerance

    Attributes
    --------
        labels_    :  cluster labels for each data item
        centers_   :  cluster centers id
        costs_     :  array of costs for each cluster
        n_iter_    :  number of iterations for the best trail

    Methods
    -------
        fit(X): fit the model
            - X: 2-D numpy array, size = (n_sample, n_features)

        predict(X): predict cluster id given a test dataset.
    '''
    def __init__(self, n_clusters, dist_func=_get_distance, max_iter=3, tol=0.000001):
        self.n_clusters = n_clusters
        self.dist_func = dist_func
        self.max_iter = max_iter
        self.tol = tol

    def fit(self, X, plotit=True, verbose=True):
        centers, members, costs, tot_cost, dist_mat = _kmedoids_run(
            X, self.n_clusters, self.dist_func, max_iter=self.max_iter, tol=self.tol, verbose=verbose)
        if plotit:
            fig = plt.figure()
            ax = fig.add_subplot(111)


            for i in range(len(centers)):
                X_c = X[members == i, :]
                ax.scatter(X_c[:, 0], X_c[:, 1], label = i+1,alpha=0.5, s=30)
                ax.scatter(X[centers[i], 0], X[centers[i], 1],alpha=1., s=250, marker='*')
            #ax.legend(bbox_to_anchor=(1, 1), fontsize="small", loc=2, borderaxespad=0.)
            colormap = plt.cm.gist_ncar  # nipy_spectral, Set1,Paired
            colorst = [colormap(i) for i in np.linspace(0, 0.9, len(ax.collections))]
            for t, j1 in enumerate(ax.collections):
                j1.set_color(colorst[t])

        return


    def predict(self,X):
        raise NotImplementedError()

could you halp how to do it?
thanks