i use this code as my reference :
i want to change K-Means with K-Medoids.
this is my k-medoids code :
could you halp how to do it?
thanks
Code:
import pylab as plt
import numpy as np
from scipy.spatial.distance import cdist, pdist
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
iris = load_iris()
k = range(1,11)
clusters = [KMeans(n_clusters = c,init = 'k-means++').fit(iris.data) for c in k]
centr_lst = [cc.cluster_centers_ for cc in clusters]
k_distance = [cdist(iris.data, cent, 'euclidean') for cent in centr_lst]
clust_indx = [np.argmin(kd,axis=1) for kd in k_distance]
distances = [np.min(kd,axis=1) for kd in k_distance]
avg_within = [np.sum(dist)/iris.data.shape[0] for dist in distances]
with_in_sum_square = [np.sum(dist ** 2) for dist in distances]
to_sum_square = np.sum(pdist(iris.data) ** 2)/iris.data.shape[0]
bet_sum_square = to_sum_square - with_in_sum_square
kidx = 2
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(k, avg_within, 'g*-')
ax.plot(k[kidx], avg_within[kidx], marker='o', markersize=12, \
markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Average within-cluster sum of squares')
plt.title('Elbow for KMeans clustering (IRIS Data)')
this is my k-medoids code :
Code:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from copy import deepcopy
from IPython import embed
import time
def _get_init_centers(n_clusters, n_samples):
'''return random points as initial centers'''
init_ids = []
while len(init_ids) < n_clusters:
_ = np.random.randint(0,n_samples)
if not _ in init_ids:
init_ids.append(_)
return init_ids
def _get_distance(data1, data2):
'''example distance function'''
return np.sqrt(np.sum((data1 - data2)**2))
def _get_cost(X, centers_id, dist_func):
'''return total cost and cost of each cluster'''
st = time.time()
dist_mat = np.zeros((len(X),len(centers_id)))
# compute distance matrix
for j in range(len(centers_id)):
center = X[centers_id[j],:]
for i in range(len(X)):
if i == centers_id[j]:
dist_mat[i,j] = 0.
else:
dist_mat[i,j] = dist_func(X[i,:], center)
#print 'cost ', -st+time.time()
mask = np.argmin(dist_mat,axis=1)
members = np.zeros(len(X))
costs = np.zeros(len(centers_id))
for i in range(len(centers_id)):
mem_id = np.where(mask==i)
members[mem_id] = i
costs[i] = np.sum(dist_mat[mem_id,i])
return members, costs, np.sum(costs), dist_mat
def _kmedoids_run(X, n_clusters, dist_func, max_iter=3, tol=0.000001, verbose=True):
'''run algorithm return centers, members, and etc.'''
# Get initial centers
n_samples, n_features = X.shape
init_ids = _get_init_centers(n_clusters,n_samples)
if verbose:
print 'Initial centers are ', init_ids
centers = init_ids
members, costs, tot_cost, dist_mat = _get_cost(X, init_ids,dist_func)
cc,SWAPED = 0, True
while True:
SWAPED = False
for i in range(n_samples):
if not i in centers:
for j in range(len(centers)):
centers_ = deepcopy(centers)
centers_[j] = i
members_, costs_, tot_cost_, dist_mat_ = _get_cost(X, centers_,dist_func)
if tot_cost_-tot_cost < tol:
members, costs, tot_cost, dist_mat = members_, costs_, tot_cost_, dist_mat_
centers = centers_
SWAPED = True
if verbose:
print 'Change centers to ', centers
if cc > max_iter:
if verbose:
print 'End Searching by reaching maximum iteration', max_iter
break
if not SWAPED:
if verbose:
print 'End Searching by no swaps'
break
cc += 1
return centers,members, costs, tot_cost, dist_mat
class KMedoids(object):
'''
Main API of KMedoids Clustering
Parameters
--------
n_clusters: number of clusters
dist_func : distance function
max_iter: maximum number of iterations
tol: tolerance
Attributes
--------
labels_ : cluster labels for each data item
centers_ : cluster centers id
costs_ : array of costs for each cluster
n_iter_ : number of iterations for the best trail
Methods
-------
fit(X): fit the model
- X: 2-D numpy array, size = (n_sample, n_features)
predict(X): predict cluster id given a test dataset.
'''
def __init__(self, n_clusters, dist_func=_get_distance, max_iter=3, tol=0.000001):
self.n_clusters = n_clusters
self.dist_func = dist_func
self.max_iter = max_iter
self.tol = tol
def fit(self, X, plotit=True, verbose=True):
centers, members, costs, tot_cost, dist_mat = _kmedoids_run(
X, self.n_clusters, self.dist_func, max_iter=self.max_iter, tol=self.tol, verbose=verbose)
if plotit:
fig = plt.figure()
ax = fig.add_subplot(111)
for i in range(len(centers)):
X_c = X[members == i, :]
ax.scatter(X_c[:, 0], X_c[:, 1], label = i+1,alpha=0.5, s=30)
ax.scatter(X[centers[i], 0], X[centers[i], 1],alpha=1., s=250, marker='*')
#ax.legend(bbox_to_anchor=(1, 1), fontsize="small", loc=2, borderaxespad=0.)
colormap = plt.cm.gist_ncar # nipy_spectral, Set1,Paired
colorst = [colormap(i) for i in np.linspace(0, 0.9, len(ax.collections))]
for t, j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
return
def predict(self,X):
raise NotImplementedError()
thanks