使用 MSMBuilder 进行数据聚类

发表于 2019-08-21 分类于 python

　　之前的一篇博文讲述了通过scipy.cluster进行聚类，最近发现MSMBuilder也具有类似的功能，并且使用比较方便，遂记录于此。

准备工作

　　在构建 MSM 的过程中，一般会有如下降维的步骤：

from msmbuilder.featurizer import RawPositionsFeaturizer
from msmbuilder.decomposition import PCA
from msmbuilder.cluster import MiniBatchKMeans

featurizer = RawPositionsFeaturizer(atom_indices=atoms)
feat = featurizer.fit_transform(trajs) # trajs 为列表，每个元素为一条轨迹；feat 为列表，每个元素为 帧数* 特征数 的 2D array

pca = PCA(n_components=2)
pca_traj = pca.fit_transform(feat) # pca_traj 为列表，长度为轨迹条数；每个元素为 帧数 * n_components 的 2D array
pca_data=np.concatenate(pca_traj)

# 开始聚类
n_clusters=200
clusterer = MiniBatchKMeans(n_clusters=n_clusters, random_state=1)
clustered_traj = clusterer.fit(pca_traj) # pca_traj 为列表 

clusterer.labels_           # 聚类标签
clusterer.cluster_centers_  # 聚类中心

　　先使用RawPositionsFeaturizer提取特征，然后使用PCA进行降维，再使用MiniBatchKMeans进行聚类。需要注意的是： 1. clusterer.fit(pca_traj) 表示使用pca_traj来 fit 模型的参数 2. clusterer.transform(new_data) 表示计算 new_data 在这个模型上的投影 3. pca_traj 是一个列表，它的每一个元素是一组数据 4. 除了MiniBatchKMeans之外，msmbuilder 还提供了其它的聚类方案：

API	Description
KCenters	K-Centers clustering
KMeans	K-Means clustering
KMedoids	K-Medoids clustering
MiniBatchKMedoids	Mini-Batch K-Medoids clustering
RegularSpatial	Regular spatial clustering
LandmarkAgglomerative	Landmark-based agglomerative hierarchical clustering
AffinityPropagation	Perform Affinity Propagation Clustering of data
GMM	Gaussian Mixture
MeanShift	Mean shift clustering using a flat kernel
MiniBatchKMeans	Mini-Batch K-Means clustering
SpectralClustering	Apply clustering to a projection to the normalized laplacian
Ward

测试

%pylab 

from msmbuilder.cluster import MiniBatchKMeans

######## generate data 
## 矩形 
n=200000
np.random.seed(100)
pos1=np.random.random(size=(n,2)) # position 

# 闭合圆环
n=100000
def gen_circle(radii,n): 
  np.random.seed(200+int(radii))
  p=np.random.random(size=(n,2)) # position
  r=(p[:,0]+radii)/8
  theta=p[:,1]*2*np.pi 
  return np.column_stack((r*np.cos(theta),r*np.sin(theta)))
 
p1=gen_circle(radii=1,n=n)
p2=gen_circle(radii=3,n=n)
pos2=np.concatenate((p1,p2))

# 半环 
n=100000
def gen_open(top,n):
  np.random.seed(300+int(top))
  p=np.random.random(size=(n,2)) # position 
  x=p[:,0]*np.pi 
  y=np.sin(x)+p[:,1]*0.2
  if top:
    return np.column_stack((x,y-0.4))
  else:  
    return np.column_stack((x+np.pi/2,-1*y+0.4))
  
p1=gen_open(top=1,n=n)
p2=gen_open(top=0,n=n)
pos3=np.concatenate((p1,p2))

######## plot data 
from msmbuilder.cluster import KCenters 
from msmbuilder.cluster import KMeans 
from msmbuilder.cluster import KMedoids 
from msmbuilder.cluster import MiniBatchKMedoids 
from msmbuilder.cluster import RegularSpatial 
from msmbuilder.cluster import LandmarkAgglomerative 
from msmbuilder.cluster import AffinityPropagation 
from msmbuilder.cluster import GMM 
from msmbuilder.cluster import MeanShift 
from msmbuilder.cluster import MiniBatchKMeans 
from msmbuilder.cluster import SpectralClustering  

def plot_cluster(data,method,n_clusters,ax):
  #clusterer = MiniBatchKMeans(n_clusters=n_clusters, random_state=1)
  clusterer = eval('%s(n_clusters=%d, random_state=1)'%(method,n_clusters))
  print(clusterer)
  clustered_traj = clusterer.fit([data])   
  #clusterer.labels_           # 聚类标签
  #clusterer.cluster_centers_  # 聚类中心 
  labels=clusterer.labels_[0]
  
  cmap = plt.cm.jet
  norm = matplotlib.colors.Normalize(vmin=labels.min(), vmax=labels.max())
  ax.scatter(data[:,0],data[:,1],c=cmap(norm(labels)))
  ax.set_xticks([])
  ax.set_yticks([])
  ax.set_xticklabels([])
  ax.set_yticklabels([])

# 
methods=['KCenters','KMeans','MiniBatchKMeans','MiniBatchKMedoids']

for method in methods:
  try:
    fig,axes=plt.subplots(figsize=(5.6,2),nrows=1,ncols=3)
    axes=axes.ravel()
    #method='MiniBatchKMeans'
    axes[0].set_ylabel(method)
    plot_cluster(data=pos1,method=method,n_clusters=10,ax=axes[0])
    plot_cluster(data=pos2,method=method,n_clusters=10,ax=axes[1])
    plot_cluster(data=pos3,method=method,n_clusters=10,ax=axes[2])
    plt.tight_layout()
    plt.savefig(method+'.png',dpi=300)
    plt.close()
  except: 
    print('Error: %s\n\n'%method)

效果如下：