introclusterproblem
April 27, 2024
Intradata Face Clustering Introdução a problemas de machine learning não “supervisionados”
utilizando o viés para a resolução do problema atual:
1. Como agrupar pessoas similares.
[ ]: import pandas as pd
import numpy as np
import seaborn as sns
from pickle import load
from matplotlib import pyplot as plt
from sklearn.datasets import make_blobs
from sklearn import cluster
from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer
from warnings import filterwarnings
filterwarnings('ignore')
#plt.rcParams['figure.figsize'] = [13, 6]
#plt.rcParams['font.size'] = 13
[ ]: from umap.umap_ import UMAP
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
0.0.1 1.0. Artificial Datasets
1.1. Make Dataset
[ ]: #np.random.seed(7)
# Random sample with Overlapping and Grouped Poits
X, Y = make_blobs(
n_samples=100,
n_features=2,
centers=2,
1
cluster_std=1.0,
center_box=(-10.0, 10.0),
shuffle=True,
random_state=None,
return_centers=False,
)
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.scatterplot(x=X[:,0], y=X[:,1], hue=Y, ax=ax[0], palette=['red', 'blue'])
ax[1].hist(X[:,0], label='0', color='red');
ax[1].hist(X[:,1], label='1', color='blue');
ax[0].set_title('Clusters for 2-D');
ax[1].set_title('Hists for 2-D');
ax[1].legend();
0.0.2 1.2. Apply Cluster Models
1.2.1. KMeans + Metrics
[ ]: clusters = [2, 3, 4, 5, 6]
kmeans = KElbowVisualizer(cluster.KMeans(), k=clusters, metric='silhouette')
kmeans.fit(X)
ax1 = kmeans.show();
2
[ ]: fig, ax = plt.subplots(2, 2, figsize=(8,6))
ax = ax.flatten()
for k, i in zip(clusters, ax):
kmeans = cluster.KMeans(n_clusters=k, init='k-means++', n_init=10,␣
↪max_iter=10)
kmeans.fit(X)
i.scatter(x=X[:,0], y=X[:,1], c=kmeans.labels_, cmap='Dark2')
i.set_title(f'Number of Clusters: {k}')
plt.tight_layout()
3
[ ]: # Predict closest Cluster for new User Image.
kmeans.predict(X)
[ ]: array([3, 4, 2, 4, 4, 1, 1, 3, 4, 0, 0, 0, 2, 0, 0, 1, 3, 1, 2, 0, 2, 0,
1, 2, 4, 1, 4, 4, 0, 1, 0, 2, 0, 0, 2, 3, 4, 4, 0, 3, 4, 1, 1, 4,
3, 1, 0, 0, 3, 3, 2, 2, 3, 4, 4, 1, 4, 4, 0, 0, 2, 4, 2, 4, 4, 0,
1, 0, 2, 2, 4, 0, 0, 0, 3, 1, 2, 2, 3, 2, 3, 1, 3, 0, 0, 0, 0, 1,
2, 0, 3, 1, 4, 1, 4, 4, 3, 2, 3, 2], dtype=int32)
[ ]: fig, ax = plt.subplots(2, 2, figsize=(10,10))
ax = ax.flatten()
for k, i in zip(clusters, ax):
kmeans = cluster.KMeans(n_clusters=k, init='k-means++', n_init=10,␣
↪max_iter=10)
viz = SilhouetteVisualizer(kmeans, ax=i)
viz.fit(X)
viz.finalize()
4
plt.tight_layout()
1.2.2. Shared Nearest Neigh
[ ]: from SharedNearestNeighbors.shared_nearest_neighbors import SNN
eps = [3, 4, 5, 6]
fig, ax = plt.subplots(2,2)
ax = ax.flatten()
for ep, i in zip(eps, range(0,len(eps))):
snn = SNN(
n_neighbors=7,
5
eps=ep,
min_samples=2,
algorithm="auto",
leaf_size=30,
metric="euclidean",
p=None,
metric_params=None,
).fit(X)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=snn.labels_, palette='inferno',␣
↪ size=snn.labels_, ax=ax[i])
ax[i].set_title(f"Eps: {ep}")
print(f'For {ep}, size if: {np.unique(snn.labels_).shape}')
For 3, size if: (1,)
For 4, size if: (5,)
For 5, size if: (15,)
For 6, size if: (25,)
[ ]: snn = SNN(
n_neighbors=8,
6
eps=7,
min_samples=1,
algorithm="auto",
leaf_size=30,
metric="euclidean",
p=None,
metric_params=None,
).fit(X)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=snn.labels_, palette='inferno',␣
↪size=snn.labels_)
print(f'Unique Clusters: {np.unique(snn.labels_).shape}')
Unique Clusters: (54,)
1.2.3. Shared Nearest Neigh 2
[ ]: snn_model = SNN(neighbor_num=1, min_shared_neighbor_proportion=1).fit(X)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=snn_model.labels_, palette='inferno',␣
↪size=snn_model.labels_)
[ ]: <AxesSubplot: >
7
[ ]: np.unique(snn_model.labels_).shape
[ ]: (71,)
1.2.4. OPTICS
[ ]: opt = cluster.OPTICS(
min_samples=2,
max_eps=np.inf,
metric="euclidean",
p=2,
metric_params=None,
cluster_method="xi",
eps=1,
xi=0.05,
predecessor_correction=True,
min_cluster_size=None,
algorithm="auto",
leaf_size=30,
memory=None,
).fit(X)
8
sns.scatterplot(x=X[:,0], y=X[:,1], hue=opt.labels_, palette='inferno',␣
↪size=opt.labels_)
np.unique(opt.labels_).shape
[ ]: (28,)
1.2.5. DBSCAN
[ ]: dbs = cluster.DBSCAN(
eps=0.001,
min_samples=1,
metric="canberra",
metric_params=None,
algorithm="auto",
leaf_size=30,
p=None,
n_jobs=None,
).fit(X)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=dbs.labels_, palette='inferno',␣
↪size=dbs.labels_)
9
np.unique(dbs.labels_).shape
[ ]: (100,)
1.2.6. Manual Confs
[ ]: # Manual Config for Shr
#from sklearn.cluster import DBSCAN
#from sklearn.neighbors import kneighbors_graph
#
#import numpy as np
#from sklearn.base import BaseEstimator, ClusterMixin
#
#def get_snn_similarity(x0, x1):
# """Calculate the shared-neighbor similarity of two sets of nearest␣
↪neighbors, normalized by the maximum number of shared neighbors"""
#
# return len(x0.intersection(x1)) / len(x0)
#
#
#def get_snn_distance(x0, x1):
10
# """Calculate the shared-neighbor distance of two sets of nearest␣
neighbors, normalized by the maximum number of shared neighbors"""
↪
#
# return 1 - get_snn_similarity(x0, x1)
#
#def snn(X, neighbor_num, min_shared_neighbor_num):
# """Perform Shared Nearest Neighbor (SNN) clustering algorithm clustering.
# Parameters
# ----------
# X : array or sparse (CSR) matrix of shape (n_samples, n_features), or␣
↪array of shape (n_samples, n_samples)
# A feature array
# neighbor_num : int
# K number of neighbors to consider for shared nearest neighbor similarity
# min_shared_neighbor_num : int
# Number of nearest neighbors that need to share two data points to be␣
↪considered part of the same cluster
# """
#
# # for each data point, find their set of K nearest neighbors
# knn_graph = kneighbors_graph(X, n_neighbors=neighbor_num,␣
↪include_self=False)
# neighbors = np.array([set(knn_graph[i].nonzero()[1]) for i in␣
↪range(len(X))])
#
# # the distance matrix is computed as the complementary of the proportion␣
↪of shared neighbors between each pair of data points
# snn_distance_matrix = np.asarray([[get_snn_distance(neighbors[i],␣
↪neighbors[j]) for j in range(len(neighbors))] for i in␣
↪range(len(neighbors))])
#
# ssn_distance_matrix = []
#
#
# # perform DBSCAN with the shared-neighbor distance criteria for density␣
↪estimation
# dbscan = DBSCAN(min_samples=min_shared_neighbor_num, metric="precomputed")
# dbscan = dbscan.fit(snn_distance_matrix)
# return dbscan.core_sample_indices_, dbscan.labels_
#
#
#class SNN(BaseEstimator, ClusterMixin):
# """Class for performing the Shared Nearest Neighbor (SNN) clustering␣
↪algorithm.
# Parameters
# ----------
11
# neighbor_num : int
# K number of neighbors to consider for shared nearest neighbor␣
↪similarity
# min_shared_neighbor_proportion : float [0, 1]
# Proportion of the K nearest neighbors that need to share two data␣
↪points to be considered part of the same cluster
# Note: Naming conventions for attributes are based on the analogous ones of␣
↪DBSCAN
# """
#
# def __init__(self, neighbor_num, min_shared_neighbor_proportion):
#
# """Constructor"""
#
# self.neighbor_num = neighbor_num
# self.min_shared_neighbor_num = round(neighbor_num *␣
↪min_shared_neighbor_proportion)
#
# def fit(self, X):
#
# """Perform SNN clustering from features or distance matrix.
# Parameters
# ----------
# X : array or sparse (CSR) matrix of shape (n_samples, n_features), or␣
↪array of shape (n_samples, n_samples)
# A feature array
# """
#
# clusters = snn(X, neighbor_num=self.neighbor_num,␣
↪min_shared_neighbor_num=self.min_shared_neighbor_num)
# self.core_sample_indices_, self.labels_ = clusters
# if len(self.core_sample_indices_):
# # fix for scipy sparse indexing issue
# self.components_ = X[self.core_sample_indices_].copy()
# else:
# # no core samples
# self.components_ = np.empty((0, X.shape[1]))
# return self
#
# def fit_predict(self, X, y=None, sample_weight=None):
# """Performs clustering on X and returns cluster labels.
# Parameters
# ----------
# X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
# array of shape (n_samples, n_samples)
# A feature array, or array of distances between samples if
12
# ``metric='precomputed'``.
# sample_weight : array, shape (n_samples,), optional
# Weight of each sample, such that a sample with a weight of at least
# ``min_samples`` is by itself a core sample; a sample with negative
# weight may inhibit its eps-neighbor from being core.
# Note that weights are absolute, and default to 1.
# y : Ignored
# Returns
# -------
# y : ndarray, shape (n_samples,)
# cluster labels
# """
# self.fit(X)
# return self.labels_
#snn_model = SNN(neighbor_num=1, min_shared_neighbor_proportion=0.5).fit(X[:,:
↪337].sample(10_000))
13