0% found this document useful (0 votes)

11 views13 pages

Intro Cluster Problem Python

The document introduces clustering problems in machine learning without supervision using bias to solve the current problem: how to group similar people. It shows various clustering algorithms like K-Means, OPTICS, DBSCAN applied to artificial datasets and discusses metrics to analyze clustering results.

Uploaded by

gabrielrichter2021

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

11 views13 pages

Intro Cluster Problem Python

Uploaded by

gabrielrichter2021

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 13

introclusterproblem

April 27, 2024

Intradata Face Clustering Introdução a problemas de machine learning não “supervisionados”

utilizando o viés para a resolução do problema atual:
1. Como agrupar pessoas similares.
[ ]: import pandas as pd
import numpy as np
import seaborn as sns

from pickle import load

from matplotlib import pyplot as plt
from sklearn.datasets import make_blobs

from sklearn import cluster

from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer

from warnings import filterwarnings

filterwarnings('ignore')

#plt.rcParams['figure.figsize'] = [13, 6]
#plt.rcParams['font.size'] = 13

[ ]: from umap.umap_ import UMAP

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

0.0.1 1.0. Artificial Datasets

1.1. Make Dataset
[ ]: #np.random.seed(7)
# Random sample with Overlapping and Grouped Poits

X, Y = make_blobs(
n_samples=100,
n_features=2,
centers=2,

1
cluster_std=1.0,
center_box=(-10.0, 10.0),
shuffle=True,
random_state=None,
return_centers=False,
)

fig, ax = plt.subplots(1, 2, figsize=(10, 5))

sns.scatterplot(x=X[:,0], y=X[:,1], hue=Y, ax=ax[0], palette=['red', 'blue'])
ax[1].hist(X[:,0], label='0', color='red');
ax[1].hist(X[:,1], label='1', color='blue');
ax[0].set_title('Clusters for 2-D');
ax[1].set_title('Hists for 2-D');
ax[1].legend();

0.0.2 1.2. Apply Cluster Models

1.2.1. KMeans + Metrics
[ ]: clusters = [2, 3, 4, 5, 6]

kmeans = KElbowVisualizer(cluster.KMeans(), k=clusters, metric='silhouette')

kmeans.fit(X)
ax1 = kmeans.show();

2
[ ]: fig, ax = plt.subplots(2, 2, figsize=(8,6))
ax = ax.flatten()

for k, i in zip(clusters, ax):

kmeans = cluster.KMeans(n_clusters=k, init='k-means++', n_init=10,␣
↪max_iter=10)

kmeans.fit(X)

i.scatter(x=X[:,0], y=X[:,1], c=kmeans.labels_, cmap='Dark2')

i.set_title(f'Number of Clusters: {k}')

plt.tight_layout()

3
[ ]: # Predict closest Cluster for new User Image.
kmeans.predict(X)

[ ]: array([3, 4, 2, 4, 4, 1, 1, 3, 4, 0, 0, 0, 2, 0, 0, 1, 3, 1, 2, 0, 2, 0,
1, 2, 4, 1, 4, 4, 0, 1, 0, 2, 0, 0, 2, 3, 4, 4, 0, 3, 4, 1, 1, 4,
3, 1, 0, 0, 3, 3, 2, 2, 3, 4, 4, 1, 4, 4, 0, 0, 2, 4, 2, 4, 4, 0,
1, 0, 2, 2, 4, 0, 0, 0, 3, 1, 2, 2, 3, 2, 3, 1, 3, 0, 0, 0, 0, 1,
2, 0, 3, 1, 4, 1, 4, 4, 3, 2, 3, 2], dtype=int32)

[ ]: fig, ax = plt.subplots(2, 2, figsize=(10,10))

ax = ax.flatten()

for k, i in zip(clusters, ax):

kmeans = cluster.KMeans(n_clusters=k, init='k-means++', n_init=10,␣
↪max_iter=10)

viz = SilhouetteVisualizer(kmeans, ax=i)

viz.fit(X)

viz.finalize()

4
plt.tight_layout()

1.2.2. Shared Nearest Neigh

[ ]: from SharedNearestNeighbors.shared_nearest_neighbors import SNN

eps = [3, 4, 5, 6]

fig, ax = plt.subplots(2,2)
ax = ax.flatten()

for ep, i in zip(eps, range(0,len(eps))):

snn = SNN(
n_neighbors=7,

5
eps=ep,
min_samples=2,
algorithm="auto",
leaf_size=30,
metric="euclidean",
p=None,
metric_params=None,
).fit(X)

sns.scatterplot(x=X[:,0], y=X[:,1], hue=snn.labels_, palette='inferno',␣

↪ size=snn.labels_, ax=ax[i])
ax[i].set_title(f"Eps: {ep}")
print(f'For {ep}, size if: {np.unique(snn.labels_).shape}')

For 3, size if: (1,)

For 4, size if: (5,)
For 5, size if: (15,)
For 6, size if: (25,)

[ ]: snn = SNN(
n_neighbors=8,

6
eps=7,
min_samples=1,
algorithm="auto",
leaf_size=30,
metric="euclidean",
p=None,
metric_params=None,
).fit(X)

sns.scatterplot(x=X[:,0], y=X[:,1], hue=snn.labels_, palette='inferno',␣

↪size=snn.labels_)

print(f'Unique Clusters: {np.unique(snn.labels_).shape}')

Unique Clusters: (54,)

1.2.3. Shared Nearest Neigh 2

[ ]: snn_model = SNN(neighbor_num=1, min_shared_neighbor_proportion=1).fit(X)

sns.scatterplot(x=X[:,0], y=X[:,1], hue=snn_model.labels_, palette='inferno',␣

↪size=snn_model.labels_)

[ ]: <AxesSubplot: >

7
[ ]: np.unique(snn_model.labels_).shape

[ ]: (71,)

1.2.4. OPTICS
[ ]: opt = cluster.OPTICS(
min_samples=2,
max_eps=np.inf,
metric="euclidean",
p=2,
metric_params=None,
cluster_method="xi",
eps=1,
xi=0.05,
predecessor_correction=True,
min_cluster_size=None,
algorithm="auto",
leaf_size=30,
memory=None,
).fit(X)

8
sns.scatterplot(x=X[:,0], y=X[:,1], hue=opt.labels_, palette='inferno',␣
↪size=opt.labels_)

np.unique(opt.labels_).shape

[ ]: (28,)

1.2.5. DBSCAN
[ ]: dbs = cluster.DBSCAN(
eps=0.001,
min_samples=1,
metric="canberra",
metric_params=None,
algorithm="auto",
leaf_size=30,
p=None,
n_jobs=None,
).fit(X)

sns.scatterplot(x=X[:,0], y=X[:,1], hue=dbs.labels_, palette='inferno',␣

↪size=dbs.labels_)

9
np.unique(dbs.labels_).shape

[ ]: (100,)

1.2.6. Manual Confs

[ ]: # Manual Config for Shr
#from sklearn.cluster import DBSCAN
#from sklearn.neighbors import kneighbors_graph
#
#import numpy as np
#from sklearn.base import BaseEstimator, ClusterMixin
#
#def get_snn_similarity(x0, x1):
# """Calculate the shared-neighbor similarity of two sets of nearest␣
↪neighbors, normalized by the maximum number of shared neighbors"""

#
# return len(x0.intersection(x1)) / len(x0)
#
#
#def get_snn_distance(x0, x1):

10
# """Calculate the shared-neighbor distance of two sets of nearest␣
neighbors, normalized by the maximum number of shared neighbors"""
↪

#
# return 1 - get_snn_similarity(x0, x1)
#
#def snn(X, neighbor_num, min_shared_neighbor_num):
# """Perform Shared Nearest Neighbor (SNN) clustering algorithm clustering.
# Parameters
# ----------
# X : array or sparse (CSR) matrix of shape (n_samples, n_features), or␣
↪array of shape (n_samples, n_samples)

# A feature array
# neighbor_num : int
# K number of neighbors to consider for shared nearest neighbor similarity
# min_shared_neighbor_num : int
# Number of nearest neighbors that need to share two data points to be␣
↪considered part of the same cluster

# """
#
# # for each data point, find their set of K nearest neighbors
# knn_graph = kneighbors_graph(X, n_neighbors=neighbor_num,␣
↪include_self=False)

# neighbors = np.array([set(knn_graph[i].nonzero()[1]) for i in␣

↪range(len(X))])

#
# # the distance matrix is computed as the complementary of the proportion␣
↪of shared neighbors between each pair of data points

# snn_distance_matrix = np.asarray([[get_snn_distance(neighbors[i],␣
↪neighbors[j]) for j in range(len(neighbors))] for i in␣

↪range(len(neighbors))])

#
# ssn_distance_matrix = []
#
#
# # perform DBSCAN with the shared-neighbor distance criteria for density␣
↪estimation

# dbscan = DBSCAN(min_samples=min_shared_neighbor_num, metric="precomputed")

# dbscan = dbscan.fit(snn_distance_matrix)
# return dbscan.core_sample_indices_, dbscan.labels_
#
#
#class SNN(BaseEstimator, ClusterMixin):
# """Class for performing the Shared Nearest Neighbor (SNN) clustering␣
↪algorithm.

# Parameters
# ----------

11
# neighbor_num : int
# K number of neighbors to consider for shared nearest neighbor␣
↪similarity

# min_shared_neighbor_proportion : float [0, 1]

# Proportion of the K nearest neighbors that need to share two data␣
↪points to be considered part of the same cluster

# Note: Naming conventions for attributes are based on the analogous ones of␣
↪DBSCAN

# """
#
# def __init__(self, neighbor_num, min_shared_neighbor_proportion):
#
# """Constructor"""
#
# self.neighbor_num = neighbor_num
# self.min_shared_neighbor_num = round(neighbor_num *␣
↪min_shared_neighbor_proportion)

#
# def fit(self, X):
#
# """Perform SNN clustering from features or distance matrix.
# Parameters
# ----------
# X : array or sparse (CSR) matrix of shape (n_samples, n_features), or␣
↪array of shape (n_samples, n_samples)

# A feature array
# """
#
# clusters = snn(X, neighbor_num=self.neighbor_num,␣
↪min_shared_neighbor_num=self.min_shared_neighbor_num)

# self.core_sample_indices_, self.labels_ = clusters

# if len(self.core_sample_indices_):
# # fix for scipy sparse indexing issue
# self.components_ = X[self.core_sample_indices_].copy()
# else:
# # no core samples
# self.components_ = np.empty((0, X.shape[1]))
# return self
#
# def fit_predict(self, X, y=None, sample_weight=None):
# """Performs clustering on X and returns cluster labels.
# Parameters
# ----------
# X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
# array of shape (n_samples, n_samples)
# A feature array, or array of distances between samples if

12
# ``metric='precomputed'``.
# sample_weight : array, shape (n_samples,), optional
# Weight of each sample, such that a sample with a weight of at least
# ``min_samples`` is by itself a core sample; a sample with negative
# weight may inhibit its eps-neighbor from being core.
# Note that weights are absolute, and default to 1.
# y : Ignored
# Returns
# -------
# y : ndarray, shape (n_samples,)
# cluster labels
# """
# self.fit(X)
# return self.labels_

#snn_model = SNN(neighbor_num=1, min_shared_neighbor_proportion=0.5).fit(X[:,:

↪337].sample(10_000))

KNN Cookbook
No ratings yet
KNN Cookbook
8 pages
KNN Colab Illustration
No ratings yet
KNN Colab Illustration
5 pages
21BCE5775 Clustering
No ratings yet
21BCE5775 Clustering
42 pages
AIML Lab 10
No ratings yet
AIML Lab 10
4 pages
LAB7 Kmeans
No ratings yet
LAB7 Kmeans
11 pages
K-Nearest Neighbor On Python Ken Ocuma
100% (2)
K-Nearest Neighbor On Python Ken Ocuma
9 pages
Spectral Clustering
No ratings yet
Spectral Clustering
5 pages
Week 8 DS Practical
No ratings yet
Week 8 DS Practical
13 pages
Clustering
No ratings yet
Clustering
1 page
ML Unit-4
No ratings yet
ML Unit-4
23 pages
Kmeans Gradtut 22B0394
No ratings yet
Kmeans Gradtut 22B0394
3 pages
Assignment # 1: Performance Timeline of Flynn Taxonomy
No ratings yet
Assignment # 1: Performance Timeline of Flynn Taxonomy
21 pages
MLT Unit 3 Notes
No ratings yet
MLT Unit 3 Notes
19 pages
Unsuper
No ratings yet
Unsuper
15 pages
KMEANS
No ratings yet
KMEANS
9 pages
Minor Assignment 4
No ratings yet
Minor Assignment 4
17 pages
KMeans Clustering Guide
No ratings yet
KMeans Clustering Guide
5 pages
Wa0003
No ratings yet
Wa0003
16 pages
Assignment 4
No ratings yet
Assignment 4
9 pages
DSM 1
No ratings yet
DSM 1
6 pages
Data Science and Machine Learning Practicals
No ratings yet
Data Science and Machine Learning Practicals
8 pages
ML Exp5 C36
No ratings yet
ML Exp5 C36
18 pages
Lab Report 4
No ratings yet
Lab Report 4
6 pages
4.cluster Analysis
No ratings yet
4.cluster Analysis
7 pages
Graph Based Clustering
No ratings yet
Graph Based Clustering
78 pages
Nearest Centroid
No ratings yet
Nearest Centroid
7 pages
Cheat Sheet-Building Unsupervised Learning Models
No ratings yet
Cheat Sheet-Building Unsupervised Learning Models
3 pages
Entropy (S) Log (P) : I 1c I I
No ratings yet
Entropy (S) Log (P) : I 1c I I
5 pages
K-Means Clustering From Scratch
No ratings yet
K-Means Clustering From Scratch
3 pages
23CC554
No ratings yet
23CC554
10 pages
Unit-Iv Material
No ratings yet
Unit-Iv Material
24 pages
ML Minors Exp7
No ratings yet
ML Minors Exp7
6 pages
Unit 2
No ratings yet
Unit 2
30 pages
2.3 Aiml Rishit
No ratings yet
2.3 Aiml Rishit
7 pages
CS40003 (Data Analytics) : Term Project
No ratings yet
CS40003 (Data Analytics) : Term Project
10 pages
Baidurya Debnath 4
No ratings yet
Baidurya Debnath 4
37 pages
DS - ML - 7 - 60019210046 1
No ratings yet
DS - ML - 7 - 60019210046 1
6 pages
Iris Dataset Analysis with KNN & K-Means
No ratings yet
Iris Dataset Analysis with KNN & K-Means
6 pages
LAB6
No ratings yet
LAB6
4 pages
Instance Based Learning
No ratings yet
Instance Based Learning
20 pages
KNN Final
No ratings yet
KNN Final
4 pages
Module 3
No ratings yet
Module 3
21 pages
Implementing KNN Algorithm On The Iris Dataset
No ratings yet
Implementing KNN Algorithm On The Iris Dataset
7 pages
Part A 3. KNN Classification
No ratings yet
Part A 3. KNN Classification
35 pages
Unsupervised Learning Guide
No ratings yet
Unsupervised Learning Guide
50 pages
Aiml Assignment 10
No ratings yet
Aiml Assignment 10
6 pages
1 s2.0 S0031320317303497 Main
No ratings yet
1 s2.0 S0031320317303497 Main
14 pages
ML DSBA Lab7
No ratings yet
ML DSBA Lab7
6 pages
Computer Vision Clustering Guide
No ratings yet
Computer Vision Clustering Guide
41 pages
Unit 4
No ratings yet
Unit 4
19 pages
Machine Learning IV
No ratings yet
Machine Learning IV
54 pages
KNN Algorithm for Car Classification
No ratings yet
KNN Algorithm for Car Classification
9 pages
JNTUK R20 B.Tech CSE 3-2 Machine Learning Unit 4 Notes
No ratings yet
JNTUK R20 B.Tech CSE 3-2 Machine Learning Unit 4 Notes
23 pages
Birch
No ratings yet
Birch
6 pages
Unit 4 Machine Learning
No ratings yet
Unit 4 Machine Learning
12 pages
7b. Clustering in MapReduce and Spark
No ratings yet
7b. Clustering in MapReduce and Spark
15 pages
Enhancing K-Nearest Neighbor Algorithm: A Comprehensive Review and Performance Analysis of Modifications
No ratings yet
Enhancing K-Nearest Neighbor Algorithm: A Comprehensive Review and Performance Analysis of Modifications
55 pages
Lecture 12 K-Nearest Neighbors
No ratings yet
Lecture 12 K-Nearest Neighbors
24 pages
Aml - Lab (1-6)
No ratings yet
Aml - Lab (1-6)
15 pages
Multi-Disease Prediction Guide
No ratings yet
Multi-Disease Prediction Guide
33 pages
Detecting False Alarms From Automatic Static Analysis Tools: How Far Are We?
No ratings yet
Detecting False Alarms From Automatic Static Analysis Tools: How Far Are We?
12 pages
AI in Construction Cost Prediction
No ratings yet
AI in Construction Cost Prediction
5 pages
A Review On Tea Leaf Disease Detection System
No ratings yet
A Review On Tea Leaf Disease Detection System
14 pages
DSand ML
No ratings yet
DSand ML
76 pages
Machine Learning Tools and Toolkits in The Explora
No ratings yet
Machine Learning Tools and Toolkits in The Explora
7 pages
Texture Analysis for Engineers
No ratings yet
Texture Analysis for Engineers
6 pages
2020 @medphyslib Ganesh Naik Biomedical Signal Processing, Advances
No ratings yet
2020 @medphyslib Ganesh Naik Biomedical Signal Processing, Advances
432 pages
Teks DATA SCIENCE Syllabus - QR
No ratings yet
Teks DATA SCIENCE Syllabus - QR
26 pages
Deep Ant
No ratings yet
Deep Ant
16 pages
Experiments On Indobert Implementation For Detecting Multi-Label Hate Speech With Data Resampling Through Synonym Replacement Method
No ratings yet
Experiments On Indobert Implementation For Detecting Multi-Label Hate Speech With Data Resampling Through Synonym Replacement Method
8 pages
A Comparison of Six Methods For Missing Data Imputation 2155 6180 1000224 PDF
No ratings yet
A Comparison of Six Methods For Missing Data Imputation 2155 6180 1000224 PDF
6 pages
2 Customer Churning Analysis Using Machine Learning Algorithms
No ratings yet
2 Customer Churning Analysis Using Machine Learning Algorithms
10 pages
CH 15
No ratings yet
CH 15
88 pages
Internship - Report - On - Ai - and - ML - 23P15A0513 SARATH - Final
No ratings yet
Internship - Report - On - Ai - and - ML - 23P15A0513 SARATH - Final
32 pages
Machine Learning Approaches in Battery Management Systems State of The Art Remaining Useful Life and Fault Detection
No ratings yet
Machine Learning Approaches in Battery Management Systems State of The Art Remaining Useful Life and Fault Detection
6 pages
Report
No ratings yet
Report
14 pages
Predictiveanalysis of PSL Match Winners Using Machine Learning Techniques
No ratings yet
Predictiveanalysis of PSL Match Winners Using Machine Learning Techniques
12 pages
Hp1047, Vmr286 Loan Default Prediction Final Report
No ratings yet
Hp1047, Vmr286 Loan Default Prediction Final Report
8 pages
Course Code HUM1012 Logic and Language Structure BL202425040 0921 D21+D22
No ratings yet
Course Code HUM1012 Logic and Language Structure BL202425040 0921 D21+D22
55 pages
Enhancing Error Prediction in Machineries Through Sensor Data Fusion
No ratings yet
Enhancing Error Prediction in Machineries Through Sensor Data Fusion
78 pages
PGP in DS & AI
No ratings yet
PGP in DS & AI
24 pages
Fadli (Bahasa Inggris)
No ratings yet
Fadli (Bahasa Inggris)
30 pages
Prog Found Final
No ratings yet
Prog Found Final
10 pages
Enhancing Emergency Response Through Speech Emotion Recognition A Machine Learning Approach
No ratings yet
Enhancing Emergency Response Through Speech Emotion Recognition A Machine Learning Approach
5 pages
Unit 2 Supervised Learning and Applications
No ratings yet
Unit 2 Supervised Learning and Applications
13 pages
Abstract
No ratings yet
Abstract
11 pages
Unmasking The Fake Machine Learning Approach For Deepfake Voice Detection
No ratings yet
Unmasking The Fake Machine Learning Approach For Deepfake Voice Detection
12 pages
Deep Learning Approach To Fish Survival Prediction in A Fish Pond
No ratings yet
Deep Learning Approach To Fish Survival Prediction in A Fish Pond
14 pages
189 Submission
No ratings yet
189 Submission
6 pages

Intro Cluster Problem Python

Uploaded by

Intro Cluster Problem Python

Uploaded by

introclusterproblem

April 27, 2024

Intradata Face Clustering Introdução a problemas de machine learning não “supervisionados”

from pickle import load

from sklearn import cluster

from warnings import filterwarnings

[ ]: from umap.umap_ import UMAP

0.0.1 1.0. Artificial Datasets

fig, ax = plt.subplots(1, 2, figsize=(10, 5))

0.0.2 1.2. Apply Cluster Models

kmeans = KElbowVisualizer(cluster.KMeans(), k=clusters, metric='silhouette')

for k, i in zip(clusters, ax):

i.scatter(x=X[:,0], y=X[:,1], c=kmeans.labels_, cmap='Dark2')

[ ]: fig, ax = plt.subplots(2, 2, figsize=(10,10))

for k, i in zip(clusters, ax):

viz = SilhouetteVisualizer(kmeans, ax=i)

1.2.2. Shared Nearest Neigh

for ep, i in zip(eps, range(0,len(eps))):

sns.scatterplot(x=X[:,0], y=X[:,1], hue=snn.labels_, palette='inferno',␣

For 3, size if: (1,)

sns.scatterplot(x=X[:,0], y=X[:,1], hue=snn.labels_, palette='inferno',␣

print(f'Unique Clusters: {np.unique(snn.labels_).shape}')

Unique Clusters: (54,)

1.2.3. Shared Nearest Neigh 2

sns.scatterplot(x=X[:,0], y=X[:,1], hue=snn_model.labels_, palette='inferno',␣

sns.scatterplot(x=X[:,0], y=X[:,1], hue=dbs.labels_, palette='inferno',␣

1.2.6. Manual Confs

# neighbors = np.array([set(knn_graph[i].nonzero()[1]) for i in␣

# dbscan = DBSCAN(min_samples=min_shared_neighbor_num, metric="precomputed")

# min_shared_neighbor_proportion : float [0, 1]

# self.core_sample_indices_, self.labels_ = clusters

#snn_model = SNN(neighbor_num=1, min_shared_neighbor_proportion=0.5).fit(X[:,:

You might also like