Dendrogram and clustering 3d

In [1]:
import scipy.cluster.hierarchy as shc
import pandas as pd
import matplotlib.pyplot as plt

# Import Data
df = pd.read_csv('c:/1/USArrests.csv')

USArrests

Source of data: https://www.kaggle.com/deepakg/usarrests

In [2]:
df.rename(columns = {'Unnamed: 0': 'State'}, inplace=True)
df.head(4)
Out[2]:
State Murder Assault UrbanPop Rape
0 Alabama 13.2 236 58 21.2
1 Alaska 10.0 263 48 44.5
2 Arizona 8.1 294 80 31.0
3 Arkansas 8.8 190 50 19.5
In [3]:
# Plot
plt.figure(figsize=(17, 4), dpi= 280)  
plt.title("USArrests Dendograms", fontsize=22)  
dend = shc.dendrogram(shc.linkage(df[['Murder', 'Assault', 'UrbanPop', 'Rape']], method='ward'), labels=df.State.values, color_threshold=100)  
plt.xticks(fontsize=12)
plt.show()
In [4]:
df3 = pd.read_csv('c:/1/hierarchical-clustering-with-python-and-scikit-learn-shopping-data.csv')
df3.head()
Out[4]:
CustomerID Genre Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40

We have a table that shows gender, age, annual income and expenditure. We take a vector of two coordinates from the DataFrame table: annual income in k $ – a tendency to spend on a scale of 1 to 100.

In [5]:
data = df3.iloc[:, 3:5].values
data
Out[5]:
array([[ 15,  39],
       [ 15,  81],
       [ 16,   6],
       [ 16,  77],
       [ 17,  40],
       [ 17,  76],
       [ 18,   6],
       [ 18,  94],
       [ 19,   3],
       [ 19,  72],
       [ 19,  14],
       [ 19,  99],
       [ 20,  15],
       [ 20,  77],
       [ 20,  13],
       [ 20,  79],
       [ 21,  35],
In [6]:
plt.figure(figsize=(10, 3))
plt.title("Customer Dendograms")
dend = shc.dendrogram(shc.linkage(data, method='ward'))

The dendrogram showed that there are 5 clusters (5 branches) of the bank’s clients. We create a clustering matrix. Since we had five clusters, we have five labels at the output, i.e. 0 to 4.

In [7]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
cluster.fit_predict(data)
Out[7]:
array([4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3,
       4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 1,
       4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 0, 2, 0, 2,
       1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 2], dtype=int64)
In [8]:
plt.figure(figsize=(10, 7))
plt.scatter(data[:,0], data[:,1], c=cluster.labels_, cmap='rainbow')
plt.title('CUSTOMERS CLUSTERINGS')
plt.xlabel('Annual earnings')
plt.ylabel('Spending')
Out[8]:
Text(0, 0.5, 'Spending')

Purple cluster – (in the lower right corner) a cluster of clients with high earnings but low expenses. Customers in the middle (blue data points) are those with average income and average salary. The largest number of customers belongs to this category.

Clinical tests

Source of data: https://www.kaggle.com/saurabh00007/diabetescsv

In [21]:
df3 = pd.read_csv('c:/1/diabetes.csv')
df3.head()
Out[21]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [22]:
PKP = df3[['Age','SkinThickness','BMI']]
In [23]:
PKP.head()
Out[23]:
Age SkinThickness BMI
0 50 35 33.6
1 31 29 26.6
2 32 0 23.3
3 21 23 28.1
4 33 35 43.1
The dendroid chart will tell you how many clusters you want
In [24]:
plt.figure(figsize=(17, 4), dpi= 280)  
plt.title("Customer Dendograms")
dend = shc.dendrogram(shc.linkage(PKP, method='ward'))
It seems 5 clusters
In [26]:
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(PKP['Age'], PKP['SkinThickness'], PKP['BMI'], color='black',marker='o')

ax.set_title('Clusters', fontsize= 30, alpha=0.6)
ax.set_xlabel('Age', fontsize= 20, alpha=0.6)
ax.set_ylabel('SkinThickness', fontsize= 20, alpha=0.6)
ax.set_zlabel('BMI', fontsize= 20, alpha=0.6)
Out[26]:
Text(0.5, 0, 'BMI')
In [27]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
KF = cluster.fit_predict(PKP)
KF
Out[27]:
array([3, 0, 4, 0, 1, 4, 0, 4, 3, 2, 4, 4, 2, 3, 3, 4, 1, 4, 1, 1, 1, 2,
       2, 1, 3, 3, 2, 0, 3, 4, 3, 1, 0, 4, 3, 1, 4, 3, 1, 3, 0, 4, 3, 3,
       4, 1, 4, 0, 0, 0, 0, 0, 0, 3, 1, 0, 1, 1, 2, 1, 0, 4, 4, 0, 2, 0,
       3, 2, 0, 0, 0, 1, 2, 0, 0, 0, 2, 0, 4, 0, 0, 0, 3, 0, 4, 0, 1, 0,
       3, 0, 4, 0, 1, 2, 0, 3, 0, 0, 0, 1, 4, 4, 4, 0, 4, 0, 4, 3, 0, 0,
       0, 3, 0, 4, 1, 2, 4, 4, 0, 0, 1, 1, 0, 2, 4, 1, 0, 1, 3, 2, 0, 4,
       1, 3, 0, 0, 0, 0, 4, 0, 2, 3, 0, 2, 0, 0, 1, 1, 2, 0, 1, 4, 3, 1,
       2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 4, 3, 0, 4, 4, 0, 4, 0, 0, 1, 0, 1,
       2, 1, 2, 4, 4, 0, 0, 4, 4, 3, 3, 1, 1, 0, 4, 1, 4, 4, 3, 1, 4, 0,
       1, 0, 0, 4, 0, 0, 3, 0, 3, 2, 0, 3, 0, 1, 3, 0, 1, 1, 1, 0, 0, 2,
       0, 2, 4, 3, 0, 0, 4, 1, 1, 0, 4, 1, 0, 4, 0, 4, 3, 0, 0, 4, 0, 0,
       4, 0, 0, 3, 2, 1, 1, 0, 2, 4, 0, 0, 0, 1, 1, 0, 0, 3, 0, 4, 0, 3,
       4, 3, 4, 1, 4, 4, 1, 0, 4, 1, 2, 1, 0, 0, 2, 0, 4, 3, 0, 2, 2, 3,
       1, 1, 0, 1, 0, 0, 1, 1, 2, 0, 1, 0, 3, 2, 4, 0, 1, 4, 4, 0, 3, 0,
       0, 0, 1, 1, 0, 0, 3, 0, 0, 4, 1, 2, 0, 0, 0, 1, 0, 0, 0, 4, 1, 1,
       3, 0, 2, 4, 0, 1, 2, 2, 1, 2, 0, 0, 0, 4, 2, 3, 0, 4, 0, 3, 4, 4,
       3, 0, 4, 2, 1, 3, 3, 1, 1, 2, 3, 2, 0, 0, 4, 0, 0, 3, 1, 0, 0, 1,
       1, 3, 0, 1, 4, 1, 0, 0, 0, 0, 0, 0, 3, 1, 3, 0, 3, 4, 0, 0, 4, 0,
       1, 1, 4, 0, 4, 2, 1, 3, 2, 0, 2, 4, 4, 1, 1, 0, 0, 0, 1, 0, 0, 3,
       4, 0, 1, 0, 1, 0, 3, 1, 0, 3, 1, 3, 4, 0, 0, 4, 0, 4, 3, 4, 0, 4,
       3, 0, 0, 4, 0, 1, 0, 1, 1, 0, 0, 4, 0, 2, 0, 3, 2, 0, 3, 3, 3, 4,
       1, 1, 4, 0, 0, 1, 4, 1, 1, 1, 0, 2, 4, 3, 1, 0, 1, 3, 1, 1, 0, 0,
       4, 1, 1, 3, 0, 2, 0, 3, 1, 3, 0, 2, 4, 0, 3, 1, 0, 0, 1, 3, 1, 4,
       3, 0, 0, 2, 3, 0, 2, 4, 0, 0, 3, 2, 2, 2, 0, 0, 0, 2, 4, 0, 0, 0,
       0, 4, 0, 4, 1, 4, 0, 4, 2, 2, 1, 1, 1, 0, 3, 0, 0, 1, 3, 0, 3, 1,
       0, 0, 2, 0, 0, 1, 1, 2, 1, 4, 2, 0, 1, 0, 4, 0, 0, 3, 3, 1, 4, 4,
       0, 0, 0, 1, 0, 4, 4, 3, 1, 0, 3, 2, 3, 0, 2, 4, 3, 4, 1, 1, 2, 0,
       1, 0, 2, 0, 4, 0, 0, 4, 1, 3, 4, 0, 1, 0, 1, 0, 0, 3, 1, 0, 3, 4,
       4, 0, 3, 4, 1, 0, 2, 0, 4, 1, 4, 4, 2, 0, 4, 1, 4, 0, 4, 4, 2, 0,
       0, 0, 0, 4, 2, 4, 0, 0, 0, 1, 1, 0, 0, 0, 1, 4, 0, 0, 0, 1, 2, 0,
       2, 1, 1, 1, 1, 1, 3, 1, 3, 3, 3, 0, 3, 1, 2, 4, 2, 4, 4, 0, 0, 1,
       1, 4, 2, 0, 4, 0, 0, 1, 4, 2, 0, 1, 4, 3, 0, 4, 0, 4, 0, 3, 3, 2,
       0, 0, 0, 0, 2, 0, 0, 3, 1, 0, 4, 1, 1, 3, 1, 3, 0, 1, 1, 3, 2, 1,
       0, 0, 4, 4, 0, 4, 1, 0, 2, 0, 0, 3, 0, 2, 1, 0, 0, 2, 1, 3, 1, 1,
       3, 2, 4, 1, 0, 1, 3, 1, 1, 2, 4, 2, 0, 3, 4, 3, 0, 0, 2, 0],
      dtype=int64)
In [28]:
# Initializing KMeans
kmeans = KMeans(n_clusters=5)
# Fitting with inputs
kmeans = kmeans.fit(PKP)
# Predicting the clusters
labels = kmeans.predict(PKP)
# Getting the cluster centers
C = kmeans.cluster_centers_
In [29]:
C
Out[29]:
array([[25.10138249, 21.01382488, 27.84147465],
       [45.82014388, 32.33093525, 33.90359712],
       [28.86486486,  0.33108108, 29.1527027 ],
       [52.08695652,  1.26086957, 31.24782609],
       [27.02906977, 38.09883721, 38.52732558]])
In [31]:
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(PKP['Age'], PKP['SkinThickness'], PKP['BMI'], c=KF)
ax.scatter(C[:, 0], C[:, 1], C[:, 2], marker='.', c='red', s=1000)

ax.set_title('Clusters', fontsize= 30, alpha=0.6)
ax.set_xlabel('Age', fontsize= 20, alpha=0.6)
ax.set_ylabel('SkinThickness', fontsize= 20, alpha=0.6)
ax.set_zlabel('BMI', fontsize= 20, alpha=0.6)
Out[31]:
Text(0.5, 0, 'BMI')