Hierarchical Clustering - Agglomerative

from matplotlib.axes._axes import _log as matplotlib_axes_logger
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial import distance_matrix 
from scipy.spatial.distance import euclidean
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import MinMaxScaler
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab
import scipy
import warnings

matplotlib_axes_logger.setLevel('ERROR')
warnings.filterwarnings('ignore')

df = pd.read_csv('./data/ibm/cars_clus.csv')

columns = [
    'sales', 
    'resale', 
    'type', 
    'price', 
    'engine_s',
    'horsepow', 
    'wheelbas', 
    'width', 
    'length', 
    'curb_wgt', 
    'fuel_cap',
    'mpg', 
    'lnsales'
]
df[columns] = df[columns].apply(pd.to_numeric, errors='coerce')

df = df.dropna()
df = df.reset_index(drop=True)

columns = [
    'engine_s',  
    'horsepow', 
    'wheelbas',
    'width',
    'length',
    'curb_wgt', 
    'fuel_cap',
    'mpg'
]
dataset = df[columns]

X = dataset.values
mms = MinMaxScaler()
features = mms.fit_transform(X)

leng = features.shape[0]
D = scipy.zeros([leng, leng])

for i in range(leng):
    for j in range(leng):
        D[i,j] = euclidean(features[i], features[j])

Z = linkage(D, 'complete')

k = 3
clusters = fcluster(Z, k, criterion='distance')

def llf(id):
    return '[%s %s %s]' % (df['manufact'][id], df['model'][id], int(float(df['type'][id])))

fig = pylab.figure(figsize=(18,50))
dendro = dendrogram(Z, leaf_label_func=llf, leaf_rotation=0, leaf_font_size=12, orientation='right')

Clustering using scikit-learn

Now, we can use the 'AgglomerativeClustering' function from scikit-learn library to cluster the dataset. The AgglomerativeClustering performs a hierarchical clustering using a bottom up approach. The linkage criteria determines the metric used for the merge strategy:

Ward minimizes the sum of squared differences within all clusters. It is a variance-minimizing approach and in this sense is similar to the k-means objective function but tackled with an agglomerative hierarchical approach.
Maximum or complete linkage minimizes the maximum distance between observations of pairs of clusters.
Average linkage minimizes the average of the distances between all observations of pairs of clusters.

dist_matrix = distance_matrix(features, features)

agglom = AgglomerativeClustering(n_clusters=6, linkage='complete')

agglom.fit(features)

AgglomerativeClustering(linkage='complete', n_clusters=6)

df['cluster'] = agglom.labels_

n_clusters = max(agglom.labels_) + 1
colors = cm.rainbow(np.linspace(0, 1, n_clusters))
cluster_labels = list(range(0, n_clusters))

plt.figure(figsize=(16, 14))

for color, label in zip(colors, cluster_labels):
    subset = df[df.cluster == label]
    for i in subset.index:
            plt.text(
                subset.horsepow[i],
                subset.mpg[i],
                str(subset['model'][i]),
                rotation=25
            ) 
            
    plt.scatter(
        subset.horsepow,
        subset.mpg,
        s= subset.price * 10,
        c=color,
        label='cluster' + str(label),
        alpha=0.5
    )
    
plt.legend()
plt.title('Clusters')
plt.xlabel('horsepow')
plt.ylabel('mpg');

cols = [
    'horsepow',
    'engine_s',
    'mpg',
    'price'
]
agg_cars = df.groupby(['cluster', 'type'])[cols].mean()
agg_cars

It is obvious that we have 3 main clusters with the majority of vehicles in those.

Cars:

Cluster 1: with almost high mpg, and low in horsepower.
Cluster 2: with good mpg and horsepower, but higher price than average.
Cluster 3: with low mpg, high horsepower, highest price.

Trucks:

Cluster 1: with almost highest mpg among trucks, and lowest in horsepower and price.
Cluster 2: with almost low mpg and medium horsepower, but higher price than average.
Cluster 3: with good mpg and horsepower, low price.

Please notice that we did not use type , and price of cars in the clustering process, but Hierarchical clustering could forge the clusters and discriminate them with quite high accuracy.

plt.figure(figsize=(16, 10))

for color, label in zip(colors, cluster_labels):
    subset = agg_cars.loc[(label,),]
    
    for i in subset.index:
        plt.text(
            subset.loc[i][0] + 5,
            subset.loc[i][2],
            'type=' + str(int(i)) + ', price=' + str(int(subset.loc[i][3])) + 'k'
        )
    
    plt.scatter(subset.horsepow, subset.mpg, s=subset.price*20, c=color, label='cluster'+str(label))
    
plt.legend()
plt.title('Clusters')
plt.xlabel('horsepow')
plt.ylabel('mpg');

		horsepow	engine_s	mpg	price
cluster	type
0	1.0	211.666667	4.483333	16.166667	29.024667
1	0.0	146.531915	2.246809	27.021277	20.306128
1	1.0	145.000000	2.580000	22.200000	17.009200
2	0.0	203.111111	3.303704	24.214815	27.750593
2	1.0	182.090909	3.345455	20.181818	26.265364
3	0.0	256.500000	4.410000	21.500000	42.870400
3	1.0	160.571429	3.071429	21.428571	21.527714
4	0.0	55.000000	1.000000	45.000000	9.235000
5	0.0	365.666667	6.233333	19.333333	66.010000