k-means clustering of Bach chorales


k-means clustering of Bach chorales into modes based on pitch-class distributions. Original code here.

from music21 import corpus
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import os

Process the chorales


Iterates through all Bach chorales, discarding those where the number of parts is not 4.

Collect the pc histogram for each remaining chorale and saves the data as 'chorales_pc_counts.npy' using

def process_chorales():
    chorales_pc_counts = []

    for chorale in corpus.chorales.Iterator():

Discard if number of parts is not 4

        if len( != 4:
        pcs = get_pcs(chorale)
        tonic_pc = get_tonic_pc(chorale)

transpose pcs so that tonic_pc is 0

        pcs = [(pc - tonic_pc) % 12 for pc in pcs]

counter for pcs

        counts = np.zeros(12)
        for pc in pcs:
            counts[pc] += 1

Print progress

        length = len(chorales_pc_counts)
        if length % 10 == 0:
            print(length, 'chorales processed')

    chorales_pc_counts = np.array(chorales_pc_counts)

Save the pc counts'chorales_pc_counts.npy', chorales_pc_counts)
    return chorales_pc_counts

Helper function to get all pcs from a stream. Returns a list of pcs (integers mod 12).

def get_pcs(s):
    pcs = []
    for n in s.flat.notes:
        if n.isChord:
            for p in n.sortAscending():
    return pcs

Helper function that returns the pitch class of the last bass note of the choral.

def get_tonic_pc(s):
    bass =[-1]
    tonic_note = bass.flat.notes[-1]
    if tonic_note.isChord:
        tonic_note = tonic_note.sortAscending()[0]
    return tonic_note.pitch.pitchClass

Load the data


Loads 'chorales_pc_counts.npy', which is a list of Bach chorale pitch- class counts, using np.load. If 'chorales_pc_counts.npy' does not exist, calls process_chorales() to generate this list.

Returns a list of Bach chorale pitch-class counts.

def load_data():
    if os.path.isfile('chorales_pc_counts.npy'):       
        return np.load('chorales_pc_counts.npy')
        return process_chorales()

Reduce the data to two principal components (dimensions)


Reduce 12D pc histograms to two principal components.

Returns the reduced data.

def reduce_data(data):
    normalized_data = normalize(data)
    reduced_data = PCA(n_components=2).fit_transform(normalized_data)
    return reduced_data

Normalize the data


Normalize the features in each sample.

Returns the normalized data.

def normalize(data):
    rows = []
    for row in data:
        rows.append(row / row.sum())
    return np.array(rows)

Fit the data


Fit the data using a kmeans clustering, where k is the number of clusters.

Returns the fitted kmeans model.

def fit_data(data, k=2):
    kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)

    return kmeans

Silhouette scores


Plot the silhouette scores for number of clusters ranging from 2 to 6.

Returns the number of clusters with the max silhouette score.

def silhouette(data):
    scores = []
    for n in range(2, 7):
        kmeans = fit_data(data, k=n)
        pred = kmeans.predict(data)
        scores.append(silhouette_score(data, pred))
    xs = range(2, 7)
    plt.plot(xs, scores)
    plt.xticks(range(2, 7))
    plt.xlabel('number of clusters')
    plt.ylabel('silhouette score')

    return np.argmax(scores) + 2

Visualize the data as a Voronoi diagram


Visualization code taken from here

def visualize_data(data, kmeans):

Step size of the mesh. Decrease to increase the quality of the VQ.

    h = .0002     # point in the mesh [x_min, x_max]x[y_min, y_max].

Plot the decision boundary. For that, we will assign a color to each

    excess = 0.1
    x_min, x_max = data[:, 0].min() - excess, data[:, 0].max() + excess
    y_min, y_max = data[:, 1].min() - excess, data[:, 1].max() + excess
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Obtain labels for each point in mesh. Use last trained model.

    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

Put the result into a color plot

    Z = Z.reshape(xx.shape)
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               aspect='auto', origin='lower')

    plt.plot(data[:, 0], data[:, 1], 'k.', markersize=2)

Plot the centroids as a white X

    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)
    plt.title('K-means clustering on the pc_distribution of Bach Chorales:\n'
              '(PCA-reduced data) Centroids are marked with white cross')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)

Putting it all together


Load the data

data = load_data()

Reduce the data

reduced_data = reduce_data(data)

Find the best number of clusters

best = silhouette(reduced_data)

Fit the data

kmeans = fit_data(reduced_data, k=best)

Visualize the data

pred = visualize_data(reduced_data, kmeans)