machine_learning_5.py

#

k-means clustering of Bach chorales

#

k-means clustering of Bach chorales into modes based on pitch-class distributions. Original code here.

from music21 import corpus
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import os
#

Process the chorales

#

Iterates through all Bach chorales, discarding those where the number of parts is not 4.

Collect the pc histogram for each remaining chorale and saves the data as 'chorales_pc_counts.npy' using np.save.

def process_chorales():
#
    chorales_pc_counts = []

    for chorale in corpus.chorales.Iterator():
#

Discard if number of parts is not 4

        if len(chorale.parts) != 4:
            continue
        pcs = get_pcs(chorale)
        tonic_pc = get_tonic_pc(chorale)
#

transpose pcs so that tonic_pc is 0

        pcs = [(pc - tonic_pc) % 12 for pc in pcs]
#

counter for pcs

        counts = np.zeros(12)
        for pc in pcs:
            counts[pc] += 1
        chorales_pc_counts.append(counts)
#

Print progress

        length = len(chorales_pc_counts)
        if length % 10 == 0:
            print(length, 'chorales processed')

    chorales_pc_counts = np.array(chorales_pc_counts)
#

Save the pc counts

    np.save('chorales_pc_counts.npy', chorales_pc_counts)
    return chorales_pc_counts
#

Helper function to get all pcs from a stream. Returns a list of pcs (integers mod 12).

def get_pcs(s):
#
    pcs = []
    for n in s.flat.notes:
        if n.isChord:
            for p in n.sortAscending():
                pcs.append(p.pitch.pitchClass)
        else:
            pcs.append(n.pitch.pitchClass)
    return pcs
#

Helper function that returns the pitch class of the last bass note of the choral.

def get_tonic_pc(s):
#
    bass = s.parts[-1]
    tonic_note = bass.flat.notes[-1]
    if tonic_note.isChord:
        tonic_note = tonic_note.sortAscending()[0]
    return tonic_note.pitch.pitchClass
#

Load the data

#

Loads 'chorales_pc_counts.npy', which is a list of Bach chorale pitch- class counts, using np.load. If 'chorales_pc_counts.npy' does not exist, calls process_chorales() to generate this list.

Returns a list of Bach chorale pitch-class counts.

def load_data():
#
    if os.path.isfile('chorales_pc_counts.npy'):       
        return np.load('chorales_pc_counts.npy')
    else:
        return process_chorales()
#

Reduce the data to two principal components (dimensions)

#

Reduce 12D pc histograms to two principal components.

Returns the reduced data.

def reduce_data(data):
#
    normalized_data = normalize(data)
    reduced_data = PCA(n_components=2).fit_transform(normalized_data)
    return reduced_data
#

Normalize the data

#

Normalize the features in each sample.

Returns the normalized data.

def normalize(data):
#
    rows = []
    for row in data:
        rows.append(row / row.sum())
    return np.array(rows)
#

Fit the data

#

Fit the data using a kmeans clustering, where k is the number of clusters.

Returns the fitted kmeans model.

def fit_data(data, k=2):
#
    kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
    kmeans.fit(data)

    return kmeans
#

Silhouette scores

#

Plot the silhouette scores for number of clusters ranging from 2 to 6.

Returns the number of clusters with the max silhouette score.

def silhouette(data):
#
    scores = []
    for n in range(2, 7):
        kmeans = fit_data(data, k=n)
        pred = kmeans.predict(data)
        scores.append(silhouette_score(data, pred))
    xs = range(2, 7)
    plt.plot(xs, scores)
    plt.xticks(range(2, 7))
    plt.xlabel('number of clusters')
    plt.ylabel('silhouette score')
    plt.show()

    return np.argmax(scores) + 2
#

Visualize the data as a Voronoi diagram

#

Visualization code taken from here

def visualize_data(data, kmeans):
#

Step size of the mesh. Decrease to increase the quality of the VQ.

    h = .0002     # point in the mesh [x_min, x_max]x[y_min, y_max].
#

Plot the decision boundary. For that, we will assign a color to each

    excess = 0.1
    x_min, x_max = data[:, 0].min() - excess, data[:, 0].max() + excess
    y_min, y_max = data[:, 1].min() - excess, data[:, 1].max() + excess
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
#

Obtain labels for each point in mesh. Use last trained model.

    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
#

Put the result into a color plot

    Z = Z.reshape(xx.shape)
    plt.figure(1)
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.plot(data[:, 0], data[:, 1], 'k.', markersize=2)
#

Plot the centroids as a white X

    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)
    plt.title('K-means clustering on the pc_distribution of Bach Chorales:\n'
              '(PCA-reduced data) Centroids are marked with white cross')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()
#

Putting it all together

#

Load the data

data = load_data()
#

Reduce the data

reduced_data = reduce_data(data)
#

Find the best number of clusters

best = silhouette(reduced_data)
#

Fit the data

kmeans = fit_data(reduced_data, k=best)
#

Visualize the data

pred = visualize_data(reduced_data, kmeans)