from music21 import features
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import os
import pickleimport four different classifiers for comparison
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifierimport class for Principal Component Analysis
from sklearn.decomposition import PCAimport class for selecting the k best features
from sklearn.feature_selection import SelectKBestimport class for cross validation and scoring
from sklearn.model_selection import cross_val_scoreCreate DataSet, add features with addFeatureExtractors, and parse and
add streams of Monteverdi madrigals 1-8 from both books three and five.
Pickle the lists given by getFeaturesAsList() and getAttributeLabels()
and return these lists as a tuple.
def process_madrigals():    books = {}Create DataSet
    ds = features.DataSet(classLabel='Book')Create list of ids for feature extractors
    ids = ['m' + str(i) for i in range(2, 20) if i not in [3, 4]]
    ids.extend(['p' + str(i) for i in range(1, 17)
                if i not in [5, 6, 11, 16]])
    ids.append('r17')Get feature extractorsById and addFeatureExtractors
    fes = features.extractorsById(ids)
    ds.addFeatureExtractors(fes)Get paths for madrigals 1-8 in both books three and five
    for book in ['3', '5']:
        prefix = 'monteverdi/madrigal.'
        suffix = '.mxl'
        books[book] = [prefix + book + '.' + str(i) + suffix
                       for i in range(1, 9)]Parse madrigals and add streams to DataSet using addData
    for book in books:
        for piece in books[book]:
            print('Parsing', piece)
            ds.addData(piece, classValue=book)Process DataSet, name feature values dataset and attributes labels
attributes
    ds.process()
    dataset = ds.getFeaturesAsList()
    attributes = ds.getAttributeLabels()Pickle dataset and attributes
    pickle.dump(dataset, open('monteverdi_dataset.pkl', 'wb'))
    pickle.dump(attributes, open('monteverdi_attributes.pkl', 'wb'))
    return (dataset, attributes)If 'monteverdi_dataset.pkl' exists, unpickle and return tuple of dataset and attribute labels.
Otherwise, call process_madrigals() and return the tuple.
def load_data():    if os.path.isfile('monteverdi_dataset.pkl'):
        return (pickle.load(open('monteverdi_dataset.pkl', 'rb')),
                pickle.load(open('monteverdi_attributes.pkl', 'rb')))
    else:
        return process_madrigals()Perform a Principal Component Analysis with three components. Fit to and
transform X. Plot the resulting transformed data in 3D with data points
colored by their labels.
def plot_3D(X, y, title=None):    fig = plt.figure(1, figsize=(8, 6))
    ax = Axes3D(fig, elev=-150, azim=110)
    pca = PCA(n_components=3)
    pca.fit(X)
    X_reduced = pca.transform(X)
    ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y)
    ax.set_title(title)
    ax.set_xlabel("1st eigenvector")
    ax.w_xaxis.set_ticklabels([])
    ax.set_ylabel("2nd eigenvector")
    ax.w_yaxis.set_ticklabels([])
    ax.set_zlabel("3rd eigenvector")
    ax.w_zaxis.set_ticklabels([])
    plt.show()Compare four different classifiers on data X and targets y.
Use cross_val_score to perform cross validation and score each batch.
def test_classifiers(X, y):    gnb = GaussianNB()
    knn = KNeighborsClassifier()
    svc = SVC()
    dtc = DecisionTreeClassifier()
    classifiers = [gnb, knn, svc, dtc]
    classifier_names = ['GaussianNB', 'KNeighbors', 'SVC', 'Decision Tree']
    for clf, name in zip(classifiers, classifier_names):
        scores = cross_val_score(clf, X, y, cv=8)
        print('Accuracy for {} is {}.'.format(name, np.mean(scores)))Find the best num_features of features for data X and targets y.
If verbose=True, print features in order from highest to lowest scores.
Return the reduced feature values.
def feature_reduction(num_features, X, y, attributes, verbose=True):    select = SelectKBest(k=num_features)
    select.fit(X, y)
    indices = np.argsort(select.scores_)
    if verbose:
        print('\nBest features:')
        for index in indices[::-1]:
            print('feature: {}, score: {}'.format(attributes[index+1],
                                                  select.scores_[index]))
    return X[:, indices[-1:-num_features -1:-1]]def initialize(reload):
    if reload:
        process_madrigals()
    dataset, attributes = load_data()
    data = []
    targets = []
    for obs in dataset:
        data.append(obs[1:-1])
        targets.append(obs[-1])
    return (np.array(data), np.array(targets), attributes)Flag whether or not to force reloading and parsing of the madrigals.
reload = FalseAssign X, y, and attributes to data, targets, and attribute labels.
X, y, attributes = initialize(reload)Score for classifiers trained on data with all features.
Accuracies based on all features:
Accuracy for GaussianNB is 0.75.
Accuracy for KNeighbors is 0.5625.
Accuracy for SVC is 0.625.
Accuracy for Decision Tree is 0.5625.
print('Accuracies based on all features:')
test_classifiers(X, y)Determine best features and assign data with only these features to X_new.
num_features = 6
X_new = feature_reduction(num_features, X, y, attributes)Best features:
feature: Relative_Strength_of_Top_Pitch_Classes, score: 9.115085492894748
feature: Stepwise_Motion, score: 8.90606828647132
feature: Chromatic_Motion, score: 7.922172419095363
feature: Amount_of_Arpeggiation, score: 7.74151861513606
feature: Repeated_Notes, score: 6.978814103596693
feature: Direction_of_Motion, score: 5.053424575750202
feature: Melodic_Fifths, score: 3.4287923964333364
feature: Most_Common_Pitch_Class_Prevalence, score: 3.4206375187543414
feature: Pitch_Class_Variety, score: 2.7777777777777777
feature: Primary_Register, score: 2.6155071146418294
feature: Pitch_Variety, score: 2.5444264943457187
feature: Most_Common_Pitch_Prevalence, score: 2.287553543524226
feature: Relative_Strength_of_Top_Pitches, score: 1.5412578570414568
feature: Number_of_Common_Pitches, score: 1.3358778625954197
feature: Average_Melodic_Interval, score: 1.1026124301184528
feature: Melodic_Thirds, score: 1.0747339356403958
feature: Most_Common_Melodic_Interval_Prevalence, score: 1.0222533224729131
feature: Number_of_Common_Melodic_Intervals, score: 1.0
feature: Importance_of_Bass_Register, score: 0.8003018580068948
feature: Importance_of_High_Register, score: 0.6444487928561894
feature: Range, score: 0.42748091603053434
feature: Relative_Strength_of_Most_Common_Intervals, score: 0.41668508666123516
feature: Average_Note_Duration, score: 0.18684910135877145
feature: Melodic_Tritones, score: 0.16621202611352615
feature: Importance_of_Middle_Register, score: 0.009758707453006818
feature: Size_of_Melodic_Arcs, score: 0.006591088847781841
feature: Melodic_Octaves, score: 0.0019111831441988583
feature: Duration_of_Melodic_Arcs, score: 0.0008730810676606502  
plot_3D(X_new, y, title='PCA, best {} features'.format(num_features))Score for classifiers trained on data with only best features.
Accuracies based on best 6 features:
Accuracy for GaussianNB is 0.8125.
Accuracy for KNeighbors is 0.75.
Accuracy for SVC is 0.875.
Accuracy for Decision Tree is 0.625.
print('\nAccuracies based on best', num_features, 'features:')
test_classifiers(X_new, y)