machine_learning_2.py

#

scikit-learn classifiers
Based on code here and here

import numpy as np
import matplotlib.pyplot as plt
#

Gaussian Naive Bayes: Simple example

#

Create a couple of numpy arrays
X contains the feature values

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
#

Y contains the target values

Y = np.array([1, 1, 1, 2, 2, 2])
#

data plotted and points colored by target value

plt.scatter(X[:, 0], X[:, 1], c=Y)
#

feature values for new data

new_obs = np.array([[-0.8, -1], [1, 0.8], [1, -0.5]])
#

new data plotted and given a different color (no target value)

plt.scatter(new_obs[:, 0], new_obs[:, 1])
plt.show()
#

Import Gaussian naive bayesian classifier

from sklearn.naive_bayes import GaussianNB
#

Create an instance of the classifier

clf = GaussianNB()
#

Fit the classifier to the data

clf.fit(X, Y)
#

Make predictions on new feature values based on the fit

predictions = clf.predict(new_obs)
print("Predicted targets (labels) for new data:", predictions, "\n")
#

Extend feature and target vectors with new data and predicted targets

new_X = np.append(X, new_obs, axis=0)
new_Y = np.append(Y, predictions)
#

Create new scatterplot with new data points colored according to their predicted classification

plt.scatter(new_X[:, 0], new_X[:, 1], c=new_Y)
plt.show()
#

Gaussian Naive Bayes: Example with Iris dataset

from sklearn import datasets

iris = datasets.load_iris()

gnb = GaussianNB()
gnb.fit(iris.data, iris.target)
y_pred = gnb.predict(iris.data)

print("Iris dataset predictions based on GaussianNB classifier")
for pred, targ in zip(y_pred, iris.target):
    print('Predicted:', pred, 'Target:', targ)

print("\nNumber of mislabeled points out of a total %d points : %d"
      % (iris.data.shape[0], (iris.target != y_pred).sum()))
#

k-nearest-neighbor classifier: Example with Iris dataset

#

Split iris dataset into data and target arrays

X = iris.data
y = iris.target
#

Split iris data into train and test data for the purposes of cross validation using numpy.random.permutation to split the data randomly.
Run code several times for different randomly assigned train and
test samples.

indices = np.random.permutation(len(X))
#

Assign all but the last 10 randomly permuted indices for training

X_train = X[indices[:-10]]
y_train = y[indices[:-10]]
#

Assign the remaining 10 indices for testing

X_test  = X[indices[-10:]]
y_test  = y[indices[-10:]]
#

Create and fit a nearest-neighbor classifier

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
#

Predict the target for the test data

pred = knn.predict(X_test)
print('\n\nPredicted values for test data based on KNN classifier:\n',
      pred, '\n')
#

The true targets for the test data

print('True values for test data:\n', y_test, '\n')
#

Use the classifier's .score method for measuring the accuracy of predictions

score = knn.score(X_test, y_test)
print('Accuracy is', score)