L’objectif de ce mini-projet est de programmer un classifieur bayesien. L’exemple choisi est la classifica- tion de tres beau specimens d’iris (la fleur). Chaque iris est represent´ee par 4 parametres caract´eristiques : la largeur et la longueur de leur p´etale et de leur s´epale. Chaque iris est donc code par un vecteur de r´eels de dimension 4.
Le travail demande consiste en deux phases :
Les instructions Python suivantes permettent de charger le jeu de donnees Iris, et d’afficher la partie data (description des donnees en termes d’attributs) et la partie target (classe, cible, etiquette)
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
%matplotlib inline
iris = datasets.load_iris()
print(iris.data[:4])
print(iris.target)
Ciris = np.c_[iris.data.reshape(len(iris.data), -1), iris.target.reshape(len(iris.target), -1)]
np.random.seed(987654321)
print(Ciris[:4])
np.random.shuffle(Ciris)
shuffledIrisData = Ciris[ :, :iris.data.size//len(iris.data)].reshape(iris.data.shape)
shuffledIrisTarget = Ciris[ :, iris.data.size//len(iris.data) :].reshape(iris.target.shape)
learn = shuffledIrisData[0:100]
dev = shuffledIrisData[101:130]
test = shuffledIrisData[131:]
learn_target = shuffledIrisTarget[0:100]
dev_target = shuffledIrisTarget[101:130]
test_target = shuffledIrisTarget[131:]
Il faut estimer deux types de probabilites :
class GaussianNB(object):
all_color = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
def __init__(self, feature_names, target_names, select_features):
"""
feature_names is Array( "name of feature 0", "name of feature 1", ..)
target_names is Array ( "name of class0", ..)
"""
self.feature_names = np.array(feature_names)[select_features]
self.target_names = target_names
self._init_color()
def _init_color(self):
self.color = self.all_color[:len(self.target_names)]
def fit(self, X, y, select_features):
separated = self._extract_feature_class(X[:, select_features],y)
# print(separated[0])
# print(np.mean(separated[0], axis=0))
self.model = self._calcul_model(separated)
# print(self.model)
self.wi_mean = self.calculate_wi_mean(y)
return self
def calculate_wi_mean(self, y):
"""Les probabilités a priori"""
return np.array( [ list(y).count(class_x)/len(y) for class_x in np.unique(y)] )
def _extract_feature_class(self, X, y):
"""
extract features by class association
return Array(
Array(feature_1, feature_2,..) /* for class 0 */,
Array(feature_1, feature_2,..) /* for class 1 */,
..)
"""
result = []
for class_x in np.unique(y):
separated = []
for features, target in zip(X, y):
if target == class_x:
separated.append(features)
result.append(separated)
return result
def _calcul_model(self, separated):
"""
caclulate the mean and the standard deviation of each feature for each class.
return Array(
Array(mean, std) /* for class 0 */,
Array(mean, std) /* for class 1 */,
..)
"""
return np.array([np.c_[np.mean(i, axis=0), np.std(i, axis=0)] # np.c_ == zip
for i in separated])
def _make_plot(self, mu, std):
sigma = std
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
label = "μ = %.02f σ = %.02f" % (mu, sigma)
plt.plot(x,mlab.normpdf(x, mu, sigma), self.color.pop(), label = label)
def plot_features(self):
"""
display x plot (x == num of features) each plot has y fig ( y == num of classes)
"""
tmp_feature_name = list(self.feature_names) # copy the list
transpose = self.model.transpose()
# Array ( Array(class1, class2, ...) --> feature1, ... )
mu = transpose[0]
stds = transpose[1]
for m,s in zip(mu, stds):
plt.title(tmp_feature_name.pop(0), y=1.10)
plt.xlabel('x')
plt.ylabel('P(μ, σ)(x)')
for a, b in zip(m,s):
self._make_plot(a,b)
self._init_color()
first_legend = plt.legend()
ax = plt.gca().add_artist(first_legend)
# Create another legend for the second line.
plt.legend(self.target_names, bbox_to_anchor=(0,1.02,1,0.2), loc="lower right",
mode="expand", borderaxespad=0, ncol=len(self.target_names))
plt.show()
def _norm_pdf(self, x, mean, std):
"""
Gaussian distribution
http://en.wikipedia.org/wiki/Normal_distribution#Probability_density_function
"""
var = std**2
denom = np.sqrt(2*np.pi*var)
num = np.exp(-(float(x)-float(mean))**2/(2*var))
return num/denom
def predict_proba(self, X):
return np.array([ self.wi_mean * [sum(self._norm_pdf(i, *m) for m, i in zip(model, x))
for model in self.model] for x in X])
def predict(self, X):
return np.argmax(self.predict_proba(X), axis=1)
def score(self, X, y):
return sum(self.predict(X) == y) / len(y)
def plot_for_2_feature(self, X, Y, feature_index = [0,1]):
"""
plot only 2 dim (2 features)
by default it will be the first(0) and second(1) features
"""
# Plot
fig = plt.figure(figsize=(5, 3.75))
ax = fig.add_subplot(111)
for x, y, class_x, class_res in zip(X[:, feature_index[0]], X[:, feature_index[1]], self.predict(X), Y):
# target class
ax.scatter(x,y,100, color=self.all_color[class_res.astype(int)],
edgecolor='k')
# predict class
ax.scatter(x,y,20, color=self.all_color[class_x],
edgecolor='k')
# Plot formatting
plt.title("Confusion Plot (IN = predict, LAYER = real)")
ax.set_xlabel(self.feature_names[feature_index[0]])
ax.set_ylabel(self.feature_names[feature_index[1]])
plt.tight_layout()
plt.show()
# print(nb.predict_proba(dev[:5]))
# print("Model:")
# print(nb.model)
# print("\nPredict_proba")
# print(nb.predict_proba(dev[:5]))
# print("\nFeatures values dev:")
# print(dev_target[:5])
# print("\nPredict values dev:")
# print(nb.predict(dev[:5]))
feature_selected = [0,1,2,3]
feature_selected_name = np.array(iris.feature_names)[feature_selected]
nb = GaussianNB(iris.feature_names, iris.target_names, feature_selected)
nb.fit(learn, learn_target, feature_selected)
nb.plot_features()
On note que individuellement, les features petal width et petal lenght semble être
les features les plus discriminant.
Un expert diras surement qu'elle sont trop corréler ?
def combs(x):
import itertools
return [np.array(c) for i in range(len(x)+1) for c in itertools.combinations(x,i)]
combi_best = 0.0
features_best = []
for comb in combs(range(len(iris.feature_names))):
if len(comb) >= 2:
# Choose the feature here
feature_selected = comb
feature_selected_name = np.array(iris.feature_names)[feature_selected]
nb = GaussianNB(iris.feature_names, iris.target_names, feature_selected)
nb.fit(learn, learn_target, feature_selected)
score = nb.score(dev[:, feature_selected],dev_target)
if (score > combi_best):
features_best = feature_selected_name
print("\nLearn with", feature_selected_name, "comb:", comb)
print("Score Learn:", nb.score(learn[:, feature_selected], learn_target))
nb.plot_for_2_feature(dev[:, feature_selected], dev_target)
print("Score dev:", nb.score(dev[:, feature_selected], dev_target))
print("\nBest of the best")
print(combi_best, features_best)
On remarque que sur le corpus de dev la meilleur notes est 0.931034482759
sur les entraînements de
* 'sepal length (cm)' 'petal width (cm)'
* 'sepal length (cm)' 'sepal width (cm)' 'petal width (cm)'
* 'sepal length (cm)' 'petal length (cm)' 'petal width (cm)'
* 'sepal width (cm)' 'petal length (cm)' 'petal width (cm)'
* 'sepal length (cm)' 'sepal width (cm)' 'petal length (cm)' 'petal width (cm)'
Comme quoi les features 'sepal length (cm)' 'petal width (cm)' (indice [0,3]) sont apparemment celle qui discriminent le plus nos iris
def matriceConfusion(estimation, target):
matrice = np.zeros((3, 3))
for i in range(len(estimation)):
matrice[estimation[i]][int(target[i])] += 1
return matrice
feature_selected = [0,3]
nb = GaussianNB(iris.feature_names, iris.target_names, feature_selected)
nb.fit(learn, learn_target, feature_selected)
print("Matrice de confusion DEV :")
print(matriceConfusion(nb.predict(dev[:, feature_selected]), dev_target))
print("Matrice de confusion TEST :")
print(matriceConfusion(nb.predict(test[:, feature_selected]), test_target))
corpus = test[:, feature_selected]
target = test_target
nb.plot_for_2_feature(corpus, target)
print("Score test:", nb.score(corpus, target))
from sklearn.naive_bayes import GaussianNB as GaussianNB_sklearn
clf = GaussianNB_sklearn()
clf.fit(learn, learn_target)
print("SKLEAN naive bayes")
print("Score test:",clf.score(test, test_target))