In this kernel I will use different machine learning models, including Random Forest, Logistic Regression, Extra Trees, K Nearest Neighbors, Fully Connected Neural Network and Convolutional Neural Network, to classify 69 classes of fruit from images. Transfer learning will also be used based on VGG16 model fitted to ImageNet dataset. The datasets are downloaded from Kaggle dataset. The training dataset includes 34641 images and the testing dataset includes 11640 images.
import numpy as np
import matplotlib.pyplot as plt
import keras
import tensorflow as tf
import os
import glob
import platform
import cv2
from sklearn import svm
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization
from keras.layers import LSTM, Input
from keras.optimizers import RMSprop, Adamax
# Reading all pictures
# Each picture -> array of height x width x 3 dimenion
img_height = 48
img_width = 48
train_fruit_images = []
train_labels = []
# Loop over folder inside Training
whatos = platform.system()
for fruit_dir_path in glob.glob("Training/*"):
if whatos == 'Windows':
fruit_label = fruit_dir_path.split("\\")[-1]
else: # Linux and Mac (Unix)
fruit_label = fruit_dir_path.split("/")[-1]
# Loop over each pic in each folder inside Training
for image_path in glob.glob(os.path.join(fruit_dir_path, "*.jpg")):
image = cv2.imread(image_path, cv2.IMREAD_COLOR)
image = cv2.resize(image, (img_height, img_width))
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
train_fruit_images.append(image)
train_labels.append(fruit_label)
train_fruit_images = np.array(train_fruit_images)
train_labels = np.array(train_labels)
label_to_id_dict = {v:i for i,v in enumerate(np.unique(train_labels))}
id_to_label_dict = {v: k for k, v in label_to_id_dict.items()}
train_label_ids = np.array([label_to_id_dict[label] for label in train_labels])
# Same for test set, reading all pictures
# Each picture -> array of height x width x 3 dimenion
test_fruit_images = []
test_labels = []
# Loop over folder inside Validation
whatos = platform.system()
for fruit_dir_path in glob.glob("Validation/*"):
if whatos == 'Windows':
fruit_label = fruit_dir_path.split("\\")[-1]
else: # Linux and Mac (Unix)
fruit_label = fruit_dir_path.split("/")[-1]
# Loop over each pic in each folder inside Validation
for image_path in glob.glob(os.path.join(fruit_dir_path, "*.jpg")):
image = cv2.imread(image_path, cv2.IMREAD_COLOR)
image = cv2.resize(image, (img_height, img_width))
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
test_fruit_images.append(image)
test_labels.append(fruit_label)
test_fruit_images = np.array(test_fruit_images)
test_labels = np.array(test_labels)
test_label_ids = np.array([label_to_id_dict[label] for label in test_labels])
# Plotting some training samples
nplot=10
fig, axes = plt.subplots(nplot, nplot, sharex=True, sharey=True, figsize=(9,9))
train_index = np.arange(0,train_fruit_images.shape[0])
for i in range(nplot):
for j in range(nplot):
axes[i,j].imshow(train_fruit_images[np.random.choice(train_index,1,replace=False)[0]])
axes[i,j].set_xticklabels([])
axes[i,j].set_yticklabels([])
#axes[i,j].axis('off')
plt.show()
# Examples of training samples for one fruit
nplot = 10
offset = 492*5
fig, axes = plt.subplots(nplot, nplot, sharex=True, sharey=True, figsize=(9,9))
for i in range(nplot):
for j in range(nplot):
axes[i,j].imshow(train_fruit_images[i*nplot+j+offset])
axes[i,j].set_xticklabels([])
axes[i,j].set_yticklabels([])
#axes[i,j].axis('off')
plt.show()
# Train-Test
X_train = train_fruit_images
Y_train = train_label_ids
X_test = test_fruit_images
Y_test = test_label_ids
# Normalizing features between 0 and 1
X_train = X_train/255
X_test = X_test/255
# Flattening each image features to 1D-array
X_train_flat = X_train.reshape(X_train.shape[0], img_height*img_width*3)
X_test_flat = X_test.reshape(X_test.shape[0], img_height*img_width*3)
# One-hot encode of the Output
Y_train = keras.utils.to_categorical(Y_train, len(label_to_id_dict))
Y_test = keras.utils.to_categorical(Y_test, len(label_to_id_dict))
# Shape of input and output
print('Original Sizes:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
print('Flattened:', X_train_flat.shape, X_test_flat.shape)
error_nestimators = []
for n_estimators in [10, 20, 30, 40, 50, 60, 70, 80]:
#print('Number of Trees: ', n_estimators)
rf_clf = RandomForestClassifier(n_estimators=n_estimators, oob_score=True)
rf_clf.fit(X_train_flat, train_label_ids)
fit_score = rf_clf.score(X_train_flat, train_label_ids)
test_score = rf_clf.score(X_test_flat, test_label_ids)
#print('Fitting score: ', fit_score)
#print('Testng score: ', test_score)
error_nestimators.append([n_estimators, fit_score, test_score, rf_clf.oob_score_])
error_nestimators = np.array(error_nestimators)
error_maxfeatures = []
for max_features in ['sqrt','log2', None]:
#print('Number of Trees: ', max_features)
rf_clf = RandomForestClassifier(n_estimators=30, max_features=max_features)
rf_clf.fit(X_train_flat, train_label_ids)
fit_score = rf_clf.score(X_train_flat, train_label_ids)
test_score = rf_clf.score(X_test_flat, test_label_ids)
#print('Fitting score: ', fit_score)
#print('Testng score: ', test_score)
error_maxfeatures.append([max_features, fit_score, test_score])
error_maxfeatures = np.array(error_maxfeatures)
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(8,4))
ax1.plot(error_nestimators[:,0], error_nestimators[:,1], marker='o', label='Fitting')
ax1.plot(error_nestimators[:,0], error_nestimators[:,2], marker='o', label='Testing')
ax1.plot(error_nestimators[:,0], error_nestimators[:,3], marker='o', label='OOB')
ax1.set_xlabel('# of trees')
ax1.set_ylabel('Score')
ax1.legend()
ax2.plot([0,1,2], error_maxfeatures[:,1], marker='o', markersize=10, label='Fitting')
ax2.plot([0,1,2], error_maxfeatures[:,2], marker='o', markersize=10, label='Testing')
ax2.set_xlabel('# of features')
#ax2.set_ylabel('Score')
ax2.set_xlim(0,2)
ax2.set_xticks([0,1,2])
ax2.set_xticklabels(['Sqrt','Log2','All'])
ax2.legend()
plt.show()
We can get the best Random Forest Clasifier with 60 trees, where the fitting score is 1.0, tesing score about 0.954 and out-of-bag (OOB) score is 0.993.
error_nestimators = []
for n_estimators in [10, 20, 30, 40, 50, 60, 70, 80]:
#print('Number of Trees: ', n_estimators)
et_clf = ExtraTreesClassifier(n_estimators=n_estimators)
et_clf.fit(X_train_flat, train_label_ids)
fit_score = et_clf.score(X_train_flat, train_label_ids)
test_score = et_clf.score(X_test_flat, test_label_ids)
error_nestimators.append([n_estimators, fit_score, test_score])
error_nestimators = np.array(error_nestimators)
fig, ax1 = plt.subplots(1,1, figsize=(4,4))
ax1.plot(error_nestimators[:,0], error_nestimators[:,1], marker='o', label='Fitting')
ax1.plot(error_nestimators[:,0], error_nestimators[:,2], marker='o', label='Testing')
ax1.set_xlabel('# of trees')
ax1.set_ylabel('Score')
ax1.legend()
plt.show()
The best result is for 80 trees and the fitting score is 1.0 and testing score is 0.952, very slightly less than Random Forest Classifier. This is reasonable since Random Forest and Extra Tree methods are very similar. However, both models are quite overfit.
nplotx=6
nploty=6
model = rf_clf
fig, axes = plt.subplots(nploty, nplotx, sharex=True, sharey=True, figsize=(10,10))
test_index = np.arange(0,test_fruit_images.shape[0])
for i in range(nploty):
for j in range(nplotx):
ind=np.random.choice(test_index,1,replace=False)[0]
axes[i,j].imshow(test_fruit_images[ind])
axes[i,j].set_xticklabels([])
axes[i,j].set_yticklabels([])
axes[i,j].set_title(str(test_labels[ind])+'\n'+'Predicted: '+str(id_to_label_dict[model.predict(X_test_flat[ind:ind+1])[0]]), fontsize=8)
plt.tight_layout()
plt.show()
lg_error = []
for C in [10000, 100, 1.0, 0.01, 0.0001]:
lg_clf = LogisticRegression(C=C)
lg_clf.fit(X_train_flat, train_label_ids)
fit_score = lg_clf.score(X_train_flat, train_label_ids)
test_score = lg_clf.score(X_test_flat, test_label_ids)
lg_error.append([C, fit_score, test_score])
lg_error = np.array(lg_error)
fig, ax1 = plt.subplots(1,1, figsize=(4,4))
ax1.plot(np.log10(1/lg_error[:,0]), lg_error[:,1], marker='o', label='Fitting')
ax1.plot(np.log10(1/lg_error[:,0]), lg_error[:,2], marker='o', label='Testing')
ax1.set_xlabel('Regu strength (log scale)')
ax1.set_ylabel('Score')
ax1.legend()
plt.show()
For C = 1 we can get fitting score 1.0 and testing score 0.878, which are quite overfitted. However, increasing regularization strength, we can not reduce overfit a significantly. For very large regularization strength, both fitting and testing score reduce significantly, about 0.78 for fitting and 0.66 for testing.
C=1
lg_clf = LogisticRegression(C=C)
lg_clf.fit(X_train_flat, train_label_ids)
nplotx=6
nploty=6
model = lg_clf
fig, axes = plt.subplots(nploty, nplotx, sharex=True, sharey=True, figsize=(10,10))
test_index = np.arange(0,test_fruit_images.shape[0])
for i in range(nploty):
for j in range(nplotx):
ind=np.random.choice(test_index,1,replace=False)[0]
axes[i,j].imshow(test_fruit_images[ind])
axes[i,j].set_xticklabels([])
axes[i,j].set_yticklabels([])
axes[i,j].set_title(str(test_labels[ind])+'\n'+'Predicted: '+str(id_to_label_dict[model.predict(X_test_flat[ind:ind+1])[0]]), fontsize=8)
plt.tight_layout()
plt.show()
knn_error = []
for kn in [3, 5, 10, 20]:
knn_clf = KNeighborsClassifier(n_neighbors=kn)
knn_clf.fit(X_train_flat, train_label_ids)
fit_score = knn_clf.score(X_train_flat, train_label_ids)
test_score = knn_clf.score(X_test_flat, test_label_ids)
knn_error.append([kn, fit_score, test_score])
knn_error = np.array(knn_error)
fig, ax1 = plt.subplots(1,1, figsize=(4,4))
ax1.plot(knn_error[:,0], knn_error[:,1], marker='o', label='Fitting')
ax1.plot(knn_error[:,0], knn_error[:,2], marker='o', label='Testing')
ax1.set_xlabel('# neighbor')
ax1.set_ylabel('Score')
ax1.legend()
plt.show()
We can get a testing score of ~ 0.919 with k3. Increasing number of neighbors, both fitting and testing scores are decreasing but we can not reduce overfit. As above model, KNN is also quite overfitted to the current fruit classification problem.
kn = 3
knn_clf = KNeighborsClassifier(n_neighbors=kn)
knn_clf.fit(X_train_flat, train_label_ids)
nplotx=6
nploty=6
model = knn_clf
fig, axes = plt.subplots(nploty, nplotx, sharex=True, sharey=True, figsize=(10,10))
test_index = np.arange(0,test_fruit_images.shape[0])
for i in range(nploty):
for j in range(nplotx):
ind=np.random.choice(test_index,1,replace=False)[0]
axes[i,j].imshow(test_fruit_images[ind])
axes[i,j].set_xticklabels([])
axes[i,j].set_yticklabels([])
axes[i,j].set_title(str(test_labels[ind])+'\n'+'Predicted: '+str(id_to_label_dict[model.predict(X_test_flat[ind:ind+1])[0]]), fontsize=8)
plt.tight_layout()
plt.show()
First we will use fully connected neural networks model with Keras package with TensorFlow as backend.
#1st very simple model
model_dense = Sequential()
model_dense.add(Dense(256, activation='relu', input_shape=(X_train_flat.shape[1],)))
# Dropout layer to reduce variance
model_dense.add(Dropout(0.1))
model_dense.add(Dense(64, activation='relu'))
model_dense.add(Dropout(0.1))
model_dense.add(Dense(Y_train.shape[1], activation='softmax'))
# Summary of model
model_dense.summary()
# Compiling model
model_dense.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])
history_model_dense = model_dense.fit(X_train_flat, Y_train, batch_size=128, epochs=30, validation_data=(X_test_flat, Y_test))
score = model_dense.evaluate(X_test_flat, Y_test, verbose=0)
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])
# Deeper model
model_dense = Sequential()
model_dense.add(Dense(256, activation='relu', input_shape=(X_train_flat.shape[1],)))
# Dropout layer to reduce variance
model_dense.add(Dropout(0.2))
model_dense.add(Dense(128, activation='relu'))
# Batch Normalization to stabilize fitting and partially to reduce variance
model_dense.add(BatchNormalization())
model_dense.add(Dense(128, activation='relu'))
model_dense.add(BatchNormalization())
model_dense.add(Dense(128, activation='relu'))
model_dense.add(Dropout(0.5))
model_dense.add(Dense(Y_train.shape[1], activation='softmax'))
# Summary of model
model_dense.summary()
# Compiling model
model_dense.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])
history_model_dense = model_dense.fit(X_train_flat, Y_train, batch_size=128, epochs=30, validation_data=(X_test_flat, Y_test))
score = model_dense.evaluate(X_test_flat, Y_test, verbose=0)
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])
It's great! With a bit deeper networks, we can archive 0.934 for testing and 0.962 for fitting. There is still overfitting here live above model, but it is less. Dropout was already used to avoid overfit but we still can not avoid it completely. We can use early stop to futher avoid overfit. For example of our case here, we can stop after 7 epochs to have a very a very well generalized neural network model with both fitting and testing score about 0.92! We can expect to get better model with convolutional neural networks (CNN). Let try out CNN!
nplotx=6
nploty=6
model = model_dense
fig, axes = plt.subplots(nploty, nplotx, sharex=True, sharey=True, figsize=(10,10))
test_index = np.arange(0,test_fruit_images.shape[0])
for i in range(nploty):
for j in range(nplotx):
ind=np.random.choice(test_index,1,replace=False)[0]
axes[i,j].imshow(test_fruit_images[ind])
axes[i,j].set_xticklabels([])
axes[i,j].set_yticklabels([])
axes[i,j].set_title(str(test_labels[ind])+'\n'+'Predicted: '+str(id_to_label_dict[model.predict(X_test_flat[ind:ind+1]).argmax()]), fontsize=8)
plt.tight_layout()
plt.show()
# CNN
# Stacking model
model_cnn = Sequential()
model_cnn.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu', input_shape=(img_height, img_width, 3)))
model_cnn.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu'))
model_cnn.add(MaxPool2D(pool_size=(2,2)))
#model_cnn.add(BatchNormalization())
model_cnn.add(Dropout(0.25))
model_cnn.add(Flatten())
model_cnn.add(Dense(128, activation='relu'))
model_cnn.add(Dropout(0.25))
model_cnn.add(Dense(Y_train.shape[1], activation='softmax'))
# Compiling model
model_cnn.compile(loss='categorical_crossentropy',
optimizer=Adamax(),
metrics=['accuracy']
)
model_cnn.summary()
history_model_cnn = model_cnn.fit(X_train, Y_train, batch_size=128,
epochs=5, validation_data=(X_test, Y_test))
score = model_cnn.evaluate(X_test, Y_test)
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])
history_model_cnn = model_cnn.fit(X_train, Y_train, batch_size=128,
epochs=5, validation_data=(X_test, Y_test))
score = model_cnn.evaluate(X_test, Y_test)
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])
# deeper CNN
# Stacking model
model_cnn2 = Sequential()
model_cnn2.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu', input_shape=(img_height, img_width, 3)))
model_cnn2.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu'))
model_cnn2.add(MaxPool2D(pool_size=(2,2)))
#model_cnn2.add(Dropout(0.1))
model_cnn2.add(BatchNormalization())
model_cnn2.add(Conv2D(filters=128, kernel_size=(5,5), activation='relu'))
model_cnn2.add(Conv2D(filters=128, kernel_size=(5,5), activation='relu'))
model_cnn2.add(MaxPool2D(pool_size=(2,2)))
model_cnn2.add(Flatten())
model_cnn2.add(Dropout(0.25))
model_cnn2.add(Dense(128, activation='relu'))
model_cnn2.add(Dropout(0.25))
model_cnn2.add(Dense(Y_train.shape[1], activation='softmax'))
# Compiling model
model_cnn2.compile(loss='categorical_crossentropy',
optimizer=Adamax(),
metrics=['accuracy']
)
model_cnn2.summary()
history_model_cnn2 = model_cnn2.fit(X_train, Y_train, batch_size=128,
epochs=10, validation_data=(X_test, Y_test))
score = model_cnn2.evaluate(X_test, Y_test)
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])
It's great! We can archive 0.952 for testing and 0.986 for fitting. There is still a bit overfitting here. We can use early stop to avoid overfit a bit more.We can stop after 7 epochs to have a very a very well generalized CNN model with fitting and testing score of 0.980 and 0.970! We can see that CNN is quite advanced than other models in image classification.
nplotx=6
nploty=6
fig, axes = plt.subplots(nploty, nplotx, sharex=True, sharey=True, figsize=(10,10))
test_index = np.arange(0,test_fruit_images.shape[0])
for i in range(nploty):
for j in range(nplotx):
ind=np.random.choice(test_index,1,replace=False)[0]
axes[i,j].imshow(test_fruit_images[ind])
axes[i,j].set_xticklabels([])
axes[i,j].set_yticklabels([])
axes[i,j].set_title(str(test_labels[ind])+'\n'+'Predicted: '+str(id_to_label_dict[model_cnn.predict(X_test[ind:ind+1]).argmax()]), fontsize=8)
plt.tight_layout()
plt.show()
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
vgg_base = VGG16( include_top=False, weights='imagenet', input_shape=(img_height, img_width, 3))
# Do not change the weights
for layer in vgg_base.layers:
layer.trainable = False
x = Flatten()(vgg_base.output)
x = Dense(256, activation='relu')(x)
prediction = Dense(Y_train.shape[1], activation='softmax')(x)
vgg_model = Model(inputs=vgg_base.input, outputs=prediction)
vgg_model.summary()
vgg_model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])
history_vgg_model = vgg_model.fit(X_train, Y_train, batch_size=128, epochs=10, validation_data=(X_test, Y_test))
We get a good but not great as CNN model above results for transfer learning, since we fit only the last layer and fixed all parameters of VGG16 model fitted to ImageNet dataset. We can get better results if we allow all parameters to change, but that fitting will take quite long time and importantly the current database is quite small, ~35k samples comparing with ~15 millions parameters of VGG16 models, to fit all parameters.