Intro to k-NN

Working with Sample Fruits Data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

%matplotlib inline
In [3]:
# load data
fruits = pd.read_table('fruit_data_with_colors.txt')
fruits.head()
Out[3]:
fruit_label fruit_name fruit_subtype mass width height color_score
0 1 apple granny_smith 192 8.4 7.3 0.55
1 1 apple granny_smith 180 8.0 6.8 0.59
2 1 apple granny_smith 176 7.4 7.2 0.60
3 2 mandarin mandarin 86 6.2 4.7 0.80
4 2 mandarin mandarin 84 6.0 4.6 0.79
In [4]:
fruits.shape
Out[4]:
(59, 7)

Train / test split

In [39]:
X = fruits[['height','width','mass','color_score']]

y = fruits['fruit_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=41)
In [40]:
X_train.head()
Out[40]:
height width mass color_score
27 9.2 7.5 204 0.77
47 9.7 7.3 196 0.72
18 7.1 7.5 162 0.83
39 7.4 6.8 144 0.75
56 8.1 5.9 116 0.73
In [41]:
y_train.head(2)
Out[41]:
27    3
47    4
Name: fruit_label, dtype: int64
In [42]:
X_test.head(2)
Out[42]:
height width mass color_score
49 8.7 5.8 132 0.73
9 7.0 7.4 172 0.89
In [43]:
y_test.head(2)
Out[43]:
49    4
9     1
Name: fruit_label, dtype: int64
In [44]:
# from matplotlib import cm 
# cmap = cm.get_cmap('gnuplot')
# scatter = pd.plotting.scatter_matrix(X_train,c=y_train,marker='o',
#                            s=40,hist_kwds={'bins':15},figsize=(12,12),cmap=cmap)
In [45]:
sns.pairplot(fruits,hue='fruit_label')
Out[45]:
<seaborn.axisgrid.PairGrid at 0x1a1b5dd7f0>

using KNN Classification

In [46]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report

Create classifyer object

In [60]:
knn = KNeighborsClassifier(n_neighbors=5)

Train the classifyer (fit the estimator)

In [61]:
knn.fit(X_train,y_train)
Out[61]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
In [49]:
pred = knn.predict(X_test)

Test a prediction on a sample

In [50]:
X_sample = np.array([[7.0,7.4,172,0.89]]) #

pred_sample = knn.predict(X_sample)
In [51]:
fruits['fruit_name'][pred_sample]
Out[51]:
1    apple
Name: fruit_name, dtype: object

Evaluate model

In [52]:
print (classification_report(y_test,pred))
             precision    recall  f1-score   support

          1       0.57      0.80      0.67         5
          2       1.00      1.00      1.00         2
          3       0.50      0.33      0.40         6
          4       0.71      0.71      0.71         7

avg / total       0.64      0.65      0.64        20

In [53]:
print (confusion_matrix(y_test,pred))
[[4 0 1 0]
 [0 2 0 0]
 [2 0 2 2]
 [1 0 1 5]]

Test Accuracy score

  • right predictions
In [54]:
np.mean(pred == y_test)
Out[54]:
0.65
  • test set accuracy
In [55]:
knn.score(X_test,y_test)
Out[55]:
0.65

Visual Plots of k-NN classifyers on sample fruit data

Fruits in the dataset

In [59]:
print (fruits['fruit_name'].unique())
['apple' 'mandarin' 'orange' 'lemon']
In [ ]:
from adspy_shared_utilities import plot_fruit_knn

Decision Boundaries with K=30

In [85]:
plot_fruit_knn(X_train,y_train, 30, 'uniform')
# uniform - weight method. could take 'distance'

Decision Boundaries with K=10

In [64]:
plot_fruit_knn(X_train,y_train, 10, 'uniform')

Decision Boundaries with K=1

In [65]:
plot_fruit_knn(X_train,y_train, 1, 'uniform')

Choosing the appropriate K value

In [75]:
error_rate = []

for i in range(1,30):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))
In [77]:
plt.figure(figsize=(10,6))
plt.plot(range(1,30),error_rate,color='b',
         ls='--',marker='o',markerfacecolor='r')
plt.title('Error Rate vs K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
Out[77]:
Text(0,0.5,'Error Rate')

Classify with the best value of k

In [78]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
In [79]:
print(classification_report(y_test,pred_i))
             precision    recall  f1-score   support

          1       0.57      0.80      0.67         5
          2       1.00      1.00      1.00         2
          3       0.60      0.50      0.55         6
          4       0.83      0.71      0.77         7

avg / total       0.71      0.70      0.70        20

In [80]:
np.mean
Out[80]:
<function numpy.core.fromnumeric.mean>