# set matplotlib backend to inline
%matplotlib inline 

# import modules
from sklearn import datasets 
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd

# load data
wine=datasets.load_wine()
#print(wine.DESCR)

# this dataset has 13 features, we will only choose a subset of these
df_wine = pd.DataFrame(wine.data, columns = wine.feature_names )

selected_features = ['alcohol','flavanoids','color_intensity','ash']

# extract the data as numpy arrays of features, X, and target, y
X = df_wine[selected_features].values
y = wine.target


# Adding the Wine Type to the dataset
# This had to be done since I have used Seaborn for the graph
# Seaborn only uses dataframe for hue and to show each classes...
# ...hence a combined copy of data was made by using target and existing data
target = pd.DataFrame(y, columns=['wine'])
frames = [target, df_wine]
combined = pd.concat(frames, axis=1, join='inner')

combined


# define plotting function
# Using seaborn library
import seaborn as sns
# Using seaborn librarie's pairplot function
sns.pairplot(data=combined, vars=selected_features, hue='wine', palette='tab10' )

# run the plotting function
plt.show()


# noise code
mySeed = 123456
np.random.seed(mySeed)
XN = X + np.random.normal(0, 0.6, X.shape);


noise_Data = pd.DataFrame(XN, columns=selected_features);

noise_arr = [target, noise_Data]

Noise_df = pd.concat(noise_arr, axis=1, join='inner')
Noise_df


# SHOWING THE NOISY DATA
sns.pairplot(data=Noise_df, vars=selected_features, hue='wine', palette='tab10')

plt.show()


#  Splitting the test and training data using Sklearn library
# THIS IS THE ONLY PLACE WHER SKLEARN LIBRARY VALUES ARE USED IN MY CODE
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# Testing code to compare with SKLEARN
# Sklearn values will be later compared to my KNN function

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')

knn.fit(X_train, y_train)

y_pred_sklearn = knn.predict(X_test)

acc_sklearn = np.sum(y_pred_sklearn == y_test) / len(y_test)

print("Testing: %s " % y_test)
print("Predicted Sklearn: %s " % y_pred_sklearn)
print("Accuracy Sklearn: %s " % round(acc_sklearn,2))

Testing: [2 2 0 1 0 0 1 0 2 2 0 0 1 0 1 2 1 1 2 1 1 1 0 0 1 0 0 1 0 0 0 1 2 1 0 0] 
Predicted Sklearn: [2 2 0 1 0 0 1 0 2 2 0 0 0 0 1 2 1 1 2 1 1 1 0 0 1 0 0 1 0 0 0 1 2 1 0 0] 
Accuracy Sklearn: 0.97


# My KNN Code
# CODED BY USING DIFFERENT EXTERNAL SOURCES
# REFERENCES ARE GIVEN IN THE END OF THE NOTEBOOK
import numpy as np
from collections import Counter

# Euclidean and Manhattan distance functions
def euclidean_dist(x1, x2):
        return np.sqrt(np.sum((x1-x2)**2))
def manhattan_dist(x1, x2):
     return sum(abs(val1-val2) for val1, val2 in zip(x1,x2))

# My Function based KNN
def myKNN(X_train, y_train, X_test, k, distance):
    output = []
    # Iterating over the length of X_test
    for i in range(len(X_test)):
            # Arrays for indices and labels
            indices = []
            labels = []
            for j in range(len(X_train)):
                # Checking for type of distance given
                if(distance == 'euclidean'):
                    dist = euclidean_dist(X_train[j], X_test[i])
                elif(distance == "manhattan"):
                     dist = manhattan_dist(X_train[j], X_test[i])
                #  Appending indices using the distance
                indices.append([dist, j])
            # Sorting the indices and iterating over them to appeand to output array
            indices.sort()
            indices = indices[0:k]
            for indices, j in indices:
                labels.append(y_train[j])
            ans = Counter(labels).most_common(1)[0][0]
            output.append(ans)
    return output


# Getting predicted values using my KNN function
predict = myKNN(X_train, y_train, X_test, 5, 'euclidean')

# Getting accuracy from the prediction values
acc_knn = np.sum(predict == y_test) / len(y_test)

# Printing out the values
# AFTER COMPARING THE VALUES, THEY ARE SIMILAR TO SKLEARN
print("Testing: %s " % y_test)
print("Predicted My KNN: %s " % predict)
print("Accuracy My KNN: %s " % round(acc_knn, 2))

Testing: [2 2 0 1 0 0 1 0 2 2 0 0 1 0 1 2 1 1 2 1 1 1 0 0 1 0 0 1 0 0 0 1 2 1 0 0] 
Predicted My KNN: [2, 2, 0, 1, 0, 0, 1, 0, 2, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 2, 1, 0, 0] 
Accuracy My KNN: 0.97


# Testing first with SKLEARN
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score

# Printing out values using Sklearn to compare later
print("Confusion Matrix (Sklearn):")
print(confusion_matrix(y_test, y_pred_sklearn))
print("Accuracy Score (Sklearn): %s " % round(accuracy_score(y_test, y_pred_sklearn), 2))
print("Precision Score (Sklearn): %s " % precision_score(y_test, y_pred_sklearn, average=None))
print("Recall Score (Sklearn): %s " % recall_score(y_test, y_pred_sklearn, average=None))

Confusion Matrix (Sklearn):
[[16  0  0]
 [ 1 12  0]
 [ 0  0  7]]
Accuracy Score (Sklearn): 0.97 
Precision Score (Sklearn): [0.94117647 1.         1.        ] 
Recall Score (Sklearn): [1.         0.92307692 1.        ]


# confusion matrix, accuracy, precision, recall, etc.
# CODED BY USING EXTERNAL AND COURSERA LAB SOURCES 
# REFERENCES ARE GIVEN IN THE END OF THE NOTEBOOK
# Creating a confusion matrix function
def conf_matrix(y_actual, y_pred):
    classes = np.unique(y_actual)
    matrix = np.zeros((len(classes), len(classes)), dtype=int)

    for i in range(len(classes)):
        for j in range(len(classes)):
            matrix[i, j] = np.sum((y_actual == classes[i]) & (y_pred == classes[j]))
    
    return matrix
# Creating a accuracy function
def accur(y_actual, y_pred):
    return np.sum(y_pred == y_actual) / len(y_actual)

# Creating prepcision function
def precision(y_actual, y_pred):
    matrix = conf_matrix(y_actual, y_pred)
    classes = np.unique(y_actual)
    prec = np.zeros(classes.shape)

    for i in classes:
        prec[i] = matrix[i,i] / sum(matrix[:,i])
    return prec
# Creating recall function
def recall(y_actual, y_pred):
    classes = np.unique(y_pred)
    rec = np.zeros(classes.shape)
    matrix = conf_matrix(y_actual, y_pred)

    for i in classes:
        rec[i] = matrix[i, i] / sum(matrix[i,:])

    return rec

# Printing out values for the data
# In comparison to the SKlearn, the our data evaluation is similar as well.
print('My Confusion Matrix: ')   
print(conf_matrix(y_test, predict))
print('My Accuracy Score:  %s' % round(accur(y_test, predict), 2))
print('My Precision Score:  %s' % precision(y_test, predict));
print('My Precision Score:  %s' % recall(y_test, predict));

My Confusion Matrix: 
[[16  0  0]
 [ 1 12  0]
 [ 0  0  7]]
My Accuracy Score:  0.97
My Precision Score:  [0.94117647 1.         1.        ]
My Precision Score:  [1.         0.92307692 1.        ]


# My Nested Cross Validation Function
# CODED BY USING EXTERNAL AND COURSERA LAB SOURCES
# REFERENCES ARE GIVEN IN THE END OF THE NOTEBOOK
def NestedCrossValidation(X, y, k_fold, neighbour, distance, mySeed):
    
    # Declaring the fold arrays to store the accuracy, parameters and confusion matrix
    acc_fold = [] 
    param_fold = []
    confusion_matrix = []
   
    np.random.seed(mySeed)
    
    # Generating shuffled list from 0 to length of data
    indices = np.random.permutation(np.arange(0,len(X),1))
    
    # Splitting data into different bins
    bins = np.array_split(indices, k_fold)
    
    # Iterating over number of folds, in this case 5
    for i in range(k_fold):
        # list to save indices for training, testing and validation data
        foldTrain=[] 
        foldTest=[]  
        valid_fold = []
        # Setting up initial values for best accuracy and neighbours before...
        # ...going into the two loops
        accuracy_best = 0
        neighbour_best = 0

        # Taking bin i for testing
        foldTest = bins[i]
        # Setting up bin for fold and checking if it is still under 5
        valid_bin = i + 1
        if valid_bin >= k_fold:
            valid_bin = 0
        # Dividing bins into training and validation
        for j in range(len(bins)):
            if j == valid_bin:
                valid_fold = bins[valid_bin]
            else:
                foldTrain.extend( bins[j] )
        
        # Nested Loop for Nested Cross Validation
        # First loop is for distances
        for x in distance:
            # Second loop for number of neighbours
            for z in neighbour:
                # Calling our KNN function
                y_prediction = myKNN(X[foldTrain], y[foldTrain], X[valid_fold], z, x)
                # Calculating accuracy
                acc_score = accur(y[valid_fold], y_prediction)
                # Checking if accuracy is better or not
                # If it is then setting up appropriate parameters
                if acc_score > accuracy_best:
                    neighbour_best = z
                    distance_best = x
                    accuracy_best = acc_score
        
        # Extending the training array with validation data
        foldTrain.extend(valid_fold)
        # Doing a KNN on each fold using the best values taken out from nested loop
        y_final = myKNN(X[foldTrain], y[foldTrain], X[foldTest], neighbour_best, distance_best)
        # Calculating final accuracy for the fold
        acc_final = accur(y[foldTest], y_final)
        # Calculating final matrix for the fold
        matrix = conf_matrix(y[foldTest], y_final)

        # Printing all the values
        print("==============================")
        fold_no = i + 1
        print("Fold Number: %s" % (fold_no))
        print("This Best Accuracy: %s" % round(acc_final, 2))
        print("This Best Distance (Parameter): %s" % distance_best )
        print("This Best Neighbour (Parameter): %s" % neighbour_best )
        print("This Confusion Matrix Per Fold:")
        print(matrix)
        
        # Appending the values so they can be returned and used for summary
        param_fold.append((distance_best, neighbour_best))
        confusion_matrix.append(matrix)
        acc_fold.append(acc_final)

    # Returning the values
    matrices = np.array(confusion_matrix)
    return acc_fold, param_fold, matrices


# evaluate clean data code
dists=["euclidean", "manhattan"]
mySeed=123456
folds=5

# Calling the NCV for noisy data and passing in XN value
# Prints out fold, best accuracy, distance, neighbour and confusion matrix per fold
a_fold_clean, p_fold_clean, conf_matrices_clean = NestedCrossValidation(X,y,folds,list(range(1,11)),dists,mySeed)

==============================
Fold Number: 1
This Best Accuracy: 1.0
This Best Distance (Parameter): euclidean
This Best Neighbour (Parameter): 3
This Confusion Matrix Per Fold:
[[11  0  0]
 [ 0 17  0]
 [ 0  0  8]]
==============================
Fold Number: 2
This Best Accuracy: 1.0
This Best Distance (Parameter): euclidean
This Best Neighbour (Parameter): 1
This Confusion Matrix Per Fold:
[[13  0  0]
 [ 0 11  0]
 [ 0  0 12]]
==============================
Fold Number: 3
This Best Accuracy: 0.92
This Best Distance (Parameter): euclidean
This Best Neighbour (Parameter): 3
This Confusion Matrix Per Fold:
[[11  0  0]
 [ 2 12  1]
 [ 0  0 10]]
==============================
Fold Number: 4
This Best Accuracy: 0.91
This Best Distance (Parameter): euclidean
This Best Neighbour (Parameter): 7
This Confusion Matrix Per Fold:
[[12  0  0]
 [ 3 11  0]
 [ 0  0  9]]
==============================
Fold Number: 5
This Best Accuracy: 1.0
This Best Distance (Parameter): manhattan
This Best Neighbour (Parameter): 3
This Confusion Matrix Per Fold:
[[12  0  0]
 [ 0 14  0]
 [ 0  0  9]]


# evaluate noisy  data code

# Calling the NCV for noisy data and passing in XN value
# Prints out fold, best accuracy, distance, neighbour and confusion matrix per fold
a_fold_noisy, p_fold_noisy, conf_matrices_noisy = NestedCrossValidation(XN,y,folds,list(range(1,11)),dists,mySeed)

==============================
Fold Number: 1
This Best Accuracy: 0.83
This Best Distance (Parameter): euclidean
This Best Neighbour (Parameter): 3
This Confusion Matrix Per Fold:
[[ 9  2  0]
 [ 4 13  0]
 [ 0  0  8]]
==============================
Fold Number: 2
This Best Accuracy: 0.89
This Best Distance (Parameter): manhattan
This Best Neighbour (Parameter): 4
This Confusion Matrix Per Fold:
[[11  1  1]
 [ 0 11  0]
 [ 0  2 10]]
==============================
Fold Number: 3
This Best Accuracy: 0.97
This Best Distance (Parameter): euclidean
This Best Neighbour (Parameter): 3
This Confusion Matrix Per Fold:
[[11  0  0]
 [ 1 14  0]
 [ 0  0 10]]
==============================
Fold Number: 4
This Best Accuracy: 0.83
This Best Distance (Parameter): euclidean
This Best Neighbour (Parameter): 8
This Confusion Matrix Per Fold:
[[11  1  0]
 [ 4  9  1]
 [ 0  0  9]]
==============================
Fold Number: 5
This Best Accuracy: 0.94
This Best Distance (Parameter): euclidean
This Best Neighbour (Parameter): 5
This Confusion Matrix Per Fold:
[[12  0  0]
 [ 2 12  0]
 [ 0  0  9]]


# Summary Code
# A result summary function that takes in parameter and accuracy array
# Extracts the values from the array and saves it in a dataframe
# Written by Myself
import pandas as pd

def result_summary(a_fold, p_fold):

    # Iterating over accuracies_fold
    round_acc=[round(acc, 2) for acc in a_fold]
    # Iterating over parameter fold and separating distances and neighbours
    dist_fold, neighbour = zip(*p_fold)
    # Combining the data in a numpy array
    data = np.array([round_acc,neighbour,dist_fold])
    # Calculating indicies
    indices = [i for i in range(1, folds+1)]
    # Saving in a dataframe using pandas
    df=pd.DataFrame(data.T,indices, ["accuracy", "k", "distance"],)
    # Returning dataframe
    return df


# Clean Data Summary
# Calling summary function
clean_summary = result_summary(a_fold_clean, p_fold_clean)

# Calculating average accuracy and standard deviation
avg_accuracy_clean = np.average(a_fold_clean)
sd_clean = np.std(a_fold_clean)

# Printing average accuracy, standard deviation and showing the dataframe
print("Average Clean Accuracy:  %2f ± %2f" % (avg_accuracy_clean, sd_clean))

clean_summary

Average Clean Accuracy:  0.966190 ± 0.041415


# Noisy Data Summary
# Calling summary function
noisy_summary = result_summary(a_fold_noisy, p_fold_noisy)

# Calculating average accuracy and standard deviation
avg_accuracy_noisy = np.average(a_fold_noisy)
sd_noisy = np.std(a_fold_noisy)

# Printing average accuracy, standard deviation and showing the dataframe
print("Average Noisy Accuracy:  %2f ± %2f" % (avg_accuracy_noisy, sd_noisy))

noisy_summary

Average Noisy Accuracy:  0.893175 ± 0.057428


# New custom functions for class precision and recall
# The older one's couldn't be used
# Written by Myself
def class_precision(matrix):
    classes = np.unique(y)
    result = []
    # goes over the length of classes
    for i in range(len(classes)):
        # For each class, calculates TP and FP
        true_pos = matrix[i, i]
        false_pos = np.sum(matrix[:, i]) - true_pos
        # Uses the Precision Formula to calculate and appeand in array
        c_precision = true_pos / (true_pos + false_pos)
        result.append(c_precision)
    # Returning precision
    return result

def class_recall(matrix):
    classes = np.unique(y)
    result = []
    # goes over the length of classes
    for i in range(len(classes)):
        # For each class, calculates TP and FN
        true_pos = matrix[i, i]
        false_neg = np.sum(matrix[i, :]) - true_pos
        # Uses the Recall Formula to calculate and appeand in array
        c_recall = true_pos / (true_pos + false_neg)
        result.append(c_recall)
    # Returning recall
    return result


# Written by Myself
print('CLEAN')
# clean data summary results
# Summing up the clean matrices
sum_clean_matrix = np.sum(conf_matrices_clean, axis=0)
# Printing recall and precision values for clean data
print("Confusion Matrix:")
print(sum_clean_matrix)
print("Precision: %s" % class_precision(sum_clean_matrix))
print("Recall: %s" % class_recall(sum_clean_matrix))

print("===============")
print('NOISY')

# clean data summary results
# Summing up the noisy matrices
sum_noisy_matrix = np.sum(conf_matrices_noisy, axis=0)
# Printing recall and precision values for noisy data
print("Confusion Matrix:")
print(sum_noisy_matrix)
print("Precision: %s" % class_precision(sum_noisy_matrix))
print("Recall: %s" % class_recall(sum_noisy_matrix))

CLEAN
Confusion Matrix:
[[59  0  0]
 [ 5 65  1]
 [ 0  0 48]]
Precision: [0.921875, 1.0, 0.9795918367346939]
Recall: [1.0, 0.9154929577464789, 1.0]
===============
NOISY
Confusion Matrix:
[[54  4  1]
 [11 59  1]
 [ 0  2 46]]
Precision: [0.8307692307692308, 0.9076923076923077, 0.9583333333333334]
Recall: [0.9152542372881356, 0.8309859154929577, 0.9583333333333334]


import numpy as np
# tie breaker function that is called if there's a tie, takes in neighbour array
def tie_breaker(neighbours):
    # sorting array to see which one is closest
    sorted_neighbours = np.sort(neighbours)
    # assigning the value to a variable
    nearest_value = sorted_neighbours[0]
    # Calculating the length of class and unique values 
    class_length = len(neighbours)
    class_unique = len(np.unique(neighbours))

    # Checking if length and unique values are equal to 2
    if class_length == 2 and class_unique == 2:
        tie_breaker_class = nearest_value
    # Returns the class
    return tie_breaker_class

	wine	alcohol	malic_acid	ash	alcalinity_of_ash	magnesium	total_phenols	flavanoids	nonflavanoid_phenols	proanthocyanins	color_intensity	hue	od280/od315_of_diluted_wines	proline
0	0	14.23	1.71	2.43	15.6	127.0	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065.0
1	0	13.20	1.78	2.14	11.2	100.0	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050.0
2	0	13.16	2.36	2.67	18.6	101.0	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185.0
3	0	14.37	1.95	2.50	16.8	113.0	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480.0
4	0	13.24	2.59	2.87	21.0	118.0	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
173	2	13.71	5.65	2.45	20.5	95.0	1.68	0.61	0.52	1.06	7.70	0.64	1.74	740.0
174	2	13.40	3.91	2.48	23.0	102.0	1.80	0.75	0.43	1.41	7.30	0.70	1.56	750.0
175	2	13.27	4.28	2.26	20.0	120.0	1.59	0.69	0.43	1.35	10.20	0.59	1.56	835.0
176	2	13.17	2.59	2.37	20.0	120.0	1.65	0.68	0.53	1.46	9.30	0.60	1.62	840.0
177	2	14.13	4.10	2.74	24.5	96.0	2.05	0.76	0.56	1.35	9.20	0.61	1.60	560.0

	wine	alcohol	flavanoids	color_intensity	ash
0	0	14.511467	2.890282	4.734565	1.748621
1	0	13.927267	2.656071	4.451525	1.513458
2	0	12.642891	1.977258	5.383042	3.313082
3	0	14.802933	3.065937	7.176255	2.663116
4	0	12.985017	3.030212	4.485739	2.217560
...	...	...	...	...	...
173	2	14.536851	0.815131	7.288758	2.436147
174	2	12.269144	1.184646	7.029788	2.513411
175	2	12.507302	0.683971	10.051960	1.402465
176	2	12.410797	1.106591	8.809080	2.696125
177	2	15.076628	0.570022	8.736084	1.770933

Fold	accuracy	k	distance
1	.?	?	?
2	.?	?	?
3	.?	?	?
4	.?	?	?
5	.?	?	?
total	.? $\pm$ ?

Fold	accuracy	k	distance
1	.?	?	?
2	.?	?	?
3	.?	?	?
4	.?	?	?
5	.?	?	?
total	.? $\pm$ ?

MIDTERM - Machine Learning¶

1. Exploratory Data Analysis¶

1.1. Visualising the data¶

1.2. Exploratory Data Analysis under noise¶

Q1. Exploratory data analysis¶

Q2. Data with noise¶

2. Implementing kNN¶

3. Classifier evaluation¶

4. Nested Cross-validation using your implementation of KNN¶

5. Summary of results¶

A CUSTOM SUMMARY IS PRINTED BELOW USING PANDAS AND CUSTOM SUMMARY FUNCTION¶

5.2. Confusion matrix summary¶

6. More questions¶

Q3. Influence of noise¶

Q4. Tie break¶

Q5. Beyond Wine¶

References:¶

KNN Function (Section 2):¶

Classifier Evaluation (Section 3):¶

Nested Cross Validation Function (Section 4):¶

	accuracy	k	distance
1	1.0	3	euclidean
2	1.0	1	euclidean
3	0.92	3	euclidean
4	0.91	7	euclidean
5	1.0	3	manhattan

	accuracy	k	distance
1	0.83	3	euclidean
2	0.89	4	manhattan
3	0.97	3	euclidean
4	0.83	8	euclidean
5	0.94	5	euclidean