# First we bring in our standard imports...
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
import pandas as pd
import seaborn as sns
import time


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


dfp = sns.load_dataset('penguins')
dfp2 = dfp.dropna()
dfp2.isna().sum()
dfp2 = dfp2.reset_index(drop=True)


dfp.head()


dfp2.head()


# create our labelled training and test data sets with 60/40 train/test split
X = dfp2.iloc[:, 2:6].values
y = dfp2.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)


# scale the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


# obtain the classifier and fit it using 2 nearest neighbours
# and the Manhattan norm
classifier = KNeighborsClassifier(n_neighbors=2, p=1)
classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=2, p=1)


# Now use the fitted model from the training data to predict
# from the test data.
y_pred = classifier.predict(X_test)


# create a confusion matrix to assess the performance

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

clsrep = classification_report(y_test, y_pred)
print("Classification Report:",)
print(clsrep)

accsc = accuracy_score(y_test,y_pred)
print("Accuracy:", accsc)

Confusion Matrix:
[[58  0  0]
 [ 2 27  0]
 [ 0  0 47]]
Classification Report:
              precision    recall  f1-score   support

      Adelie       0.97      1.00      0.98        58
   Chinstrap       1.00      0.93      0.96        29
      Gentoo       1.00      1.00      1.00        47

    accuracy                           0.99       134
   macro avg       0.99      0.98      0.98       134
weighted avg       0.99      0.99      0.98       134

Accuracy: 0.9850746268656716


from sklearn.metrics import ConfusionMatrixDisplay
cmplot = ConfusionMatrixDisplay(cm, display_labels=classifier.classes_)
cmplot.plot()
plt.show()


dfp2.head()


dfp2['sex'].unique()

array(['Male', 'Female'], dtype=object)


dfp2.head()


sns.pairplot(dfp2, corner=True, hue='sex', height=1.7)

<seaborn.axisgrid.PairGrid at 0x7fc3399aa630>


# extract the input/features X, and the output/labels y
X = dfp2.iloc[:, [3,5]].values
y = dfp2.iloc[:, 6].values
# optionally, print the first few just to check
print(X[0:4,:])
print(y[0:4])
# bifurcate the data to get a 40% test set, and 60% training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)

# print out the sizes of the train and test sets
print('\n')
print('X_train has ', X_train.shape[0], ' rows and ', X_train.shape[1], ' columns')
print('y_train has ', y_train.shape[0], ' rows')
print('\n')
print('X_test has ', X_test.shape[0], ' rows and ', X_test.shape[1], ' columns')
print('y_test has ', y_test.shape[0], ' rows')
print('\n')

# get scaling parameters from the training data, 
scaler = StandardScaler()
scaler.fit(X_train)
# scale the training data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# now classify using k=2, p=1 - as before
classifier = KNeighborsClassifier(n_neighbors=2, p=1)
classifier.fit(X_train, y_train)
# and then make predictions from the test data 
y_pred = classifier.predict(X_test)

# compare the predictions with the ground truth, or hold-out set, y_test
cm = confusion_matrix(y_test, y_pred)

# and print the results out
print("Confusion Matrix:")
print(cm)
clsrep = classification_report(y_test, y_pred)
print('\n')
print("Classification Report:",)
print(clsrep)
accsc = accuracy_score(y_test,y_pred)
print('\n')
print("Accuracy:", accsc)

[[  18.7 3750. ]
 [  17.4 3800. ]
 [  18.  3250. ]
 [  19.3 3450. ]]
['Male' 'Female' 'Female' 'Female']


X_train has  199  rows and  2  columns
y_train has  199  rows


X_test has  134  rows and  2  columns
y_test has  134  rows


Confusion Matrix:
[[69  1]
 [15 49]]


Classification Report:
              precision    recall  f1-score   support

      Female       0.82      0.99      0.90        70
        Male       0.98      0.77      0.86        64

    accuracy                           0.88       134
   macro avg       0.90      0.88      0.88       134
weighted avg       0.90      0.88      0.88       134


Accuracy: 0.8805970149253731


print(cm.sum())

134


cmplot = ConfusionMatrixDisplay(cm, display_labels=classifier.classes_)
cmplot.plot()
plt.show()


TP = cm[0,0]; FP = cm[1,0]; FN = cm[0,1]; TN = cm[1,1]
print(TP, FN)
print(FP, TN)

69 1
15 49


P=TP+FN
N=FP+TN
print('P (# positives) = ', P)
print('N (# negatives) = ', N)
print('TPR (recall)    = ', TP/(TP+FN))
print('PPV (precision) = ', TP/(TP+FP))
print('F1              = ', 2*TP/(2*TP+FP+FN))
print('Accuracy        = ', (TP+TN)/(P+N))

P (# positives) =  70
N (# negatives) =  64
TPR (recall)    =  0.9857142857142858
PPV (precision) =  0.8214285714285714
F1              =  0.8961038961038961
Accuracy        =  0.8805970149253731


print("Classification Report:",)
print(clsrep)
accsc = accuracy_score(y_test,y_pred)
print("Accuracy:", accsc)

Classification Report:
              precision    recall  f1-score   support

      Female       0.82      0.99      0.90        70
        Male       0.98      0.77      0.86        64

    accuracy                           0.88       134
   macro avg       0.90      0.88      0.88       134
weighted avg       0.90      0.88      0.88       134

Accuracy: 0.8805970149253731


sns.scatterplot(data=dfp2, x="bill_depth_mm", y="body_mass_g", hue="sex")

<AxesSubplot:xlabel='bill_depth_mm', ylabel='body_mass_g'>


print(y_pred)

['Female' 'Male' 'Female' 'Male' 'Female' 'Female' 'Female' 'Female'
 'Female' 'Female' 'Male' 'Female' 'Male' 'Female' 'Male' 'Male' 'Male'
 'Female' 'Female' 'Male' 'Female' 'Female' 'Female' 'Male' 'Female'
 'Male' 'Female' 'Female' 'Male' 'Female' 'Female' 'Male' 'Female'
 'Female' 'Male' 'Male' 'Female' 'Female' 'Male' 'Female' 'Male' 'Female'
 'Female' 'Female' 'Female' 'Female' 'Female' 'Male' 'Female' 'Male'
 'Female' 'Female' 'Female' 'Female' 'Female' 'Female' 'Male' 'Male'
 'Male' 'Male' 'Male' 'Female' 'Female' 'Female' 'Male' 'Male' 'Female'
 'Female' 'Male' 'Female' 'Female' 'Female' 'Female' 'Female' 'Female'
 'Female' 'Female' 'Female' 'Female' 'Female' 'Male' 'Female' 'Male'
 'Female' 'Female' 'Female' 'Female' 'Female' 'Female' 'Male' 'Female'
 'Female' 'Female' 'Female' 'Female' 'Male' 'Female' 'Male' 'Female'
 'Female' 'Male' 'Female' 'Female' 'Female' 'Male' 'Female' 'Female'
 'Male' 'Female' 'Female' 'Male' 'Male' 'Male' 'Female' 'Female' 'Male'
 'Female' 'Male' 'Male' 'Female' 'Male' 'Male' 'Female' 'Female' 'Male'
 'Female' 'Male' 'Male' 'Male' 'Male' 'Female' 'Male' 'Male' 'Male']


indxM = np.where(y_pred == 'Male')[0]
indxF = np.where(y_pred != 'Male')[0]
print(indxM, len(indxM))
print(indxF, len(indxF))

[  1   3  10  12  14  15  16  19  23  25  28  31  34  35  38  40  47  49
  56  57  58  59  60  64  65  68  80  82  89  95  97 100 104 107 110 111
 112 115 117 118 120 121 124 126 127 128 129 131 132 133] 50
[  0   2   4   5   6   7   8   9  11  13  17  18  20  21  22  24  26  27
  29  30  32  33  36  37  39  41  42  43  44  45  46  48  50  51  52  53
  54  55  61  62  63  66  67  69  70  71  72  73  74  75  76  77  78  79
  81  83  84  85  86  87  88  90  91  92  93  94  96  98  99 101 102 103
 105 106 108 109 113 114 116 119 122 123 125 130] 84


plt.scatter(X_test[indxM,0], X_test[indxM,1], color='blue')
plt.scatter(X_test[indxF,0], X_test[indxF,1], color='red')

<matplotlib.collections.PathCollection at 0x7fc339a4fcc0>


sns.scatterplot(data=dfp2, x="bill_depth_mm", y="body_mass_g", hue="sex")
print(dfp2.shape, 0.4*dfp2.shape[0])

(333, 7) 133.20000000000002


print(y_pred.shape)
print(X_test.shape)
print(X_test[indxM,:].shape)
print(X_test[indxF,:].shape)
print(X_test[indxM,:].shape[0] + X_test[indxF,:].shape[0])

(134,)
(134, 2)
(50, 2)
(84, 2)
134


# define bounds of the domain using max and min of our features
x1min, x1max = X_test[:, 0].min()-1, X_test[:, 0].max()+1
x2min, x2max = X_test[:, 1].min()-1, X_test[:, 1].max()+1

# define the x and y scale - this sets up a point spacing of 0.1
x1grid = np.arange(x1min, x1max, 0.1)
x2grid = np.arange(x2min, x2max, 0.1)

# create arrays for the grid coordinates
xx1, xx2 = np.meshgrid(x1grid, x2grid)
print('xx1.shape = ', xx1.shape)
print('xx2.shape = ', xx2.shape)

xx1.shape =  (62, 61)
xx2.shape =  (62, 61)


xx1xx2 = np.stack((xx1.flatten(), xx2.flatten()), axis=-1)
print(xx1xx2.shape)

(3782, 2)


y_pred = classifier.predict(xx1xx2)


indxF = np.where(y_pred == 'Female')[0]
indxM = np.where(y_pred != 'Female')[0]
plt.scatter(xx1xx2[indxM,0], xx1xx2[indxM,1], color='blue' , s=2)
plt.scatter(xx1xx2[indxF,0], xx1xx2[indxF,1], color='red', s=2)

<matplotlib.collections.PathCollection at 0x7fc349344f28>


# the ground truth of the entire data set we have seen earlier, it's this:
sns.scatterplot(data=dfp2, x="bill_depth_mm", y="body_mass_g", hue="sex")
print(dfp2.shape, 0.4*dfp2.shape[0])

(333, 7) 133.20000000000002


indxM = np.where(y == 'Male')[0]
indxF = np.where(y != 'Male')[0]
plt.scatter(X[indxM,0], X[indxM,1], color='blue')
plt.scatter(X[indxF,0], X[indxF,1], color='red')

<matplotlib.collections.PathCollection at 0x7fc3399345f8>


# repeat the plot from above
indxF = np.where(y_pred == 'Female')[0]
indxM = np.where(y_pred != 'Female')[0]
plt.scatter(xx1xx2[indxM,0], xx1xx2[indxM,1], color='blue' , s=2)
plt.scatter(xx1xx2[indxF,0], xx1xx2[indxF,1], color='red', s=2)

# scale the ground truth
X_trans = scaler.transform(X)

# now plot the ground truth using 'empty' circles
indxM = np.where(y == 'Male')[0]
indxF = np.where(y != 'Male')[0]
plt.scatter(X_trans[indxM,0], X_trans[indxM,1], facecolors='w', edgecolors='b')
plt.scatter(X_trans[indxF,0], X_trans[indxF,1], facecolors='w', edgecolors='r')

<matplotlib.collections.PathCollection at 0x7fc358a179e8>

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
3	Adelie	Torgersen	NaN	NaN	NaN	NaN	NaN
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
3	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
4	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
3	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
4	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
3	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
4	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male

Concepts¶

variationalform https://variationalform.github.io/¶

Just Enough: progress at pace¶

Penguins: Exploring Some Key Concepts¶

Back to where we were¶

The Confusion Matrix¶

Binary Classifier¶

Predicting gender from just two data items¶

True and False Positives and Negatives¶

Decision Boundaries¶

Plotting the decision boundary¶

Fairness - setting up the discussion¶

Remarks¶

Fairness¶

Implication¶

Closing Thoughts¶

Concepts¶

variationalform https://variationalform.github.io/¶

Just Enough: progress at pace¶

Penguins: Exploring Some Key Concepts¶

Back to where we were¶

The Confusion Matrix¶

Binary Classifier¶

Predicting gender from just two data items¶

True and False Positives and Negatives¶

Related Formulae and Measures¶

Decision Boundaries¶

Plotting the decision boundary¶

Fairness - setting up the discussion¶

Remarks¶

Fairness¶

Implication¶

Closing Thoughts¶