import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
import pandas as pd
import seaborn as sns


# See, for example,
#   https://github.com/mwaskom/seaborn-data
#   https://blog.enterprisedna.co/how-to-load-sample-datasets-in-python/
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic',
 'anagrams',
 'anagrams',
 'anscombe',
 'anscombe',
 'attention',
 'attention',
 'brain_networks',
 'brain_networks',
 'car_crashes',
 'car_crashes',
 'diamonds',
 'diamonds',
 'dots',
 'dots',
 'dowjones',
 'dowjones',
 'exercise',
 'exercise',
 'flights',
 'flights',
 'fmri',
 'fmri',
 'geyser',
 'geyser',
 'glue',
 'glue',
 'healthexp',
 'healthexp',
 'iris',
 'iris',
 'mpg',
 'mpg',
 'penguins',
 'penguins',
 'planets',
 'planets',
 'seaice',
 'seaice',
 'taxis',
 'taxis',
 'tips',
 'tips',
 'titanic',
 'titanic',
 'anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']


dfp = sns.load_dataset('penguins')
dfp.head()


num_rows, num_columns = dfp.shape
print('number of data points (or observations) = ', num_rows)
print('number of features (or measurement) = ', num_columns)

number of data points (or observations) =  344
number of features (or measurement) =  7


sns.scatterplot(data=dfp, x="body_mass_g", y="bill_depth_mm", hue="species", style="sex")

<AxesSubplot:xlabel='body_mass_g', ylabel='bill_depth_mm'>


plt.figure(figsize=(8, 8))
sns.scatterplot(data=dfp, x="body_mass_g", y="bill_depth_mm", hue="island")

<AxesSubplot:xlabel='body_mass_g', ylabel='bill_depth_mm'>


dfp.tail()


# print(dfp.to_string())


dfp.species.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)


dfp.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)


dfp.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64


dfp[dfp.isna().any(axis=1)]


NaN_rows = dfp[dfp.isna().any(axis=1)]
print(NaN_rows.index)

Int64Index([3, 8, 9, 10, 11, 47, 246, 286, 324, 336, 339], dtype='int64')


dfp.loc[NaN_rows.index]


# from https://datagy.io/pandas-fillna/
dfp1 = dfp.fillna({'bill_length_mm'   : dfp['bill_length_mm'].mean(),
                   'bill_depth_mm'    : dfp['bill_depth_mm'].mean(),
                   'flipper_length_mm': dfp['flipper_length_mm'].mean(),
                   'body_mass_g'      : dfp['body_mass_g'].mean(),
                   'sex': 'Female'})


# Here is the new one with the NaN's replaced - or engineered out
dfp1.loc[NaN_rows.index]


# Here is the old one with the NaN's
dfp.loc[NaN_rows.index]


sns.scatterplot(data=dfp, x="body_mass_g", y="bill_depth_mm")

<AxesSubplot:xlabel='body_mass_g', ylabel='bill_depth_mm'>


sns.scatterplot(data=dfp1, x="body_mass_g", y="bill_depth_mm")

<AxesSubplot:xlabel='body_mass_g', ylabel='bill_depth_mm'>


dfp.describe()


dfp1.describe()


dfp.loc[NaN_rows.index]


dfp.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64


dfp2 = dfp.dropna()

dfp


dfp2


dfp2.isna().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


# don't do this - you'll just a column of old and useless index labels.
# dfp2 = dfp2.reset_index()
# instead reset the index and drop the original index column
dfp2 = dfp2.reset_index(drop=True)


dfp2


sns.scatterplot(data=dfp2, x="bill_length_mm", y="bill_depth_mm", hue="species")

<AxesSubplot:xlabel='bill_length_mm', ylabel='bill_depth_mm'>


sns.scatterplot(data=dfp2, x="body_mass_g", y="flipper_length_mm", hue="species")

<AxesSubplot:xlabel='body_mass_g', ylabel='flipper_length_mm'>


sns.pairplot(dfp2, hue='species')

<seaborn.axisgrid.PairGrid at 0x7f887873fb70>


# lots of options for the above. See
# https://seaborn.pydata.org/generated/seaborn.pairplot.html
sns.pairplot(dfp2, corner=True, hue='species', height=1.5)

<seaborn.axisgrid.PairGrid at 0x7f8858741da0>


g = sns.pairplot(dfp2, diag_kind="kde", hue='species')
g.map_lower(sns.kdeplot, levels=4, color=".2")

<seaborn.axisgrid.PairGrid at 0x7f8858b27940>


dfp2.head()


sns.scatterplot(data=dfp2, x="bill_length_mm", y="bill_depth_mm", hue="species")

<AxesSubplot:xlabel='bill_length_mm', ylabel='bill_depth_mm'>


dfp2['species']

0      Adelie
1      Adelie
2      Adelie
3      Adelie
4      Adelie
        ...  
328    Gentoo
329    Gentoo
330    Gentoo
331    Gentoo
332    Gentoo
Name: species, Length: 333, dtype: object


dfp2['species'].unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)


dfA = dfp2.loc[dfp2['species'] == 'Adelie']
dfC = dfp2.loc[dfp2['species'] == 'Chinstrap']
dfG = dfp2.loc[dfp2['species'] == 'Gentoo']


blA=np.array(dfA['bill_length_mm'].tolist())
bdA=np.array(dfA['bill_depth_mm'].tolist())
plt.scatter(blA,bdA,color='blue')

blC=np.array(dfC['bill_length_mm'].tolist())
bdC=np.array(dfC['bill_depth_mm'].tolist())
plt.scatter(blC,bdC,color='orange')

blG=np.array(dfG['bill_length_mm'].tolist())
bdG=np.array(dfG['bill_depth_mm'].tolist())
plt.scatter(blG,bdG,color='green')
plt.xlabel('bill_length_mm')
plt.ylabel('bill_depth_mm')
plt.legend(['Adelie', 'Chinstrap', 'Gentoo'],loc='lower right')

<matplotlib.legend.Legend at 0x7f8878d733c8>


dfA.describe()

blA

array([39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6, 34.6, 36.6,
       38.7, 42.5, 34.4, 46. , 37.8, 37.7, 35.9, 38.2, 38.8, 35.3, 40.6,
       40.5, 37.9, 40.5, 39.5, 37.2, 39.5, 40.9, 36.4, 39.2, 38.8, 42.2,
       37.6, 39.8, 36.5, 40.8, 36. , 44.1, 37. , 39.6, 41.1, 36. , 42.3,
       39.6, 40.1, 35. , 42. , 34.5, 41.4, 39. , 40.6, 36.5, 37.6, 35.7,
       41.3, 37.6, 41.1, 36.4, 41.6, 35.5, 41.1, 35.9, 41.8, 33.5, 39.7,
       39.6, 45.8, 35.5, 42.8, 40.9, 37.2, 36.2, 42.1, 34.6, 42.9, 36.7,
       35.1, 37.3, 41.3, 36.3, 36.9, 38.3, 38.9, 35.7, 41.1, 34. , 39.6,
       36.2, 40.8, 38.1, 40.3, 33.1, 43.2, 35. , 41. , 37.7, 37.8, 37.9,
       39.7, 38.6, 38.2, 38.1, 43.2, 38.1, 45.6, 39.7, 42.2, 39.6, 42.7,
       38.6, 37.3, 35.7, 41.1, 36.2, 37.7, 40.2, 41.4, 35.2, 40.6, 38.8,
       41.5, 39. , 44.1, 38.5, 43.1, 36.8, 37.5, 38.1, 41.1, 35.6, 40.2,
       37. , 39.7, 40.2, 40.6, 32.1, 40.7, 37.3, 39. , 39.2, 36.6, 36. ,
       37.8, 36. , 41.5])


blA[2:5]

array([40.3, 36.7, 39.3])


blA[:-5]

array([39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6, 34.6, 36.6,
       38.7, 42.5, 34.4, 46. , 37.8, 37.7, 35.9, 38.2, 38.8, 35.3, 40.6,
       40.5, 37.9, 40.5, 39.5, 37.2, 39.5, 40.9, 36.4, 39.2, 38.8, 42.2,
       37.6, 39.8, 36.5, 40.8, 36. , 44.1, 37. , 39.6, 41.1, 36. , 42.3,
       39.6, 40.1, 35. , 42. , 34.5, 41.4, 39. , 40.6, 36.5, 37.6, 35.7,
       41.3, 37.6, 41.1, 36.4, 41.6, 35.5, 41.1, 35.9, 41.8, 33.5, 39.7,
       39.6, 45.8, 35.5, 42.8, 40.9, 37.2, 36.2, 42.1, 34.6, 42.9, 36.7,
       35.1, 37.3, 41.3, 36.3, 36.9, 38.3, 38.9, 35.7, 41.1, 34. , 39.6,
       36.2, 40.8, 38.1, 40.3, 33.1, 43.2, 35. , 41. , 37.7, 37.8, 37.9,
       39.7, 38.6, 38.2, 38.1, 43.2, 38.1, 45.6, 39.7, 42.2, 39.6, 42.7,
       38.6, 37.3, 35.7, 41.1, 36.2, 37.7, 40.2, 41.4, 35.2, 40.6, 38.8,
       41.5, 39. , 44.1, 38.5, 43.1, 36.8, 37.5, 38.1, 41.1, 35.6, 40.2,
       37. , 39.7, 40.2, 40.6, 32.1, 40.7, 37.3, 39. , 39.2])


print('number of rows in dfA = ', dfA.shape[0], '; in dfC = ', dfC.shape[0], ' and in dfG = ', dfG.shape[0])

number of rows in dfA =  146 ; in dfC =  68  and in dfG =  119


rA, _ = dfA.shape; rC, _ = dfC.shape; rG, _ = dfG.shape
print('number of rows in dfA = ', rA, '; in dfC = ', rC, ' and in dfG = ', rG)

number of rows in dfA =  146 ; in dfC =  68  and in dfG =  119


# plot first twenty rows of each as coloured dots.
plt.scatter(blA[0:20],bdA[0:20],color='blue')
plt.scatter(blC[0:20],bdC[0:20],color='orange')
plt.scatter(blG[0:20],bdG[0:20],color='green')
plt.legend(['Adelie', 'Chinstrap', 'Gentoo'],loc='lower right')
plt.xlabel('bill_length_mm')
plt.ylabel('bill_depth_mm')

# pick out the data item fourth from the end in each
indx = -4
# and plot each as a cross
plt.scatter(blA[indx],bdA[indx],color='blue', marker='x', s=500)
plt.scatter(blC[indx],bdC[indx],color='orange', marker='x', s=500)
plt.scatter(blG[indx],bdG[indx],color='green', marker='x', s=500)

<matplotlib.collections.PathCollection at 0x7f8858df3a90>


dfp2.head()


dfp2.iloc[2, 0:1].values

array(['Adelie'], dtype=object)


dfp2.iloc[1, 2:6].values

array([39.5, 17.4, 186.0, 3800.0], dtype=object)


# We assign the numerical features to X
X = dfp2.iloc[:, 2:6].values
# And we assign the species label to y
y = dfp2.iloc[:, 0].values


# from the scikit-learn library we use 40% of the data to test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)


print('shape of X_train = ', X_train.shape,' and of X_test = ', X_test.shape)
print('shape of y_train = ', y_train.shape,' and of y_test = ', y_test.shape)

shape of X_train =  (199, 4)  and of X_test =  (134, 4)
shape of y_train =  (199,)  and of y_test =  (134,)


# import the helper and give it a name
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# initialise the scaler by feeding it the training data
scaler.fit(X_train)
# now carry out the transformation of all of the feauture data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


# import the k-NN classifier
from sklearn.neighbors import KNeighborsClassifier
# assign it with k=2 and p=1
classifier = KNeighborsClassifier(n_neighbors=2, p=1)
# give the training data to the classifier
classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=2, p=1)


y_pred = classifier.predict(X_test)


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

clsrep = classification_report(y_test, y_pred)
print("Classification Report:",)
print(clsrep)

accsc = accuracy_score(y_test,y_pred)
print("Accuracy:", accsc)

Confusion Matrix:
[[55  1  0]
 [ 1 28  0]
 [ 0  0 49]]
Classification Report:
              precision    recall  f1-score   support

      Adelie       0.98      0.98      0.98        56
   Chinstrap       0.97      0.97      0.97        29
      Gentoo       1.00      1.00      1.00        49

    accuracy                           0.99       134
   macro avg       0.98      0.98      0.98       134
weighted avg       0.99      0.99      0.99       134

Accuracy: 0.9850746268656716


print(cm)

[[55  1  0]
 [ 1 28  0]
 [ 0  0 49]]


from sklearn.metrics import ConfusionMatrixDisplay
cmplot = ConfusionMatrixDisplay(cm, display_labels=classifier.classes_)
cmplot.plot()
plt.show()


sns.scatterplot(data=dfp2, x="bill_length_mm", y="bill_depth_mm", style="species", hue="sex")

<AxesSubplot:xlabel='bill_length_mm', ylabel='bill_depth_mm'>


sns.scatterplot(data=dfp2, x="flipper_length_mm", y="body_mass_g", style="species", hue="sex")

<AxesSubplot:xlabel='flipper_length_mm', ylabel='body_mass_g'>


sns.scatterplot(data=dfp2, x="body_mass_g", y="bill_depth_mm", style="species", hue="sex")

<AxesSubplot:xlabel='body_mass_g', ylabel='bill_depth_mm'>


sns.scatterplot(data=dfp2, x="body_mass_g", y="bill_length_mm", style="species", hue="sex")

<AxesSubplot:xlabel='body_mass_g', ylabel='bill_length_mm'>


sns.scatterplot(data=dfp2, x="bill_length_mm", y="flipper_length_mm", style="species", hue="sex")

<AxesSubplot:xlabel='bill_length_mm', ylabel='flipper_length_mm'>


sns.scatterplot(data=dfp2, x="bill_depth_mm", y="flipper_length_mm", style="species", hue="sex")

<AxesSubplot:xlabel='bill_depth_mm', ylabel='flipper_length_mm'>

cm

array([[55,  1,  0],
       [ 1, 28,  0],
       [ 0,  0, 49]])


print(cm[1,1],cm[2,2])
print(cm[0,0]+cm[1,1]+cm[2,2])

28 49
132


print(cm.sum())

134


print((cm[0,0]+cm[1,1]+cm[2,2])/cm.sum())

0.9850746268656716


print(np.trace(cm)/cm.sum())

0.9850746268656716

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
count	342.000000	342.000000	342.000000	342.000000
mean	43.921930	17.151170	200.915205	4201.754386
std	5.459584	1.974793	14.061714	801.954536
min	32.100000	13.100000	172.000000	2700.000000
25%	39.225000	15.600000	190.000000	3550.000000
50%	44.450000	17.300000	197.000000	4050.000000
75%	48.500000	18.700000	213.000000	4750.000000
max	59.600000	21.500000	231.000000	6300.000000

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
count	344.000000	344.000000	344.000000	344.000000
mean	43.921930	17.151170	200.915205	4201.754386
std	5.443643	1.969027	14.020657	799.613058
min	32.100000	13.100000	172.000000	2700.000000
25%	39.275000	15.600000	190.000000	3550.000000
50%	44.250000	17.300000	197.000000	4050.000000
75%	48.500000	18.700000	213.000000	4750.000000
max	59.600000	21.500000	231.000000	6300.000000

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
count	146.000000	146.000000	146.000000	146.000000
mean	38.823973	18.347260	190.102740	3706.164384
std	2.662597	1.219338	6.521825	458.620135
min	32.100000	15.500000	172.000000	2850.000000
25%	36.725000	17.500000	186.000000	3362.500000
50%	38.850000	18.400000	190.000000	3700.000000
75%	40.775000	19.000000	195.000000	4000.000000
max	46.000000	21.500000	210.000000	4775.000000

$k$-NN's: $k$-Nearest Neighbours¶

variationalform https://variationalform.github.io/¶

Just Enough: progress at pace¶

What this is about:¶

Assigned Reading¶

Penguins: An Example Data Set¶

Some Data-Engineering¶

Summary¶

Data Engineering - our first method¶

Mean Imputation¶

Data Engineering - our second method¶

Visualization¶

Further Exploration of the Data Set¶

Creating Data Subsets¶

Using `matplotlib` to plot the clusters separately¶

$k$-NN's - developing intuition¶

Any comments, thoughts, questions?¶

$k$-NN's - the mathematical details¶

Cross-Reference to the Assigned Reading¶

Hyperparameters¶

Data Set Bifurcation and Trifurcation¶

Introducing `scikit-learn`, our first visit¶

Using `sklearn`¶

Normalization of Data¶

Fitting: Learning from Data¶

Evaluation of Performance¶

The Confusion Matrix¶

$k$-NN for regression¶

Exercise¶

Exercise¶

Exercise¶

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
3	Adelie	Torgersen	NaN	NaN	NaN	NaN	NaN
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
339	Gentoo	Biscoe	NaN	NaN	NaN	NaN	NaN
340	Gentoo	Biscoe	46.8	14.3	215.0	4850.0	Female
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
3	Adelie	Torgersen	43.92193	17.15117	200.915205	4201.754386	Female
8	Adelie	Torgersen	34.10000	18.10000	193.000000	3475.000000	Female
9	Adelie	Torgersen	42.00000	20.20000	190.000000	4250.000000	Female
10	Adelie	Torgersen	37.80000	17.10000	186.000000	3300.000000	Female
11	Adelie	Torgersen	37.80000	17.30000	180.000000	3700.000000	Female
47	Adelie	Dream	37.50000	18.90000	179.000000	2975.000000	Female
246	Gentoo	Biscoe	44.50000	14.30000	216.000000	4100.000000	Female
286	Gentoo	Biscoe	46.20000	14.40000	214.000000	4650.000000	Female
324	Gentoo	Biscoe	47.30000	13.80000	216.000000	4725.000000	Female
336	Gentoo	Biscoe	44.50000	15.70000	217.000000	4875.000000	Female
339	Gentoo	Biscoe	43.92193	17.15117	200.915205	4201.754386	Female

$k$-NN's: $k$-Nearest Neighbours¶

variationalform https://variationalform.github.io/¶

Just Enough: progress at pace¶

What this is about:¶

Assigned Reading¶

Penguins: An Example Data Set¶

Some Data-Engineering¶

Summary¶

Data Engineering - our first method¶

Mean Imputation¶

Data Engineering - our second method¶

Visualization¶

Further Exploration of the Data Set¶

Creating Data Subsets¶

Using matplotlib to plot the clusters separately¶

$k$-NN's - developing intuition¶

Any comments, thoughts, questions?¶

$k$-NN's - the mathematical details¶

Cross-Reference to the Assigned Reading¶

Hyperparameters¶

Data Set Bifurcation and Trifurcation¶

Introducing scikit-learn, our first visit¶

Using sklearn¶

Normalization of Data¶

Fitting: Learning from Data¶

Evaluation of Performance¶

The Confusion Matrix¶

$k$-NN for regression¶

Exercise¶

Exercise¶

Exercise¶

Using `matplotlib` to plot the clusters separately¶

Introducing `scikit-learn`, our first visit¶

Using `sklearn`¶