import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


dfp = sns.load_dataset('penguins')
dfp2 = dfp.dropna()
dfp2.isna().sum()
dfp2 = dfp2.reset_index(drop=True)
print(dfp2.shape)

(333, 7)


dfp.head()


dfp2.head()


# create our labelled training and test data sets with 60/40 train/test split
X = dfp2.iloc[:, 2:6].values
y = dfp2.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)
# scale the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# obtain the classifier and fit it using 2 nearest neighbours
# and the Manhattan norm
classifier = KNeighborsClassifier(n_neighbors=2, p=1)
classifier.fit(X_train, y_train)# Now use the fitted model from the training data to predict
# from the test data.
y_pred = classifier.predict(X_test)


# create a confusion matrix to assess the performance
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:"); print(cm)
accsc = accuracy_score(y_test,y_pred);
print("Accuracy:", accsc)

Confusion Matrix:
[[58  1  0]
 [ 0 29  0]
 [ 0  0 46]]
Accuracy: 0.9925373134328358


from sklearn.metrics import ConfusionMatrixDisplay
cmplot = ConfusionMatrixDisplay(cm, display_labels=classifier.classes_)
cmplot.plot()
plt.show()


print(' The data type of X: ', type(X))
print('     The shape of X: ', X.shape)
print('The first four rows:\n', X[0:4,:])
print('  The last two rows:\n', X[-2:,:])

 The data type of X:  <class 'numpy.ndarray'>
     The shape of X:  (333, 4)
The first four rows:
 [[  39.1   18.7  181.  3750. ]
 [  39.5   17.4  186.  3800. ]
 [  40.3   18.   195.  3250. ]
 [  36.7   19.3  193.  3450. ]]
  The last two rows:
 [[  45.2   14.8  212.  5200. ]
 [  49.9   16.1  213.  5400. ]]


U, S, VT = np.linalg.svd(X, full_matrices=False) # full_matrices=False - discussed later
print("     U's  shape: ", U.shape)
print("     VT's shape: ", VT.shape)
print("      S's shape: ", S.shape)
print("             S = ", S)
print("diag(S)'s shape: ", np.diag(S).shape)
print("allclose? (T/F): ", np.allclose(X, U @ np.diag(S) @ VT), end=' - ')
print("|| X-U @ np.diag(S) @ VT || = ", np.linalg.norm( X-U @ np.diag(S) @ VT ) )

     U's  shape:  (333, 4)
     VT's shape:  (4, 4)
      S's shape:  (4,)
             S =  [7.82505416e+04 4.99884015e+02 7.40457786e+01 4.18409792e+01]
diag(S)'s shape:  (4, 4)
allclose? (T/F):  True - || X-U @ np.diag(S) @ VT || =  1.862284009018728e-11


# in some contexts this is related to a scree plot
print("S = ", S)
plt.bar([1,2,3,4],S)

S =  [7.82505416e+04 4.99884015e+02 7.40457786e+01 4.18409792e+01]

<BarContainer object of 4 artists>


# a log scale is sometimes preferable...
print("S = ", S)
plt.bar([1,2,3,4],S,log=True)

S =  [7.82505416e+04 4.99884015e+02 7.40457786e+01 4.18409792e+01]

<BarContainer object of 4 artists>


# set up a zero matrix to hold the approximations X1, X2, ...
Xc = np.zeros(X.shape)
print('The norm of X is    ', np.linalg.norm( X ) )
# take more and more terms in the SVD expansion - starting with none
for nc in range(0,1+S.shape[0]):
  Xc = U[:, :nc] @ np.diag(S[:nc]) @ VT[:nc, :]
  print('The norm of X-Xc is ',np.linalg.norm( X-Xc ) )
print('X-Xc is close to zero (T/F)...', np.allclose(X,Xc))

The norm of X is     78252.18454350013
The norm of X-Xc is  78252.18454350013
The norm of X-Xc is  507.06752386361745
The norm of X-Xc is  85.04966116997144
The norm of X-Xc is  41.84097917907918
The norm of X-Xc is  1.862284009018728e-11
X-Xc is close to zero (T/F)... True


Xc = np.zeros(X.shape)
Xc = Xc + S[0]*U[:,0:1] @ VT[0:1,:]
print(np.linalg.norm(X - Xc))
Xc = Xc + S[1]*U[:,1:2] @ VT[1:2,:]
print(np.linalg.norm(X - Xc))
Xc = Xc + S[2]*U[:,2:3] @ VT[2:3,:]
print(np.linalg.norm(X - Xc))
Xc = Xc + S[3]*U[:,3:4] @ VT[3:4,:]
print(np.linalg.norm(X - Xc))

507.06752386361745
85.04966116997144
41.84097917907918
1.8955341028962172e-11


Xc = np.zeros(X.shape)
nc = 4
for k in range(0, nc):
  Xc = Xc + S[k] * U[:,[k]] @ VT[[k],:]
  print(np.linalg.norm( X-Xc ) )

507.06752386361745
85.04966116997144
41.84097917907918
1.8955341028962172e-11


accarray = np.zeros([4])


#X = dfp2.iloc[:, 2:6].values # we don't use the raw data this time
Xc = np.zeros(X.shape)
nc = 3
for k in range(0, nc):
  Xc = Xc + S[k] * U[:,[k]] @ VT[[k],:]
y = dfp2.iloc[:, 0].values                         # Xc needed below
X_train, X_test, y_train, y_test = train_test_split(Xc, y, test_size=0.40)
# scale the data
scaler = StandardScaler(); scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# obtain classifier, fit using 2 NN's and the Manhattan norm
classifier = KNeighborsClassifier(n_neighbors=2, p=1)
classifier.fit(X_train, y_train)
# predict from the test data.
y_pred = classifier.predict(X_test)
# create a confusion matrix to assess the performance
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:"); print(cm)
accsc = accuracy_score(y_test,y_pred); print("Accuracy:", accsc)
print('nc = ', nc, ', ||X-Xc|| = ', np.linalg.norm(X - Xc))
# store the accuracy scores in a python list for nc=1,2,3,4
accarray[nc-1] = accsc

Confusion Matrix:
[[53  0  2]
 [ 3 26  0]
 [ 1  0 49]]
Accuracy: 0.9552238805970149
nc =  3 , ||X-Xc|| =  41.84097917907918


print(accarray)
plt.plot([1,2,3,4], accarray)
plt.xlabel('Number of Singular Values')
plt.ylabel('Accuracy estimate')
plt.ylim([0,1])

[0.         0.         0.95522388 0.        ]

(0.0, 1.0)


cmplot = ConfusionMatrixDisplay(cm, display_labels=classifier.classes_)
cmplot.plot()
plt.show()


print('The shape of X  is: ', X.shape, '\t we knew this')
print('The shape of U  is: ', U.shape, '\t seems WRONG')
print('The shape of S  is: ', S.shape, '\t seems WRONG')
print('The shape of VT is: ', VT.shape,'\t seems OK')

The shape of X  is:  (333, 4) 	 we knew this
The shape of U  is:  (333, 4) 	 seems WRONG
The shape of S  is:  (4,) 	 seems WRONG
The shape of VT is:  (4, 4) 	 seems OK


K = np.array([[1,2,5],[5,-6,1]])
U, S, VT = np.linalg.svd(K)
S1 = np.array([ [7.88191, 0], [0, 5.46584]])
V1 = np.array([ [0.625, 0.239],[-0.775, 0.297],[0.087,0.924]])
print( U @ S1 @ V1.T)
print(K)

[[ 0.99771032  1.99978285  4.99806744]
 [ 4.99784675 -5.99580514  0.99822226]]
[[ 1  2  5]
 [ 5 -6  1]]


K = np.array([[1,2,5],[5,-6,1]])
U, S, VT = np.linalg.svd(K)
print(U)
print(S)
print(VT)

[[-0.06213744  0.9980676 ]
 [ 0.9980676   0.06213744]]
[7.88191065 5.4658471 ]
[[ 0.62525456 -0.77553283  0.08720987]
 [ 0.23944227  0.29699158  0.9243719 ]
 [-0.74278135 -0.55708601  0.37139068]]


print(K)
print(U @ np.diag(S) @ VT[0:2,:])
print('K - U @ np.diag(S) @ VT[0:2,:] is zero (T/F): ', np.allclose(K, U @ np.diag(S) @ VT[0:2,:]) )

[[ 1  2  5]
 [ 5 -6  1]]
[[ 1.  2.  5.]
 [ 5. -6.  1.]]
K - U @ np.diag(S) @ VT[0:2,:] is zero (T/F):  True


S = np.hstack(( np.diag(S), np.zeros((2,1)) ))
print(S)

[[7.88191065 0.         0.        ]
 [0.         5.4658471  0.        ]]


print(K - U @ S @ VT)

[[-4.44089210e-16 -4.44089210e-16 -1.77635684e-15]
 [-8.88178420e-16  1.77635684e-15  1.11022302e-16]]


U, S, VT = np.linalg.svd(K, full_matrices=False)
print(U)
print(S)
print(VT)

[[-0.06213744  0.9980676 ]
 [ 0.9980676   0.06213744]]
[7.88191065 5.4658471 ]
[[ 0.62525456 -0.77553283  0.08720987]
 [ 0.23944227  0.29699158  0.9243719 ]]


U, S, VT = np.linalg.svd(K, full_matrices=True)
print(U)
print(S)
print(VT)

[[-0.06213744  0.9980676 ]
 [ 0.9980676   0.06213744]]
[7.88191065 5.4658471 ]
[[ 0.62525456 -0.77553283  0.08720987]
 [ 0.23944227  0.29699158  0.9243719 ]
 [-0.74278135 -0.55708601  0.37139068]]

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
3	Adelie	Torgersen	NaN	NaN	NaN	NaN	NaN
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
3	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
4	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male

SVD Demonstration¶

variationalform https://variationalform.github.io/¶

Just Enough: progress at pace¶

What this is about:¶

Assigned Reading¶

Remembering $k$-NN for penguins¶

Motivation¶

In This Notebook WE WILL COMMIT a DATA SCIENCE CRIME¶

Dimensionality Reduction¶

Have you SPOTTED THE CRIME YET?¶

Review¶

SVD: The Singular Value Decomposition - some technicalities¶

Review¶

How about THAT CRIME?¶

HOMEWORK REMINDER - very important¶