# our usual imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import pandas
import matplotlib.pyplot as plt


choice = 2
if choice == 0:
  df_train = pandas.read_csv(r'./data/MNIST/MNIST_train.csv', header=None)
  df_test  = pandas.read_csv(r'./data/MNIST/MNIST_test.csv', header=None)
elif choice == 1:
  df_train = pandas.read_csv(r'./data/MNIST/MNIST_train_1000.csv', header=None)
  df_test  = pandas.read_csv(r'./data/MNIST/MNIST_test_100.csv', header=None)
elif choice == 2:
  df_train = pandas.read_csv(r'./data/MNIST/MNIST_train_100.csv', header=None)
  df_test  = pandas.read_csv(r'./data/MNIST/MNIST_test_100.csv', header=None)
else:
  df_train = pandas.read_csv(r'./data/MNIST/MNIST_train_100.csv', header=None)
  df_test  = pandas.read_csv(r'./data/MNIST/MNIST_test_10.csv', header=None)


# it will take a bit of work to see what these data files hold
df_train.head()


# We assign the pixel values to X_train and X_test
X_train = df_train.iloc[:, 1:].values
X_test  = df_test.iloc[:, 1:].values
N_train = X_train.shape[0]
N_test  = X_test.shape[0]
print(f'N_train = {N_train}, N_test = {N_test}')

# And we assign the first column labels 0,1,2,...,9 to ... 
train_labels = df_train.iloc[:, 0].values
test_labels  = df_test.iloc[:, 0].values

print(X_train.shape)
print(X_test.shape)
print(train_labels.shape)
print(test_labels.shape)

N_train = 100, N_test = 100
(100, 784)
(100, 784)
(100,)
(100,)


print(train_labels[:9])

[5 0 4 1 9 2 1 3 1]


print(test_labels[:9])

[3 8 0 5 4 3 8 3 2]


# Let's choose the third row (indexed as 2) 
row = 2
plt.imshow( X_train[row,:].reshape(28,28) , cmap='Greys', interpolation='None')
plt.title(f'Digitized Image With Label {train_labels[row]}')
print(f'There are 28x28 = {28*28} pixel values: 0,1,...,255')
print('0 is white, 255 is black 2,3,...,254 are grays')

There are 28x28 = 784 pixel values: 0,1,...,255
0 is white, 255 is black 2,3,...,254 are grays


# scale pixel values to [0,1] - this is recommended. 
X_train = X_train/255
X_test  = X_test/255
plt.imshow(X_train[row,:].reshape(28,28) , cmap='Greys', interpolation='None')

<matplotlib.image.AxesImage at 0x7fd220c4a6d8>


# make every entry zero to begin with ...
y_train = np.zeros((10, N_train))
y_test = np.zeros((10, N_test))
print(f'Shape of: y_train = {y_train.shape}, y_test = {y_test.shape}')

Shape of: y_train = (10, 100), y_test = (10, 100)


for k in range(N_train):
  label = train_labels[k]
  y_train[label,k] = 1

for k in range(N_test):
  label = test_labels[k]
  y_test[label,k] = 1


plt.bar(range(10),y_train.sum(axis=1), label='y train')
plt.bar(range(10),y_test.sum(axis=1), label='y test')
plt.legend()

<matplotlib.legend.Legend at 0x7fd1f08fba58>


# add up all the ones - across both dimensions
print(y_train.sum())
print(y_test.sum())

100.0
100.0


inn = 784  # number of nodes on input layer
h1n = 500  # number of nodes on first hidden layer
h2n = 200  # number of nodes on second hidden layer
onn = 10   # number of nodes on output layer

# weights and biases
W1 = 0.5 - np.random.rand(inn,h1n) # weights connecting input to first hidden
W2 = 0.5 - np.random.rand(h1n,h2n) # weights connecting first to second hidden
W3 = 0.5 - np.random.rand(h2n,onn) # weights connecting second hidden to output
b1 = np.zeros([h1n,1])             # bias on first hidden
b2 = np.zeros([h2n,1])             # bias on second hidden
b3 = np.zeros([onn,1])             # bias on output

print(f'W1 shape: {W1.shape}, W2 shape: {W2.shape}, W3 shape: {W3.shape}')
print(f'b1 shape: {b1.shape}, b2 shape: {b2.shape}, b3 shape: {b3.shape}')

W1 shape: (784, 500), W2 shape: (500, 200), W3 shape: (200, 10)
b1 shape: (500, 1), b2 shape: (200, 1), b3 shape: (10, 1)


def sigmoid(x):
  return 1/(1+np.exp(-x))
def ReLU(x):
  return np.maximum(0,x)


def Diff_sigmoid(x):
  return sigmoid(x)*(1-sigmoid(x))
def Diff_ReLU(x):
  return np.heaviside(x,0)


# Let's plot them - just to remember what they look like
xvals = np.arange(-5,5+0.1,0.1)
plt.plot(xvals, sigmoid(xvals), color='blue', label='sigmoid')
xvals = np.arange(-5,2+0.1,0.1)
plt.plot(xvals, ReLU(xvals), color='red', label='ReLU')
plt.legend()

<matplotlib.legend.Legend at 0x7fd24129b358>


k = np.random.randint(0, X_train.shape[0])
a0 = X_train[k,:].T # a0 = X_train[[k],:].T #a0 = X_train[k,:].reshape(-1,1)
# feed into first hidden layer and activate
n1 = W1.T @ a0 + b1
a1 = sigmoid(n1)
# feed into second hidden layer and activate
n2 = W2.T @ a1 + b2
a2 = sigmoid(n2)
# feed into output layer and activate
n3 = W3.T @ a2 + b3
a3 = sigmoid(n3)
# produce output
y = a3


Wvals = W1.size + W2.size + W3.size
bvals = b1.size + b2.size + b3.size
print('Number of values to optimize = ', Wvals + bvals)

Number of values to optimize =  494710


# select a learning rate for SGD
alpha = 0.3
# loop through this many epochs
N_ep = 50
# initialise the TSE cost
TSEcost = np.zeros([N_ep,1])


for ep in range(N_ep): 
  ransel = np.random.permutation(N_train)
  for k in range(N_train):
    # select a random without replacement
    j = ransel[k]
    # forward prop
    a0 = X_train[[j],:].T 
    n1 = W1.T @ a0 + b1
    a1 = sigmoid(n1)
    n2 = W2.T @ a1 + b2
    a2 = sigmoid(n2)
    n3 = W3.T @ a2 + b3
    a3 = sigmoid(n3)
    y = a3
    # backprop and update
    error = y_train[:,[j]] - y
    A3 = np.diagflat(Diff_sigmoid(n3))
    A2 = np.diagflat(Diff_sigmoid(n2))
    A1 = np.diagflat(Diff_sigmoid(n1))
    S3 = -2*A3@error
    S2 = A2@W3@S3
    S1 = A1@W2@S2

    W3 = W3 - alpha * a2@S3.T
    W2 = W2 - alpha * a1@S2.T
    W1 = W1 - alpha * a0@S1.T

    b3 = b3 - alpha * S3
    b2 = b2 - alpha * S2
    b1 = b1 - alpha * S1

  # update cost - loop through training set
  for j in range(N_train):
    a0 = X_train[[j],:].T
    n1 = W1.T @ a0 + b1
    a1 = sigmoid(n1)
    n2 = W2.T @ a1 + b2
    a2 = sigmoid(n2)
    n3 = W3.T @ a2 + b3
    a3 = sigmoid(n3)
    y = a3
    error = y_train[:,[j]] - y
    TSEcost[ep] += (error * error).sum()


plt.plot(range(N_ep), TSEcost)
plt.xlabel('epoch'); plt.ylabel('cost')

Text(0, 0.5, 'cost')


# test - create a matrix to hold the predictions
y_pred = np.zeros((10, X_test.shape[0]))
print(f'Shape of y_test = {y_test.shape}')
print(f'Shape of y_pred = {y_pred.shape}')

# create scorecards...
success = np.zeros(10)
failure = np.zeros(10)

Shape of y_test = (10, 100)
Shape of y_pred = (10, 100)


for k in range(N_test):
  # forward prop
  a0 = X_test[[k],:].T   # a0 = X_test[k,:].reshape(-1,1)
  n1 = W1.T @ a0 + b1
  a1 = sigmoid(n1)
  n2 = W2.T @ a1 + b2
  a2 = sigmoid(n2)
  n3 = W3.T @ a2 + b3
  a3 = sigmoid(n3)
  y_pred[:,[k]] = a3  
  if np.argmax(a3) == test_labels[k]:
    success[test_labels[k]] += 1
  else:
    failure[test_labels[k]] += 1


print(success)
print(failure)

[10. 10.  8.  9.  7.  2.  9.  6.  5.  6.]
[0. 0. 2. 1. 3. 8. 1. 4. 5. 4.]


y_test_cm = np.zeros(N_test)
y_pred_cm = np.zeros(N_test)
test_indx_max = np.argmax(y_test, axis=0)
pred_indx_max = np.argmax(y_pred, axis=0)
for k in range(N_test):
  y_test_cm[k] = test_indx_max[k]
  y_pred_cm[k] = pred_indx_max[k]


from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test_cm, y_pred_cm)
print("Confusion Matrix:")
print(cm)
accsc = accuracy_score(y_test_cm,y_pred_cm)
print("Accuracy:", accsc)

Confusion Matrix:
[[10  0  0  0  0  0  0  0  0  0]
 [ 0 10  0  0  0  0  0  0  0  0]
 [ 0  1  8  0  0  0  1  0  0  0]
 [ 1  0  0  9  0  0  0  0  0  0]
 [ 0  0  0  0  7  1  0  0  0  2]
 [ 1  0  0  3  2  2  1  1  0  0]
 [ 0  0  0  0  0  0  9  0  0  1]
 [ 0  0  0  0  0  0  0  6  1  3]
 [ 0  2  0  1  0  0  1  0  5  1]
 [ 0  1  0  1  2  0  0  0  0  6]]
Accuracy: 0.72


from sklearn.metrics import ConfusionMatrixDisplay
cmplot = ConfusionMatrixDisplay(cm, display_labels=range(10))
#plt.figure(figsize=(15, 15))
#cmplot.plot()
fig, ax = plt.subplots(figsize=(6,6))
cmplot.plot(ax=ax);


print( (6439 - 6132)/5, ' mAh per training cycle')
print( (6439 - 6132)/5/1000, ' Ah per training cycle')
print( 12.5*(6439 - 6132)/5/1000, ' Wh per training cycle')
print( 28 / (12.5*(6439 - 6132)/5/1000), ' training cycles for 1 hour of a 28 watt bulb')

61.4  mAh per training cycle
0.061399999999999996  Ah per training cycle
0.7675  Wh per training cycle
36.48208469055375  training cycles for 1 hour of a 28 watt bulb


import numpy as np

def sigmoid(x):
  return 1 / (1+np.exp(-x))

def neural_network(X,y):
    learning_rate = 0.1
    W1 = np.random.rand(2,4)
    W2 = np.random.rand(4,1)
    
    for epoch in range(1000):
        layer1 = sigmoid(np.dot(X, W1))
        output = sigmoid(np.dot(layer1, W2))
        error = y-output
        delta2 = 2 * error * (output * (1 - output))
        delta1 = delta2.dot(W2.T) * (layer1 * (1 - layer1))
        W2 += learning_rate * layer1.T.dot(delta2)
        W1 += learning_rate * X.T.dot(delta1)
    
    return np.round(output).flatten()

X = np.array([[0,0],[0,1],[1,0],[1,1]])

print(  "OR", neural_network( X, np.array([[0,1,1,1]]).T ) )
print( "AND", neural_network( X, np.array([[0,0,0,1]]).T ) )
print( "XOR", neural_network( X, np.array([[0,1,1,0]]).T ) )
print("NAND", neural_network( X, np.array([[1,1,1,0]]).T ) )
print( "NOR", neural_network( X, np.array([[1,0,0,0]]).T ) )

OR [1. 1. 1. 1.]
AND [0. 0. 0. 1.]
XOR [0. 1. 1. 1.]
NAND [1. 1. 1. 0.]
NOR [1. 0. 0. 0.]

	0	...
0	5	...
1	0	...
2	4	...
3	1	...
4	9	...

Artificial Neural Networks and Deep Learning¶

variationalform https://variationalform.github.io/¶

Just Enough: progress at pace¶

What this is about:¶

Assigned Reading¶

Context¶

MNIST Data Set of Handwritten Digits¶

Our Neural Network¶

One-Hot Encoding¶

Our Neural Network Architecture¶

DEEP LEARNING¶

The Feed Forward Algorithm¶

Feeding Forward - Forward Propagation¶

Learning - Artificial Intelligence (AI)¶

Cost: Total Squared Error (TSE)¶

The Size of the Task¶

Gradient Descent¶

Gradient Descent in Outline¶

Gradient Descent Demonstrations - Global and Local Minima¶

Gradient Descent Demonstrations - Saddle Points¶

Stochastic Gradient (Descent?)¶

The Calculus of Learning - part 1¶

The Calculus of Learning - part 2¶

The Calculus of Learning - part 3¶

The Calculus of Learning - part 4¶

The Calculus of Learning - part 5¶

The Calculus of Learning - part 6¶

The Calculus of Learning - part 7¶

The Calculus of Learning - part 8¶

The Calculus of Learning - part 9¶

The Calculus of Learning - part 10¶

The Forward and Backward Propagation ('backprop') Algorithm - Learning from Data¶

Our Neural Network - training and testing¶

Testing¶

Confusion Matrices¶

Review¶

Activation Functions and Cost¶

Power Consumption¶

The Tensorflow Playground¶

Deep Learning in a few lines¶

	0	...
0	5	...
1	0	...
2	4	...
3	1	...
4	9	...

	0	...
0	5	...
1	0	...
2	4	...
3	1	...
4	9	...

	0	...
0	5	...
1	0	...
2	4	...
3	1	...
4	9	...