import numpy as np
cm = np.array([[62,5],[9,44]])
N = cm.sum()
print('Number of samples: ', N,' with base rates...')
print('P(Y) = (62+5)/120 = ', (62+5)/120, end=' and ')
print('P(N) = (9+44)/120 = ', (9+44)/120, ' = 1-P(Y)')
print('P(+) = (62+9)/120 = ', (62+9)/120, end=' and ')
print('P(-) = (5+44)/120 = ', (5+44)/120, ' = 1-P(+)')
print('Conditionals...')
print('P(Y|+) = 62/(62+9) = ', 62/(62+9), end=' and ')
print('P(N|+) =  9/(62+9) = ',  9/(62+9))
print('P(Y|-) =  5/(5+44) = ',  5/(5+44), end=' and ')
print('P(N|-) = 44/(5+44) = ', 44/(5+44))
print('P(+|Y) = 62/(62+5) = ', 62/(62+5), end=' and ')
print('P(-|Y) =  5/(62+5) = ',  5/(62+5))
print('P(+|N) =  9/(9+44) = ',  9/(9+44), end=' and ')
print('P(-|N) = 44/(9+44) = ', 44/(9+44))

Number of samples:  120  with base rates...
P(Y) = (62+5)/120 =  0.5583333333333333 and P(N) = (9+44)/120 =  0.44166666666666665  = 1-P(Y)
P(+) = (62+9)/120 =  0.5916666666666667 and P(-) = (5+44)/120 =  0.4083333333333333  = 1-P(+)
Conditionals...
P(Y|+) = 62/(62+9) =  0.8732394366197183 and P(N|+) =  9/(62+9) =  0.1267605633802817
P(Y|-) =  5/(5+44) =  0.10204081632653061 and P(N|-) = 44/(5+44) =  0.8979591836734694
P(+|Y) = 62/(62+5) =  0.9253731343283582 and P(-|Y) =  5/(62+5) =  0.07462686567164178
P(+|N) =  9/(9+44) =  0.16981132075471697 and P(-|N) = 44/(9+44) =  0.8301886792452831


X = np.array([1,3,4,5,7])
N = X.shape[0]
Xbar = X.sum()/N
print('E(X) = mean = ', Xbar, ' or with numpy: ', X.mean())
# centre X using mean, then sum of squares using dot product
Xc = X-Xbar
VarX = Xc.T.dot(Xc) / N
print('Var(X) = variance = ', VarX, ' or with numpy: ', X.var())
print('SD(X) = Std Dev = ', np.sqrt(VarX), ' or with numpy: ', X.std())
# or, the unbiased result..
VarX = Xc.T.dot(Xc) / (N-1)
print('Var(X) = variance = ', VarX, ' or with numpy: ', X.var(ddof=1))
print('SD(X) = Std Dev = ', np.sqrt(VarX), ' or with numpy: ', X.std(ddof=1))

E(X) = mean =  4.0  or with numpy:  4.0
Var(X) = variance =  4.0  or with numpy:  4.0
SD(X) = Std Dev =  2.0  or with numpy:  2.0
Var(X) = variance =  5.0  or with numpy:  5.0
SD(X) = Std Dev =  2.23606797749979  or with numpy:  2.23606797749979


import numpy as np
import seaborn as sns
dfp = sns.load_dataset('penguins')
dfp.head()
dfp = dfp.dropna()
dfp = dfp.reset_index(drop=True)
dfp.head()


X = dfp.iloc[:, 2:6].values
X[:4,:]

array([[  39.1,   18.7,  181. , 3750. ],
       [  39.5,   17.4,  186. , 3800. ],
       [  40.3,   18. ,  195. , 3250. ],
       [  36.7,   19.3,  193. , 3450. ]])


print('Mean of column 1 (indexed at 0) :  ', X[:,0].mean())
print('Std Dev of column 3 (population):  ', X[:,2].std())
print('Std Dev of column 3 (unbiased)  :  ', X[:,2].std(ddof=1))

Mean of column 1 (indexed at 0) :   43.99279279279279
Std Dev of column 3 (population):   13.994704772576716
Std Dev of column 3 (unbiased)  :   14.015765288287879


# remember that we can access some summary stats like this...
dfp.describe()


# first center the data using the column means...
X1 = X[:,[1]] - X[:,[1]].mean()
X2 = X[:,[2]] - X[:,[2]].mean()
# then multiply, sum and take the unbiased average
N = X.shape[0]
CV12 = np.sum(X1*X2)/(N-1)
print("Cov(X1,X2) = ", CV12)

Cov(X1,X2) =  -15.94724845327255


CV12 = X1.T @ X2 / (N-1)
print("Cov(X1,X2) = ", CV12, " or as a scalar Cov(X1,X2) = ", float(CV12[0,0]) )

# The above is a bug fixed version of the code below.
# We needed the [0,0] for float to work.
# CV12 = X1.T @ X2 / (N-1)
# print("Cov(X1,X2) = ", CV12, " or as a scalar Cov(X1,X2) = ", float(CV12) )

Cov(X1,X2) =  [[-15.94724845]]  or as a scalar Cov(X1,X2) =  -15.94724845327255


# note the transpose...
print(np.cov(X.T))

[[ 2.99063334e+01 -2.46209134e+00  5.00581949e+01  2.59562330e+03]
 [-2.46209134e+00  3.87788831e+00 -1.59472485e+01 -7.48456122e+02]
 [ 5.00581949e+01 -1.59472485e+01  1.96441677e+02  9.85219165e+03]
 [ 2.59562330e+03 -7.48456122e+02  9.85219165e+03  6.48372488e+05]]


dfp[['bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g']].cov()
# dfp.cov() # simpler, but may not work on newer installations


dfp[['bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g']].corr()
# dfp.corr() # simpler, but may not work on newer installations

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
3	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
4	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
count	333.000000	333.000000	333.000000	333.000000
mean	43.992793	17.164865	200.966967	4207.057057
std	5.468668	1.969235	14.015765	805.215802
min	32.100000	13.100000	172.000000	2700.000000
25%	39.500000	15.600000	190.000000	3550.000000
50%	44.500000	17.300000	197.000000	4050.000000
75%	48.600000	18.700000	213.000000	4775.000000
max	59.600000	21.500000	231.000000	6300.000000

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
bill_length_mm	1.000000	-0.228626	0.653096	0.589451
bill_depth_mm	-0.228626	1.000000	-0.577792	-0.472016
flipper_length_mm	0.653096	-0.577792	1.000000	0.872979
body_mass_g	0.589451	-0.472016	0.872979	1.000000

Probability and Statistics¶

variationalform https://variationalform.github.io/¶

Just Enough: progress at pace¶

What this is about:¶

Assigned Reading¶

Discrete Probability¶

Key axioms of Discrete Probability¶

Conditional Probability¶

Bayes' Theorem¶

Key Formulae for Probability.¶

Confusion Matrices¶

Relation to Earlier Formulae and Measures¶

Statistics and Associated Concepts¶

Mean, Average, Expected Value¶

Median and Mode¶

Variance¶

Covariance and Correlation¶

Review¶

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
bill_length_mm	29.906333	-2.462091	50.058195	2595.623304
bill_depth_mm	-2.462091	3.877888	-15.947248	-748.456122
flipper_length_mm	50.058195	-15.947248	196.441677	9852.191649
body_mass_g	2595.623304	-748.456122	9852.191649	648372.487699