import seaborn as sns
# we can now refer to the seaborn library functions using 'sns'
# note that you can use another character string - but 'sns' is standard.

# note that # is used to write 'comments'
# Now let's get the names of the built-in data sets.
sns.get_dataset_names()

# type SHIFT=RETURN to execute the highlighted (active) cell

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic',
 'anagrams',
 'anagrams',
 'anscombe',
 'anscombe',
 'attention',
 'attention',
 'brain_networks',
 'brain_networks',
 'car_crashes',
 'car_crashes',
 'diamonds',
 'diamonds',
 'dots',
 'dots',
 'dowjones',
 'dowjones',
 'exercise',
 'exercise',
 'flights',
 'flights',
 'fmri',
 'fmri',
 'geyser',
 'geyser',
 'glue',
 'glue',
 'healthexp',
 'healthexp',
 'iris',
 'iris',
 'mpg',
 'mpg',
 'penguins',
 'penguins',
 'planets',
 'planets',
 'seaice',
 'seaice',
 'taxis',
 'taxis',
 'tips',
 'tips',
 'titanic',
 'titanic',
 'anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']


# let's take a look at 'taxis'
dft = sns.load_dataset('taxis')
# this just plots the first few lines of the data
dft.head()


# this will plot the last few lines... There are 6433 records (Why?)
dft.tail()


# let's print the data frame...
print(dft)

                   pickup              dropoff  passengers  distance  fare  \
0     2019-03-23 20:21:09  2019-03-23 20:27:24           1      1.60   7.0   
1     2019-03-04 16:11:55  2019-03-04 16:19:00           1      0.79   5.0   
2     2019-03-27 17:53:01  2019-03-27 18:00:25           1      1.37   7.5   
3     2019-03-10 01:23:59  2019-03-10 01:49:51           1      7.70  27.0   
4     2019-03-30 13:27:42  2019-03-30 13:37:14           3      2.16   9.0   
...                   ...                  ...         ...       ...   ...   
6428  2019-03-31 09:51:53  2019-03-31 09:55:27           1      0.75   4.5   
6429  2019-03-31 17:38:00  2019-03-31 18:34:23           1     18.74  58.0   
6430  2019-03-23 22:55:18  2019-03-23 23:14:25           1      4.14  16.0   
6431  2019-03-04 10:09:25  2019-03-04 10:14:29           1      1.12   6.0   
6432  2019-03-13 19:31:22  2019-03-13 19:48:02           1      3.85  15.0   

       tip  tolls  total   color      payment            pickup_zone  \
0     2.15    0.0  12.95  yellow  credit card        Lenox Hill West   
1     0.00    0.0   9.30  yellow         cash  Upper West Side South   
2     2.36    0.0  14.16  yellow  credit card          Alphabet City   
3     6.15    0.0  36.95  yellow  credit card              Hudson Sq   
4     1.10    0.0  13.40  yellow  credit card           Midtown East   
...    ...    ...    ...     ...          ...                    ...   
6428  1.06    0.0   6.36   green  credit card      East Harlem North   
6429  0.00    0.0  58.80   green  credit card                Jamaica   
6430  0.00    0.0  17.30   green         cash    Crown Heights North   
6431  0.00    0.0   6.80   green  credit card          East New York   
6432  3.36    0.0  20.16   green  credit card            Boerum Hill   

                          dropoff_zone pickup_borough dropoff_borough  
0                  UN/Turtle Bay South      Manhattan       Manhattan  
1                Upper West Side South      Manhattan       Manhattan  
2                         West Village      Manhattan       Manhattan  
3                       Yorkville West      Manhattan       Manhattan  
4                       Yorkville West      Manhattan       Manhattan  
...                                ...            ...             ...  
6428              Central Harlem North      Manhattan       Manhattan  
6429  East Concourse/Concourse Village         Queens           Bronx  
6430                    Bushwick North       Brooklyn        Brooklyn  
6431      East Flatbush/Remsen Village       Brooklyn        Brooklyn  
6432                   Windsor Terrace       Brooklyn        Brooklyn  

[6433 rows x 14 columns]


sns.scatterplot(data=dft, x="distance", y="fare")

<AxesSubplot:xlabel='distance', ylabel='fare'>


# here's another example
sns.scatterplot(data=dft, x="pickup_borough", y="tip")

<AxesSubplot:xlabel='pickup_borough', ylabel='tip'>


# is the tip proportional to the fare?
sns.scatterplot(data=dft, x="fare", y="tip")

<AxesSubplot:xlabel='fare', ylabel='tip'>


# is the tip proportional to the distance?
sns.scatterplot(data=dft, x="distance", y="tip")

<AxesSubplot:xlabel='distance', ylabel='tip'>


# load the data - dft: data frame tips
# note that this overwrites the previous 'value/meaning' of dft
dft = sns.load_dataset('tips')
dft.head()


print(dft.info)
print('The shape of the data frame is: ', dft.shape)
print('The size of the data frame is: ', dft.size)
print('Note that 244*7 =', 244*7)

<bound method DataFrame.info of      total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
1         10.34  1.66    Male     No   Sun  Dinner     3
2         21.01  3.50    Male     No   Sun  Dinner     3
3         23.68  3.31    Male     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
..          ...   ...     ...    ...   ...     ...   ...
239       29.03  5.92    Male     No   Sat  Dinner     3
240       27.18  2.00  Female    Yes   Sat  Dinner     2
241       22.67  2.00    Male    Yes   Sat  Dinner     2
242       17.82  1.75    Male     No   Sat  Dinner     2
243       18.78  3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]>
The shape of the data frame is:  (244, 7)
The size of the data frame is:  1708
Note that 244*7 = 1708


dft.plot()

<AxesSubplot:>


sns.scatterplot(data=dft, x="total_bill", y="tip")

<AxesSubplot:xlabel='total_bill', ylabel='tip'>


# here are some descriptive statistics
dft.describe()


dfa = sns.load_dataset('anscombe')
# look at how we get an apostrophe in the string...
print("The size of Anscombe's data set is:", dfa.shape)

The size of Anscombe's data set is: (44, 3)


dfa.head()


dfa.tail()


print(dfa)

   dataset     x      y
0        I  10.0   8.04
1        I   8.0   6.95
2        I  13.0   7.58
3        I   9.0   8.81
4        I  11.0   8.33
5        I  14.0   9.96
6        I   6.0   7.24
7        I   4.0   4.26
8        I  12.0  10.84
9        I   7.0   4.82
10       I   5.0   5.68
11      II  10.0   9.14
12      II   8.0   8.14
13      II  13.0   8.74
14      II   9.0   8.77
15      II  11.0   9.26
16      II  14.0   8.10
17      II   6.0   6.13
18      II   4.0   3.10
19      II  12.0   9.13
20      II   7.0   7.26
21      II   5.0   4.74
22     III  10.0   7.46
23     III   8.0   6.77
24     III  13.0  12.74
25     III   9.0   7.11
26     III  11.0   7.81
27     III  14.0   8.84
28     III   6.0   6.08
29     III   4.0   5.39
30     III  12.0   8.15
31     III   7.0   6.42
32     III   5.0   5.73
33      IV   8.0   6.58
34      IV   8.0   5.76
35      IV   8.0   7.71
36      IV   8.0   8.84
37      IV   8.0   8.47
38      IV   8.0   7.04
39      IV   8.0   5.25
40      IV  19.0  12.50
41      IV   8.0   5.56
42      IV   8.0   7.91
43      IV   8.0   6.89


dfa.dataset.unique()

array(['I', 'II', 'III', 'IV'], dtype=object)


dfa.dataset.unique().shape

(4,)


dfa.loc[dfa['dataset'] == 'I']


sns.scatterplot(data=dfa.loc[dfa['dataset'] == 'I'], x="x", y="y")

<AxesSubplot:xlabel='x', ylabel='y'>


dfa1 = dfa.loc[dfa['dataset'] == 'I']
dfa2 = dfa.loc[dfa['dataset'] == 'II']
dfa3 = dfa.loc[dfa['dataset'] == 'III']
dfa4 = dfa.loc[dfa['dataset'] == 'IV']


sns.scatterplot(data=dfa1, x="x", y="y")
dfa1.describe()


sns.scatterplot(data=dfa2, x="x", y="y")
dfa2.describe()


sns.scatterplot(data=dfa3, x="x", y="y")
dfa3.describe()


sns.scatterplot(data=dfa4, x="x", y="y")
dfa4.describe()

	total_bill	tip	size
count	244.000000	244.000000	244.000000
mean	19.785943	2.998279	2.569672
std	8.902412	1.383638	0.951100
min	3.070000	1.000000	1.000000
25%	13.347500	2.000000	2.000000
50%	17.795000	2.900000	2.000000
75%	24.127500	3.562500	3.000000
max	50.810000	10.000000	6.000000

	x	y
count	11.000000	11.000000
mean	9.000000	7.500909
std	3.316625	2.031568
min	4.000000	4.260000
25%	6.500000	6.315000
50%	9.000000	7.580000
75%	11.500000	8.570000
max	14.000000	10.840000

	x	y
count	11.000000	11.000000
mean	9.000000	7.500909
std	3.316625	2.031657
min	4.000000	3.100000
25%	6.500000	6.695000
50%	9.000000	8.140000
75%	11.500000	8.950000
max	14.000000	9.260000

	x	y
count	11.000000	11.000000
mean	9.000000	7.500000
std	3.316625	2.030424
min	4.000000	5.390000
25%	6.500000	6.250000
50%	9.000000	7.110000
75%	11.500000	7.980000
max	14.000000	12.740000

	x	y
count	11.000000	11.000000
mean	9.000000	7.500909
std	3.316625	2.030579
min	8.000000	5.250000
25%	8.000000	6.170000
50%	8.000000	7.040000
75%	8.000000	8.190000
max	19.000000	12.500000

MA5634: Fundamentals of Machine Learning¶

variationalform https://variationalform.github.io/¶

Just Enough: progress at pace¶

What this is about:¶

Assessment¶

Study Guide¶

Key Concepts: Glossary of Relevant Terms¶

Data Science¶

Data Analytics¶

Data Engineering¶

Artificial Intelligence¶

Machine Learning¶

Learning¶

Regression and Classification¶

Reading List¶

Coding: `python` and some data sets¶

Binder, Anaconda, Jupyter - a first look at some data¶

The `taxis` data set¶

Visualization¶

The `tips` data set¶

Visualization¶

Statistics and Probability¶

The `anscombe` data set¶

dataset 1¶

dataset 2¶

dataset 3¶

dataset 4¶

Exercises¶

	pickup	dropoff	passengers	distance	fare	tip	total	color	payment	pickup_zone	dropoff_zone	pickup_borough	dropoff_borough
0	2019-03-23 20:21:09	2019-03-23 20:27:24	1	1.60	7.0	2.15	12.95	yellow	credit card	Lenox Hill West	UN/Turtle Bay South	Manhattan	Manhattan
1	2019-03-04 16:11:55	2019-03-04 16:19:00	1	0.79	5.0	0.00	9.30	yellow	cash	Upper West Side South	Upper West Side South	Manhattan	Manhattan
2	2019-03-27 17:53:01	2019-03-27 18:00:25	1	1.37	7.5	2.36	14.16	yellow	credit card	Alphabet City	West Village	Manhattan	Manhattan
3	2019-03-10 01:23:59	2019-03-10 01:49:51	1	7.70	27.0	6.15	36.95	yellow	credit card	Hudson Sq	Yorkville West	Manhattan	Manhattan
4	2019-03-30 13:27:42	2019-03-30 13:37:14	3	2.16	9.0	1.10	13.40	yellow	credit card	Midtown East	Yorkville West	Manhattan	Manhattan

	pickup	dropoff	passengers	distance	fare	tip	total	color	payment	pickup_zone	dropoff_zone	pickup_borough	dropoff_borough
6428	2019-03-31 09:51:53	2019-03-31 09:55:27	1	0.75	4.5	1.06	6.36	green	credit card	East Harlem North	Central Harlem North	Manhattan	Manhattan
6429	2019-03-31 17:38:00	2019-03-31 18:34:23	1	18.74	58.0	0.00	58.80	green	credit card	Jamaica	East Concourse/Concourse Village	Queens	Bronx
6430	2019-03-23 22:55:18	2019-03-23 23:14:25	1	4.14	16.0	0.00	17.30	green	cash	Crown Heights North	Bushwick North	Brooklyn	Brooklyn
6431	2019-03-04 10:09:25	2019-03-04 10:14:29	1	1.12	6.0	0.00	6.80	green	credit card	East New York	East Flatbush/Remsen Village	Brooklyn	Brooklyn
6432	2019-03-13 19:31:22	2019-03-13 19:48:02	1	3.85	15.0	3.36	20.16	green	credit card	Boerum Hill	Windsor Terrace	Brooklyn	Brooklyn

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	dataset	x	y
0	I	10.0	8.04
1	I	8.0	6.95
2	I	13.0	7.58
3	I	9.0	8.81
4	I	11.0	8.33
5	I	14.0	9.96
6	I	6.0	7.24
7	I	4.0	4.26
8	I	12.0	10.84
9	I	7.0	4.82
10	I	5.0	5.68

MA5634: Fundamentals of Machine Learning¶

variationalform https://variationalform.github.io/¶

Just Enough: progress at pace¶

What this is about:¶

Assessment¶

Study Guide¶

Key Concepts: Glossary of Relevant Terms¶

Data Science¶

Data Analytics¶

Data Engineering¶

Artificial Intelligence¶

Machine Learning¶

Learning¶

Regression and Classification¶

Reading List¶

Coding: python and some data sets¶

Binder, Anaconda, Jupyter - a first look at some data¶

The taxis data set¶

Visualization¶

The tips data set¶

Visualization¶

Statistics and Probability¶

The anscombe data set¶

dataset 1¶

dataset 2¶

dataset 3¶

dataset 4¶

Exercises¶

Coding: `python` and some data sets¶

The `taxis` data set¶

The `tips` data set¶

The `anscombe` data set¶