diff --git a/_quarto.yml b/_quarto.yml
index f1e12be9..57ad1769 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -79,6 +79,37 @@ website:
- text: ' 12.1. Exercises'
href: modules/module1/module1-32-practicing_bar_charts.qmd
- href: modules/module1/module1-34-what_did_we_just_learn.qmd
+ - section: "**M2. Not So Scary Wrangling (Table Manipulation and Chaining)**"
+ contents:
+ - href: modules/module2/module2-00-module_learning_outcomes.qmd
+ - href: modules/module2/module2-01-reading_in_different_file_types.qmd
+ - text: ' 1.1. Exercises'
+ href: modules/module2/module2-02-delimiter.qmd
+ - href: modules/module2/module2-06-arguments_for_reading_data.qmd
+ - text: ' 2.1. Exercises'
+ href: modules/module2/module2-07-name_that_argument.qmd
+ - href: modules/module2/module2-09-column_renaming_and_dropping.qmd
+ - text: ' 3.1. Exercises'
+ href: modules/module2/module2-10-column_editing_questions.qmd
+ - href: modules/module2/module2-13-column_arithmetic_and_creation.qmd
+ - text: ' 4.1. Exercises'
+ href: modules/module2/module2-14-column_arithmetic_questions.qmd
+ - href: modules/module2/module2-16-data_filtering.qmd
+ - text: ' 5.1. Exercises'
+ href: modules/module2/module2-17-filtering_question.qmd
+ - href: modules/module2/module2-20-conditional_value_replacement.qmd
+ - text: ' 6.1. Exercises'
+ href: modules/module2/module2-21-practice_replacing_values.qmd
+ - href: modules/module2/module2-22-chaining_notation.qmd
+ - text: ' 7.1. Exercises'
+ href: modules/module2/module2-23-chaining_true_false.qmd
+ - href: modules/module2/module2-25-grouping_and_aggregating.qmd
+ - text: ' 8.1. Exercises'
+ href: modules/module2/module2-26-fruit_salad_grouping_and_aggregating.qmd
+ - href: modules/module2/module2-29-plotting_with_altair.qmd
+ - text: ' 9.1. Exercises'
+ href: modules/module2/module2-30-plotting_a_groupby_object.qmd
+ - href: modules/module2/module2-31-what_did_we_just_learn.qmd
# Since we are declaring options for two formats here (html and revealjs)
# each qmd file needs to include a yaml block including which format to use for that file.
diff --git a/data/candybars-h.csv b/data/candybars-h.csv
new file mode 100644
index 00000000..62dc1006
--- /dev/null
+++ b/data/candybars-h.csv
@@ -0,0 +1 @@
+This dataset was created by Hayley Boyce in February 2020.,,,,,,,,,,
Note this is not a complete dataset and there are many other candybars that are in existance ,,,,,,,,,,
name,weight,chocolate,peanuts,caramel,nougat,cookie_wafer_rice,coconut,white_chocolate,multi,available_canada_america
Coffee Crisp,50,1,0,0,0,1,0,0,0,Canada
Butterfinger,184,1,1,1,0,0,0,0,0,America
Skor,39,1,0,1,0,0,0,0,0,Both
Smarties,45,1,0,0,0,0,0,0,1,Canada
Twix,58,1,0,1,0,1,0,0,1,Both
Reeses Peanutbutter Cups ,43,1,1,0,0,0,0,0,1,Both
3 Musketeers,54,1,0,0,1,0,0,0,0,America
Kinder Surprise,20,1,0,0,0,0,0,1,0,Canada
M & M,48,1,1,0,0,0,0,0,1,Both
Glosettes,50,1,0,0,0,0,0,0,1,Canada
KitKat,45,1,0,0,0,1,0,0,1,Both
Babe Ruth,60,1,1,1,1,0,0,0,0,America
Caramilk,52,1,0,1,0,0,0,0,0,Canada
Aero,42,1,0,0,0,0,0,0,0,Canada
Mars,51,1,0,1,1,0,0,0,0,Both
Payday,52,0,1,1,0,0,0,0,0,America
Snickers,48,1,1,1,1,0,0,0,0,Both
Crunchie,26,1,0,0,0,0,0,0,0,Canada
Wonderbar ,58,1,1,1,0,0,0,0,0,Canada
100 Grand ,43,1,0,1,0,1,0,0,0,America
Take 5,43,1,1,1,0,1,0,0,0,America
Whatchamacallits,45,1,1,0,0,1,0,0,0,America
Almond Joy,46,1,0,0,0,0,1,0,0,America
Oh Henry,51,1,1,1,0,0,0,0,0,Both
Cookies and Cream,43,0,0,0,0,1,0,1,0,Both
\ No newline at end of file
diff --git a/data/candybars-text.txt b/data/candybars-text.txt
new file mode 100644
index 00000000..385f2240
--- /dev/null
+++ b/data/candybars-text.txt
@@ -0,0 +1,26 @@
+name weight chocolate peanuts caramel nougat cookie_wafer_rice coconut white_chocolate multi available_canada_america
+Coffee Crisp 50 1 0 0 0 1 0 0 0 Canada
+Butterfinger 184 1 1 1 0 0 0 0 0 America
+Skor 39 1 0 1 0 0 0 0 0 Both
+Smarties 45 1 0 0 0 0 0 0 1 Canada
+Twix 58 1 0 1 0 1 0 0 1 Both
+Reeses Peanutbutter Cups 43 1 1 0 0 0 0 0 1 Both
+3 Musketeers 54 1 0 0 1 0 0 0 0 America
+Kinder Surprise 20 1 0 0 0 0 0 1 0 Canada
+M & M 48 1 1 0 0 0 0 0 1 Both
+Glosettes 50 1 0 0 0 0 0 0 1 Canada
+KitKat 45 1 0 0 0 1 0 0 1 Both
+Babe Ruth 60 1 1 1 1 0 0 0 0 America
+Caramilk 52 1 0 1 0 0 0 0 0 Canada
+Aero 42 1 0 0 0 0 0 0 0 Canada
+Mars 51 1 0 1 1 0 0 0 0 Both
+Payday 52 0 1 1 0 0 0 0 0 America
+Snickers 48 1 1 1 1 0 0 0 0 Both
+Crunchie 26 1 0 0 0 0 0 0 0 Canada
+Wonderbar 58 1 1 1 0 0 0 0 0 Canada
+100 Grand 43 1 0 1 0 1 0 0 0 America
+Take 5 43 1 1 1 0 1 0 0 0 America
+Whatchamacallits 45 1 1 0 0 1 0 0 0 America
+Almond Joy 46 1 0 0 0 0 1 0 0 America
+Oh Henry 51 1 1 1 0 0 0 0 0 Both
+Cookies and Cream 43 0 0 0 0 1 0 1 0 Both
\ No newline at end of file
diff --git a/data/foods.xlsx b/data/foods.xlsx
new file mode 100644
index 00000000..b65540d9
Binary files /dev/null and b/data/foods.xlsx differ
diff --git a/environment.yaml b/environment.yaml
index 2485822f..dcb82d02 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -9,7 +9,6 @@ dependencies:
- scipy
- matplotlib
- jupyter
- - quarto
+ - quarto=1.6.43
- pip
-
-
+ - openpyxl
\ No newline at end of file
diff --git a/modules/module1/slides/module1_29.qmd b/modules/module1/slides/module1_29.qmd
index 0d09b3a7..1e30150b 100644
--- a/modules/module1/slides/module1_29.qmd
+++ b/modules/module1/slides/module1_29.qmd
@@ -77,27 +77,19 @@ Now we can use `.value_counts()` on this `mfr_column` variable to reference it,
---
```{python}
-mfr_col_wrong = cereal[['mfr']]
-mfr_col_wrong
+mfr_col_dataframe = cereal[['mfr']]
+mfr_col_dataframe
```
-```python
-mfr_col_wrong.value_counts()
+```{python}
+mfr_col_dataframe.value_counts()
```
-```out
-AttributeError: 'DataFrame' object has no attribute 'value_counts'
-
-Detailed traceback:
- File "", line 1, in
- File "/usr/local/lib/python3.7/site-packages/pandas/core/generic.py", line 5274, in __getattr__
- return object.__getattribute__(self, name)
-```
:::{.notes}
-If we did instead use double square brackets with `pd.value_counts()`, we would get an error. So it is important to take care and remember when you are using `value_counts()`, you only use one set of square brackets.
+`value_counts()` can also be called on a DataFrame to count unique rows.
:::
---
@@ -105,6 +97,7 @@ If we did instead use double square brackets with `pd.value_counts()`, we would
## Saving a dataframe
```{python}
+# | eval: false
mfr_freq.to_csv('data/mfr_frequency.csv', index=False)
```
diff --git a/modules/module2/module2-00-module_learning_outcomes.qmd b/modules/module2/module2-00-module_learning_outcomes.qmd
new file mode 100644
index 00000000..0d8579ad
--- /dev/null
+++ b/modules/module2/module2-00-module_learning_outcomes.qmd
@@ -0,0 +1,29 @@
+---
+format:
+ html:
+ page-layout: full
+---
+
+# 0. Module Learning Outcomes
+
+::: {.panel-tabset .nav-pills}
+
+## Video
+
+
+
+## Slides
+
+
+
+:::
diff --git a/modules/module2/module2-01-reading_in_different_file_types.qmd b/modules/module2/module2-01-reading_in_different_file_types.qmd
new file mode 100644
index 00000000..1f99abdd
--- /dev/null
+++ b/modules/module2/module2-01-reading_in_different_file_types.qmd
@@ -0,0 +1,29 @@
+---
+format:
+ html:
+ page-layout: full
+---
+
+# 1. Reading in Different File Types
+
+::: {.panel-tabset .nav-pills}
+
+## Video
+
+
+
+## Slides
+
+
+
+:::
diff --git a/modules/module2/module2-02-delimiter.qmd b/modules/module2/module2-02-delimiter.qmd
new file mode 100644
index 00000000..cbcfc5a3
--- /dev/null
+++ b/modules/module2/module2-02-delimiter.qmd
@@ -0,0 +1,259 @@
+---
+format: live-html
+---
+
+
+
+# 1.1. Exercises
+
+## Delimiter
+
+
+
+
+
+
+
+
+## Coding questions
+
+**Instructions:**
+Running a coding exercise for the first time could take a bit of time for everything to load. Be patient, it could take a few minutes.
+
+**When you see `____` in a coding exercise, replace it with what you assume to be the correct code. Run it and see if you obtain the desired output. Submit your code to validate if you were correct.**
+
+
+### Reading in a URL
+
+Let's try reading in some data from a URL using `pd.read_csv()`.
+
+**Tasks:**
+
+- Use `pd.read_csv()` to read in the data from [this url](https://raw.githubusercontent.com/UBC-MDS/MCL-DSCI-511-programming-in-python/master/data/pokemon.csv) using the name column as the index.
+- Save the resulting dataframe as `pokemon_df`.
+- Display the first 10 rows of the dataframe.
+
+
+```{pyodide}
+#| exercise: reading_in_a_url
+import pandas as pd
+
+url = 'https://raw.githubusercontent.com/UBC-MDS/MCL-DSCI-511-programming-in-python/master/data/pokemon.csv'
+
+# Read in the data from the URL
+____ = ____(____)
+
+# Display the first 10 rows
+____.____
+```
+
+```{pyodide}
+#| exercise: reading_in_a_url
+#| check: true
+from src.utils import print_correct_msg
+
+url = 'https://raw.githubusercontent.com/UBC-MDS/MCL-DSCI-511-programming-in-python/master/data/pokemon.csv'
+solution = pd.read_csv(url).head(10)
+
+assert isinstance(result, pd.DataFrame), "Your result should be a dataframe."
+assert solution.shape == result.shape, "The dimensions are incorrect."
+assert sorted(list(solution.columns)) == sorted(list(result.columns)), "Your columns do not seem correct."
+print_correct_msg()
+```
+
+:::: { .hint exercise="reading_in_a_url"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you sure you are saving your dataframe as `pokemon_df`?
+- Are you using `pd.read_csv()`?
+
+:::
+::::
+
+:::: { .solution exercise="reading_in_a_url" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+
+url = 'https://raw.githubusercontent.com/UBC-MDS/MCL-DSCI-511-programming-in-python/master/data/pokemon.csv'
+
+# Read in the data from the URL
+pokemon_df = pd.read_csv(url)
+
+# Display the first 10 rows
+pokemon_df.head(10)
+```
+
+:::
+::::
+
+
+
+### Reading in a Text File
+
+Let's try reading in a `.txt` file.
+
+**Tasks:**
+
+- Read in the data from a text file name `pokemon-text.txt` located in the `data` folder.
+- Save the resulting dataframe as `pokemon_df`.
+- It's a good idea to see what the [delimiter](https://github.com/UBC-MDS/MCL-DSCI-511-programming-in-python/blob/binder/data/pokemon-text.txt) is.
+- Display the first 10 rows of `pokemon_df`.
+
+
+```{pyodide}
+#| exercise: reading_in_a_text_file
+import pandas as pd
+
+# Read in the data from the text file using the full pathway
+____ = ____(____, ____)
+
+# Display the first 10 rows
+____
+```
+
+```{pyodide}
+#| exercise: reading_in_a_text_file
+#| check: true
+from src.utils import print_correct_msg
+solution = pd.read_csv('data/pokemon-text.txt', delimiter=";").head(10)
+
+assert isinstance(result, pd.DataFrame), "Your result should be a dataframe."
+assert solution.shape == result.shape, "The dimensions are incorrect."
+assert sorted(list(solution.columns)) == sorted(list(result.columns)), "Your columns do not seem correct."
+print_correct_msg()
+```
+
+:::: { .hint exercise="reading_in_a_text_file"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you sure you are saving your dataframe as the correct object names?
+- Are you using `pd.read_csv()`?
+- Did you check to see what the [delimiter](https://github.com/UBC-MDS/MCL-DSCI-511-programming-in-python/blob/binder/data/pokemon-text.txt) is.
+- Are you including the full path through the `data/` folder when calling the file name?
+- Check that your delimiter argument is correct.
+
+:::
+::::
+
+:::: { .solution exercise="reading_in_a_text_file" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+
+# Read in the data from the text file using the full pathway
+pokemon_df = pd.read_csv('data/pokemon-text.txt', delimiter=";")
+
+# Display the first 10 rows
+pokemon_df.head(10)
+```
+
+:::
+::::
+
+
+
+### Reading in an Excel File
+
+Let's try reading in an Excel file.
+
+**Tasks:**
+
+- Read in the data from the sheet named `pokemon` from the Excel file `pokemon.xlsx` located in the `data` folder.
+- Save the resulting dataframe as `pokemon_df`.
+- Display the first 10 rows of `pokemon_df`.
+
+```{pyodide}
+#| setup: true
+#| exercise: reading_in_an_excel_file
+import micropip
+await micropip.install('openpyxl')
+```
+
+```{pyodide}
+#| exercise: reading_in_an_excel_file
+import pandas as pd
+
+# Read in the data from the Excel file using the full pathway
+____ = ____(____, ____)
+
+# Display the first 10 rows
+____
+```
+
+```{pyodide}
+#| exercise: reading_in_an_excel_file
+#| check: true
+from src.utils import print_correct_msg
+solution = pd.read_excel('data/pokemon.xlsx', sheet_name="pokemon").head(10)
+
+assert isinstance(result, pd.DataFrame), "Your result should be a dataframe."
+assert solution.shape == result.shape, "The dimensions are incorrect."
+assert sorted(list(solution.columns)) == sorted(list(result.columns)), "Your columns do not seem correct."
+print_correct_msg()
+```
+
+:::: { .hint exercise="reading_in_an_excel_file"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you sure you are saving your dataframe as the correct object names?
+- Are you using `pd.read_excel()`?
+- Check that you are using `sheet_name="pokemon"`.
+- Are you including the full path through the `data/` folder when calling the file name?
+
+:::
+::::
+
+:::: { .solution exercise="reading_in_an_excel_file" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+
+# Read in the data from the Excel file using the full pathway
+pokemon_df = pd.read_excel('data/pokemon.xlsx', sheet_name="pokemon")
+
+# Display the first 10 rows
+pokemon_df.head(10)
+```
+
+:::
+::::
\ No newline at end of file
diff --git a/modules/module2/module2-06-arguments_for_reading_data.qmd b/modules/module2/module2-06-arguments_for_reading_data.qmd
new file mode 100644
index 00000000..ffa083fd
--- /dev/null
+++ b/modules/module2/module2-06-arguments_for_reading_data.qmd
@@ -0,0 +1,29 @@
+---
+format:
+ html:
+ page-layout: full
+---
+
+# 2. Arguments for Reading Data
+
+::: {.panel-tabset .nav-pills}
+
+## Video
+
+
+
+## Slides
+
+
+
+:::
diff --git a/modules/module2/module2-07-name_that_argument.qmd b/modules/module2/module2-07-name_that_argument.qmd
new file mode 100644
index 00000000..668855e0
--- /dev/null
+++ b/modules/module2/module2-07-name_that_argument.qmd
@@ -0,0 +1,121 @@
+---
+format: live-html
+---
+
+
+
+# 2.1. Exercises
+
+## Name that Argument!
+
+
+
+
+
+
+
+
+## Using Arguments when Reading in Files
+
+**Instructions:**
+Running a coding exercise for the first time could take a bit of time for everything to load. Be patient, it could take a few minutes.
+
+**When you see `____` in a coding exercise, replace it with what you assume to be the correct code. Run it and see if you obtain the desired output. Submit your code to validate if you were correct.**
+
+Load in the data using the most suitable arguments.
+
+**Tasks:**
+
+- Read in the first 100 rows and columns `name`, `total_bs` and `type` from the file `pokemon.csv`, which is located in the data directory.
+- Save the resulting dataframe as `pokemon_sample`.
+- Display `pokemon_sample`.
+
+
+```{pyodide}
+#| exercise: using_arguments_when_reading_in_files
+import pandas as pd
+
+# Read in the data from the csv file using the full pathway
+# Save it as pokemon_sample
+# Only load in the first 100 rows and only load in columns: name, total_bs, type
+____ = ____(____,
+ ____,
+ ____)
+
+# Display the dataframe
+____
+```
+
+```{pyodide}
+#| exercise: using_arguments_when_reading_in_files
+#| check: true
+from src.utils import print_correct_msg
+
+solution = pd.read_csv('data/pokemon.csv', nrows=100, usecols=['name', 'total_bs', 'type'])
+
+assert isinstance(result, pd.DataFrame), "Your result should be a dataframe."
+assert solution.shape == result.shape, "The dimensions are incorrect."
+assert sorted(list(solution.columns)) == sorted(list(result.columns)), "Your columns do not seem correct."
+print_correct_msg()
+```
+
+:::: { .hint exercise="using_arguments_when_reading_in_files"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you sure you are saving your dataframe as `pokeman_df`?
+- Are you using `pd.read_csv()`?
+- Are you including the full path through the `data/` folder when calling the file name?
+- Do you the argument `nrows=100`?
+- Are you loading in the specified column index labels?
+- Perhaps you are using `index_col=0` when it was not required?
+
+:::
+::::
+
+:::: { .solution exercise="using_arguments_when_reading_in_files" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+
+# Read in the data from the csv file using the full pathway
+# Save it as pokemon_sample
+# Only load in the first 100 rows and only load in columns: name, total_bs, type
+pokemon_sample = pd.read_csv('data/pokemon.csv',
+ nrows=100,
+ usecols=['name', 'total_bs', 'type'])
+
+# Display the dataframe
+pokemon_sample
+```
+
+:::
+::::
\ No newline at end of file
diff --git a/modules/module2/module2-09-column_renaming_and_dropping.qmd b/modules/module2/module2-09-column_renaming_and_dropping.qmd
new file mode 100644
index 00000000..3049a938
--- /dev/null
+++ b/modules/module2/module2-09-column_renaming_and_dropping.qmd
@@ -0,0 +1,29 @@
+---
+format:
+ html:
+ page-layout: full
+---
+
+# 3. Column Renaming and Dropping
+
+::: {.panel-tabset .nav-pills}
+
+## Video
+
+
+
+## Slides
+
+
+
+:::
diff --git a/modules/module2/module2-10-column_editing_questions.qmd b/modules/module2/module2-10-column_editing_questions.qmd
new file mode 100644
index 00000000..a9628ae7
--- /dev/null
+++ b/modules/module2/module2-10-column_editing_questions.qmd
@@ -0,0 +1,226 @@
+---
+format: live-html
+---
+
+
+
+# 3.1. Exercises
+
+## Column Editing Questions
+
+Here is our `fruit_salad` dataframe once again.
+
+```out
+ name colour location seed shape sweetness water-content weight
+0 apple red canada True round True 84 100
+1 banana yellow mexico False long True 75 120
+2 cantaloupe orange spain True round True 90 1360
+3 dragon-fruit magenta china True round False 96 600
+4 elderberry purple austria False round True 80 5
+5 fig purple turkey False oval False 78 40
+6 guava green mexico True oval True 83 450
+7 huckleberry blue canada True round True 73 5
+8 kiwi brown china True round True 80 76
+9 lemon yellow mexico False oval False 83 65
+```
+
+
+
+Let's say we run the following code:
+
+```python
+fruit_salad.drop(columns = ['colour', 'shape', 'sweetness'])
+fruit_salad = fruit_salad.rename(columns={'location':'country',
+ 'weight':'weight_g'})
+```
+
+Use the dataframe and code above to answer the next 2 questions.
+
+
+
+
+
+
+
+
+## Coding questions
+
+**Instructions:**
+Running a coding exercise for the first time, could take a bit of time for everything to load. Be patient, it could take a few minutes.
+
+**When you see `____` in a coding exercise, replace it with what you assume to be the correct code. Run it and see if you obtain the desired output. Submit your code to validate if you were correct.**
+
+
+
+### Renaming a Column Index
+
+Let's rename one of the columns in our `pokemon.csv` data.
+
+**Tasks:**
+
+- Rename the column `sp_attack` to `special_a` and `sp_defense` to `special_d` using `.rename()` only once.
+- Save the new dataframe as `pokemon_special`.
+- Display the first 5 rows of the dataframe.
+
+```{pyodide}
+#| exercise: renaming_a_column_index
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+# Rename the column sp_attack to special_a and
+# sp_defense to special_d using df.rename() once
+# Save the new dataframe as pokemon_special
+____ = ____
+
+# Display the first 5 rows of the dataframe
+____
+```
+
+```{pyodide}
+#| exercise: renaming_a_column_index
+#| check: true
+from src.utils import print_correct_msg
+
+pokemon = pd.read_csv('data/pokemon.csv')
+solution = pokemon.rename(columns={'sp_attack':'special_a', 'sp_defense':'special_d'}).head()
+
+assert isinstance(result, pd.DataFrame), "Your result should be a dataframe."
+assert not {"sp_attack"}.issubset(set(result.columns)), "Are you changing 'sp_attack' to 'special_a' using ':'?"
+assert not {"sp_defense"}.issubset(set(result.columns)), "Are you changing 'sp_defense' to 'special_d' using ':'?"
+assert sorted(list(solution.columns)) == sorted(list(result.columns)), "Your columns do not seem correct."
+print_correct_msg()
+```
+
+:::: { .hint exercise="renaming_a_column_index"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you using `pokemon.rename()`?
+- Are you saving the new dataframe as the correct name?
+- Are you using the argument `columns={'sp_attack':'special_a', 'sp_defense':'special_d'}`?
+
+:::
+::::
+
+:::: { .solution exercise="renaming_a_column_index" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+# Rename the column sp_attack to special_a and
+# sp_defense to special_d using df.rename() once
+# Save the new dataframe as pokemon_special
+pokemon_special = pokemon.rename(columns={'sp_attack':'special_a',
+ 'sp_defense':'special_d'})
+
+# Display the first 5 rows of the dataframe
+pokemon_special.head()
+```
+
+:::
+::::
+
+
+
+### Dropping Columns in a Dataframe
+
+Some of the columns in `pokemon.csv` we have deemed not useful. Let's get rid of them!
+
+**Tasks:**
+
+- Drop the columns `deck_no`, `capture_rt`, and `legendary`.
+- Make sure to overwrite the new dataframe to object `pokemon`.
+- Display the first 5 rows of the dataframe.
+
+```{pyodide}
+#| exercise: dropping_columns_in_a_dataframe
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+# Drop the columns deck_no, capture_rt, and legendary
+# Make sure to overwrite the new dataframe to object pokemon
+____
+
+# Display the first 5 rows of the dataframe
+____
+```
+
+```{pyodide}
+#| exercise: dropping_columns_in_a_dataframe
+#| check: true
+from src.utils import print_correct_msg
+
+pokemon = pd.read_csv('data/pokemon.csv')
+solution = pokemon.drop(columns=['deck_no', 'capture_rt', 'legendary']).head()
+
+assert isinstance(result, pd.DataFrame), "Your result should be a dataframe."
+assert not {'deck_no'}.issubset(set(result.columns)) , "Have you dropped the 'deck_no' column?"
+assert not {"capture_rt"}.issubset(set(result.columns)) , "Have you dropped the 'capture_rt' column?"
+assert not {"legendary"}.issubset(set(result.columns)) , "Have you dropped the 'legendary' column?"
+assert sorted(list(solution.columns)) == sorted(list(result.columns)), "Your columns do not seem correct."
+print_correct_msg()
+```
+
+:::: { .hint exercise="dropping_columns_in_a_dataframe"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you using `pokemon.drop()`?
+- Are you overwriting the new dataframe to object `pokemon`?
+- Are you using square brackets in the argument `columns`?
+
+:::
+::::
+
+:::: { .solution exercise="dropping_columns_in_a_dataframe" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+# Drop the columns deck_no, capture_rt, and legendary
+# Make sure to overwrite the new dataframe to object pokemon
+pokemon = pokemon.drop(columns=['deck_no', 'capture_rt', 'legendary'])
+
+# Display the first 5 rows of the dataframe
+pokemon.head()
+```
+
+:::
+::::
\ No newline at end of file
diff --git a/modules/module2/module2-13-column_arithmetic_and_creation.qmd b/modules/module2/module2-13-column_arithmetic_and_creation.qmd
new file mode 100644
index 00000000..334ff24d
--- /dev/null
+++ b/modules/module2/module2-13-column_arithmetic_and_creation.qmd
@@ -0,0 +1,29 @@
+---
+format:
+ html:
+ page-layout: full
+---
+
+# 4. Column Arithmetic and Creation
+
+::: {.panel-tabset .nav-pills}
+
+## Video
+
+
+
+## Slides
+
+
+
+:::
diff --git a/modules/module2/module2-14-column_arithmetic_questions.qmd b/modules/module2/module2-14-column_arithmetic_questions.qmd
new file mode 100644
index 00000000..a0a32e65
--- /dev/null
+++ b/modules/module2/module2-14-column_arithmetic_questions.qmd
@@ -0,0 +1,126 @@
+---
+format: live-html
+---
+
+
+
+# 4.1. Exercises
+
+## Column Arithmetic Questions
+
+**Question 1**
+
+What is the result if we multiply 2 columns together using the syntax
+
+```
+df[['Column_A']] * df[['Column_B']]
+```
+
+
+
+
+
+
+
+
+## Creating a New Column
+
+**Instructions:**
+Running a coding exercise for the first time, could take a bit of time for everything to load. Be patient, it could take a few minutes.
+
+**When you see `____` in a coding exercise, replace it with what you assume to be the correct code. Run it and see if you obtain the desired output. Submit your code to validate if you were correct.**
+
+For this exercise, we are going to create and drop some columns from our dataframe.
+
+**Tasks:**
+
+- Create a new column named `total_special` that is the sum of column `sp_attack` and `sp_defense`.
+- Save it, overwriting the dataframe named `pokemon`.
+- Display the first 5 rows of the dataframe.
+
+```{pyodide}
+#| exercise: creating_a_new_column
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+# Create a new column named total_special
+# that is the sum of column sp_attack and sp_defense
+# Save it, overwriting the dataframe named pokemon
+____
+
+# Display the first 5 rows of the dataframe
+____
+```
+
+```{pyodide}
+#| exercise: creating_a_new_column
+#| check: true
+from src.utils import print_correct_msg
+
+pokemon = pd.read_csv('data/pokemon.csv')
+solution = pokemon.assign(total_special = pokemon['sp_attack'] + pokemon['sp_defense']).head()
+
+assert isinstance(result, pd.DataFrame), "Your result should be a dataframe."
+assert 'total_special' in list(result.columns) ,"Are you naming the new column 'total_special'?"
+assert sum(solution.total_special) == sum(result.total_special), "Values in the column 'total_special' is wrong. Are you adding 'sp_attack' and 'sp_defense'?"
+print_correct_msg()
+```
+
+:::: { .hint exercise="creating_a_new_column"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you using `pokemon.assign()`?
+- Are you saving the new dataframes as the correct names?
+- For the new column does `total_special = pokemon['sp_attack'] + pokemon['sp_defense']`?
+
+:::
+::::
+
+:::: { .solution exercise="creating_a_new_column" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+# Create a new column named total_special
+# that is the sum of column sp_attack and sp_defense
+# Save it, overwriting the dataframe named pokemon
+pokemon = pokemon.assign(total_special = pokemon['sp_attack'] + pokemon['sp_defense'])
+
+# Display the first 5 rows of the dataframe
+pokemon.head()
+```
+
+:::
+::::
\ No newline at end of file
diff --git a/modules/module2/module2-16-data_filtering.qmd b/modules/module2/module2-16-data_filtering.qmd
new file mode 100644
index 00000000..facdd988
--- /dev/null
+++ b/modules/module2/module2-16-data_filtering.qmd
@@ -0,0 +1,29 @@
+---
+format:
+ html:
+ page-layout: full
+---
+
+# 5. Data Filtering
+
+::: {.panel-tabset .nav-pills}
+
+## Video
+
+
+
+## Slides
+
+
+
+:::
diff --git a/modules/module2/module2-17-filtering_question.qmd b/modules/module2/module2-17-filtering_question.qmd
new file mode 100644
index 00000000..300ee2ba
--- /dev/null
+++ b/modules/module2/module2-17-filtering_question.qmd
@@ -0,0 +1,229 @@
+---
+format: live-html
+---
+
+
+
+# 5.1. Exercises
+
+## Filtering Question
+
+**Question 1**
+
+If the output of
+
+```python
+df['location'] == 'Canada'
+```
+
+is
+
+```out
+[True, False, False, True]
+```
+
+
+
+What would be the output of
+
+```python
+~(df['location'] == 'Canada')
+```
+
+
+
+
+
+## Coding questiongs
+
+**Instructions:**
+Running a coding exercise for the first time, could take a bit of time for everything to load. Be patient, it could take a few minutes.
+
+**When you see `____` in a coding exercise, replace it with what you assume to be the correct code. Run it and see if you obtain the desired output. Submit your code to validate if you were correct.**
+
+
+### Single Condition Filtering
+
+Try to filter the dataframe to obtain only a certain Pokemon type using single condition filtering.
+
+**Tasks:**
+
+- Create a new dataframe named `fire_pokemon` containing only the rows of `type` "fire".
+
+```{pyodide}
+#| exercise: single_condition_filtering
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+# Create a new dataframe named fire_pokemon containing only the rows of type "fire"
+____ = ____
+
+fire_pokemon
+```
+
+```{pyodide}
+#| exercise: single_condition_filtering
+#| check: true
+from src.utils import print_correct_msg
+
+pokemon = pd.read_csv('data/pokemon.csv')
+solution = pokemon[pokemon['type'] == 'fire']
+
+assert isinstance(result, pd.DataFrame), "Your result should be a dataframe."
+assert solution.shape == result.shape, "Your dataframe imensions are incorrect. Are you selecting only the fire pokemons?"
+assert set(list(result.type)) == {'fire'} , "Are you selecting only fire pokemons?"
+print_correct_msg()
+```
+
+:::: { .hint exercise="single_condition_filtering"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you sure you are saving your dataframe as the correct object names?
+- Are you using `pokemon['type'] == 'fire'` as your condition?
+
+:::
+::::
+
+:::: { .solution exercise="single_condition_filtering" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+# Create a new dataframe named fire_pokemon containing only the rows of type "fire"
+fire_pokemon = pokemon[pokemon['type'] == 'fire']
+
+fire_pokemon
+```
+
+:::
+::::
+
+
+
+
+
+
+
+
+### Filtering using "and"
+
+Let's find all the pokemon that meet multiple conditions.
+
+**Tasks:**
+
+- Filter the dataframe for the pokemon that have `attack` and `defense` values both greater than 100.
+- Save this dataframe as an object named `mighty_pokemon`.
+
+```{pyodide}
+#| exercise: filtering_using_and
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+# Filter the dataframe for the pokemon that have attack and
+# defense values both greater than 100
+# Save this dataframe as an object named mighty_pokemon
+____ = ____
+
+mighty_pokemon
+```
+
+```{pyodide}
+#| exercise: filtering_using_and
+#| check: true
+from src.utils import print_correct_msg
+
+pokemon = pd.read_csv('data/pokemon.csv')
+solution = pokemon[(pokemon['defense'] > 100) & (pokemon['attack'] > 100)]
+
+assert isinstance(result, pd.DataFrame), "Your result should be a dataframe."
+assert min(list(solution.defense)) == min(list(result.defense)), "Are you selecting pokemons with attack and defense > 100?"
+assert solution.defense.sum() == result.defense.sum() , "Some values in the 'defense'column are wrong. \n Are you selecting pokemons with defense > 100?"
+assert max(list(solution.attack)) == max(list(result.attack)), "Are you selecting pokemons with attack and defense > 100?"
+assert solution.attack.sum() == result.attack.sum() , "Some values in the 'attack' column are wrong. \n Are you selecting pokemons with attack > 100?"
+print_correct_msg()
+```
+
+:::: { .hint exercise="filtering_using_and"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you sure you are saving your dataframe as the correct object names?
+- Are you separating your conditions with brackets?
+- Are you using the symbol` & ` to get the intersect?
+- Are you using `pokemon['defense'] > 100` and `pokemon['attack'] > 100` as your conditions?
+
+:::
+::::
+
+:::: { .solution exercise="filtering_using_and" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+# Filter the dataframe for the pokemon that have attack and
+# defense values both greater than 100
+# Save this dataframe as an object named mighty_pokemon
+mighty_pokemon = pokemon[(pokemon['defense'] > 100) & (pokemon['attack'] > 100)]
+
+mighty_pokemon
+```
+
+:::
+::::
+
+
+
+
+
diff --git a/modules/module2/module2-20-conditional_value_replacement.qmd b/modules/module2/module2-20-conditional_value_replacement.qmd
new file mode 100644
index 00000000..fa580252
--- /dev/null
+++ b/modules/module2/module2-20-conditional_value_replacement.qmd
@@ -0,0 +1,29 @@
+---
+format:
+ html:
+ page-layout: full
+---
+
+# 6. Conditional Value Replacement
+
+::: {.panel-tabset .nav-pills}
+
+## Video
+
+
+
+## Slides
+
+
+
+:::
diff --git a/modules/module2/module2-21-practice_replacing_values.qmd b/modules/module2/module2-21-practice_replacing_values.qmd
new file mode 100644
index 00000000..787a4df3
--- /dev/null
+++ b/modules/module2/module2-21-practice_replacing_values.qmd
@@ -0,0 +1,167 @@
+---
+format: live-html
+---
+
+
+
+# 6.1. Exercises
+
+## Practice Replacing Values
+
+**Instructions:**
+Running a coding exercise for the first time, could take a bit of time for everything to load. Be patient, it could take a few minutes.
+
+**When you see `____` in a coding exercise, replace it with what you assume to be the correct code. Run it and see if you obtain the desired output. Submit your code to validate if you were correct.**
+
+Let's make a new column by assigning each pokemon base score as either "strong" or "weak".
+
+**Tasks:**
+
+- Create a new column in the dataframe named `base_score` by assigning values 500 or greater from the column `total_bs` as 'strong' pokemon and values less than 500 as 'weak' pokemon.
+- Display the first 10 rows of the dataframe.
+
+```{pyodide}
+#| exercise: practice_replacing_values
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+# Create a new column in the dataframe Name the column base_score,
+# by assigning values 500 or greater from the column total_bs
+# as 'strong' pokemon and values less than 500 as 'weak' pokemon
+____
+____
+
+# Display the first 10 rows of the dataframe
+____
+```
+
+```{pyodide}
+#| exercise: practice_replacing_values
+#| check: true
+from src.utils import print_correct_msg
+
+pokemon = pd.read_csv('data/pokemon.csv')
+pokemon.loc[pokemon['total_bs'] >= 500, 'base_score'] = 'strong'
+pokemon.loc[pokemon['total_bs'] < 500, 'base_score'] = 'weak'
+solution = pokemon.head(10)
+
+assert isinstance(result, pd.DataFrame), "Your result should be a dataframe."
+assert "base_score" in list(result.columns), "Are you creating a 'base_score' column?"
+assert list(solution.base_score).count("strong") == list(result.base_score).count("strong"), "The number of strong pokemons is incorrect. \nAre you selecting pokemons with total_bs >= 500?"
+assert list(solution.base_score).count("weak") == list(result.base_score).count("weak"), "The number of weak pokemons is incorrect. \nAre you selecting pokemons with total_bs < 500?"
+print_correct_msg()
+```
+
+:::: { .hint exercise="practice_replacing_values"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you naming the new column named `base_score`?
+- Are you using `.loc[df['total_bs'] >= 500, 'base_score']` and assigning it to the correct value?
+- Are you using single equality signs for the assignment?
+
+:::
+::::
+
+:::: { .solution exercise="practice_replacing_values" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+# Create a new column in the dataframe Name the column base_score,
+# by assigning values 500 or greater from the column total_bs
+# as 'strong' pokemon and values less than 500 as 'weak' pokemon
+pokemon.loc[pokemon['total_bs'] >= 500, 'base_score'] = 'strong'
+pokemon.loc[pokemon['total_bs'] < 500, 'base_score'] = 'weak'
+
+# Display the first 10 rows of the dataframe
+pokemon.head(10)
+```
+
+:::
+::::
+
+
+
+Using the new column `base_score` we made above, make a bar graph showing the frequency of the `strong` and `weak` pokemon.
+
+**Tasks:**
+
+- Create an object using single brackets to obtain the column `base_score` and name it `bs_column`.
+- Plot the object `bs_column` using `.mark_bar()` and save this graph as `score_plot`.
+
+```{pyodide}
+#| exercise: practice_replacing_values_b
+import pandas as pd
+import altair as alt
+
+pokemon = pd.read_csv('data/pokemon_sw.csv')
+
+# Create an object using single brackets to obtain the column base_score and name it bs_column
+____ = pd.DataFrame(____['____'])
+
+# Plot the object score_freq using .mark_bar() and save this graph as score_plot
+____ = alt.____(____, width=500, height=300).____().____(
+ x='____',
+ y='____()')
+
+score_plot
+```
+
+```{pyodide}
+#| exercise: practice_replacing_values_b
+#| check: true
+from src.utils import assert_chart_equal
+
+pokemon = pd.read_csv('data/pokemon_sw.csv')
+solution = score_plot = alt.Chart(pd.DataFrame(pokemon['base_score']), width=500, height=300).mark_bar().encode(
+ x='base_score',
+ y='count()')
+
+assert isinstance(result, type(solution)), "The final check needs a chart."
+assert_chart_equal(solution, result)
+```
+
+:::: { .hint exercise="practice_replacing_values_b"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you using single square brackets or obtain the column `base_score`?
+- Are you using `count()` to count the occurences of the base scores?
+- Are you saving the objects with the correct names?
+
+:::
+::::
+
+:::: { .solution exercise="practice_replacing_values_b" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+import altair as alt
+
+pokemon = pd.read_csv('data/pokemon_sw.csv')
+
+# Create an object using single brackets to obtain the column base_score and name it bs_column
+bs_column = pd.DataFrame(pokemon['base_score'])
+
+# Plot the object bs_column using .mark_bar() and save this graph as score_plot
+score_plot = alt.Chart(bs_column, width=500, height=300).mark_bar().encode(
+ x='base_score',
+ y='count()')
+
+score_plot
+```
+
+:::
+::::
\ No newline at end of file
diff --git a/modules/module2/module2-22-chaining_notation.qmd b/modules/module2/module2-22-chaining_notation.qmd
new file mode 100644
index 00000000..ad3a4d4f
--- /dev/null
+++ b/modules/module2/module2-22-chaining_notation.qmd
@@ -0,0 +1,29 @@
+---
+format:
+ html:
+ page-layout: full
+---
+
+# 7. Chaining Notation
+
+::: {.panel-tabset .nav-pills}
+
+## Video
+
+
+
+## Slides
+
+
+
+:::
diff --git a/modules/module2/module2-23-chaining_true_false.qmd b/modules/module2/module2-23-chaining_true_false.qmd
new file mode 100644
index 00000000..21c3d00b
--- /dev/null
+++ b/modules/module2/module2-23-chaining_true_false.qmd
@@ -0,0 +1,138 @@
+---
+format: live-html
+---
+
+
+
+# 7.1. Exercises
+
+## Chaining True/False
+
+
+
+
+
+
+
+
+## Practice Chaining
+
+**Instructions:**
+Running a coding exercise for the first time, could take a bit of time for everything to load. Be patient, it could take a few minutes.
+
+**When you see `____` in a coding exercise, replace it with what you assume to be the correct code. Run it and see if you obtain the desired output. Submit your code to validate if you were correct.**
+
+Make a plot using our Pokemon dataset by chaining actions sequentially.
+
+**Tasks:**
+
+- Chain the following methods in the order specified.
+- First, rename the column `capture_rt` to `capture_rate`.
+- Then, create a new column named `AD_total` by adding the `attack` and `defense` columns from the pokemon dataset.
+- Save this in a dataframe object called `plot_df`.
+- Finally use `.mark_circle()` to plot `AD_total` on the x-axis and `capture_rate` on the y-axis.
+- Name this plot `pokemon_plot`.
+- Use a new line for each method.
+
+```{pyodide}
+#| exercise: practice_chaining
+import pandas as pd
+import altair as alt
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+# First, rename the column capture_rt to capture_rate.
+# Then, create a new column named AD_total by adding
+# the attack and `defense columns from the pokemon dataset.
+# Save this in a dataframe object called plot_df.
+____ = pd.DataFrame(____.____(columns={'____': '____'})
+ .____(____=pokemon['____'] + pokemon['____'])
+ )
+
+# Use .mark_circle() to plot AD_total on the x-axis and capture_rt on the y-axis
+# Name the plot pokemon_plot
+pokemon_plot = alt.Chart(____, width=500, height=300).____().____(
+ x='____',
+ y='____')
+
+pokemon_plot
+```
+
+```{pyodide}
+#| exercise: practice_chaining
+#| check: true
+from src.utils import assert_chart_equal
+
+pokemon = pd.read_csv('data/pokemon.csv')
+plot_df = pd.DataFrame(pokemon.rename(columns={'capture_rt': 'capture_rate'}).assign(AD_total=pokemon['defense'] + pokemon['attack']))
+solution = alt.Chart(plot_df, width=500, height=300).mark_circle().encode(x='AD_total', y='capture_rate')
+
+assert isinstance(result, type(solution)), "The final check needs a chart."
+assert_chart_equal(solution, result)
+```
+
+:::: { .hint exercise="practice_chaining"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you sure you are saving your dataframe as the correct object names?
+- Are you using the chaning commands in the correct order?
+- Are you using the `.mark_circle()` function?
+
+:::
+::::
+
+:::: { .solution exercise="practice_chaining" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+import altair as alt
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+# First, rename the column capture_rt to capture_rate.
+# Then, create a new column named AD_total by adding
+# the attack and `defense columns from the pokemon dataset.
+# Save this in a dataframe object called plot_df.
+plot_df = pd.DataFrame(pokemon.rename(columns={'capture_rt': 'capture_rate'})
+ .assign(AD_total=pokemon['defense'] + pokemon['attack'])
+ )
+
+# Use .mark_circle() to plot AD_total on the x-axis and capture_rate on the y-axis
+# Name the plot pokemon_plot
+pokemon_plot = alt.Chart(plot_df, width=500, height=300).mark_circle().encode(
+ x='AD_total',
+ y='capture_rate')
+
+pokemon_plot
+```
+
+:::
+::::
\ No newline at end of file
diff --git a/modules/module2/module2-25-grouping_and_aggregating.qmd b/modules/module2/module2-25-grouping_and_aggregating.qmd
new file mode 100644
index 00000000..80a09d79
--- /dev/null
+++ b/modules/module2/module2-25-grouping_and_aggregating.qmd
@@ -0,0 +1,29 @@
+---
+format:
+ html:
+ page-layout: full
+---
+
+# 8. Grouping and Aggregating
+
+::: {.panel-tabset .nav-pills}
+
+## Video
+
+
+
+## Slides
+
+
+
+:::
diff --git a/modules/module2/module2-26-fruit_salad_grouping_and_aggregating.qmd b/modules/module2/module2-26-fruit_salad_grouping_and_aggregating.qmd
new file mode 100644
index 00000000..9a66b68a
--- /dev/null
+++ b/modules/module2/module2-26-fruit_salad_grouping_and_aggregating.qmd
@@ -0,0 +1,240 @@
+---
+format: live-html
+---
+
+
+
+# 8.1. Exercises
+
+## Fruit Salad Grouping and Aggregating
+
+Remember the fruit salad dataframe named `fruit_salad`? Refer to it for the next two questions.
+
+```out
+ name colour location seed shape sweetness water-content weight
+0 apple red canada True round True 84 100
+1 banana yellow mexico False long True 75 120
+2 cantaloupe orange spain True round True 90 1360
+3 dragon-fruit magenta china True round False 96 600
+4 elderberry purple austria False round True 80 5
+5 fig purple turkey False oval False 78 40
+6 guava green mexico True oval True 83 450
+7 huckleberry blue canada True round True 73 5
+8 kiwi brown china True round True 80 76
+9
+```
+
+
+
+
+
+Consider this output made from the `fruit_salad` dataframe:
+
+{fig-align="center" fig-alt="404 image"}
+
+
+
+
+
+## Coding questions
+
+**Instructions:**
+Running a coding exercise for the first time, could take a bit of time for everything to load. Be patient, it could take a few minutes.
+
+**When you see `____` in a coding exercise, replace it with what you assume to be the correct code. Run it and see if you obtain the desired output. Submit your code to validate if you were correct.**
+
+_**Make sure you remove the hash (`#`) symbol in the coding portions of this question. We have commented them so that the line won't execute and you can test your code after each step.**_
+
+### Practice Grouping
+
+Find the mean speed of each column for every Pokemon types using `.mean()` and `.groupby()`.
+
+**Tasks:**
+
+- Make a groupby object on the column `type`.
+- Find the mean value of each column for each pokemon `type` using `.mean()` and save the resulting dataframe as `type_means`.
+- Obtain the mean speed of each pokemon type from the dataframe `type_means` by using `.loc[]`.
+- Save it in an object named mean_speed.
+- Display it.
+
+```{pyodide}
+#| exercise: practice_grouping
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+pokemon = pokemon.loc[ : , 'attack': 'type']
+
+# Make a groupby object on the column type
+# Find the mean value of each column for each pokemon type using .mean()
+# Save the resulting dataframe as type_means
+____ = ____.____(____).____()
+____
+
+# Obtain the mean speed of each pokemon type from the dataframe
+# type_means by using .loc[]
+# Save it in an object named mean_speed
+# ____ = ____.____[____]
+
+# Display it
+# ____
+```
+
+```{pyodide}
+#| exercise: practice_grouping
+#| check: true
+from src.utils import print_correct_msg
+
+pokemon = pd.read_csv('data/pokemon.csv')
+pokemon = pokemon.loc[ : , 'attack': 'type']
+solution = pokemon.groupby(by='type').mean().loc[:, 'speed']
+
+assert isinstance(result, type(solution)), "Are you displaying mean_speed?"
+assert round(solution.values.mean()) == round(result.values.mean()), "The average speed values are incorrect. Are you taking the mean after grouping by type?"
+assert round(max(solution.values)) == round(max(result.values)), "The maximum average speed is incorrect. Are you taking the mean after grouping by type?"
+print_correct_msg()
+```
+
+:::: { .hint exercise="practice_grouping"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you grouping by the column named `type`?
+- Are you using `.mean()` on the `pokemon_type` dataframe?
+- Are you naming the mean speed objects correctly?
+- Are you obtaining the mean values using `.loc[]`?
+
+:::
+::::
+
+:::: { .solution exercise="practice_grouping" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+pokemon = pokemon.loc[ : , 'attack': 'type']
+
+# Make a groupby object on the column type
+# Find the mean value of each column for each pokemon type using .mean()
+# Save the resulting dataframe as type_means
+type_means = pokemon.groupby(by='type').mean()
+type_means
+
+# Obtain the mean speed of each pokemon type from the dataframe
+# type_means by using .loc[]
+# Save it in an object named mean_speed
+mean_speed = type_means.loc[:, 'speed']
+
+# Display it
+mean_speed
+```
+
+:::
+::::
+
+
+
+### Practice Aggregating
+
+Let's practice using `.agg()`
+
+**Tasks:**
+
+- Make a groupby object on the column `legendary`.
+- Find the maximum and minimum value of each column for each legendary groups using `.agg()` and save the resulting dataframe as `legendary_stats`.
+- Display it.
+
+
+```{pyodide}
+#| exercise: practice_aggregating
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+pokemon = pokemon.loc[ : , ['attack', 'defense', 'capture_rt', 'total_bs', 'legendary']]
+
+# Make a groupby object on the column legendary
+# Find the maximum and minimum value of each column for each legendary groups using
+# .agg() and save the resulting dataframe as legendary_stats
+____ = ____.____[____].____(____)
+
+# Display it
+____
+```
+
+```{pyodide}
+#| exercise: practice_aggregating
+#| check: true
+from src.utils import print_correct_msg
+
+pokemon = pd.read_csv('data/pokemon.csv')
+pokemon = pokemon.loc[ : , ['attack', 'defense', 'capture_rt', 'total_bs', 'legendary']]
+solution = pokemon.groupby(by='legendary').agg(['max', 'min'])
+
+assert isinstance(result, type(solution)), "Are you using groupby?"
+assert sum(max(solution.values.tolist())) == sum(max(result.values.tolist())), "Your maximum values are incorrect. Are you aggregating by 'min' and 'max'?"
+assert sum(min(solution.values.tolist())) == sum(min(result.values.tolist())), "Your minimum values are incorrect. Are you aggregating by 'min' and 'max'?"
+print_correct_msg()
+```
+
+:::: { .hint exercise="practice_aggregating"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you grouping by the column named `legendary`?
+- Are you using `.agg()` on the `legendary_stats` dataframe?
+- Are you naming the objects correctly?
+
+:::
+::::
+
+:::: { .solution exercise="practice_aggregating" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+
+pokemon = pd.read_csv('data/pokemon.csv')
+pokemon = pokemon.loc[ : , ['attack', 'defense', 'capture_rt', 'total_bs', 'legendary']]
+
+# Make a groupby object on the column legendary
+# Find the maximum and minimum value of each column for each legendary groups using
+# .agg() and save the resulting dataframe as legendary_stats
+legendary_stats = pokemon.groupby(by='legendary').agg(['max', 'min'])
+
+# Display it
+legendary_stats
+```
+
+:::
+::::
\ No newline at end of file
diff --git a/modules/module2/module2-29-plotting_with_altair.qmd b/modules/module2/module2-29-plotting_with_altair.qmd
new file mode 100644
index 00000000..db44f646
--- /dev/null
+++ b/modules/module2/module2-29-plotting_with_altair.qmd
@@ -0,0 +1,29 @@
+---
+format:
+ html:
+ page-layout: full
+---
+
+# 9. Plotting with Altair
+
+::: {.panel-tabset .nav-pills}
+
+## Video
+
+
+
+## Slides
+
+
+
+:::
diff --git a/modules/module2/module2-30-plotting_a_groupby_object.qmd b/modules/module2/module2-30-plotting_a_groupby_object.qmd
new file mode 100644
index 00000000..2c319237
--- /dev/null
+++ b/modules/module2/module2-30-plotting_a_groupby_object.qmd
@@ -0,0 +1,126 @@
+---
+format: live-html
+---
+
+
+
+# 9.1. Exercises
+
+## Plotting a Groupby Object
+
+**Instructions:**
+Running a coding exercise for the first time, could take a bit of time for everything to load. Be patient, it could take a few minutes.
+
+**When you see `____` in a coding exercise, replace it with what you assume to be the correct code. Run it and see if you obtain the desired output. Submit your code to validate if you were correct.**
+
+Let's attempt to answer the question ***"Which pokemon type has the highest mean attack score?"*** by making a bar chart from a groupby object.
+
+**Tasks:**
+
+Create a plot by chaining the following actions.
+
+- Make a groupby object on the column `type` and name it pokemon_type.
+- Use `.mean()` on the new groupby object.
+- reset the index so it no longer has `type` as index column.
+- Use `reset_index()` to make `type` a column again.
+- Sort the pokemon mean attack values in descending order using `sort` argument.
+- Name the y-axis "Mean attack scores".
+- Name the object `attack_plot`.
+
+```{pyodide}
+#| exercise: practice_replacing_values
+import pandas as pd
+import altair as alt
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+____ = pd.DataFrame(____.____('____').____(numeric_only=True).____[:, '____'])
+
+____ = ____.____()
+
+# ____ = alt.Chart(____, width=500,
+# height=300).____().____(x=alt.X('____',sort='____',
+# title='____'), y=alt.Y('____',
+# title='____'
+# )).____(title='____')
+
+# attack_plot
+```
+
+```{pyodide}
+#| exercise: practice_replacing_values
+#| check: true
+from src.utils import assert_chart_equal, remove_keys_inplace
+
+pokemon = pd.read_csv('data/pokemon.csv')
+pokemon_type = pd.DataFrame(pokemon.groupby('type').mean(numeric_only=True).loc[:, 'attack']).reset_index()
+solution = alt.Chart(pokemon_type, width=500,
+ height=300).mark_bar().encode(x=alt.X('type:N', sort='-y',
+ title='Pokemon type'), y=alt.Y('attack:Q',
+ title='Mean attack score'
+ )).properties(title='Mean attack value among Pokemon types')
+
+assert isinstance(result,type(solution)), "The final check needs a chart."
+remove_keys_inplace(solution, "title")
+remove_keys_inplace(result, "title")
+assert_chart_equal(solution, result)
+```
+
+:::: { .hint exercise="practice_replacing_values"}
+::: { .callout-note collapse="false"}
+
+## Hint 1
+
+- Are you grouping by the column named `type`?
+- Are you using `.loc[:, 'attack']`?
+- While sorting, are you using the argument `ascending=False`?
+- Are you resetting the index?
+- Are you giving your plot a title??
+
+:::
+::::
+
+:::: { .solution exercise="practice_replacing_values" }
+::: { .callout-tip collapse="false"}
+
+## Fully worked solution:
+
+```{pyodide}
+import pandas as pd
+import altair as alt
+
+pokemon = pd.read_csv('data/pokemon.csv')
+
+pokemon_type = pd.DataFrame(pokemon.groupby('type').mean(numeric_only=True).loc[:, 'attack'])
+
+pokemon_type = pokemon_type.reset_index()
+
+attack_plot = alt.Chart(pokemon_type, width=500,
+ height=300).mark_bar().encode(x=alt.X('type:N', sort='-y',
+ title='Pokemon type'), y=alt.Y('attack:Q',
+ title='Mean attack score'
+ )).properties(title='Mean attack value among Pokemon types')
+
+attack_plot
+```
+
+:::
+::::
+
+
+
+
+
diff --git a/modules/module2/module2-31-what_did_we_just_learn.qmd b/modules/module2/module2-31-what_did_we_just_learn.qmd
new file mode 100644
index 00000000..6f06b6f5
--- /dev/null
+++ b/modules/module2/module2-31-what_did_we_just_learn.qmd
@@ -0,0 +1,29 @@
+---
+format:
+ html:
+ page-layout: full
+---
+
+# 10. What Did We Just Learn?
+
+::: {.panel-tabset .nav-pills}
+
+## Video
+
+
+
+## Slides
+
+
+
+:::
diff --git a/modules/module2/slides/module2_00.qmd b/modules/module2/slides/module2_00.qmd
new file mode 100644
index 00000000..1c4f9207
--- /dev/null
+++ b/modules/module2/slides/module2_00.qmd
@@ -0,0 +1,26 @@
+---
+format: revealjs
+title: Module Learning Outcomes
+title-slide-attributes:
+ data-notes: |
+---
+
+```{python}
+# | echo: false
+%run src/utils.py
+```
+
+## Module Learning Outcomes
+
+By the end of the module, students are expected to:
+
+- Demonstrate how to rename columns of a dataframe using [`.rename()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html).
+- Create new or columns in a dataframe using [`.assign()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.assign.html) notation.
+- Drop columns in a dataframe using [`.drop()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html)
+- Use `df[]` notation to filter rows of a dataframe.
+- Calculate summary statistics on grouped objects using [`.groupby()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html) and [`.agg()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html).
+- Explain when chaining is appropriate.
+- Demonstrate chaining over multiple lines and verbs.
+
+
+# Let's Start!
diff --git a/modules/module2/slides/module2_01.qmd b/modules/module2/slides/module2_01.qmd
new file mode 100644
index 00000000..41075d2c
--- /dev/null
+++ b/modules/module2/slides/module2_01.qmd
@@ -0,0 +1,181 @@
+---
+format: revealjs
+title: Reading in Different File Types
+title-slide-attributes:
+ data-notes: |
+---
+
+```{python}
+# | echo: false
+%run src/utils.py
+```
+
+## Reading in Different File Types
+
+`pandas` facilitates the loading of data from many different file types including:
+
+- A URL
+- A `txt` file
+- An `xlsx` file
+
+:::{.notes}
+In the last module, we learned how to read in a `csv` file but loading in data is not restricted to this file type.
+
+There are several types such as:
+
+- A URL: If the data is stored publicly on a webpage, pandas can read it directly in from the page address.
+- A `txt` file: We saw what a plain text file looked like in the last module and it is generally a simple manner of storing data.
+- An `xlsx` file: This is a Microsoft Excel spreadsheet. This is different than a regular `csv` file as an Excel file can contain many different sheets and can be formatted uniquely and specifically for an individual’s needs.
+
+Of course, there are many other file types but we will focus on these for this course.
+:::
+
+---
+
+## Reading from a URL
+
+https://raw.githubusercontent.com/UBC-MDS/MCL-DSCI-011-programming-in-python/master/data/candybars.csv.
+
+
+{fig-alt="404 image" width="50%" fig-align="center"}
+
+
+
+```{python}
+candybars = pd.read_csv('https://raw.githubusercontent.com/UBC-MDS/MCL-DSCI-511-programming-in-python/master/data/candybars.csv')
+candybars.head()
+```
+
+
+:::{.notes}
+If the data is accessible publicly on a website, you can read in data directly from the webpage it is stored on.
+For example, this code and all the files that make up this course are all openly available and can be viewed online.
+
+The `candybar.csv` file that we used in the last module, is stored at this URL.
+
+You can see that it looks like a plain text file with each line being a row and each column value separated with a comma.
+
+The code required to read in this URL looks like this.
+
+It uses the same `pd.read_csv()` function we saw when reading in csv files locally.
+:::
+
+---
+
+## Reading in a Text File
+
+```{python}
+candybars = pd.read_csv('data/candybars-text.txt')
+candybars.head()
+```
+
+
+
+```{python}
+candybars = pd.read_csv('data/candybars-text.txt', delimiter='\t')
+candybars.head()
+```
+
+:::{.notes}
+Reading in `txt` files can be a little less standard.
+
+Sometimes the character separating column values are not always commas like we saw before.
+
+There are many different options and when we read in the data, we need to specify how the data should be recognized.
+
+Let's load in the `candybars-text.txt` file.
+
+This is the same as the `candybars.csv` data but saved as a `txt` file.
+
+Look what happens when we load it in using the same syntax we are used to.
+
+This is not ideal.
+
+What you should notice is instead of each column value being separated by a comma, it is now separated by `\t`.
+
+This is called the **delimiter**.
+
+In this specific case, a `\t` delimiter is a "tab".
+
+We need to tell `pd.read_csv()` to separate each value on our delimiter `\t`.
+
+That's much better.
+
+The delimiter won't always be `\t` for `txt` files. The most common delimiters are `;`, `,`, `\t`, and sometimes even just spaces.
+:::
+
+---
+
+## Reading in an Excel File (`xlsx`)
+
+```{python}
+candybars = pd.read_excel('data/foods.xlsx', sheet_name='chocolate')
+candybars
+```
+
+:::{.notes}
+Excel files need special attention because they give the user the capability of additional formatting including saving multiple dataframes on different "sheets" within a single file.
+
+If this is the case, we need to specify which sheet we want.
+
+Since this is a new type of animal, we also need a new verb. Enter `read_excel()`.
+
+Our candybars dataframe is now saved as an excel spreadsheet named `foods.xlsx` on a sheet named `chocolate`.
+
+Here is how we would read it in.
+:::
+
+---
+
+## Reading in Data from a Different File
+
+{fig-alt="404 image" width="70%" fig-align="center"}
+
+This translates to the syntax `data/canucks.csv`.
+
+
+:::{.notes}
+Something you have seen in Module 1’s exercises is that when reading in the data there is always a `data/` before the file name.
+
+This is because we are running the current code in a file that is located in a different folder than the data.
+
+The `data` is specifying a folder in our current directory (folder).
+
+We need to specify the path to the `csv` file through the subdirectory.
+
+This translates to the syntax `data/canucks.csv`.
+:::
+
+---
+
+
+_*Example:*_
+
+`data/module3/question2/candybars.csv`
+
+
+
+
+{fig-alt="404 image" width="80%" fig-align="center"}
+
+
+:::{.notes}
+This syntax is not restricted to a single subdirectory and could even have multiple folders between the current location and the final file name.
+:::
+
+---
+
+
+{fig-alt="404 image" width="80%" fig-align="center"}
+
+
+:::{.notes}
+You can see the whole course structure and it's subdirectories openly online.
+
+In this course, we save all our data in a folder called `data` so when asked to read in data, take care in future exercises to add the full path to the required file.
+
+It may be a good idea to look in the data folder to see exactly where the data you are loading in the exercises is coming from.
+:::
+
+
+# Let’s apply what we learned!
\ No newline at end of file
diff --git a/modules/module2/slides/module2_06.qmd b/modules/module2/slides/module2_06.qmd
new file mode 100644
index 00000000..91fe8e3f
--- /dev/null
+++ b/modules/module2/slides/module2_06.qmd
@@ -0,0 +1,181 @@
+---
+format: revealjs
+title: Reading arguments
+title-slide-attributes:
+ data-notes: |
+---
+
+```{python}
+# | echo: false
+%run src/utils.py
+```
+
+## Arguments
+
+Here, we are going to introduce different arguments for `pd.read_csv()` and `pd.read_excel()`:
+
+- `index_col`
+- `header`
+- `nrows`
+- `usecols`
+
+If you wish to know more, you can find the documentation at the following links:
+
+- [pd.read_csv()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)
+- [pd.read_excel()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html)
+
+
+:::{.notes}
+When we load in our data we use different arguments to make sure it's organized how we want it.
+
+`delimiter` is an argument we have already discussed that instructs on how to separate each value in the data.
+
+This is only the tip of the iceberg.
+
+There are many others that are helpful when reading in our data, such as `index_col`, `header`, `nrows`, and `usecols`.
+:::
+
+---
+
+## index_col
+
+```{python}
+df = pd.read_csv('data/cereal.csv', index_col="name")
+df.head(3)
+```
+
+
+
+```{python}
+df = pd.read_csv('data/cereal.csv', index_col=0)
+df.head(3)
+```
+
+
+:::{.notes}
+`index_col` is an argument that indicates which column will be acting as the index label.
+
+In most of the cases we have encountered, we did not use this argument and instead relied on the pandas default, which is to use ascending integers for the index.
+
+We can, however, specify a column in the data to become the index.
+
+It's in our best interest that the column we choose have unique values.
+
+For our `cereal.csv` let's specify the `name` column as our index.
+
+The `index_col` argument also take in positions.
+
+The `name` column in our data is in the 0th position so we can also specify the index like we show here with `index_col=0`.
+:::
+
+---
+
+## header
+
+{fig-alt="404 image"}
+
+
+:::{.notes}
+We have been lucky up until now that all the data we have loaded in has been particularly straightforward.
+
+Sometimes with data, there are a few lines of text explaining important points about the file.
+
+We do not want to include this in our dataframe and therefore we need to specify exactly when our dataframe begins.
+
+This is where `header` comes in.
+
+Take a look at candybars-h.csv as an example.
+
+If we look at the data with a regular text editor, the data doesn't start until the 3rd line which would be the equivalent of position 2 (since we begin counting from 0).
+:::
+
+---
+
+```{python}
+# | include: false
+pd.set_option('display.max_rows', 4)
+```
+
+```{python}
+candybars = pd.read_csv('data/candybars-h.csv')
+candybars
+```
+
+
+:::{.notes}
+If we load this dataset without any arguments, we get this as the output.
+
+We see that there are no clear column names and things are quite a mess!
+:::
+
+---
+
+```{python}
+# | include: false
+pd.set_option('display.max_rows', 6)
+```
+
+```{python}
+candybars = pd.read_csv('data/candybars-h.csv', header=2)
+candybars
+```
+
+
+
+:::{.notes}
+If we use `header=2` to indicate that the data actually begins at position 2, then things start to look much better.
+:::
+
+---
+
+## nrows
+
+```{python}
+candybars = pd.read_csv('data/candybars.csv', nrows=7)
+candybars
+```
+
+
+:::{.notes}
+`nrows` is an argument in `pd.read_csv()` that is useful when you only want to load in part of the dataframe.
+
+Perhaps the file you have is large and you only want a sample of it.
+
+`nrows` will limit the number of rows that you read in.
+
+This code loads in only the first 7 rows of our candybar dataset.
+:::
+
+---
+
+## usecols
+
+```{python}
+candybars = pd.read_csv('data/candybars.csv', usecols=[0, 1, 10])
+candybars
+```
+
+:::{.notes}
+Similarly to how `nrows` specifies how many rows to read in, `usecols` selects which columns to load from the data.
+
+Perhaps the only columns relevant to our analysis are the columns `name`, `weight` and `available_canada_america`.
+
+We can forgo the other columns when reading the data in.
+
+In a similar way to selecting columns using `.iloc[]`, we specify the desired column indices in square brackets.
+:::
+
+---
+
+```{python}
+candybars = pd.read_csv('data/candybars.csv', usecols=['name', 'weight', 'available_canada_america'])
+candybars
+```
+
+
+:::{.notes}
+The `usecols` argument accepts either index positions or labels so we could also use the column names in square brackets as shown here.
+:::
+
+
+# Let’s apply what we learned!
\ No newline at end of file
diff --git a/modules/module2/slides/module2_09.qmd b/modules/module2/slides/module2_09.qmd
new file mode 100644
index 00000000..b407886c
--- /dev/null
+++ b/modules/module2/slides/module2_09.qmd
@@ -0,0 +1,125 @@
+---
+format: revealjs
+title: Column renaming and column dropping
+title-slide-attributes:
+ data-notes: |
+---
+
+```{python}
+# | echo: false
+%run src/utils.py
+```
+
+```{python}
+# | include: false
+pd.set_option('display.max_columns', 15)
+```
+
+```{python}
+candy = pd.read_csv('data/candybars.csv')
+candy
+```
+
+:::{.notes}
+Remember our `candybars.csv` dataframe?
+
+Let's bring it back and save it as object named `candy`.
+:::
+
+---
+
+## Column Renaming
+
+```{python}
+candy = candy.rename(columns={'available_canada_america':'availability'})
+candy
+```
+
+
+
+```python
+ columns={'old column name':'new column name'}
+```
+
+:::{.notes}
+There will be times where you are unsatisfied with the column names and you may want to change them.
+
+The proper syntax to do that is with `.rename()`.
+
+The column name `available_canada_america` is a bit long.
+
+Perhaps it would be a good idea to change it to something shorter like `availability`.
+
+Here is how we can accomplish that.
+
+This code uses something we've never seen before - `{}` curly braces, also called curly brackets.
+
+These have a special meaning but for now, you only need to concentrate your attention on the fact that the argument `columns` needs to have the format shown on the slide.
+:::
+
+---
+
+```{python}
+candy = candy.rename(columns={'available_canada_america':'availability',
+ 'weight':'weight_g'})
+candy.head()
+```
+
+
+
+:::{.notes}
+You can also rename multiple columns at once by adding a comma between the new and old column pairs within the curly brackets.
+
+It's important that we always save the dataframe to an object when making column changes or the changes will not be saved in our dataframe.
+:::
+
+---
+
+## Column Dropping
+
+```{python}
+candy.drop(columns='coconut')
+```
+
+
+:::{.notes}
+`.drop()` is the verb we use to delete columns in a dataframe.
+
+Let's delete the column `coconut` by specifying it in the `columns` argument of the `drop` verb.
+:::
+
+---
+
+```{python}
+# | eval: false
+candy.drop(columns='coconut')
+```
+
+
+
+```{python}
+candy.head()
+```
+
+
+
+```{python}
+candy = candy.drop(columns=['nougat', 'coconut'])
+candy.head()
+```
+
+
+:::{.notes}
+If you look again at the code we just wrote you'll notice we didn't save over the dataframe object, so the dataframe `candy` still will contain the `coconut` column.
+
+Let's overwrite the dataframe and remove multiple columns at the same time.
+
+Let's drop `nougat` and `coconut` together.
+
+We put the columns we want to drop in square brackets and this time we will remember to overwrite over the `candy` object.
+
+Now when we call `candy.head()` it reflects the dropped columns. They're no longer there.
+:::
+
+
+# Let’s apply what we learned!
\ No newline at end of file
diff --git a/modules/module2/slides/module2_13.qmd b/modules/module2/slides/module2_13.qmd
new file mode 100644
index 00000000..ba5f7323
--- /dev/null
+++ b/modules/module2/slides/module2_13.qmd
@@ -0,0 +1,221 @@
+---
+format: revealjs
+title: Column Arithmetic and Creation
+title-slide-attributes:
+ data-notes: |
+---
+
+```{python}
+# | echo: false
+%run src/utils.py
+```
+
+```{python}
+cereal = pd.read_csv('data/cereal.csv')
+cereal.head()
+```
+
+**Attribution:**
+_“[80 Cereals](https://www.kaggle.com/crawford/80-cereals/)” (c) by [Chris Crawford](https://www.linkedin.com/in/crawforc3/) is licensed under [Creative Commons Attribution-ShareAlike 3.0 Unported](http://creativecommons.org/licenses/by-sa/3.0/)_
+
+
+:::{.notes}
+Doing some sort of transformation on the columns of a dataframe will most likely come up in your analysis somewhere and it's not always straightforward.
+
+Let's welcome back the `cereal.csv` data we have been working with.
+:::
+
+---
+
+```{python}
+cereal= cereal.iloc[:5]
+cereal
+```
+
+:::{.notes}
+To make things especially clear, for the next few scenarios let's only use the first 5 rows of the dataset.
+:::
+
+---
+
+{fig-align="center" fig-alt="404 image"}
+
+
+:::{.notes}
+Take this next scenario.
+
+Perhaps we recently read the cereal data's documentation explaining that the `fat` column is being expressed as grams and we are interested in milligrams.
+
+How can we rectify this?
+
+We need to multiply each of the row's fat values by 1000.
+:::
+
+---
+
+```{python}
+cereal['fat']
+```
+
+
+
+Is transformed to this:
+
+```{python}
+cereal['fat'] * 1000
+```
+
+
+:::{.notes}
+Here is where some magic happens.
+
+Python doesn't require us to make a whole column filled with 1000s to get the result we want.
+
+It simply multiplies each value by 1000.
+(In Python we use `*` for multiplication.)
+
+So our original fat column in the cereal dataframe is transformed!
+
+See how each value has changed?
+
+Note that when we do any type of operations on columns, we use single square brackets.
+:::
+
+---
+
+```{python}
+cereal['rating']
+```
+
+
+
+```{python}
+cereal['rating'] / 10
+```
+
+
+:::{.notes}
+We can do the same thing with most operations.
+Let's divide the rating of each cereal by 10 so that it lies on a 10 point scale.
+
+The ratings column gets transformed to single digits instead of double digits now.
+:::
+
+---
+
+{fig-align="center" fig-alt="404 image"}
+
+
+:::{.notes}
+Every row's value is changed by the operation.
+:::
+
+---
+
+{width="60%" fig-alt="404 image" fig-align="center"}
+
+
+
+```{python}
+cereal['sugars'] / cereal['cups']
+```
+
+
+:::{.notes}
+We are not limited to simply taking a column and transforming it by a single number, say by multiplying or dividing.
+
+We can do operations involving multiple columns as well. Perhaps we wanted to know the amount of sugar (`sugar`) per cup of cereal (`cups`).
+
+The expected result would look something like this diagram.
+
+Remember that with any column operation we use only single square brackets on our columns.
+
+To get our desired output of sugar content per cup our code looks like this.
+
+Each sugar row value is divided by its corresponding cups value.
+:::
+
+---
+
+```{python}
+cereal[['sugars']] / cereal[['cups']]
+```
+
+
+:::{.notes}
+Just to stress the point of why we use single square brackets for our operations, here is what happens when we use double square brackets.
+
+This doesn't appear very useful.
+:::
+
+---
+
+```{python}
+cereal = pd.read_csv('data/cereal.csv', usecols=['name', 'mfr','type', 'fat', 'sugars', 'weight', 'cups','rating'])
+cereal
+```
+
+
+:::{.notes}
+Up until now, all of these operations have been done without being added to our cereal dataframe.
+
+Let's explore how we can add new columns to a less detailed version of our cereal dataframe.
+
+We'll be working with a smaller dataframe containing only a few columns columns so that it's easier to follow the examples.
+:::
+
+---
+
+## Column Creation
+
+```{python}
+oz_to_g = 28.3495
+cereal['weight'] * oz_to_g
+```
+
+
+
+```{python}
+cereal = cereal.assign(weight_g=cereal['weight'] * oz_to_g)
+cereal.head()
+```
+
+:::{.notes}
+In the next scenario, we have decided that our `weight` column should show the weight of each cereal in grams instead of ounces.
+
+We are going to save the conversion factor of grams to ounces in an object named `oz_to_g`.
+
+Let's start with just the operation for this.
+
+Next, we combine our operation with the implementation of adding it as a new column to the dataframe.
+The verb `.assign()` allows us to specify a column name to our result using an equal sign `=`.
+
+We are going to name our new column `weight_g` (for grams).
+
+Just like we did earlier in the module, we need to save the dataframe to an object when making changes involving columns. This will permanently save the column `weight_g` to the dataframe `cereal`.
+:::
+
+---
+
+```{python}
+cereal['sugars'] / cereal['cups']
+```
+
+
+
+```{python}
+cereal = cereal.assign(sugar_per_cup=cereal['sugars'] / cereal['cups'])
+cereal.head()
+```
+
+
+:::{.notes}
+Let's try another example.
+
+This time we want to save our sugar content per cereal cup as a column in our existing dataframe.
+
+At the top you can see the operation by itself, just for teaching purposes. Then, below, we combine our calculation with `assign()`, naming the column `sugar_per_cup`.
+:::
+
+
+# Let’s apply what we learned!
\ No newline at end of file
diff --git a/modules/module2/slides/module2_16.qmd b/modules/module2/slides/module2_16.qmd
new file mode 100644
index 00000000..70a0111a
--- /dev/null
+++ b/modules/module2/slides/module2_16.qmd
@@ -0,0 +1,269 @@
+---
+format: revealjs
+title: Data Filtering
+title-slide-attributes:
+ data-notes: |
+---
+
+```{python}
+# | echo: false
+%run src/utils.py
+```
+
+```{python}
+# | include: false
+pd.set_option('display.max_columns', 20)
+```
+
+```{python}
+cereal = pd.read_csv('data/cereal.csv')
+cereal.head()
+```
+
+
+:::{.notes}
+Filtering is probably one of the most frequent data manipulations you will do in data analysis.
+
+Filtering is often used when we are either trying to rid the dataframe of unwanted rows or trying to analyze rows with a particular column value.
+
+Let's try to filter the `cereal.csv` dataset.
+:::
+
+---
+
+## Conditions
+
+```{python}
+cereal['protein'] > 4
+```
+
+
+:::{.notes}
+Suppose you are trying to find the information for cereals with a protein content greater than 4g per serving.
+
+Our first instinct would be to write code that looks somewhat like this.
+
+This can be translated as
+
+*"From the `protein` column in the dataframe `cereal`, which have values greater than 4?"*
+
+The output shows all the index labels and a column with `True` or `False` values depending on if the row meets the condition.
+Cereals with `True` have a protein content greater than 4 and `False` if they do not.
+
+But we want a dataframe with all the information that only contains the rows with protein above 4.
+
+How can this be achieved?
+:::
+
+---
+
+```{python}
+cereal[cereal['protein'] > 4]
+```
+
+
+:::{.notes}
+To achieve this, we index into our `cereal` dataframe using this column of True/False values. The result is a smaller dataframe that only contains the rows corresponding to the `True` values.
+
+This code can be translated to:
+
+*Select the rows from the `cereal` dataframe that, according to the `cereal` dataframe, have a `protein` values greater than 4.*
+
+We can see from the output that only the rows meeting the condition are displayed.
+
+By the way, it is a common pattern that we're using the same dataframe twice, namely `cereal`, but it's not strictly required by pandas.
+:::
+
+---
+
+```{python}
+cereal[cereal['protein'] == 4]
+```
+
+
+:::{.notes}
+We can do this with equalities as well.
+
+Now we get all the cereals with a protein content of exactly 4g per serving.
+
+The key point to remember here is that we use **two** equal signs.
+
+In Python, a single `=` is used as an assignment operator. We are setting something to equal something else.
+
+The double equal sign operator is used for comparison. We check if certain values are equivalent to one another.
+
+By the way, these conventions were set a long time ago when people made the early programming languages. In hindsight, maybe something like `=?` would have been more clear, but the double equal sign for comparison is now a standard.
+:::
+
+---
+
+```{python}
+cereal[cereal['mfr'] == 'Q']
+```
+
+
+:::{.notes}
+We can filter categorical columns too. In this example, we only want cereals from the manufacturer "Q" (For Quaker):
+
+Here, we are using the double equal sign operator that we saw in the last slide.
+:::
+
+---
+
+## Multiple Condition Filtering - "and"
+
+```{python}
+cereal[cereal['protein'] >= 4]
+```
+
+
+:::{.notes}
+We now know how to filter on one condition but how do we filter if we have many?
+
+Perhaps we only want cereals with protein content between 4 to 5 grams?
+
+To find the cereals that meet protein contents greater or equal to 4, we use the code shown here.
+:::
+
+---
+
+```{python}
+cereal[cereal['protein'] <= 5]
+```
+
+
+:::{.notes}
+And the cereals that meet the condition of protein content below or equal to 5 would be obtained as shown here.
+:::
+
+---
+
+```python
+cereal[cereal['protein'] >= 4]
+```
+
+```python
+cereal[cereal['protein'] <= 5]
+```
+
+
+
+```{python}
+cereal[(cereal['protein'] >= 4) & (cereal['protein'] <= 5)]
+```
+
+
+:::{.notes}
+We can combine the two conditions using the `&` operator. This allows us to obtain cereals that meet **both** conditions.
+
+The `&` indicates "and". This means that both conditions must hold for a row to be included in the new dataframe.
+
+Each condition is wrapped with parentheses to keep them clearly separated.
+:::
+
+---
+
+{width="85%" fig-alt="404 image" fig-align="center"}
+
+
+:::{.notes}
+Only rows present in **both** dataframes will be selected.
+:::
+
+---
+
+```{python}
+cereal[(cereal['mfr'] == 'Q') & (cereal['protein'] > 4)]
+```
+
+
+:::{.notes}
+Next, we will look at a case where we filter on two different columns.
+
+Let's say we only want cereals from the Quaker manufacturer, with a protein content greater than 4.
+
+The same coding syntax can be applied to two different column conditions.
+:::
+
+---
+
+
+## Multiple Condition Filtering - "or"
+
+```{python}
+cereal[(cereal['mfr'] == 'Q') | (cereal['protein'] > 4)]
+```
+
+
+:::{.notes}
+Suppose that we are interested in cereals that either are made from the Quaker manufacturer **OR** a protein content above 4.
+
+For a row to be included in the output, we only require one or the other condition to hold.
+
+Instead of using the `&` symbol, we use `|` which is called the "pipe operator". This means "or" in the Python programming language (and many other languages).
+:::
+
+---
+
+{width="85%" fig-alt="404 image" fig-align="center"}
+
+
+:::{.notes}
+When we filter using "or" this time, it resulted in 10 cereals that met either of the conditions.
+
+When we filtered using "and", only 1 cereal met both conditions.
+:::
+
+---
+
+## Tilde
+
+```{python}
+cereal['protein'] > 4
+```
+
+
+:::{.notes}
+We saw that when we filter the conditions are expressed with an underlying column with `True` or `False` values indicating if the condition has been met in each row of the dataframe.
+
+But what if I wanted the rows that were the complement (or opposite) of this?
+
+The opposite of `cereal['protein'] > 4` is `cereal['protein'] <= 4`, so that one isn't too tricky. But sometimes taking the opposite is not so straightforward. This is where the `~` ("tilde") operator can be helpful.
+:::
+
+---
+
+```{python}
+(cereal['protein'] > 4).head()
+```
+
+
+
+Tilde converts all the `True` values to `False` and all the `False` values, to `True.`
+
+```{python}
+(~(cereal['protein'] > 4)).head()
+```
+
+
+:::{.notes}
+_Tilde_ (`~`) gives us the ability to return the complement of the code following it.
+:::
+
+---
+
+```{python}
+cereal[~(cereal['protein'] > 4)]
+```
+
+
+:::{.notes}
+We can obtain the complete dataframe by putting the entire condition within our square brackets like we did before.
+
+What we have here, is taking the rows where the protein content is not greater than four.
+
+This gives us more versatility when filtering, especially when we want the inverse of more complicated conditions and verbs (you'll see this in Module 3).
+:::
+
+
+# Let’s apply what we learned!
\ No newline at end of file
diff --git a/modules/module2/slides/module2_20.qmd b/modules/module2/slides/module2_20.qmd
new file mode 100644
index 00000000..6b056f85
--- /dev/null
+++ b/modules/module2/slides/module2_20.qmd
@@ -0,0 +1,289 @@
+---
+format: revealjs
+title: Conditional value replacement and assignment
+title-slide-attributes:
+ data-notes: |
+---
+
+```{python}
+# | echo: false
+%run src/utils.py
+```
+
+
+## Building on things we know
+
+```{python}
+cereal = pd.read_csv('data/cereal.csv',
+ usecols=['name', 'mfr', 'type', 'calories', 'protein', 'weight', 'rating'])
+cereal.head()
+```
+
+
+:::{.notes}
+So far, we have accumulated many different skills to wrangle our data.
+
+One type of transformation that you may use often is replacing values within a column depending on a certain condition.
+
+Let's bring in a smaller version of our cereal dataset.
+
+In the dataframe, the manufacturer value "Q" isn't that informative and it might be easier to understand our data if we change all these values to something more clear like "Quaker".
+
+This leads us to our task:
+
+***Replace the `Q` manufacturer values with a new value of `Quaker`.***
+:::
+
+---
+
+```{python}
+q_cereal = cereal[cereal['mfr'] == 'Q']
+q_cereal.assign(mfr = 'Quaker')
+```
+
+
+:::{.notes}
+Our first instinct may be to first filter those rows using the technique we learned in our last section.
+
+From our new filtered selection, perhaps we could assign values of "Quaker" to column `mfr` using similar code to this.
+
+The output looks like it did what we wanted, but what happened to the rest of our dataframe?
+
+Remember that we only want to replace the values in our existing dataframe and not create a new one.
+
+When we use the `.assign()` verb like this, it creates a new dataframe with only the rows that meet the condition ```cereal['mfr'] == 'Q'``` .
+
+This is problematic since we still want the original dataframe and the rows with `mfr` values not equal `Q`.
+
+So what do we do?
+:::
+
+---
+
+## Building on more things we know
+
+```{python}
+cereal.loc[73]
+```
+
+
+
+```{python}
+cereal.loc[cereal['mfr'] == 'Q']
+```
+
+
+:::{.notes}
+Remember our friend `.loc[]`? We are going to get reacquainted with it.
+
+Similarly, to how `.loc[]` can select and return specified columns and rows of the dataframe, it can filter on conditions too.
+
+We are used to seeing code involving `.loc[]` like this.
+
+But now we'll get introduced to a new side of it when we use it to filter as well.
+
+We can use the same syntax, `cereal['mfr'] == 'Q'`, we normally would when filtering. However, this time we wrap the whole thing within `.loc[]`.
+:::
+
+---
+
+```{python}
+cereal.loc[cereal['mfr'] == 'Q', 'mfr']
+```
+
+
+
+```{python}
+cereal.loc[cereal['mfr'] == 'Q', 'mfr'] = 'Quaker'
+```
+
+
+:::{.notes}
+Some people may be asking, "Why don't we do all our filtering like this then?" Well, the answer is, you can, but we prefer not to.
+
+Filtering without `.loc[]` is a bit more readable.
+
+Let's concentrate back on our task of only replacing `mfr` values equal to `Q` to `Quaker`.
+
+How can `.loc[]` help us with this?
+
+Unlike our earlier approach, `.loc[]` accepts more arguments within it.
+
+We have the ability to specify not only the target rows matching a specific condition but our column of interest as well.
+
+Once we have that, we can then assign these rows the new values `Quaker` in the `mfr` column.
+
+Wait! Nothing was outputted with our code! What happened?
+:::
+
+---
+
+```{python}
+cereal
+```
+
+
+:::{.notes}
+Let's take a look at the original dataframe.
+
+We can now see that the `Q` manufacturer values have changed to `Quaker`.
+
+When we use this syntax, we do not need to save the results in a new object like we had to with `.assign()` and `.drop()`.
+
+Let's discuss what really is happening behind the scenes.
+:::
+
+---
+
+```{python}
+cereal['mfr'] == 'Q'
+```
+
+
+:::{.notes}
+Remember what the condition `cereal['mfr'] == 'Q'` returns?
+
+It produces an object containing all the rows with True/False values depending on whether or not the row meets the condition.
+:::
+
+---
+
+{width="95%" fig-align="center" fig-alt="404 image"}
+
+
+:::{.notes}
+Essentially our code is finding the rows with `True` values and replacing the values in the `mfr` colum with the new value of `Quaker`.
+:::
+
+---
+
+1.
+```python
+cereal.loc[cereal['mfr'] == 'Q']
+```
+
+
+
+2.
+
+```python
+cereal.loc[cereal['mfr'] == 'Q', 'mfr']
+```
+
+
+
+3.
+```python
+cereal.loc[cereal['mfr'] == 'Q', 'mfr'] = 'Quaker'
+```
+
+
+:::{.notes}
+You can split up how this code works into 3 steps:
+
+1. We use `.loc[]` to find the rows meeting certain conditions.
+
+2. We next indicate which column we wish to access.
+
+3. Once we have obtained our desired rows and the column which we are editing, we assign a value.
+:::
+
+---
+
+```python
+cereal[cereal['mfr'] == 'Q', 'mfr'] = 'Quaker'
+```
+
+```out
+TypeError: unhashable type: 'Series'
+
+Detailed traceback:
+ File "", line 1, in
+ File "/usr/local/lib/python3.12/site-packages/pandas/core/frame.py", line 4311, in __setitem__
+ self._set_item(key, value)
+ File "/usr/local/lib/python3.12/site-packages/pandas/core/frame.py", line 4527, in _set_item
+ key in self.columns
+ File "/usr/local/lib/python3.12/site-packages/pandas/core/indexes/base.py", line 5358, in __contains__
+ hash(key)
+```
+
+
+:::{.notes}
+Does this work without using `.loc[]`?
+
+Let's give it a try.
+
+Unfortunately, we are not able to replace values in this manner and it results in an error since filtering this way does not allow us to specify a column.
+:::
+
+---
+
+## Replacing with inequalities
+
+```{python}
+cereal.loc[cereal['protein'] >= 3, 'protein_level'] = 'high'
+```
+
+
+
+```{python}
+cereal.loc[cereal['protein'] < 3, 'protein_level'] = 'low'
+```
+
+
+:::{.notes}
+This syntax using `.loc[]` also works for inequality conditions.
+
+If we are replacing numerical values with characters or words (or vice versa) we need to assign our desired values to a **new column** and not the existing one, because the column type will be different.
+
+Perhaps we want just two categories for protein levels - "high" and "low".
+
+Any cereal above 3 grams of protein will be considered a "high" protein level and anything less, as a "low" protein level.
+
+Let's assign the "high" protein values first.
+
+The only difference here from earlier is we now use an inequality for our condition and we designate a new column name instead of an existing one.
+
+Let's save the values in a column named `protein_level`.
+
+Next by the "low" values.
+:::
+
+---
+
+```{python}
+cereal
+```
+
+
+:::{.notes}
+Let's take a look at the dataframe now.
+:::
+
+---
+
+## Creating new columns
+
+```{python}
+oz_to_g = 28.3495
+cereal['weight_g'] = cereal['weight'] * oz_to_g
+cereal
+```
+
+
+:::{.notes}
+You may have noticed we did not use `.assign()` to create our new column.
+
+That's because as we mentioned earlier, when we use `.assign()` it creates a brand new dataframe.
+
+When we are replacing values, we don't want a new dataframe and instead, we just want to alter the current values in the existing dataframe.
+
+When we are not doing conditional value replacement, we could create new columns with a similar syntax. Take the example of converting the weight from ounces into grams and making a new column named `weight_g`.
+
+This code edits the existing dataframe `cereal` instead of creating a new one.
+
+We prefer to use `.assign()` where possible as it can help avoid unexpected errors and performance issues.
+:::
+
+
+# Let’s apply what we learned!
\ No newline at end of file
diff --git a/modules/module2/slides/module2_22.qmd b/modules/module2/slides/module2_22.qmd
new file mode 100644
index 00000000..9606241a
--- /dev/null
+++ b/modules/module2/slides/module2_22.qmd
@@ -0,0 +1,141 @@
+---
+format: revealjs
+title: Chaining Notation
+title-slide-attributes:
+ data-notes: |
+---
+
+```{python}
+# | echo: false
+%run src/utils.py
+```
+
+
+## What is Chaining?
+
+
+
+{width="110%" fig-alt="404 image"}
+
+
+[Attribution](https://unsplash.com/photos/42ui88Qrxhw)
+
+
+:::{.notes}
+Up until now, when we perform multiple actions on an object, we have been saving the results with the `=` operator after each line.
+
+Chaining allows us to do multiple actions in a single line of code without the need to save each action in an intermediate object.
+
+You can imagine that we are linking verbs together with a chain.
+:::
+
+---
+
+```{python}
+# | inlcude: false
+cereal = pd.read_csv('data/cereal.csv')
+```
+
+```{python}
+manufacturer_column = cereal['mfr']
+manufacturer_column.value_counts()
+```
+
+
+
+```{python}
+cereal['mfr'].value_counts()
+```
+
+
+:::{.notes}
+When we made our frequency table in Module 1, we first saved the single column as an object before we used `value_counts()` like we show you here.
+
+Instead of saving the column as an intermediate value, we can skip this step and make the frequency table in one line, with chaining.
+
+The convenience doesn't stop there either.
+:::
+
+---
+
+```{python}
+mfr_k = cereal[cereal['mfr'] == 'K']
+csr_df = mfr_k.loc[:, ["calories", "sugars", "rating"]]
+cereal_mean = csr_df.mean()
+cereal_mean
+```
+
+
+
+```{python}
+cereal_mean = cereal[cereal['mfr'] == 'K'].loc[:, ["calories", "sugars", "rating"]].mean()
+cereal_mean
+```
+
+
+:::{.notes}
+Let's say we want to perform three actions:
+
+1. Filter the dataframe for cereals only from manufacturer "K".
+
+2. Select the columns `calories`, `sugars` and `rating` using the verb `loc`.
+
+3. Find the mean of each column using `.mean()`.
+
+Previous we would need 3 different lines to code this.
+
+Instead we can chain them, as shown here.
+
+This chain avoided the use of the intermediate objects; `mfr_k` and `csr_df`.
+
+We cut out creating intermediate variables which is great but now we have a really long line of code and it's a bit hard to read.
+
+How can we make this easier to read?
+:::
+
+---
+
+```{python}
+cereal_mean = cereal[cereal['mfr'] == 'K'].loc[:, ["calories", "sugars", "rating"]].mean()
+
+cereal_mean
+```
+
+
+
+```{python}
+cereal_mean = (cereal[cereal['mfr'] == 'K'].loc[:, ["calories", "sugars", "rating"]]
+ .mean()
+ )
+
+cereal_mean.head()
+```
+
+
+:::{.notes}
+In this course, we suggest using a new line for each verb.
+
+We can do this by wrapping our all code (to the right of the equals sign) in parentheses and inserting a new line before each period (`.`).
+
+It's a good habit to indent and have the verbs lined up for additional clarity.
+:::
+
+---
+
+## Coding Preferences
+
+- Chaining has advantages and disadvantages.
+- Increases the readability of our code.
+- Comments are extremely important with of without chaining.
+
+
+:::{.notes}
+Although we have seen how chaining has advantages, it's a coding style that is adopted by the person writing the code.
+
+Someone else (or more often, future you) must be able to understand what is being accomplished.
+
+This is why comments (`#`) are so important. If a lot is going on in your code, it's a good habit to explain it whether it's with chaining, or without.
+:::
+
+
+# Let’s apply what we learned!
\ No newline at end of file
diff --git a/modules/module2/slides/module2_25.qmd b/modules/module2/slides/module2_25.qmd
new file mode 100644
index 00000000..53b08a42
--- /dev/null
+++ b/modules/module2/slides/module2_25.qmd
@@ -0,0 +1,278 @@
+---
+format: revealjs
+title: Grouping and Aggregating
+title-slide-attributes:
+ data-notes: |
+---
+
+```{python}
+# | echo: false
+%run src/utils.py
+```
+
+```{python}
+# | include: false
+cereal = pd.read_csv('data/cereal.csv')
+```
+
+_*Which manufacturer has the highest mean sugar content?*_
+
+```{python}
+cereal['mfr'].value_counts()
+```
+
+
+:::{.notes}
+Often, we are interested in examining specific groups in our data.
+
+Perhaps the question we want to answer from the cereal dataset is:
+
+_*Which manufacturer has the highest mean sugar content?*_
+
+We found in Module 1 using `.value_counts()` that there are 7 different manufacturers; K, G, P, R, Q, N and A.
+:::
+
+---
+
+Let's start with "K":
+
+```{python}
+cereal[cereal['mfr'] == 'K'].mean(numeric_only=True)[['sugars']]
+```
+
+
+
+Next "G":
+
+```{python}
+cereal[cereal['mfr'] == 'G'].mean(numeric_only=True)[['sugars']]
+```
+
+
+:::{.notes}
+To find the mean sugar content of each manufacturer, we could filter on each manufacturer and calculate the mean sugar content using `.mean()`. We can chain to make this process a little faster too.
+
+Let's start with "K" and then next "G".
+
+We could do this for the remaining 5 manufacturers. However, it's obvious that this is time-consuming and a lot of work to do this repeatedly.
+
+Imagine how tedious this would be if we had 100 different manufacturers?
+:::
+
+---
+
+## Using groupby
+
+
+```{python}
+mfr_group = cereal.groupby(by='mfr')
+mfr_group
+```
+
+
+:::{.notes}
+Pandas has a solution for this.
+
+It's not uncommon to be interested in examining specific groups in our data. Hence there is a verb that is helpful in grouping like-rows together.
+
+`.groupby()` allows us to group our data based on a specified column.
+
+Let's group our cereal dataframe on the `mfr` column and save it as object `mfr_group`.
+
+This returns a `DataFrame GroupBy` object.
+:::
+
+---
+
+{fig-alt="404 image" width="85%" fig-align="center"}
+
+
+:::{.notes}
+For example if we only had two manufacturers, the output would look like this.
+
+What exactly is a groubpy object though?
+:::
+
+---
+
+```{python}
+mfr_group.groups
+```
+
+
+:::{.notes}
+A `DataFrame GroupBy` object contains information about the groups of the dataframe.
+
+We can access it with the `.groups` attribute (noun).
+
+Reading carefully, we can see there are 7 groups: `A`, `G`, `K`, `N`, `P`, `Q` and `R`, and it lists the index labels (cereal names) in each group.
+:::
+
+---
+
+```{python}
+mfr_group.groups['K']
+```
+
+
+:::{.notes}
+We can obtain all the row index names of a group by specifying the group name in square brackets after the `groups` method.
+
+Take the group `K` as an example.
+:::
+
+---
+
+```{python}
+mfr_group.get_group('K')
+```
+
+
+:::{.notes}
+We can get the full dataframe of the group `K` alone using the method `.get_group()`.
+:::
+
+---
+
+```{python}
+cereal['mfr'].value_counts()
+```
+
+
+
+```{python}
+mfr_group.size()
+```
+
+
+:::{.notes}
+Similarly to how we made frequency tables using `.value_counts()`, we can now use `.size()` to obtain the number of rows in each group.
+:::
+
+---
+
+## Summary Statistics with Groups
+
+```{python}
+# | include: false
+pd.set_option('display.max_rows', 4)
+```
+
+```{python}
+mfr_group = cereal.drop(columns=["name", "type"]).groupby(by='mfr')
+mfr_group.mean()
+```
+
+```{python}
+mfr_group.max()
+```
+
+
+:::{.notes}
+What now?
+
+Grouping doesn't answer our initial question of ***Which manufacturer has the highest mean sugar content?***
+
+Where do we go from here?
+
+We need to calculate the mean sugar content in each manufacturing group! With a groupby object, this is super simple, as shown here.
+
+Using `.mean()` on our groupby object answers the initial question and confirms that manufacturer "P" has the highest mean sugar content across cereals.
+
+See how convenient this was to do in comparison to our initial method? Not only does this give us the result quicker, but it also gives us the mean of each column of the dataframe.
+
+Think of how many filtering and mean calculations would have to be done if we were to do this using our initial approach.
+
+Of course, using groups is not limited to finding only the mean. We can do the same thing for other statistics too like `.min()` and `.max()`, or many other operations.
+:::
+
+---
+
+## Aggregating dataframes
+
+```{python}
+# | include: false
+pd.set_option('display.max_rows', 6)
+```
+
+```{python}
+cereal.select_dtypes(include=np.number).agg('mean')
+```
+
+
+
+```{python}
+cereal.mean(numeric_only=True)
+```
+
+
+:::{.notes}
+In situations where we want to collect multiple statistics together, we can aggregate them in one step using a verb called `.agg()`.
+
+`.agg()` can be used on its own using a single measurement, without `.groupby()`.
+
+Using `.agg()` with only a `mean` input is essentially the same thing as calling the statistic `mean()` on the dataframe.
+:::
+
+---
+
+```{python}
+cereal.select_dtypes(include=np.number).agg(['max', 'min', 'median'])
+```
+
+
+:::{.notes}
+`.agg()` gets a chance to really shine when we want several specific measures.
+
+Let's say we want the `max`, `min` and `median`. We specify them in square brackets within our `.agg()` method.
+
+This produces a convenient dataframe giving the value for each statistic, for each column.
+:::
+
+---
+
+## Aggregating groupby objects
+
+```{python}
+mfr_group.agg(['max', 'min', 'median'])
+```
+
+
+:::{.notes}
+`.agg()` is particularly useful with groupby objects.
+
+Let's try it on our manufacturer `groupby` object named `mfr_group`.
+
+This gives us a value for each group and for each statistic we specified.
+
+For example:
+
+Look at the '150' in the bottom row on the far left under `calories`.
+The interpretation is that, for cases where the manufacturer is 'R', the max number of calories is 150.
+
+In a similar manner if the manufacturer is 'P' the minumum amount of sodium is 45.
+:::
+
+---
+
+## Extra Fancy Aggregation
+
+```{python}
+mfr_group.agg({"calories":['max', 'min'],
+ "rating":['sum'],
+ "sugars":['mean', 'median']})
+```
+
+
+:::{.notes}
+You might have noticed that when we used `.agg()`, we calculated the same 3 statistics for every column in the dataframe. But we can also calculate different statistics for different columns.
+
+Let's say we are concerned about the `max` and `min` calorie values, the total `sum` of the ratings and the `mean` and `median` sugar content for each manufacturing group.
+
+We can achieve this by wrapping everything in curly brackets and using a colon to separate the column name from the statistics values. We need to put the statistics within square brackets.
+
+The code is a bit more complicated, but the result is a bit easier to read.
+:::
+
+
+# Let’s apply what we learned!
\ No newline at end of file
diff --git a/modules/module2/slides/module2_29.qmd b/modules/module2/slides/module2_29.qmd
new file mode 100644
index 00000000..1e1d90bd
--- /dev/null
+++ b/modules/module2/slides/module2_29.qmd
@@ -0,0 +1,474 @@
+---
+format: revealjs
+title: More plotting tricks using Altair
+title-slide-attributes:
+ data-notes: |
+---
+
+```{python}
+# | echo: false
+%run src/utils.py
+```
+
+```{python}
+# | inlcude: false
+cereal = pd.read_csv('data/cereal.csv')
+```
+
+```{python}
+# | output: false
+import altair as alt
+
+chart0 = alt.Chart(cereal, width=500, height=300).mark_circle().encode(
+ x='mfr',
+ y='calories'
+).properties(title="Scatter plot of manufacturer calorie content")
+
+chart0
+```
+
+```{python}
+# | include: false
+chart0.save('static/module2/chart0.png')
+```
+
+{fig-alt="404 image" width="60%"}
+
+:::{.notes}
+Let's build on the Altair skills we learned in the previous module.
+
+At this point, we are familiar with writing basic plotting code similar to what is shown here.
+
+However, it's important that we start specifying what kind of variable type we use for our `x` and `y` values with the `encode(..)` verb.
+
+Before, Altair would guess what type of data it was plotting. Usually it's pretty smart and guesses correctly like we saw in our previous plots, but unfortunately this is not always the case.
+:::
+
+---
+
+```{python}
+# | include: false
+# convert the calories column to a string
+cereal_modified = cereal
+cereal_modified['calories'] = cereal_modified['calories'].astype(str)
+```
+
+```{python}
+# | output: false
+chart1 = alt.Chart(cereal_modified, width=500, height=300).mark_circle().encode(
+ x='mfr',
+ y='calories'
+ ).properties(title="Scatter plot of manufacturer calorie content")
+
+chart1
+```
+
+
+```{python}
+# | include: false
+chart1.save('static/module2/chart1.png')
+```
+
+{fig-alt="404 image" width="60%"}
+
+
+:::{.notes}
+Let's see an example where `Altair` fails to determine the correct data type.
+
+For this example, we have modified the `calories` column in the `cereal` dataframe.
+
+We will now generate a scatter plot of `mfr` and `calories` from this modified cereal dataset.
+
+Notice how 150 comes before 100 on the y-axis? It seems we have a problem here, which is due to Altair failing to recognize that `calories` is a numerical type.
+
+Even Altair can’t always get it right every time, which is why it’s so important we specify the data type when plotting.
+:::
+
+---
+
+```{python}
+# | output: false
+chart2 = alt.Chart(cereal_modified, width=500, height=300).mark_circle().encode(
+ x='mfr:N',
+ y='calories:Q'
+ ).properties(title="Scatter plot of manufacturer calorie content")
+
+chart2
+```
+
+```{python}
+# | include: false
+chart2.save('static/module2/chart2.png')
+```
+
+{fig-alt="404 image" width="60%"}
+
+
+:::{.notes}
+We can help Altair by giving it clear instructions on what type of columns our x and y values are.
+
+In this case we are going to specify `N` for the *nominal* column `mfr` and `Q` for the *quantitative* column `calories`.
+
+That's better!
+:::
+
+---
+
+| Data Type | Shorthand Code | Description | Examples |
+| ----------- | ----------- | ----------- | ----------- |
+| Ordinal | `O` | a discrete ordered quantity | “dislike”, “neutral”, “like” |
+| Nominal | `N` | a discrete un-ordered quantity | eye color, postal code, university |
+| Quantitative | `Q` | a continuous quantity | 5, 5.0, 5.011 |
+| Temporal| `T` | a time or date value | date (August 13 2020), time (12:00 pm) |
+
+
+:::{.notes}
+Altair recognizes the following column types and it's best practice that we specify this when we plot going forward.
+
+Ordinal values imply that there is some natural ordering to the values.
+
+For example, the ratings of a a movie could be on an ordinal scale since a five star rating is better than a single star rating.
+
+In contrast, there is no such natural ordering for nominal values.
+An example of this would be someone's eye colour, their country location or the university they attended.
+
+Anything numeric is considered a `quantitative` variable and `time` or `date` values are considered as `temporal`.
+:::
+
+---
+
+```{python}
+# | output: false
+chart3 = alt.Chart(cereal, width=500, height=300).mark_circle().encode(
+ x='sugars:Q', # set the sugars column as quantitative
+ y='rating:Q' # set the rating column as quantitative
+ ).properties(title="Scatter plot of cereal rating vs sugar content")
+
+chart3
+```
+
+
+```{python}
+# | include: false
+chart3.save('static/module2/chart3.png')
+```
+
+{fig-alt="404 image" width="60%"}
+
+
+:::{.notes}
+Let's practice this.
+
+Maybe we are interested in plotting the rating of cereals vs the amount of sugar they contain from `cereal` dataframe.
+
+We do this using a scatter plot which uses `.mark_circle()`. We can assign `sugars` as the the `x` variable and `ratings` as the `y` variable from the `cereal` dataframe we have been using.
+
+Here, `sugars` and `rating` are both quantitative columns so we specify `Q` as variable types in our plot.
+:::
+
+---
+
+## Variable types
+
+```{python}
+# | output: false
+chart4 = alt.Chart(cereal, width=500, height=300).mark_circle().encode(
+ x=alt.X('sugars:Q'), # use alt.X() to map the x-axis
+ y=alt.Y('rating:Q') # use alt.Y() to map the y-axis
+ ).properties(title="Scatter plot of cereal rating vs sugar content")
+
+chart4
+```
+
+```{python}
+# | include: false
+chart4.save('static/module2/chart4.png')
+```
+
+{fig-alt="404 image" width="60%"}
+
+
+:::{.notes}
+So far when plotting with Altair, we have been mapping our `x` and `y` in the `encode(x=..,y=..)` verb.
+
+However, doing so gives us very little control over how exactly we would like to map our x and y values.
+
+In order to have more control, we can map our x and y values using `x=alt.X(...)` and `y=alt.Y(...)` respectively.
+
+This gives us a lot more control over the customization of our plot.
+
+You'll see this coming up.
+:::
+
+---
+
+## Histograms
+
+```{python}
+# | output: false
+chart5 = alt.Chart(cereal, width=500, height=300).mark_bar().encode(
+ x=alt.X('calories:Q', bin=True), # set x-axis as calories
+ y=alt.Y('count():Q') # set the y-axis as the occurrence count for each calorie value
+ ).properties(title="Histogram plot of cereal calorie content")
+chart5
+```
+
+
+```{python}
+# | include: false
+chart5.save('static/module2/chart5.png')
+```
+
+{fig-alt="404 image" width="60%"}
+
+
+:::{.notes}
+Another type of plot we can make using Altair is called a **histogram**.
+
+A histogram would be an ideal plot if we were interested in seeing how many cereals in our dataframe have calories within a certain range. A histogram is a `bar` chart where the height of each bar shows the frequency of something occurring. When applied to quantitative data, it groups the values into **ranges**, and the height of each bar shows the frequency of each range.
+
+We can generate a histogram plot of the `calories` values in the cereal dataframe, which is quantitative. This will enable us to see the various values of calories and how many times they occur.
+
+To make a histogram, we use `mark_bar()`.
+
+In the `encode()` verb, we specify the x-axis as `calories` and use the argument `bin=True`. We assign the y-axis as `count():Q` to get the number of cereals that have values within each of the ranges.
+
+This is the same `count()` argument we use in Module 1 when we made bar charts.
+:::
+
+---
+
+## Bins
+
+```{python}
+# | output: false
+chart6 = alt.Chart(cereal, width=500, height=300).mark_bar().encode(
+ x=alt.X('calories:Q', bin=alt.Bin(maxbins=20)), # set max number of bins to 20
+ y=alt.Y('count():Q')
+ ).properties(title="Histogram of cereal calorie content with bins = 20")
+chart6
+```
+
+
+```{python}
+# | include: false
+chart6.save('static/module2/chart6.png')
+```
+
+{fig-alt="404 image" width="60%"}
+
+
+:::{.notes}
+We have the ability to change the number of bars (bins) in our histogram by using the `bin` argument and the `alt.Bin()` verb.
+
+Within `alt.Bin()`, we can specify `maxbins` which is the maximum allowed number of bins in our plot.
+
+This may be useful when viewing a dataset with lots of different values.
+
+Having control over the number of bins in a histogram can help to make visualization easier to extract insights from.
+
+Here, we set the number of max bins in the plot to `20` by setting `bin=alt.Bin(maxbins=20)` inside `alt.X()`.
+:::
+
+---
+
+{fig-alt="404 image" width="70%"}
+
+
+:::{.notes}
+When plotting with Altair, the `x` and `y` axis are labelled with the default column names.
+
+This may not always be ideal since column names may not always be informative.
+In this plot, the x axis label `calories (binned)` is a little messy.
+
+Luckily Altair allows us to customize our axis labels.
+:::
+
+---
+
+```{python}
+# | output: false
+chart7 = alt.Chart(cereal, width=500, height=300).mark_bar().encode(
+ x=alt.X('calories:Q', bin=alt.Bin(maxbins=20), title="Calorie Content"), # use alt.X() to label the x-axis
+ y=alt.Y('count():Q', title="Number of Cereals") # use alt.Y() to label the y-axis
+ ).properties(title="Histogram plot of cereal calorie content")
+chart7
+```
+
+```{python}
+# | include: false
+chart7.save('static/module2/chart7.png')
+```
+
+{fig-alt="404 image" width="60%"}
+
+
+:::{.notes}
+We can change these axis labels using the `title=""` argument within the respective `alt.X()` and `alt.Y()` verbs that we talked about earlier.
+
+This is a big help for the clarity of our analysis.
+:::
+
+---
+
+```{python}
+mfr_mean = cereal.groupby(by='mfr').mean(numeric_only=True)
+mfr_mean
+```
+
+
+:::{.notes}
+In the previous slide deck, we asked the following question regarding our cereal data:
+
+***Which manufacturer has the highest mean sugar content?***
+
+A nice way of answering this would be to plot the results using a bar chart!
+
+Before doing this, we need a few more tricks.
+
+We can start using the mean statistics we calculated from the `groupby(by='mfr')` object from the last section.
+
+Here, we seem to have lost our index column of numbers that we usually have. It also appears that `mfr` has now moved to the left of the dataframe with its label `mfr` lower than the other column labels.
+
+This is because when you apply `groupby()` to a column, the grouping column becomes the new dataframe index.
+
+Although this is a useful feature in many cases, Altair cannot access the index column.
+:::
+
+---
+
+```{python}
+mfr_mean
+```
+
+```{python}
+mfr_mean = mfr_mean.reset_index()
+mfr_mean
+```
+
+
+:::{.notes}
+To deal with this, we use `reset_index()` which will convert `mfr` to a regular column again.
+
+We can see that `mfr` column has now moved right and our index column of integers has returned on the left!
+:::
+
+---
+
+```{python}
+# | output: false
+chart8 = alt.Chart(mfr_mean, width=500, height=300).mark_bar().encode(
+ x=alt.X('mfr:N', title="Manufacturer"),
+ y=alt.Y('sugars:Q', title="Mean sugar content")
+ ).properties(title="Bar plot of manufacturers mean sugar content")
+chart8
+```
+
+
+```{python}
+# | include: false
+chart8.save('static/module2/chart8.png')
+```
+
+{fig-alt="404 image" width="60%"}
+
+
+:::{.notes}
+Now that we have our `mfr_mean` in the correct format, we can proceed.
+
+Using Altair we can plot the `mfr` column on the x axis which we've identified to contain nominal values and `sugars` which we agreed was a quantitative value on the y axis. (Also, let's not forget our title!)
+:::
+
+---
+
+
+
+
+1. Groupby object and calculated the mean
+2. Reset index
+3. Plot using Altair
+
+
+:::{.notes}
+Let's go through the steps that were needed to make the plot in the previous slide.
+
+First, we created a groupby object and calculated the mean for each column in the resulting dataframe.
+Second, since `.groupby()` made `mfr` the new index, we had to use `reset_index()` to make `mfr` a regular column again.
+And finally, we generated a bar plot using Altair.
+:::
+
+---
+
+## Sorting
+
+```{python}
+# | output: false
+chart9 = alt.Chart(mfr_mean, width=500, height=300).mark_bar().encode(
+ x=alt.X('mfr:N', sort="y", title="Manufacturer"), # use sort="y" to sort in ascending order
+ y=alt.Y('sugars:Q', title="Mean sugar content")
+ ).properties(title="Bar plot of manufacturers mean sugar content in ascending order")
+chart9
+```
+
+```{python}
+# | include: false
+chart9.save('static/module2/chart9.png')
+```
+
+{fig-alt="404 image" width="60%"}
+
+
+:::{.notes}
+Sometimes sorting a dataframe by quantity helps us obtain insights more easily.
+
+For example, if we sorted the mean sugar content for the manufacturers before generating the previous plot, it would be easier to identify which manufacturer produces cereals with the highest mean sugar content.
+
+Altair allows us to sort a column while plotting.
+
+Sorting can be done on either the x or y axis using the `sort=` in the `alt.X` or `alt.Y` verb.
+
+The sort argument takes in either `x` or `y` to specify which axis to sort by.
+
+Here we are sorting in ascending order of which manufacturers have the largest mean sugar content.
+
+This plot shows us immediately that manufacturer `P` has the highest mean cereal sugar content.
+:::
+
+---
+
+```{python}
+# | output: false
+chart10 = alt.Chart(mfr_mean, width=500, height=300).mark_bar().encode(
+ x=alt.X('mfr:N', sort="-y", title="Manufacturer"), # use sort="-y" to sort in descending order
+ y=alt.Y('sugars:Q', title="Mean sugar content")
+).properties(title="Bar plot of manufacturers mean sugar content sorted in descending order")
+chart10
+```
+
+```{python}
+# | include: false
+chart10.save('static/module2/chart10.png')
+```
+
+{fig-alt="404 image" width="60%"}
+
+
+:::{.notes}
+To generate a bar plot of mean calorie content sorted in `descending` order, we recycle the code from the previous slide.
+
+This time, we add `-y` in the `sort` argument to specify that we would like to sort the y variable in descending order.
+:::
+
+---
+
+
+
+
+
+If you enjoyed this part of the module and you wish to learn more advanced visualizations using Altair, take a look at our
+Data Visualization course
+
+
+
+
+# Let’s apply what we learned!
\ No newline at end of file
diff --git a/modules/module2/slides/module2_31.qmd b/modules/module2/slides/module2_31.qmd
new file mode 100644
index 00000000..8dacb382
--- /dev/null
+++ b/modules/module2/slides/module2_31.qmd
@@ -0,0 +1,40 @@
+---
+format: revealjs
+title: What Did we Learn and What to Expect in Assignment 2
+title-slide-attributes:
+ data-notes: |
+---
+
+```{python}
+# | echo: false
+%run src/utils.py
+```
+
+## Summary
+
+Students are now expected to be able to:
+
+- Demonstrate how to rename columns of a dataframe using [`.rename()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html).
+- Create new or columns in a dataframe using [`.assign()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.assign.html) notation.
+- Drop columns in a dataframe using [`.drop()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html)
+- Use `df[]` notation to filter rows of a dataframe.
+- Calculate summary statistics on grouped objects using [`.groupby()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html) and [`.agg()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html).
+- Explain when chaining is appropriate.
+- Demonstrate chaining over multiple lines and verbs.
+
+
+:::{.notes}
+The assignment will concentrate on the learning objectives as well as building knowledge on existing concepts.
+:::
+
+---
+
+## Attribution
+
+The cereal dataset:
+
+ “[80 Cereals](https://www.kaggle.com/crawford/80-cereals/)” (c) by [Chris Crawford](https://www.linkedin.com/in/crawforc3/) is licensed
+under [Creative Commons Attribution-ShareAlike 3.0 Unported](http://creativecommons.org/licenses/by-sa/3.0/)
+
+
+# On to Assignment 2!