Skip to content

Commit db8bb30

Browse files
author
Justine Wezenaar
committed
init
0 parents  commit db8bb30

20 files changed

+898
-0
lines changed

README.md

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# ghc2024-vectorization-workshop
2+
3+
## Set-Up
4+
5+
### Option 1: From your local Python3 shell:
6+
7+
0. Clone this repo (optionally: fork it to your github first)
8+
9+
```
10+
git clone <>
11+
```
12+
13+
1. Enter the directory and make your virtual environment (any Python3 version should be fine. Here I use 3.9)
14+
15+
```
16+
cd ghc2024-vectorization-workshop
17+
python3.9 -m venv venv
18+
```
19+
20+
2. Activate your new virtual environment
21+
```
22+
source venv/bin/activate
23+
```
24+
25+
3. Install required packages from the requirements.txt
26+
```
27+
python -m pip install -r requirements.txt
28+
```
29+
30+
### Option 2: In a Google Colab Notebook
31+
32+
1. Open a new Google Colab Notebook (https://colab.research.google.com). You will need to be signed into your own Google account
33+
34+
2. copy-paste the code from `utils.py` into the first cell and run
35+
36+
## Writing and testing the functions
37+
38+
### Option 1: From your local Python3 shell:
39+
40+
1. Open the `q#.py` in your text editor, according to the question (q1 through q7). For the given question, put your optimized code in the `vec_*` function where it says `pass # insert your code here`.
41+
42+
2. To test that your function works and compare the speed (for relevent questions) by running the `test_run.py` script:
43+
44+
```
45+
python test_run.py
46+
```
47+
48+
Note: if you're having trouble with one function and want to skip to the next one, just use `#` to comment out that line in the `test_run.py`
49+
50+
If your function matches the desired output, then you should see either a "Success" message, or a printout of the timing differences.
51+
52+
### Option 2: In a Google Colab Notebook
53+
54+
1. Copy the code from the `q#.py` for your corresponding question (q1 through q7). Be sure to include the imports too, EXCEPT **do not include the line `from util import print_time_results, time_funcs`. You have already pasted these functions into your notebook during the Setup.
55+
56+
2. Update the `vec_*` function where it says `pass # insert your code here`.
57+
58+
3. Run the `test_*` function to determine if your output is satisfactory, and what the speedup is (for relevent questions). You won't use the `test_run.py` script if you're working in a Notebook; just copy-paste the `test_` functions directly and run them.
59+
60+
## Brag about your gains!
61+
62+
1. Go to the Google Form https://forms.gle/9xrjTUSEDozPm4kJ6 and report the speed-up factor you acheived for each function! We'll shout-out the biggest speed-ups live in the workshop.
63+

q1.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""Unoptimized functions to be vectorized."""
2+
3+
import math
4+
import time
5+
6+
import numpy as np
7+
import pandas as pd
8+
9+
from util import print_time_results, time_funcs
10+
11+
# Q1: Convert a list to np.array
12+
13+
14+
def convert_list_to_array(input_list: list):
15+
pass # insert your code here
16+
17+
18+
def make_test_list(size: int = 100):
19+
return [i for i in range(size)]
20+
21+
22+
def test_convert_list(size: int = 1000):
23+
print("\n\nQ1: Running test_convert_list...\n")
24+
input = make_test_list(size)
25+
output = convert_list_to_array(input)
26+
if output is not None:
27+
assert (
28+
type(output) == np.ndarray
29+
and len(input) == len(output)
30+
and np.all([input[i] == output[i] for i in range(len(input))])
31+
), "Whoops! input and output do not match"
32+
print(" Success!")
33+
else:
34+
print(" convert_list_to_array is not implemented")

q2.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""Unoptimized functions to be vectorized."""
2+
3+
import math
4+
import time
5+
6+
import numpy as np
7+
import pandas as pd
8+
9+
from util import print_time_results, time_funcs
10+
11+
# Q2: Convert a dict to pd.DataFrame. Each key in the dict should become a column in the DataFrame.
12+
13+
14+
def convert_dict_to_df(input_dict: dict):
15+
pass # insert your code here
16+
17+
18+
def make_test_dict(size: int = 100):
19+
return {
20+
chr(97 + i % 26) * (i // 26 + 1): np.random.rand(5) * 100 for i in range(size)
21+
}
22+
23+
24+
def test_convert_dict(size: int = 1000):
25+
print("\n\nQ2: Running test_convert_dict...\n")
26+
input = make_test_dict(size)
27+
output = convert_dict_to_df(input)
28+
if output is not None:
29+
assert type(output) == pd.DataFrame and np.all(
30+
[key in output.columns for key in input.keys()]
31+
), "Whoops! input and output do not match"
32+
print(" Success!")
33+
else:
34+
print(" convert_dict_to_df is not implemented")

q3.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
"""Unoptimized functions to be vectorized."""
2+
3+
import math
4+
import time
5+
6+
import numpy as np
7+
import pandas as pd
8+
9+
from util import print_time_results, time_funcs
10+
11+
# Q3: Write a vectorized function vec_power which has the same arguments are returns as slow_power. Run the function on data1 and report your speedup on 20 reps
12+
13+
14+
def slow_power(x, m=4):
15+
out = []
16+
for x_i in x:
17+
out.append(x_i**m)
18+
return np.array(out)
19+
20+
21+
def vec_power(x, m=4):
22+
pass # insert your new code here
23+
24+
25+
def make_data1(size=1000):
26+
np.random.seed(4)
27+
array1 = np.random.rand(size, 1) # np.array
28+
df1 = pd.DataFrame(array1) # pd.DataFrame
29+
return df1
30+
31+
32+
def test_power(size=1000):
33+
print("\n\nQ3: Running test_power...\n")
34+
input = make_data1(size=size)
35+
output_slow = pd.DataFrame(slow_power(input[0]))
36+
output_vec_raw = vec_power(input[0])
37+
if output_vec_raw is not None:
38+
output_vec = pd.DataFrame(output_vec_raw)
39+
pd.testing.assert_frame_equal(output_slow, output_vec, check_dtype=False)
40+
print(" Success!")
41+
timings, _ = time_funcs(
42+
[slow_power, vec_power],
43+
[[input[0]], [input[0]]],
44+
["slow_power", "vec_power"],
45+
reps=20,
46+
)
47+
print_time_results(timings, size)
48+
else:
49+
print(" vec_power is not implemented")

q4.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""Unoptimized functions to be vectorized."""
2+
3+
import math
4+
import time
5+
6+
import numpy as np
7+
import pandas as pd
8+
9+
from util import print_time_results, time_funcs
10+
11+
# Q4: Write a vectorized function, vec_addition, which adds two vectors of the same size
12+
13+
14+
def slow_addition(arr1, arr2):
15+
assert len(arr1) == len(arr2)
16+
out = [a1 + a2 for a1, a2 in zip(arr1, arr2)]
17+
return np.array(out)
18+
19+
20+
def vec_addition(arr1, arr2):
21+
pass # insert your code here
22+
23+
24+
def make_data1(size=1000):
25+
np.random.seed(4)
26+
array1 = np.random.rand(size, 1) # np.array
27+
df1 = pd.DataFrame(array1) # pd.DataFrame
28+
return df1
29+
30+
31+
def test_addition(size=1000):
32+
print("\n\nQ4: Running test_addition...\n")
33+
input_1 = make_data1(size=size)
34+
input_2 = make_data1(size=size)
35+
output_slow = pd.DataFrame(slow_addition(input_1[0], input_2[0]))
36+
output_vec = vec_addition(input_1[0], input_2[0])
37+
if output_vec is not None:
38+
output_vec_df = pd.DataFrame(output_vec)
39+
pd.testing.assert_frame_equal(output_slow, output_vec_df, check_dtype=False)
40+
timings, _ = time_funcs(
41+
[slow_addition, vec_addition],
42+
[(input_1[0], input_2[0]), (input_1[0], input_2[0])],
43+
["slow_addition", "vec_addition"],
44+
reps=20,
45+
)
46+
print_time_results(timings, size)
47+
else:
48+
print(" vec_addition is not implemented")

q5.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
"""Unoptimized functions to be vectorized."""
2+
3+
import math
4+
import time
5+
6+
import numpy as np
7+
import pandas as pd
8+
9+
from util import print_time_results, time_funcs
10+
11+
# Q5: Write a vectorized function, vec_grade, which has the same arguments and return as slow_grade.
12+
# Note that when evaluating multiple boolean conditions over a vector, bitwise operators must be used
13+
# Example: indices_of_nums_bt_1_and_5 = (some_other_vector > 1) & (some_other_vector < 5)
14+
15+
16+
def slow_grade(grades):
17+
letter_grades = []
18+
for grade in grades:
19+
if grade >= 90:
20+
letter_grades.append("A")
21+
elif 80 <= grade <= 90:
22+
letter_grades.append("B")
23+
elif 70 <= grade <= 80:
24+
letter_grades.append("C")
25+
elif grade < 70:
26+
letter_grades.append("F")
27+
28+
return np.array(letter_grades)
29+
30+
31+
def vec_grade(grades):
32+
pass # insert your code here
33+
34+
35+
def random_grades(num_grades: int):
36+
return np.random.randint(0, 100, size=num_grades)
37+
38+
39+
def test_grades(num_grades: int = 1000):
40+
print("\n\nQ5: Running test_grades...\n")
41+
input = random_grades(num_grades)
42+
output_slow = pd.DataFrame(slow_grade(input))
43+
output_vec = vec_grade(input)
44+
if output_vec is not None:
45+
output_vec_df = pd.DataFrame(output_vec)
46+
pd.testing.assert_frame_equal(output_slow, output_vec_df, check_dtype=False)
47+
timings, _ = time_funcs(
48+
[slow_grade, vec_grade],
49+
[[input], [input]],
50+
["slow_grade", "vec_grade"],
51+
reps=20,
52+
)
53+
print_time_results(timings, num_grades)
54+
else:
55+
print(" vec_grade is not implemented")

q6.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
"""Unoptimized functions to be vectorized."""
2+
3+
import math
4+
import time
5+
6+
import numpy as np
7+
import pandas as pd
8+
9+
from util import print_time_results, time_funcs
10+
11+
# Q6: Write a vectorized function, vec_pass_fail, which has the same arguments and returns as slow_pass_fail.
12+
# (Hint: try using np.where and np.isin)
13+
14+
15+
def slow_pass_fail(grades):
16+
pass_fail_grades = []
17+
for grade in grades:
18+
if grade == "A":
19+
pass_fail_grades.append("P")
20+
elif grade == "B":
21+
pass_fail_grades.append("P")
22+
elif grade == "C":
23+
pass_fail_grades.append("P")
24+
else:
25+
pass_fail_grades.append("F")
26+
27+
return np.array(pass_fail_grades)
28+
29+
30+
def vec_pass_fail(grades):
31+
pass # insert your code here
32+
33+
34+
def random_grades(num_grades: int):
35+
return np.random.randint(0, 100, size=num_grades)
36+
37+
38+
def test_pass_fail(num_grades: int = 1000):
39+
print("\n\nQ6: Running test_pass_fail...\n")
40+
input = random_grades(num_grades)
41+
output_slow = pd.DataFrame(slow_pass_fail(input))
42+
output_vec = vec_pass_fail(input)
43+
if output_vec is not None:
44+
output_vec_df = pd.DataFrame(output_vec)
45+
pd.testing.assert_frame_equal(output_slow, output_vec_df, check_dtype=False)
46+
timings, _ = time_funcs(
47+
[slow_pass_fail, vec_pass_fail],
48+
[[input], [input]],
49+
["slow_pass_fail", "vec_pass_fail"],
50+
reps=20,
51+
)
52+
print_time_results(timings, num_grades)
53+
else:
54+
print(" vec_pass_fail is not implemented")

0 commit comments

Comments
 (0)