Skip to content

Commit 7520bd7

Browse files
committed
fix: banking example dataset's link dead, replaced with original source
1 parent e91019f commit 7520bd7

File tree

3 files changed

+43
-10
lines changed

3 files changed

+43
-10
lines changed

examples/bank_marketing_data/banking_data.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55
import pandas as pd
66

77
from pandas_profiling import ProfileReport
8-
from pandas_profiling.utils.cache import cache_file
8+
from pandas_profiling.utils.cache import cache_zipped_file
99

1010
if __name__ == "__main__":
11-
file_name = cache_file(
11+
file_name = cache_zipped_file(
1212
"bank-full.csv",
13-
"https://storage.googleapis.com/erwinh-public-data/bankingdata/bank-full.csv",
13+
"https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip",
1414
)
1515

1616
# Download the UCI Bank Marketing Dataset

src/pandas_profiling/utils/cache.py

+37-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Dataset cache utility functions"""
2+
import zipfile
23
from pathlib import Path
34

45
import requests
@@ -20,9 +21,41 @@ def cache_file(file_name: str, url: str) -> Path:
2021
data_path = get_data_path()
2122
data_path.mkdir(exist_ok=True)
2223

24+
file_path = data_path / file_name
25+
2326
# If not exists, download and create file
24-
if not (data_path / file_name).exists():
25-
data = requests.get(url)
26-
(data_path / file_name).write_bytes(data.content)
27+
if not file_path.exists():
28+
response = requests.get(url)
29+
file_path.write_bytes(response.content)
30+
31+
return file_path
32+
33+
34+
def cache_zipped_file(file_name: str, url: str) -> Path:
35+
"""Check if file_name already is in the data path, otherwise download it from url.
36+
37+
Args:
38+
file_name: the file name
39+
url: the URL of the dataset
40+
41+
Returns:
42+
The relative path to the dataset
43+
"""
44+
45+
data_path = get_data_path()
46+
data_path.mkdir(exist_ok=True)
47+
48+
file_path = data_path / file_name
49+
50+
# If not exists, download and create file
51+
if not file_path.exists():
52+
response = requests.get(url)
53+
tmp_path = data_path / "tmp.zip"
54+
tmp_path.write_bytes(response.content)
55+
56+
with zipfile.ZipFile(tmp_path, "r") as zip_file:
57+
zip_file.extract(file_path.name, data_path)
58+
59+
tmp_path.unlink()
2760

28-
return data_path / file_name
61+
return file_path

tests/issues/test_issue377.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,14 @@
88
import pytest
99

1010
import pandas_profiling
11-
from pandas_profiling.utils.cache import cache_file
11+
from pandas_profiling.utils.cache import cache_zipped_file
1212

1313

1414
@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher")
1515
def test_issue377():
16-
file_name = cache_file(
16+
file_name = cache_zipped_file(
1717
"bank-full.csv",
18-
"https://storage.googleapis.com/erwinh-public-data/bankingdata/bank-full.csv",
18+
"https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip",
1919
)
2020

2121
# Download the UCI Bank Marketing Dataset

0 commit comments

Comments
 (0)