Skip to content

Commit 5eeb4c1

Browse files
committed
Added settings file to hold constants
1 parent 7a2be60 commit 5eeb4c1

6 files changed

+28
-22
lines changed

__init__.py

Whitespace-only changes.

data_preprocessor.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
class DataPreprocessor(object):
2-
pass
2+
def __init__(self, train_df=None, test_df=None):
3+
self.train_df = train_df
4+
self.test_df = test_df

settings.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
TEST_DATA_PATH = "data/data_example_for_tests.csv"
2+
COLUMN_NAMES = ["age",
3+
"workclass",
4+
"fnlwgt",
5+
"education",
6+
"education_num",
7+
"marital_status",
8+
"occupation",
9+
"relationship",
10+
"race",
11+
"sex",
12+
"capital_gain",
13+
"capital_loss",
14+
"hours_per_week",
15+
"native_country",
16+
"income"]

tests/test_data_loader.py

+1-18
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,7 @@
22
from pyspark.sql import DataFrame
33

44
from data_loader import DataLoader
5-
6-
COLUMN_NAMES = ["age",
7-
"workclass",
8-
"fnlwgt",
9-
"education",
10-
"education_num",
11-
"marital_status",
12-
"occupation",
13-
"relationship",
14-
"race",
15-
"sex",
16-
"capital_gain",
17-
"capital_loss",
18-
"hours_per_week",
19-
"native_country",
20-
"income"]
21-
22-
TEST_DATA_PATH = "data/data_example_for_tests.csv"
5+
from settings import TEST_DATA_PATH, COLUMN_NAMES
236

247

258
@pytest.fixture

tests/test_data_preprocessor.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,16 @@
33

44
import pytest
55

6+
from data_loader import DataLoader
67
from data_preprocessor import DataPreprocessor
8+
from settings import TEST_DATA_PATH, COLUMN_NAMES
79

810

911
@pytest.fixture
1012
def preprocessor():
11-
return DataPreprocessor()
13+
df = DataLoader().load_relative(path=TEST_DATA_PATH, columns=COLUMN_NAMES)
14+
return DataPreprocessor(train_df=df, test_df=df)
1215

1316

14-
def test_data_preprocessor_x(preprocessor):
17+
def test_data_preprocessor_explore_factors(preprocessor):
1518
pass

todo_list.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
- [x] start and configure Spark
22
- [x] load data from the files
33
- [ ] prepare data for classification models
4+
- [ ] explore the data without knowing distributions
5+
- [ ] standardize and clean the data
46
- [ ] string encode the factor columns
57
- [ ] one-hot encode the numerically encoded factor columns
68
- [ ] build dataframe with features vector and labels column
@@ -11,4 +13,4 @@
1113
- [ ] prepare data for regression
1214
- [ ] fit regression model(s)
1315
- [ ] obtain the regression metrics and compare the models
14-
- [ ] improve README.md
16+
- [ ] improve README.md

0 commit comments

Comments
 (0)