Added settings file to hold constants

wtsimple · wtsimple · commit 5eeb4c1e789b · 2020-07-14T08:55:05.000-04:00
diff --git a/__init__.py b/__init__.py
diff --git a/data_preprocessor.py b/data_preprocessor.py
@@ -1,2 +1,4 @@
 class DataPreprocessor(object):
-    pass
+    def __init__(self, train_df=None, test_df=None):
+        self.train_df = train_df
+        self.test_df = test_df
diff --git a/settings.py b/settings.py
@@ -0,0 +1,16 @@
+TEST_DATA_PATH = "data/data_example_for_tests.csv"
+COLUMN_NAMES = ["age",
+                "workclass",
+                "fnlwgt",
+                "education",
+                "education_num",
+                "marital_status",
+                "occupation",
+                "relationship",
+                "race",
+                "sex",
+                "capital_gain",
+                "capital_loss",
+                "hours_per_week",
+                "native_country",
+                "income"]
diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py
@@ -2,24 +2,7 @@
 from pyspark.sql import DataFrame
 
 from data_loader import DataLoader
-
-COLUMN_NAMES = ["age",
-                "workclass",
-                "fnlwgt",
-                "education",
-                "education_num",
-                "marital_status",
-                "occupation",
-                "relationship",
-                "race",
-                "sex",
-                "capital_gain",
-                "capital_loss",
-                "hours_per_week",
-                "native_country",
-                "income"]
-
-TEST_DATA_PATH = "data/data_example_for_tests.csv"
+from settings import TEST_DATA_PATH, COLUMN_NAMES
 
 
 @pytest.fixture
diff --git a/tests/test_data_preprocessor.py b/tests/test_data_preprocessor.py
@@ -3,13 +3,16 @@
 
 import pytest
 
+from data_loader import DataLoader
 from data_preprocessor import DataPreprocessor
+from settings import TEST_DATA_PATH, COLUMN_NAMES
 
 
 @pytest.fixture
 def preprocessor():
-    return DataPreprocessor()
+    df = DataLoader().load_relative(path=TEST_DATA_PATH, columns=COLUMN_NAMES)
+    return DataPreprocessor(train_df=df, test_df=df)
 
 
-def test_data_preprocessor_x(preprocessor):
+def test_data_preprocessor_explore_factors(preprocessor):
     pass
diff --git a/todo_list.md b/todo_list.md
@@ -1,6 +1,8 @@
 - [x] start and configure Spark
 - [x] load data from the files
 - [ ] prepare data for classification models
+    - [ ] explore the data without knowing distributions
+    - [ ] standardize and clean the data
     - [ ] string encode the factor columns
     - [ ] one-hot encode the numerically encoded factor columns
     - [ ] build dataframe with features vector and labels column
@@ -11,4 +13,4 @@
 - [ ] prepare data for regression
 - [ ] fit regression model(s)
 - [ ] obtain the regression metrics and compare the models
-- [ ]  improve README.md
+- [ ] improve README.md