Skip to content

Commit 3a652f0

Browse files
committed
data loader seems to be working OK
1 parent 40a55f9 commit 3a652f0

File tree

4 files changed

+78
-1
lines changed

4 files changed

+78
-1
lines changed

data/data_example_for_tests.csv

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
39, State-gov, 77516, Bachelors, 13, Never-married, Adm-clerical, Not-in-family, White, Male, 2174, 0, 40, United-States, <=50K
2+
50, Self-emp-not-inc, 83311, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 13, United-States, <=50K
3+
38, Private, 215646, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
4+
53, Private, 234721, 11th, 7, Married-civ-spouse, Handlers-cleaners, Husband, Black, Male, 0, 0, 40, United-States, <=50K
5+
28, Private, 338409, Bachelors, 13, Married-civ-spouse, Prof-specialty, Wife, Black, Female, 0, 0, 40, Cuba, <=50K
6+
37, Private, 284582, Masters, 14, Married-civ-spouse, Exec-managerial, Wife, White, Female, 0, 0, 40, United-States, <=50K
7+
49, Private, 160187, 9th, 5, Married-spouse-absent, Other-service, Not-in-family, Black, Female, 0, 0, 16, Jamaica, <=50K
8+
52, Self-emp-not-inc, 209642, HS-grad, 9, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 45, United-States, >50K
9+
31, Private, 45781, Masters, 14, Never-married, Prof-specialty, Not-in-family, White, Female, 14084, 0, 50, United-States, >50K
10+
42, Private, 159449, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 5178, 0, 40, United-States, >50K

data_loader.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import inspect
2+
import os
3+
4+
from spark_launcher import SparkLauncher
5+
6+
7+
class DataLoader(object):
8+
def __init__(self):
9+
self.spark = SparkLauncher()
10+
11+
12+
def load_relative(self, path='', columns=None):
13+
absolute_path = self._get_absolute_path(path)
14+
df = self.spark.session.read.csv(absolute_path, header=False, inferSchema=True)
15+
if columns:
16+
df = self._rename_columns(columns, df)
17+
return df
18+
19+
20+
@staticmethod
21+
def _rename_columns(columns, df):
22+
for new_col, old_col in zip(columns, df.columns):
23+
df = df.withColumnRenamed(old_col, new_col)
24+
return df
25+
26+
27+
def _get_absolute_path(self, relative_path=''):
28+
current_file = inspect.getfile(self.__class__)
29+
directory = os.path.dirname(current_file)
30+
file_name = os.path.join(
31+
directory, relative_path)
32+
return file_name

tests/test_data_loader.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import pytest
2+
from pyspark.sql import DataFrame
3+
4+
from data_loader import DataLoader
5+
6+
COLUMN_NAMES = ["age",
7+
"workclass",
8+
"fnlwgt",
9+
"education",
10+
"education_num",
11+
"marital_status",
12+
"occupation",
13+
"relationship",
14+
"race",
15+
"sex",
16+
"capital_gain",
17+
"capital_loss",
18+
"hours_per_week",
19+
"native_country",
20+
"income"]
21+
22+
23+
@pytest.fixture
24+
def data_loader():
25+
return DataLoader()
26+
27+
28+
def test_data_loader_loads_data(data_loader):
29+
df = data_loader.load_relative(path="data/data_example_for_tests.csv", columns=COLUMN_NAMES)
30+
assert isinstance(df, DataFrame)
31+
assert df.columns == COLUMN_NAMES
32+
# Check some values from the first row
33+
first_row = df.first()
34+
assert first_row.income == ' <=50K'
35+
assert first_row.age == 39

todo_list.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
- [x] start and configure Spark
2-
- [ ] load data from the files
2+
- [ ] load_relative data from the files
33
- [ ] prepare data for classification models
44
- [ ] string encode the factor columns
55
- [ ] one-hot encode the numerically encoded factor columns

0 commit comments

Comments
 (0)