|
1 | 1 | """Data transformer intergration testing module."""
|
2 | 2 |
|
3 |
| - |
4 |
| -# Data Transformer tests that should be implemented in the future. |
5 |
| -def test_constant(): |
6 |
| - """Test transforming a dataframe containing constant values.""" |
7 |
| - |
8 |
| - |
9 |
| -def test_df_continuous(): |
10 |
| - """Test transforming a dataframe containing only continuous values.""" |
11 |
| - # validate output ranges [0, 1] |
12 |
| - # validate output shape (# samples, # output dims) |
13 |
| - # validate that forward transform is **not** deterministic |
14 |
| - # make sure it can be inverted |
15 |
| - |
16 |
| - |
17 |
| -def test_df_categorical(): |
18 |
| - """Test transforming a dataframe containing only categorical values.""" |
19 |
| - # validate output ranges [0, 1] |
20 |
| - # validate output shape (# samples, # output dims) |
21 |
| - # validate that forward transform is deterministic |
22 |
| - # make sure it can be inverted |
23 |
| - |
24 |
| - |
25 |
| -def test_df_mixed(): |
26 |
| - """Test transforming a dataframe containing mixed data types.""" |
27 |
| - |
28 |
| - |
29 |
| -def test_df_mixed_nan(): |
30 |
| - """Test transforming a dataframe containing mixed data types + NaN for categoricals.""" |
31 |
| - |
32 |
| - |
33 |
| -def test_np_continuous(): |
34 |
| - """Test transforming a np.array containing only continuous values.""" |
35 |
| - |
36 |
| - |
37 |
| -def test_np_categorical(): |
38 |
| - """Test transforming a np.array containing only categorical values.""" |
39 |
| - |
40 |
| - |
41 |
| -def test_np_mixed(): |
42 |
| - """Test transforming a np.array containing mixed data types.""" |
| 3 | +from unittest import TestCase |
| 4 | + |
| 5 | +import numpy as np |
| 6 | +import pandas as pd |
| 7 | + |
| 8 | +from ctgan.data_transformer import DataTransformer |
| 9 | + |
| 10 | + |
| 11 | +class TestDataTransformer(TestCase): |
| 12 | + |
| 13 | + def test_constant(self): |
| 14 | + """Test transforming a dataframe containing constant values.""" |
| 15 | + # Setup |
| 16 | + data = pd.DataFrame({'cnt': [123] * 1000}) |
| 17 | + transformer = DataTransformer() |
| 18 | + |
| 19 | + # Run |
| 20 | + transformer.fit(data, []) |
| 21 | + new_data = transformer.transform(data) |
| 22 | + transformer.inverse_transform(new_data) |
| 23 | + |
| 24 | + # Assert transformed values are between -1 and 1 |
| 25 | + assert (new_data[:, 0] > -np.ones(len(new_data))).all() |
| 26 | + assert (new_data[:, 0] < np.ones(len(new_data))).all() |
| 27 | + |
| 28 | + # Assert transformed values are a gaussian centered in 0 and with std ~ 0 |
| 29 | + assert -.1 < np.mean(new_data[:, 0]) < .1 |
| 30 | + assert 0 <= np.std(new_data[:, 0]) < .1 |
| 31 | + |
| 32 | + # Assert there are at most `max_columns=10` one hot columns |
| 33 | + assert new_data.shape[0] == 1000 |
| 34 | + assert new_data.shape[1] <= 11 |
| 35 | + assert np.isin(new_data[:, 1:], [0, 1]).all() |
| 36 | + |
| 37 | + def test_df_continuous(self): |
| 38 | + """Test transforming a dataframe containing only continuous values.""" |
| 39 | + # Setup |
| 40 | + data = pd.DataFrame({'col': np.random.normal(size=1000)}) |
| 41 | + transformer = DataTransformer() |
| 42 | + |
| 43 | + # Run |
| 44 | + transformer.fit(data, []) |
| 45 | + new_data = transformer.transform(data) |
| 46 | + transformer.inverse_transform(new_data) |
| 47 | + |
| 48 | + # Assert transformed values are between -1 and 1 |
| 49 | + assert (new_data[:, 0] > -np.ones(len(new_data))).all() |
| 50 | + assert (new_data[:, 0] < np.ones(len(new_data))).all() |
| 51 | + |
| 52 | + # Assert transformed values are a gaussian centered in 0 and with std = 1/4 |
| 53 | + assert -.1 < np.mean(new_data[:, 0]) < .1 |
| 54 | + assert .2 < np.std(new_data[:, 0]) < .3 |
| 55 | + |
| 56 | + # Assert there are at most `max_columns=10` one hot columns |
| 57 | + assert new_data.shape[0] == 1000 |
| 58 | + assert new_data.shape[1] <= 11 |
| 59 | + assert np.isin(new_data[:, 1:], [0, 1]).all() |
| 60 | + |
| 61 | + def test_df_categorical_constant(self): |
| 62 | + """Test transforming a dataframe containing only constant categorical values.""" |
| 63 | + # Setup |
| 64 | + data = pd.DataFrame({'cnt': [123] * 1000}) |
| 65 | + transformer = DataTransformer() |
| 66 | + |
| 67 | + # Run |
| 68 | + transformer.fit(data, ['cnt']) |
| 69 | + new_data = transformer.transform(data) |
| 70 | + transformer.inverse_transform(new_data) |
| 71 | + |
| 72 | + # Assert there is only 1 one hot vector |
| 73 | + assert np.array_equal(new_data, np.ones((len(data), 1))) |
| 74 | + |
| 75 | + def test_df_categorical(self): |
| 76 | + """Test transforming a dataframe containing only categorical values.""" |
| 77 | + # Setup |
| 78 | + data = pd.DataFrame({'cat': np.random.choice(['a', 'b', 'c'], size=1000)}) |
| 79 | + transformer = DataTransformer() |
| 80 | + |
| 81 | + # Run |
| 82 | + transformer.fit(data, ['cat']) |
| 83 | + new_data = transformer.transform(data) |
| 84 | + transformer.inverse_transform(new_data) |
| 85 | + |
| 86 | + # Assert there are 3 one hot vectors |
| 87 | + assert new_data.shape[0] == 1000 |
| 88 | + assert new_data.shape[1] == 3 |
| 89 | + assert np.isin(new_data[:, 1:], [0, 1]).all() |
| 90 | + |
| 91 | + def test_df_mixed(self): |
| 92 | + """Test transforming a dataframe containing mixed data types.""" |
| 93 | + # Setup |
| 94 | + data = pd.DataFrame({ |
| 95 | + 'num': np.random.normal(size=1000), |
| 96 | + 'cat': np.random.choice(['a', 'b', 'c'], size=1000) |
| 97 | + }) |
| 98 | + transformer = DataTransformer() |
| 99 | + |
| 100 | + # Run |
| 101 | + transformer.fit(data, ['cat']) |
| 102 | + new_data = transformer.transform(data) |
| 103 | + transformer.inverse_transform(new_data) |
| 104 | + |
| 105 | + # Assert transformed numerical values are between -1 and 1 |
| 106 | + assert (new_data[:, 0] > -np.ones(len(new_data))).all() |
| 107 | + assert (new_data[:, 0] < np.ones(len(new_data))).all() |
| 108 | + |
| 109 | + # Assert transformed numerical values are a gaussian centered in 0 and with std = 1/4 |
| 110 | + assert -.1 < np.mean(new_data[:, 0]) < .1 |
| 111 | + assert .2 < np.std(new_data[:, 0]) < .3 |
| 112 | + |
| 113 | + # Assert there are at most `max_columns=10` one hot columns for the numerical values |
| 114 | + # and 3 for the categorical ones |
| 115 | + assert new_data.shape[0] == 1000 |
| 116 | + assert 5 <= new_data.shape[1] <= 17 |
| 117 | + assert np.isin(new_data[:, 1:], [0, 1]).all() |
| 118 | + |
| 119 | + def test_numpy(self): |
| 120 | + """Test transforming a numpy array.""" |
| 121 | + # Setup |
| 122 | + data = pd.DataFrame({ |
| 123 | + 'num': np.random.normal(size=1000), |
| 124 | + 'cat': np.random.choice(['a', 'b', 'c'], size=1000) |
| 125 | + }) |
| 126 | + data = np.array(data) |
| 127 | + transformer = DataTransformer() |
| 128 | + |
| 129 | + # Run |
| 130 | + transformer.fit(data, [1]) |
| 131 | + new_data = transformer.transform(data) |
| 132 | + transformer.inverse_transform(new_data) |
| 133 | + |
| 134 | + # Assert transformed numerical values are between -1 and 1 |
| 135 | + assert (new_data[:, 0] > -np.ones(len(new_data))).all() |
| 136 | + assert (new_data[:, 0] < np.ones(len(new_data))).all() |
| 137 | + |
| 138 | + # Assert transformed numerical values are a gaussian centered in 0 and with std = 1/4 |
| 139 | + assert -.1 < np.mean(new_data[:, 0]) < .1 |
| 140 | + assert .2 < np.std(new_data[:, 0]) < .3 |
| 141 | + |
| 142 | + # Assert there are at most `max_columns=10` one hot columns for the numerical values |
| 143 | + # and 3 for the categorical ones |
| 144 | + assert new_data.shape[0] == 1000 |
| 145 | + assert 5 <= new_data.shape[1] <= 17 |
| 146 | + assert np.isin(new_data[:, 1:], [0, 1]).all() |
0 commit comments