Add integration tests for DataTransformer, remove unnecessary code and fix max_clusters bug (#314)

fealho · web-flow · commit 7bca5faffa0a · 2023-08-24T16:48:16.000-07:00
* Remove unnecessary tests

* Lots of minor changes

* Fix lint

* Fix lint

* Fix lint...

* Add weight parameter

* Fix test

* Fix typos
diff --git a/ctgan/data_transformer.py b/ctgan/data_transformer.py
@@ -18,8 +18,8 @@
 class DataTransformer(object):
     """Data Transformer.
 
-    Model continuous columns with a BayesianGMM and normalized to a scalar [0, 1] and a vector.
-    Discrete columns are encoded using a scikit-learn OneHotEncoder.
+    Model continuous columns with a BayesianGMM and normalize them to a scalar between [-1, 1]
+    and a vector. Discrete columns are encoded using a OneHotEncoder.
     """
 
     def __init__(self, max_clusters=10, weight_threshold=0.005):
@@ -47,7 +47,10 @@ def _fit_continuous(self, data):
         """
         column_name = data.columns[0]
         gm = ClusterBasedNormalizer(
-            missing_value_generation='from_column', max_clusters=min(len(data), 10))
+            missing_value_generation='from_column',
+            max_clusters=min(len(data), self._max_clusters),
+            weight_threshold=self._weight_threshold
+        )
         gm.fit(data, column_name)
         num_components = sum(gm.valid_component_indicator)
 
diff --git a/tests/integration/synthesizer/test_ctgan.py b/tests/integration/synthesizer/test_ctgan.py
@@ -231,56 +231,6 @@ def test_fixed_random_seed():
     np.testing.assert_array_equal(sampled_0_1, sampled_1_1)
 
 
-# Below are CTGAN tests that should be implemented in the future
-def test_continuous():
-    """Test training the CTGAN synthesizer on a continuous dataset."""
-    # assert the distribution of the samples is close to the distribution of the data
-    # using kstest:
-    #   - uniform (assert p-value > 0.05)
-    #   - gaussian (assert p-value > 0.05)
-    #   - inversely correlated (assert correlation < 0)
-    pass
-
-
-def test_categorical():
-    """Test training the CTGAN synthesizer on a categorical dataset."""
-    # assert the distribution of the samples is close to the distribution of the data
-    # using cstest:
-    #   - uniform (assert p-value > 0.05)
-    #   - very skewed / biased? (assert p-value > 0.05)
-    #   - inversely correlated (assert correlation < 0)
-    pass
-
-
-def test_categorical_log_frequency():
-    """Test training the CTGAN synthesizer on a small categorical dataset."""
-    # assert the distribution of the samples is close to the distribution of the data
-    # using cstest:
-    #   - uniform (assert p-value > 0.05)
-    #   - very skewed / biased? (assert p-value > 0.05)
-    #   - inversely correlated (assert correlation < 0)
-    pass
-
-
-def test_mixed():
-    """Test training the CTGAN synthesizer on a small mixed-type dataset."""
-    # assert the distribution of the samples is close to the distribution of the data
-    # using a kstest for continuous + a cstest for categorical.
-    pass
-
-
-def test_conditional():
-    """Test training the CTGAN synthesizer and sampling conditioned on a categorical."""
-    # verify that conditioning increases the likelihood of getting a sample with the specified
-    # categorical value
-    pass
-
-
-def test_batch_size_pack_size():
-    """Test that if batch size is not a multiple of pack size, it raises a sane error."""
-    pass
-
-
 def test_ctgan_save_and_load(tmpdir):
     """Test that the ``CTGAN`` model can be saved and loaded."""
     # Setup
diff --git a/tests/integration/synthesizer/test_tvae.py b/tests/integration/synthesizer/test_tvae.py
@@ -56,28 +56,6 @@ def test_drop_last_false():
     assert correct >= 95
 
 
-# TVAE tests that should be implemented in the future.
-def test_continuous():
-    """Test training the TVAE synthesizer on a small continuous dataset."""
-    # verify that the distribution of the samples is close to the distribution of the data
-    # using a kstest.
-    pass
-
-
-def test_categorical():
-    """Test training the TVAE synthesizer on a small categorical dataset."""
-    # verify that the distribution of the samples is close to the distribution of the data
-    # using a cstest.
-    pass
-
-
-def test_mixed():
-    """Test training the TVAE synthesizer on a small mixed-type dataset."""
-    # verify that the distribution of the samples is close to the distribution of the data
-    # using a kstest for continuous + a cstest for categorical.
-    pass
-
-
 def test__loss_function():
     """Test the TVAE produces average values similar to the training data."""
     data = pd.DataFrame({
diff --git a/tests/integration/test_data_transformer.py b/tests/integration/test_data_transformer.py
@@ -1,42 +1,146 @@
 """Data transformer intergration testing module."""
 
-
-# Data Transformer tests that should be implemented in the future.
-def test_constant():
-    """Test transforming a dataframe containing constant values."""
-
-
-def test_df_continuous():
-    """Test transforming a dataframe containing only continuous values."""
-    # validate output ranges [0, 1]
-    # validate output shape (# samples, # output dims)
-    # validate that forward transform is **not** deterministic
-    # make sure it can be inverted
-
-
-def test_df_categorical():
-    """Test transforming a dataframe containing only categorical values."""
-    # validate output ranges [0, 1]
-    # validate output shape (# samples, # output dims)
-    # validate that forward transform is deterministic
-    # make sure it can be inverted
-
-
-def test_df_mixed():
-    """Test transforming a dataframe containing mixed data types."""
-
-
-def test_df_mixed_nan():
-    """Test transforming a dataframe containing mixed data types + NaN for categoricals."""
-
-
-def test_np_continuous():
-    """Test transforming a np.array containing only continuous values."""
-
-
-def test_np_categorical():
-    """Test transforming a np.array containing only categorical values."""
-
-
-def test_np_mixed():
-    """Test transforming a np.array containing mixed data types."""
+from unittest import TestCase
+
+import numpy as np
+import pandas as pd
+
+from ctgan.data_transformer import DataTransformer
+
+
+class TestDataTransformer(TestCase):
+
+    def test_constant(self):
+        """Test transforming a dataframe containing constant values."""
+        # Setup
+        data = pd.DataFrame({'cnt': [123] * 1000})
+        transformer = DataTransformer()
+
+        # Run
+        transformer.fit(data, [])
+        new_data = transformer.transform(data)
+        transformer.inverse_transform(new_data)
+
+        # Assert transformed values are between -1 and 1
+        assert (new_data[:, 0] > -np.ones(len(new_data))).all()
+        assert (new_data[:, 0] < np.ones(len(new_data))).all()
+
+        # Assert transformed values are a gaussian centered in 0 and with std ~ 0
+        assert -.1 < np.mean(new_data[:, 0]) < .1
+        assert 0 <= np.std(new_data[:, 0]) < .1
+
+        # Assert there are at most `max_columns=10` one hot columns
+        assert new_data.shape[0] == 1000
+        assert new_data.shape[1] <= 11
+        assert np.isin(new_data[:, 1:], [0, 1]).all()
+
+    def test_df_continuous(self):
+        """Test transforming a dataframe containing only continuous values."""
+        # Setup
+        data = pd.DataFrame({'col': np.random.normal(size=1000)})
+        transformer = DataTransformer()
+
+        # Run
+        transformer.fit(data, [])
+        new_data = transformer.transform(data)
+        transformer.inverse_transform(new_data)
+
+        # Assert transformed values are between -1 and 1
+        assert (new_data[:, 0] > -np.ones(len(new_data))).all()
+        assert (new_data[:, 0] < np.ones(len(new_data))).all()
+
+        # Assert transformed values are a gaussian centered in 0 and with std = 1/4
+        assert -.1 < np.mean(new_data[:, 0]) < .1
+        assert .2 < np.std(new_data[:, 0]) < .3
+
+        # Assert there are at most `max_columns=10` one hot columns
+        assert new_data.shape[0] == 1000
+        assert new_data.shape[1] <= 11
+        assert np.isin(new_data[:, 1:], [0, 1]).all()
+
+    def test_df_categorical_constant(self):
+        """Test transforming a dataframe containing only constant categorical values."""
+        # Setup
+        data = pd.DataFrame({'cnt': [123] * 1000})
+        transformer = DataTransformer()
+
+        # Run
+        transformer.fit(data, ['cnt'])
+        new_data = transformer.transform(data)
+        transformer.inverse_transform(new_data)
+
+        # Assert there is only 1 one hot vector
+        assert np.array_equal(new_data, np.ones((len(data), 1)))
+
+    def test_df_categorical(self):
+        """Test transforming a dataframe containing only categorical values."""
+        # Setup
+        data = pd.DataFrame({'cat': np.random.choice(['a', 'b', 'c'], size=1000)})
+        transformer = DataTransformer()
+
+        # Run
+        transformer.fit(data, ['cat'])
+        new_data = transformer.transform(data)
+        transformer.inverse_transform(new_data)
+
+        # Assert there are 3 one hot vectors
+        assert new_data.shape[0] == 1000
+        assert new_data.shape[1] == 3
+        assert np.isin(new_data[:, 1:], [0, 1]).all()
+
+    def test_df_mixed(self):
+        """Test transforming a dataframe containing mixed data types."""
+        # Setup
+        data = pd.DataFrame({
+            'num': np.random.normal(size=1000),
+            'cat': np.random.choice(['a', 'b', 'c'], size=1000)
+        })
+        transformer = DataTransformer()
+
+        # Run
+        transformer.fit(data, ['cat'])
+        new_data = transformer.transform(data)
+        transformer.inverse_transform(new_data)
+
+        # Assert transformed numerical values are between -1 and 1
+        assert (new_data[:, 0] > -np.ones(len(new_data))).all()
+        assert (new_data[:, 0] < np.ones(len(new_data))).all()
+
+        # Assert transformed numerical values are a gaussian centered in 0 and with std = 1/4
+        assert -.1 < np.mean(new_data[:, 0]) < .1
+        assert .2 < np.std(new_data[:, 0]) < .3
+
+        # Assert there are at most `max_columns=10` one hot columns for the numerical values
+        # and 3 for the categorical ones
+        assert new_data.shape[0] == 1000
+        assert 5 <= new_data.shape[1] <= 17
+        assert np.isin(new_data[:, 1:], [0, 1]).all()
+
+    def test_numpy(self):
+        """Test transforming a numpy array."""
+        # Setup
+        data = pd.DataFrame({
+            'num': np.random.normal(size=1000),
+            'cat': np.random.choice(['a', 'b', 'c'], size=1000)
+        })
+        data = np.array(data)
+        transformer = DataTransformer()
+
+        # Run
+        transformer.fit(data, [1])
+        new_data = transformer.transform(data)
+        transformer.inverse_transform(new_data)
+
+        # Assert transformed numerical values are between -1 and 1
+        assert (new_data[:, 0] > -np.ones(len(new_data))).all()
+        assert (new_data[:, 0] < np.ones(len(new_data))).all()
+
+        # Assert transformed numerical values are a gaussian centered in 0 and with std = 1/4
+        assert -.1 < np.mean(new_data[:, 0]) < .1
+        assert .2 < np.std(new_data[:, 0]) < .3
+
+        # Assert there are at most `max_columns=10` one hot columns for the numerical values
+        # and 3 for the categorical ones
+        assert new_data.shape[0] == 1000
+        assert 5 <= new_data.shape[1] <= 17
+        assert np.isin(new_data[:, 1:], [0, 1]).all()
diff --git a/tests/unit/synthesizer/test_ctgan.py b/tests/unit/synthesizer/test_ctgan.py
@@ -295,49 +295,3 @@ def test__validate_discrete_columns(self):
         ctgan = CTGAN(epochs=1)
         with pytest.raises(ValueError, match=r'Invalid columns found: {\'doesnt exist\'}'):
             ctgan.fit(data, discrete_columns)
-
-    def test_sample(self):
-        """Test `sample` correctly sets `condition_info` and `global_condition_vec`.
-
-        Tests the first 7 lines of sample by mocking the DataTransformer and DataSampler
-        and checking that they are being correctly used.
-
-        Setup:
-            - Create and fit the synthesizer
-            - Mock DataTransformer, DataSampler
-
-        Input:
-            - n = integer
-            - condition_column = string (not None)
-            - condition_value = string (not None)
-
-        Output:
-            Not relevant
-
-        Note:
-            - I'm not sure we need this test
-        """
-
-    def test_set_device(self):
-        """Test 'set_device' if a GPU is available.
-
-        Check that decoder/encoder can successfully be moved to the device.
-        If the machine doesn't have a GPU, this test shouldn't run.
-
-        Setup:
-            - Move decoder/encoder to device
-
-        Input:
-            - device = string
-
-        Output:
-            None
-
-        Side Effects:
-            - Set `self._device` to `device`
-            - Moves `self.decoder` to `self._device`
-
-        Note:
-            - Need to be careful when checking whether the encoder is actually set
-            to the right device, since it's not saved (it's only used in fit).
-        """
diff --git a/tests/unit/synthesizer/test_tvae.py b/tests/unit/synthesizer/test_tvae.py
diff --git a/tests/unit/test_data_transformer.py b/tests/unit/test_data_transformer.py