Replace DataFrame.drop_duplicates with dictionary_encode and np.unique

tfx-copybara · tfx-copybara · commit 076ff26781be · 2020-02-24T10:31:18.000-08:00
PiperOrigin-RevId: 296922505
diff --git a/RELEASE.md b/RELEASE.md
@@ -17,6 +17,7 @@
 ## Bug Fixes and Other Changes
 
 *   Fix facets visualization.
+*   Optimize LiftStatsGenerator for string features.
 
 ## Breaking Changes
 
diff --git a/tensorflow_data_validation/statistics/generators/lift_stats_generator.py b/tensorflow_data_validation/statistics/generators/lift_stats_generator.py
@@ -133,21 +133,35 @@ def _get_example_value_presence(
     return None
 
   arr_flat = arr.flatten()
+  is_binary_like = arrow_util.is_binary_like(arr_flat.type)
+  assert boundaries is None or not is_binary_like, (
+      'Boundaries can only be applied to numeric columns')
+  if is_binary_like:
+    # use dictionary_encode so we can use np.unique on object arrays
+    dict_array = arr_flat.dictionary_encode()
+    arr_flat = dict_array.indices
+    arr_flat_dict = np.asarray(dict_array.dictionary)
   example_indices_flat = example_indices[
       array_util.GetFlattenedArrayParentIndices(arr).to_numpy()]
   if boundaries is not None:
     element_indices, bins = bin_util.bin_array(arr_flat, boundaries)
-    df = pd.DataFrame({
-        'example_indices': example_indices_flat[element_indices],
-        'values': bins
-    })
+    pairs = np.vstack([example_indices_flat[element_indices], bins])
   else:
-    df = pd.DataFrame({
-        'example_indices': example_indices_flat,
-        'values': np.asarray(arr_flat)
-    })
-  df_unique = df.drop_duplicates()
-  return df_unique.set_index('example_indices')['values']
+    pairs = np.vstack([example_indices_flat, np.asarray(arr_flat)])
+  if not pairs.size:
+    return None
+  # Deduplicate values which show up more than once in the same example. This
+  # makes P(X=x|Y=y) in the standard lift definition behave as
+  # P(x \in Xs | y \in Ys) if examples contain more than one value of X and Y.
+  unique_pairs = np.unique(pairs, axis=1)
+  example_indices = unique_pairs[0, :]
+  values = unique_pairs[1, :]
+  if is_binary_like:
+    # return binary like values a pd.Categorical wrapped in a Series. This makes
+    # subsqeuent operations like pd.Merge cheaper.
+    values = pd.Categorical.from_codes(values, categories=arr_flat_dict)
+  return pd.Series(values, name='values',
+                   index=pd.Index(example_indices, name='example_indices'))
 
 
 def _to_partial_copresence_counts(
diff --git a/tensorflow_data_validation/statistics/generators/lift_stats_generator_test.py b/tensorflow_data_validation/statistics/generators/lift_stats_generator_test.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 from absl.testing import absltest
+import numpy as np
 import pandas as pd
 import pyarrow as pa
 
@@ -46,6 +47,21 @@ def test_example_value_presence(self):
         lift_stats_generator._get_example_value_presence(
             t, types.FeaturePath(['x']), boundaries=None))
 
+  def test_example_value_presence_string_value(self):
+    t = pa.Table.from_arrays([
+        pa.array([['a'], ['a', 'a'], ['a', 'b'], ['b']]),
+    ], ['x'])
+    expected_cat = pd.Categorical.from_codes([0, 0, 0, 1, 1],
+                                             categories=['a', 'b'])
+    expected_series = pd.Series(expected_cat,
+                                name='values',
+                                index=pd.Index([0, 1, 2, 2, 3],
+                                               name='example_indices'))
+    pd.testing.assert_series_equal(
+        expected_series,
+        lift_stats_generator._get_example_value_presence(
+            t, types.FeaturePath(['x']), boundaries=None))
+
   def test_example_value_presence_none_value(self):
     t = pa.Table.from_arrays([
         pa.array([[1], None]),
@@ -709,6 +725,64 @@ def test_lift_null_y(self):
         add_default_slice_key_to_input=True,
         add_default_slice_key_to_output=True)
 
+  def test_lift_missing_x_and_y(self):
+    examples = [
+        pa.Table.from_arrays([
+            # explicitly construct type to avoid treating as null type
+            pa.array([], type=pa.list_(pa.binary())),
+            pa.array([], type=pa.list_(pa.binary())),
+        ], ['categorical_x', 'string_y']),
+    ]
+    schema = text_format.Parse(
+        """
+        feature {
+          name: 'categorical_x'
+          type: BYTES
+        }
+        feature {
+          name: 'string_y'
+          type: BYTES
+        }
+        """, schema_pb2.Schema())
+    expected_result = []
+    generator = lift_stats_generator.LiftStatsGenerator(
+        schema=schema, y_path=types.FeaturePath(['string_y']))
+    self.assertSlicingAwareTransformOutputEqual(
+        examples,
+        generator,
+        expected_result,
+        add_default_slice_key_to_input=True,
+        add_default_slice_key_to_output=True)
+
+  def test_lift_float_y_is_nan(self):
+    # after calling bin_array, this is effectively an empty array.
+    examples = [
+        pa.Table.from_arrays([
+            pa.array([['a']]),
+            pa.array([[np.nan]]),
+        ], ['categorical_x', 'float_y']),
+    ]
+    schema = text_format.Parse(
+        """
+        feature {
+          name: 'categorical_x'
+          type: BYTES
+        }
+        feature {
+          name: 'float_y'
+          type: FLOAT
+        }
+        """, schema_pb2.Schema())
+    expected_result = []
+    generator = lift_stats_generator.LiftStatsGenerator(
+        schema=schema, y_path=types.FeaturePath(['float_y']), y_boundaries=[1])
+    self.assertSlicingAwareTransformOutputEqual(
+        examples,
+        generator,
+        expected_result,
+        add_default_slice_key_to_input=True,
+        add_default_slice_key_to_output=True)
+
   def test_lift_min_x_count(self):
     examples = [
         pa.Table.from_arrays([