|
19 | 19 | from __future__ import print_function
|
20 | 20 |
|
21 | 21 | from absl.testing import absltest
|
| 22 | +import numpy as np |
22 | 23 | import pandas as pd
|
23 | 24 | import pyarrow as pa
|
24 | 25 |
|
@@ -46,6 +47,21 @@ def test_example_value_presence(self):
|
46 | 47 | lift_stats_generator._get_example_value_presence(
|
47 | 48 | t, types.FeaturePath(['x']), boundaries=None))
|
48 | 49 |
|
| 50 | + def test_example_value_presence_string_value(self): |
| 51 | + t = pa.Table.from_arrays([ |
| 52 | + pa.array([['a'], ['a', 'a'], ['a', 'b'], ['b']]), |
| 53 | + ], ['x']) |
| 54 | + expected_cat = pd.Categorical.from_codes([0, 0, 0, 1, 1], |
| 55 | + categories=['a', 'b']) |
| 56 | + expected_series = pd.Series(expected_cat, |
| 57 | + name='values', |
| 58 | + index=pd.Index([0, 1, 2, 2, 3], |
| 59 | + name='example_indices')) |
| 60 | + pd.testing.assert_series_equal( |
| 61 | + expected_series, |
| 62 | + lift_stats_generator._get_example_value_presence( |
| 63 | + t, types.FeaturePath(['x']), boundaries=None)) |
| 64 | + |
49 | 65 | def test_example_value_presence_none_value(self):
|
50 | 66 | t = pa.Table.from_arrays([
|
51 | 67 | pa.array([[1], None]),
|
@@ -709,6 +725,64 @@ def test_lift_null_y(self):
|
709 | 725 | add_default_slice_key_to_input=True,
|
710 | 726 | add_default_slice_key_to_output=True)
|
711 | 727 |
|
| 728 | + def test_lift_missing_x_and_y(self): |
| 729 | + examples = [ |
| 730 | + pa.Table.from_arrays([ |
| 731 | + # explicitly construct type to avoid treating as null type |
| 732 | + pa.array([], type=pa.list_(pa.binary())), |
| 733 | + pa.array([], type=pa.list_(pa.binary())), |
| 734 | + ], ['categorical_x', 'string_y']), |
| 735 | + ] |
| 736 | + schema = text_format.Parse( |
| 737 | + """ |
| 738 | + feature { |
| 739 | + name: 'categorical_x' |
| 740 | + type: BYTES |
| 741 | + } |
| 742 | + feature { |
| 743 | + name: 'string_y' |
| 744 | + type: BYTES |
| 745 | + } |
| 746 | + """, schema_pb2.Schema()) |
| 747 | + expected_result = [] |
| 748 | + generator = lift_stats_generator.LiftStatsGenerator( |
| 749 | + schema=schema, y_path=types.FeaturePath(['string_y'])) |
| 750 | + self.assertSlicingAwareTransformOutputEqual( |
| 751 | + examples, |
| 752 | + generator, |
| 753 | + expected_result, |
| 754 | + add_default_slice_key_to_input=True, |
| 755 | + add_default_slice_key_to_output=True) |
| 756 | + |
| 757 | + def test_lift_float_y_is_nan(self): |
| 758 | + # after calling bin_array, this is effectively an empty array. |
| 759 | + examples = [ |
| 760 | + pa.Table.from_arrays([ |
| 761 | + pa.array([['a']]), |
| 762 | + pa.array([[np.nan]]), |
| 763 | + ], ['categorical_x', 'float_y']), |
| 764 | + ] |
| 765 | + schema = text_format.Parse( |
| 766 | + """ |
| 767 | + feature { |
| 768 | + name: 'categorical_x' |
| 769 | + type: BYTES |
| 770 | + } |
| 771 | + feature { |
| 772 | + name: 'float_y' |
| 773 | + type: FLOAT |
| 774 | + } |
| 775 | + """, schema_pb2.Schema()) |
| 776 | + expected_result = [] |
| 777 | + generator = lift_stats_generator.LiftStatsGenerator( |
| 778 | + schema=schema, y_path=types.FeaturePath(['float_y']), y_boundaries=[1]) |
| 779 | + self.assertSlicingAwareTransformOutputEqual( |
| 780 | + examples, |
| 781 | + generator, |
| 782 | + expected_result, |
| 783 | + add_default_slice_key_to_input=True, |
| 784 | + add_default_slice_key_to_output=True) |
| 785 | + |
712 | 786 | def test_lift_min_x_count(self):
|
713 | 787 | examples = [
|
714 | 788 | pa.Table.from_arrays([
|
|
0 commit comments