Project import generated by Copybara.

tf-transform-team · zoyahav · commit 8d9423dbbb0b · 2018-06-28T16:26:42.000-04:00
PiperOrigin-RevId: 202529631
diff --git a/README.md b/README.md
@@ -53,7 +53,8 @@ other *untested* combinations may also work.
 
 |tensorflow-transform                                                            |tensorflow    |apache-beam[gcp]|
 |--------------------------------------------------------------------------------|--------------|----------------|
-|[GitHub master](https://github.com/tensorflow/transform/blob/master/RELEASE.md) |nightly (1.x) |2.4.0           |
+|[GitHub master](https://github.com/tensorflow/transform/blob/master/RELEASE.md) |nightly (1.x) |2.5.0           |
+|[0.8.0](https://github.com/tensorflow/transform/blob/v0.8.0/RELEASE.md)         |1.8           |2.5.0           |
 |[0.6.0](https://github.com/tensorflow/transform/blob/v0.6.0/RELEASE.md)         |1.6           |2.4.0           |
 |[0.5.0](https://github.com/tensorflow/transform/blob/v0.5.0/RELEASE.md)         |1.5           |2.3.0           |
 |[0.4.0](https://github.com/tensorflow/transform/blob/v0.4.0/RELEASE.md)         |1.4           |2.2.0           |
diff --git a/RELEASE.md b/RELEASE.md
@@ -1,4 +1,4 @@
-# Current version (not yet released; still in development)
+# Release 0.8.0
 
 ## Major Features and Improvements
 * Add TFTransformOutput utility class that wraps the output of tf.Transform for
@@ -25,12 +25,7 @@
   e.g. `tft.coders.ExampleProtoCoder`.
 * Setting dtypes for numpy arrays in `tft.coders.ExampleProtoCoder` and
   `tft.coders.CsvCoder`.
-* tft.mean now supports SparseTensor when reduce_instance_dimensions=True.
-  In this case it returns a scalar mean computed over the non-missing values of
-  the SparseTensor.
-* tft.mean now supports SparseTensor when reduce_instance_dimensions=False.
-  In this case it returns a vector mean computed over the non-missing values of
-  the SparseTensor.
+* `tft.mean`, `tft.max` and `tft.var` now support `tf.SparseTensor`.
 * Update examples to use "core" TensorFlow estimator API (`tf.estimator`).
 * Depends on `protobuf>=3.6.0<4`.
 
diff --git a/examples/census_example.py b/examples/census_example.py
@@ -208,7 +208,7 @@ def convert_label(label):
           | 'FixCommasTestData' >> beam.Map(
               lambda line: line.replace(', ', ','))
           | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1])
-          | 'DecodeTestData' >> beam.Map(converter.decode))
+          | 'DecodeTestData' >> MapAndFilterErrors(converter.decode))
 
       raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)
 
diff --git a/examples/sentiment.md b/examples/sentiment.md
@@ -30,7 +30,7 @@ example, the data in this example uses a single feature for the full text of a
 movie review. This is split into sentences using the `tf.string_split`
 function. The `tf.string_split` function takes a rank 1 tensor and converts it
 to a rank 2 `SparseTensor` that contains the individual tokens. Then, using
-`tft.string_to_int`, this `SparseTensor` is converted to a
+`tft.compute_and_apply_vocabulary`, this `SparseTensor` is converted to a
 `SparseTensor` of `int64`s with the same shape.
 
 During the training and evaluation phase, the `SparseTensor` that represents
diff --git a/examples/sentiment_example.md b/examples/sentiment_example.md
diff --git a/setup.py b/setup.py
@@ -11,25 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Package Setup script for the tf.Transform binary.
+"""Package Setup script for tf.Transform.
 """
 from setuptools import find_packages
 from setuptools import setup
 
 # Tensorflow transform version.
-__version__ = '0.8.0dev'
+__version__ = '0.8.0'
 
 
 def _make_required_install_packages():
   return [
       'absl-py>=0.1.6',
-      'apache-beam[gcp]>=2.4,<3',
-      'numpy>=1.10,<2',
+      'apache-beam[gcp]>=2.5,<3',
+      'numpy>=1.13.3,<2',
 
       # TF now requires protobuf>=3.6.0.
       'protobuf>=3.6.0,<4',
 
-      'six>=1.9,<2',
+      'six>=1.10,<2',
 
   ]
 
diff --git a/tensorflow_transform/analyzers.py b/tensorflow_transform/analyzers.py
@@ -382,7 +382,32 @@ def max(x, reduce_instance_dims=True, name=None):  # pylint: disable=redefined-b
   Returns:
     A `Tensor`. Has the same type as `x`.
   """
-  return _numeric_combine([x], np.max, reduce_instance_dims, name)[0]
+  combine_fn = np.max
+  if isinstance(x, tf.SparseTensor):
+    if reduce_instance_dims:
+      x = x.values
+    else:
+      sparse_ones = tf.SparseTensor(
+          indices=x.indices,
+          values=tf.ones_like(x.values),
+          dense_shape=x.dense_shape)
+      ones_values = tf.sparse_reduce_sum(sparse_ones, axis=0, keep_dims=True)
+      # sparse_reduce_max returns 0 when all
+      # elements are missing along axis 0.
+      # Replace the 0 with nan when float
+      # and dtype.min when int.
+      batch_has_no_values = tf.equal(ones_values, tf.cast(0, x.dtype))
+      x = tf.sparse_reduce_max(x, axis=0, keep_dims=True)
+      if x.dtype == tf.float32:
+        missing_value = np.nan
+        combine_fn = np.nanmax
+      elif x.dtype == tf.float64:
+        missing_value = np.float64(np.nan)
+        combine_fn = np.nanmax
+      else:
+        missing_value = x.dtype.min
+      x = tf.where(batch_has_no_values, tf.fill(tf.shape(x), missing_value), x)
+  return _numeric_combine([x], combine_fn, reduce_instance_dims, name)[0]
 
 
 def _min_and_max(x, reduce_instance_dims=True, name=None):  # pylint: disable=redefined-builtin
@@ -500,8 +525,8 @@ def var(x, reduce_instance_dims=True, name=None, output_dtype=None):
   (x - mean(x))**2 / length(x).
 
   Args:
-    x: A `Tensor`. Its type must be floating point (float{16|32|64}), or
-        integral ([u]int{8|16|32|64}).
+    x: `Tensor` or `SparseTensor`. Its type must be floating point
+        (float{16|32|64}), or integral ([u]int{8|16|32|64}).
     reduce_instance_dims: By default collapses the batch and instance dimensions
         to arrive at a single scalar output. If False, only collapses the batch
         dimension and outputs a vector of the same shape as the input.
@@ -517,23 +542,34 @@ def var(x, reduce_instance_dims=True, name=None, output_dtype=None):
     TypeError: If the type of `x` is not supported.
   """
   with tf.name_scope(name, 'var'):
-    # Note: Calling `mean`, `sum`, and `size` as defined in this module, not the
-    # builtins.
-    x_mean = mean(x, reduce_instance_dims, output_dtype=output_dtype)
-    # x_mean will be float16, float32, or float64, depending on type of x.
-    squared_deviations = tf.square(tf.cast(x, x_mean.dtype) - x_mean)
-    return mean(
-        squared_deviations, reduce_instance_dims, output_dtype=output_dtype)
+    return _mean_and_var(x, reduce_instance_dims, name, output_dtype)[1]
 
 
 def _mean_and_var(x, reduce_instance_dims=True, name=None, output_dtype=None):
   """More efficient combined `mean` and `var`.  See `var`."""
+  if output_dtype is None:
+    output_dtype = _MEAN_OUTPUT_DTYPE_MAP.get(x.dtype)
+    if output_dtype is None:
+      raise TypeError('Tensor type %r is not supported' % x.dtype)
   with tf.name_scope(name, 'mean_and_var'):
     # Note: Calling `mean`, `sum`, and `size` as defined in this module, not the
     # builtins.
     x_mean = mean(x, reduce_instance_dims, output_dtype=output_dtype)
-    # x_mean will be float16, float32, or float64, depending on type of x.
-    squared_deviations = tf.square(tf.cast(x, x_mean.dtype) - x_mean)
+    if isinstance(x, tf.SparseTensor):
+      if reduce_instance_dims:
+        squared_deviations = tf.square(tf.cast(x.values, x_mean.dtype) - x_mean)
+      else:
+        # Only supports sparsetensors with rank 2.
+        x.get_shape().assert_has_rank(2)
+        mean_values = tf.gather(x_mean, x.indices[:, 1])
+        squared_deviation_values = tf.square(
+            tf.cast(x.values, x_mean.dtype) - mean_values)
+        squared_deviations = tf.SparseTensor(
+            indices=x.indices,
+            values=squared_deviation_values,
+            dense_shape=x.dense_shape)
+    else:
+      squared_deviations = tf.square(tf.cast(x, x_mean.dtype) - x_mean)
     x_var = mean(
         squared_deviations, reduce_instance_dims, output_dtype=output_dtype)
     return x_mean, x_var
diff --git a/tensorflow_transform/beam/impl_test.py b/tensorflow_transform/beam/impl_test.py
@@ -1282,6 +1282,66 @@ def analyzer_fn(inputs):
     self.assertAnalyzerOutputs(
         input_data, input_metadata, analyzer_fn, expected_outputs)
 
+  def testMaxWithSparseTensorReduceTrue(self):
+
+    def analyzer_fn(inputs):
+      return {'max': tft.max(inputs['sparse'])}
+
+    input_data = [{
+        'sparse': ([0, 1], [0., 1.])
+    }, {
+        'sparse': ([1, 3], [2., 3.])
+    }]
+    input_metadata = dataset_metadata.DatasetMetadata({
+        'sparse':
+            sch.ColumnSchema(
+                tf.float32, [4],
+                sch.SparseColumnRepresentation(
+                    'val', [sch.SparseIndexField('idx', False)]))
+    })
+    expected_outputs = {'max': np.array(3., np.float32)}
+    self.assertAnalyzerOutputs(input_data, input_metadata, analyzer_fn,
+                               expected_outputs)
+
+  @tft_unit.parameters(
+      (tf.int32,),
+      (tf.int64,),
+      (tf.float32,),
+      (tf.float64,),
+  )
+  def testMaxWithSparseTensorReduceFalse(self, input_dtype):
+
+    def analyzer_fn(inputs):
+      return {'max': tft.max(inputs['sparse'], False)}
+
+    input_data = [{
+        'sparse': ([0, 1], [-1., 1.])
+    }, {
+        'sparse': ([1, 3], [2., 3.])
+    }]
+    input_metadata = dataset_metadata.DatasetMetadata({
+        'sparse':
+            sch.ColumnSchema(
+                input_dtype, [4],
+                sch.SparseColumnRepresentation(
+                    'val', [sch.SparseIndexField('idx', False)]))
+    })
+    if input_dtype == tf.float32 or input_dtype == tf.float64:
+      expected_outputs = {
+          'max':
+              np.array([-1., 2., float('nan'), 3.], input_dtype.as_numpy_dtype)
+      }
+    else:
+      expected_outputs = {
+          'max':
+              np.array(
+                  [-1, 2, np.iinfo(input_dtype.as_numpy_dtype).min, 3],
+                  input_dtype.as_numpy_dtype)
+      }
+
+    self.assertAnalyzerOutputs(input_data, input_metadata, analyzer_fn,
+                               expected_outputs)
+
   def testNumericMeanWithSparseTensorReduceTrue(self):
 
     def analyzer_fn(inputs):
@@ -1341,6 +1401,70 @@ def analyzer_fn(inputs):
     self.assertAnalyzerOutputs(input_data, input_metadata, analyzer_fn,
                                expected_outputs)
 
+  @tft_unit.parameters(
+      (tf.int32,),
+      (tf.int64,),
+      (tf.float32,),
+      (tf.float64,),
+  )
+  def testVarWithSparseTensorReduceInstanceDimsTrue(self, input_dtype):
+
+    def analyzer_fn(inputs):
+      return {'var': tft.var(inputs['sparse'])}
+
+    input_data = [{
+        'sparse': ([0, 1], [0., 1.])
+    }, {
+        'sparse': ([1, 3], [2., 3.])
+    }]
+    input_metadata = dataset_metadata.DatasetMetadata({
+        'sparse':
+            sch.ColumnSchema(
+                input_dtype, [4],
+                sch.SparseColumnRepresentation(
+                    'val', [sch.SparseIndexField('idx', False)]))
+    })
+    if input_dtype == tf.float64:
+      expected_outputs = {'var': np.array(1.25, np.float64)}
+    else:
+      expected_outputs = {'var': np.array(1.25, np.float32)}
+    self.assertAnalyzerOutputs(input_data, input_metadata, analyzer_fn,
+                               expected_outputs)
+
+  @tft_unit.parameters(
+      (tf.int32,),
+      (tf.int64,),
+      (tf.float32,),
+      (tf.float64,),
+  )
+  def testVarWithSparseTensorReduceInstanceDimsFalse(self, input_dtype):
+
+    def analyzer_fn(inputs):
+      return {'var': tft.var(inputs['sparse'], reduce_instance_dims=False)}
+
+    input_data = [{
+        'sparse': ([0, 1], [0., 1.])
+    }, {
+        'sparse': ([1, 3], [2., 3.])
+    }]
+    input_metadata = dataset_metadata.DatasetMetadata({
+        'sparse':
+            sch.ColumnSchema(
+                input_dtype, [4],
+                sch.SparseColumnRepresentation(
+                    'val', [sch.SparseIndexField('idx', False)]))
+    })
+    if input_dtype == tf.float64:
+      expected_outputs = {
+          'var': np.array([0., .25, float('nan'), 0.], np.float64)
+      }
+    else:
+      expected_outputs = {
+          'var': np.array([0., .25, float('nan'), 0.], np.float32)
+      }
+    self.assertAnalyzerOutputs(input_data, input_metadata, analyzer_fn,
+                               expected_outputs)
+
   def testNumericAnalyzersWithSparseInputs(self):
     def repeat(in_tensor, value):
       batch_size = tf.shape(in_tensor)[0]
@@ -1358,11 +1482,6 @@ def min_fn(inputs):
           return {'min': repeat(inputs['a'], tft.min(inputs['a']))}
         _ = input_dataset | beam_impl.AnalyzeDataset(min_fn)
 
-      with self.assertRaises(TypeError):
-        def max_fn(inputs):
-          return {'max': repeat(inputs['a'], tft.max(inputs['a']))}
-        _ = input_dataset | beam_impl.AnalyzeDataset(max_fn)
-
       with self.assertRaises(TypeError):
         def sum_fn(inputs):
           return {'sum': repeat(inputs['a'], tft.sum(inputs['a']))}
@@ -1373,11 +1492,6 @@ def size_fn(inputs):
           return {'size': repeat(inputs['a'], tft.size(inputs['a']))}
         _ = input_dataset | beam_impl.AnalyzeDataset(size_fn)
 
-      with self.assertRaises(TypeError):
-        def var_fn(inputs):
-          return {'var': repeat(inputs['a'], tft.var(inputs['a']))}
-        _ = input_dataset | beam_impl.AnalyzeDataset(var_fn)
-
   def testStringToTFIDF(self):
     def preprocessing_fn(inputs):
       inputs_as_ints = tft.compute_and_apply_vocabulary(