Adding telemetry for TensorRepresentations in input and output schema.

iindyk · tfx-copybara · commit 0c3e662e29c4 · 2022-04-11T16:29:24.000-07:00
PiperOrigin-RevId: 441026555
diff --git a/tensorflow_transform/beam/impl.py b/tensorflow_transform/beam/impl.py
@@ -78,6 +78,7 @@
 from tensorflow_transform.tf_metadata import metadata_io
 from tensorflow_transform.tf_metadata import schema_utils
 from tfx_bsl.telemetry import collection as telemetry
+from tfx_bsl.telemetry import util as telemetry_util
 from tfx_bsl.tfxio import tensor_representation_util
 from tfx_bsl.tfxio import tensor_to_arrow
 from tfx_bsl.tfxio import tf_example_record
@@ -1078,6 +1079,15 @@ def expand(self, dataset):
               >> telemetry.TrackRecordBatchBytes(beam_common.METRICS_NAMESPACE,
                                                  'analysis_input_bytes'))
 
+    # Gather telemetry on types of input features.
+    _ = (
+        self.pipeline | 'CreateAnalyzeInputTensorRepresentations' >>
+        beam.Create([input_tensor_adapter_config.tensor_representations])
+        |
+        'InstrumentAnalyzeInputTensors' >> telemetry.TrackTensorRepresentations(
+            telemetry_util.AppendToNamespace(beam_common.METRICS_NAMESPACE,
+                                             ['analyze_input_tensors'])))
+
     asset_map = annotators.get_asset_annotations(graph)
     # TF.HUB can error when unapproved collections are present. So we explicitly
     # clear out the collections in the graph.
@@ -1351,6 +1361,20 @@ def _remove_columns_from_metadata(metadata, excluded_columns):
       new_feature_spec, new_domains)
 
 
+class _MaybeInferTensorRepresentationsDoFn(beam.DoFn):
+  """Tries to infer TensorRepresentations from a Schema."""
+
+  def process(
+      self, schema: schema_pb2.Schema
+  ) -> Iterable[Dict[str, schema_pb2.TensorRepresentation]]:
+    try:
+      yield (tensor_representation_util
+             .InferTensorRepresentationsFromMixedSchema(schema))
+    except ValueError:
+      # Ignore any inference errors since the output is only used for metrics.
+      yield {}
+
+
 @beam.typehints.with_input_types(Union[_DatasetElementType, pa.RecordBatch],
                                  Union[dataset_metadata.DatasetMetadata,
                                        TensorAdapterConfig,
@@ -1446,11 +1470,20 @@ def expand(self, dataset_and_transform_fn):
           self.pipeline
           | 'CreateDeferredSchema' >> beam.Create([output_metadata.schema]))
 
+    # Increment input metrics.
     _ = (
         input_values
         | 'InstrumentInputBytes[Transform]' >> telemetry.TrackRecordBatchBytes(
             beam_common.METRICS_NAMESPACE, 'transform_input_bytes'))
 
+    _ = (
+        self.pipeline | 'CreateTransformInputTensorRepresentations' >>
+        beam.Create([input_tensor_adapter_config.tensor_representations])
+        | 'InstrumentTransformInputTensors' >>
+        telemetry.TrackTensorRepresentations(
+            telemetry_util.AppendToNamespace(beam_common.METRICS_NAMESPACE,
+                                             ['transform_input_tensors'])))
+
     tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_BEAM_RUNNER_TYPE.get(
         type(self.pipeline.runner))
     output_batches = (
@@ -1471,20 +1504,38 @@ def expand(self, dataset_and_transform_fn):
       converter_pcol = (
           deferred_schema | 'MakeTensorToArrowConverter' >> beam.Map(
               impl_helper.make_tensor_to_arrow_converter))
+
+      output_tensor_representations = (
+          converter_pcol
+          | 'MapToTensorRepresentations' >>
+          beam.Map(lambda converter: converter.tensor_representations()))
+
       output_data = (
           output_batches | 'ConvertToRecordBatch' >> beam.Map(
               _convert_to_record_batch,
               schema=beam.pvalue.AsSingleton(deferred_schema),
               converter=beam.pvalue.AsSingleton(converter_pcol),
               passthrough_keys=Context.get_passthrough_keys(),
               input_metadata=input_metadata))
+
     else:
+
+      output_tensor_representations = (
+          deferred_schema | 'MaybeInferTensorRepresentations' >> beam.ParDo(
+              _MaybeInferTensorRepresentationsDoFn()))
       output_data = (
           output_batches | 'ConvertAndUnbatchToInstanceDicts' >> beam.FlatMap(
               _convert_and_unbatch_to_instance_dicts,
               schema=beam.pvalue.AsSingleton(deferred_schema),
               passthrough_keys=Context.get_passthrough_keys()))
 
+    # Increment output data metrics.
+    _ = (
+        output_tensor_representations
+        | 'InstrumentTransformOutputTensors' >>
+        telemetry.TrackTensorRepresentations(
+            telemetry_util.AppendToNamespace(beam_common.METRICS_NAMESPACE,
+                                             ['transform_output_tensors'])))
     _clear_shared_state_after_barrier(self.pipeline, output_data)
 
     return (output_data, output_metadata)
diff --git a/tensorflow_transform/beam/impl_test.py b/tensorflow_transform/beam/impl_test.py
@@ -3526,7 +3526,8 @@ def preprocessing_fn(inputs):
       })
       with tft_beam.Context(temp_dir=self.get_temp_dir()):
         _ = ((input_data, metadata)
-             | 'AnalyzeDataset' >> tft_beam.AnalyzeDataset(preprocessing_fn))
+             | 'AnalyzeAndTransformDataset' >>
+             tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
 
     metrics = pipeline.metrics
     self.assertMetricsCounterEqual(metrics, 'tft_analyzer_vocabulary', 1)
@@ -3537,6 +3538,12 @@ def preprocessing_fn(inputs):
     # We check that that call is not logged.
     self.assertMetricsCounterEqual(metrics, 'tft_mapper_apply_vocabulary', 0)
 
+    for namespace in ('tfx.Transform.analyze_input_tensors',
+                      'tfx.Transform.transform_input_tensors',
+                      'tfx.Transform.transform_output_tensors'):
+      self.assertMetricsCounterEqual(
+          metrics, 'dense_tensor', 3, namespaces_list=[namespace])
+
   def testNumBytesCounter(self):
     self._SkipIfOutputRecordBatches()
 
diff --git a/tensorflow_transform/beam/tft_unit.py b/tensorflow_transform/beam/tft_unit.py
@@ -116,8 +116,14 @@ def assertMetricsCounterEqual(self, metrics, name, expected_count,
         metrics_filter)['counters']
     committed = sum([r.committed for r in metric])
     attempted = sum([r.attempted for r in metric])
-    self.assertEqual(committed, attempted)
-    self.assertEqual(committed, expected_count)
+    self.assertEqual(
+        committed,
+        attempted,
+        msg=f'Attempted counter {name} from namespace {namespaces_list}')
+    self.assertEqual(
+        committed,
+        expected_count,
+        msg=f'Expected counter {name} from namespace {namespaces_list}')
 
   def assertAnalyzerOutputs(self,
                             input_data,
diff --git a/tensorflow_transform/pickle_helper.py b/tensorflow_transform/pickle_helper.py
@@ -29,6 +29,7 @@
 _PROTO_CLASSES = [
     tf.compat.v1.ConfigProto,
     schema_pb2.Schema,
+    schema_pb2.TensorRepresentation,
     statistics_pb2.DatasetFeatureStatistics,
 ] + _ANNOTATION_CLASSES