tensorflow
diff --git a/‎RELEASE.md
Lines changed: 16 additions & 0 deletions b/‎RELEASE.md
Lines changed: 16 additions & 0 deletions
diff --git a/‎.gitignore renamed to ‎google3/third_party/py/tensorflow_transform/.gitignore b/‎.gitignore renamed to ‎google3/third_party/py/tensorflow_transform/.gitignore
diff --git a/‎setup.py
Lines changed: 1 addition & 1 deletion b/‎setup.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow_transform/analyzers.py
Lines changed: 34 additions & 22 deletions b/‎tensorflow_transform/analyzers.py
Lines changed: 34 additions & 22 deletions
diff --git a/‎tensorflow_transform/beam/analyzer_impls.py
Lines changed: 56 additions & 44 deletions b/‎tensorflow_transform/beam/analyzer_impls.py
Lines changed: 56 additions & 44 deletions
@@ -1,3 +1,19 @@
+# Current version (not yet released; still in development)
+
+## Major Features and Improvements
+* Performance improvements for vocabulary generation when using top_k.
+* Utility to deep-copy Beam `PCollection`s was added to avoid unnecessary
+  materialization.
+
+## Bug Fixes and Other Changes
+* Memory reduction during vocabulary generation.
+* Clarify documentation on return values from `tft.compute_and_apply_vocabulary`
+  and `tft.string_to_int`.
+
+## Breaking changes
+
+## Deprecations
+
 # Release 0.8.0
 
 ## Major Features and Improvements
 
@@ -17,7 +17,7 @@
 from setuptools import setup
 
 # Tensorflow transform version.
-__version__ = '0.8.0'
+__version__ = '0.8.1dev'
 
 
 def _make_required_install_packages():
 
@@ -289,20 +289,28 @@ def add_input(self, accumulator, batch_values):
           in zip(accumulator, reduced_values)]
 
   def merge_accumulators(self, accumulators):
-    # numpy's sum, min, max, etc functions operate on array-like objects, but
-    # not arbitrary iterables. Convert the provided accumulators into a list
-    return [
-        self._fn(list(sub_accumulators), axis=0)
-        for sub_accumulators in zip(*accumulators)]
+    non_empty_accumulators = [
+        accumulator for accumulator in accumulators if accumulator is not None
+    ]
+    if non_empty_accumulators:
+      return [
+          # numpy's sum, min, max, etc functions operate on array-like objects,
+          # but not arbitrary iterables. Convert the provided sub_accumulators
+          # into a list.
+          self._fn(list(sub_accumulators), axis=0)
+          for sub_accumulators in zip(*non_empty_accumulators)]
+    else:
+      return None
 
   def extract_output(self, accumulator):
     if accumulator is None:
       return None
-    # For each output, cast that output to the specified type.  Note there will
-    # be one output for each input tensor to the analyzer.
-    return [sub_accumulator.astype(output_dtype)
-            for sub_accumulator, output_dtype
-            in zip(accumulator, self._output_dtypes)]
+    else:
+      # For each output, cast that output to the specified type.  Note there
+      # will be one output for each input tensor to the analyzer.
+      return [sub_accumulator.astype(output_dtype)
+              for sub_accumulator, output_dtype
+              in zip(accumulator, self._output_dtypes)]
 
   def num_outputs(self):
     return len(self._output_dtypes)
@@ -1040,18 +1048,22 @@ def add_input(self, accumulator, batch_values):
 
   def merge_accumulators(self, accumulators):
     """Sums values in each accumulator entry."""
-    # Convert `accumulators` to list (it may be an arbitrary iterator) so it can
-    # be iterated over multiple times.
-    accumulators = list(accumulators)
-    # Because each accumulator contains multiple arrays of different dimensions,
-    # the np.sum operation must be explicitly used across the entries within
-    # each accumulator. np.sum(list(accumulators)) does not work.
-    sum_product = np.sum(
-        [accumulator[0] for accumulator in accumulators], axis=0)
-    sum_vectors = np.sum(
-        [accumulator[1] for accumulator in accumulators], axis=0)
-    count = np.sum([accumulator[2] for accumulator in accumulators], axis=0)
-    return [sum_product, sum_vectors, count]
+    accumulators = [
+        accumulator for accumulator in accumulators if accumulator is not None
+    ]
+    if accumulators:
+      # Because each accumulator contains multiple arrays of different
+      # dimensions, the np.sum operation must be explicitly used across the
+      # entries within each accumulator. np.sum(list(accumulators)) does not
+      # work.
+      sum_product = np.sum(
+          [accumulator[0] for accumulator in accumulators], axis=0)
+      sum_vectors = np.sum(
+          [accumulator[1] for accumulator in accumulators], axis=0)
+      count = np.sum([accumulator[2] for accumulator in accumulators], axis=0)
+      return [sum_product, sum_vectors, count]
+    else:
+      return None
 
   def extract_output(self, accumulator):
     """Run covariance logic on sum_product, sum of input vectors, and count.
 
@@ -59,13 +59,30 @@ def expand(self, pcoll):
       raise NotImplementedError(self._spec.__class__)
 
 
-def _flatten_value_to_list(batch_values):
-  """Converts an N-D dense or sparse batch to a 1-D list."""
-  # Ravel for flattening and tolist so that we go to native Python types
-  # for more efficient followup processing.
-  #
-  batch_value, = batch_values
-  return batch_value.ravel().tolist()
+class _OrderElementsFn(beam.DoFn):
+  """Sort the vocabulary by descending frequency count."""
+
+  def __init__(self, store_frequency):
+    self._store_frequency = store_frequency
+
+    # Metrics.
+    self._vocab_size_distribution = beam.metrics.Metrics.distribution(
+        common.METRICS_NAMESPACE, 'vocabulary_size')
+
+  def process(self, element, counts_iter):
+    del element
+    counts = list(counts_iter)
+    self._vocab_size_distribution.update(len(counts))
+
+    if not counts:
+      counts = [(1, '49d0cd50-04bb-48c0-bc6f-5b575dce351a')]
+
+    counts.sort(reverse=True)  # Largest first.
+    for count, entry in counts:
+      if self._store_frequency:
+        yield '{} {}'.format(count, entry)
+      else:
+        yield entry
 
 
 @with_input_types(List[np.ndarray])
@@ -88,13 +105,21 @@ def expand(self, pcoll):
     # pairs in sorted order by decreasing counts (and by values for equal
     # counts).
 
+    def flatten_value_to_list(batch_values):
+      """Converts an N-D dense or sparse batch to a 1-D list."""
+      # Ravel for flattening and tolist so that we go to native Python types
+      # for more efficient followup processing.
+      #
+      batch_value, = batch_values
+      return batch_value.ravel().tolist()
+
     def is_problematic_string(kv):
       string, _ = kv  # Ignore counts.
       return string and '\n' not in string and '\r' not in string
 
     counts = (
         pcoll
-        | 'FlattenStrings' >> beam.FlatMap(_flatten_value_to_list)
+        | 'FlattenStrings' >> beam.FlatMap(flatten_value_to_list)
         | 'CountPerString' >> beam.combiners.Count.PerElement()
         | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string)
         | 'SwapStringsAndCounts' >> beam.KvSwap())
@@ -105,51 +130,38 @@ def is_problematic_string(kv):
       counts |= ('FilterByFrequencyThreshold(%s)' % frequency_threshold >>
                  beam.Filter(lambda kv: kv[0] >= frequency_threshold))
 
-    if top_k is not None:
+    if top_k is None:
+      # Performance optimization to obviate reading from finely sharded files
+      # via AsIter in order_elements below. By breaking fusion, we allow sharded
+      # files' sizes to be automatically computed (when possible), so we end up
+      # reading from fewer and larger files. This is not needed when top_k is
+      # provided since that already induces a single-sharded output (due to the
+      # CombineGlobaly).
+      counts |= 'Reshard' >> beam.transforms.Reshuffle()  # pylint: disable=no-value-for-parameter
+    else:
       counts = (counts
                 | 'Top(%s)' % top_k
-                >> beam.transforms.combiners.Top.Largest(top_k)
+                # Using without_defaults() below since it obviates unnecessary
+                # materializations. This is worth doing because:
+                # a) Some vocabs could be really large and allthough they do
+                #    fit in memory they might go over per-record
+                #    materialization limits (TopCombineFn is producing
+                #    single-record with the entire vocabulary as a list).
+                # b) More fusion leads to increased performance in general.
+                >> beam.CombineGlobally(
+                    beam.combiners.TopCombineFn(top_k)).without_defaults()
                 | 'FlattenList' >> beam.FlatMap(lambda lst: lst))
 
-    # Performance optimization to obviate reading from finely sharded files
-    # via AsIter. By breaking fusion, we allow sharded files' sizes to be
-    # automatically computed (when possible), so we end up reading from fewer
-    # and larger files.
-    counts |= 'Reshard' >> beam.transforms.Reshuffle()  # pylint: disable=no-value-for-parameter
-
-    # Using AsIter instead of AsList at the callsite below in order to reduce
-    # max memory usage (due to AsList caching).
-    def order_elements(ignored, counts_iter, store_frequency):
-      """Sort the vocabulary by descending frequency count."""
-      del ignored
-      counts = list(counts_iter)
-      if not counts:
-        counts = [(1, '49d0cd50-04bb-48c0-bc6f-5b575dce351a')]
-      counts.sort(reverse=True)  # Largest first.
-
-      # Log vocabulary size to metrics.  Note we can call
-      # beam.metrics.Metrics.distribution here because this function only gets
-      # called once, so there is no need to amortize the cost of calling the
-      # constructor by putting in a DoFn initializer.
-      vocab_size_distribution = beam.metrics.Metrics.distribution(
-          common.METRICS_NAMESPACE, 'vocabulary_size')
-      vocab_size_distribution.update(len(counts))
-
-      if store_frequency:
-        # Returns ['count1 element1', ... ]
-        return ['{} {}'.format(count, element) for count, element in counts]
-      else:
-        return [element for _, element in counts]
-
     vocabulary_file = os.path.join(self._temp_assets_dir,
                                    self._spec.vocab_filename)
     vocab_is_written = (
         pcoll.pipeline
         | 'Prepare' >> beam.Create([None])
-        | 'OrderElements' >> beam.FlatMap(
-            order_elements,
-            counts_iter=beam.pvalue.AsIter(counts),
-            store_frequency=self._spec.store_frequency)
+        | 'OrderElements' >> beam.ParDo(
+            _OrderElementsFn(self._spec.store_frequency),
+            # Using AsIter instead of AsList at the callsite below in order to
+            # reduce max memory usage.
+            counts_iter=beam.pvalue.AsIter(counts))
         | 'WriteToFile' >> beam.io.WriteToText(vocabulary_file,
                                                shard_name_template=''))
     # Return the vocabulary path.