Skip to content

Commit 7fcfe05

Browse files
tf-transform-teamzoyahav
tf-transform-team
authored andcommitted
Project import generated by Copybara.
PiperOrigin-RevId: 204787703
1 parent 2fbaba6 commit 7fcfe05

File tree

8 files changed

+539
-71
lines changed

8 files changed

+539
-71
lines changed

RELEASE.md

+16
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
# Current version (not yet released; still in development)
2+
3+
## Major Features and Improvements
4+
* Performance improvements for vocabulary generation when using top_k.
5+
* Utility to deep-copy Beam `PCollection`s was added to avoid unnecessary
6+
materialization.
7+
8+
## Bug Fixes and Other Changes
9+
* Memory reduction during vocabulary generation.
10+
* Clarify documentation on return values from `tft.compute_and_apply_vocabulary`
11+
and `tft.string_to_int`.
12+
13+
## Breaking changes
14+
15+
## Deprecations
16+
117
# Release 0.8.0
218

319
## Major Features and Improvements
File renamed without changes.

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from setuptools import setup
1818

1919
# Tensorflow transform version.
20-
__version__ = '0.8.0'
20+
__version__ = '0.8.1dev'
2121

2222

2323
def _make_required_install_packages():

tensorflow_transform/analyzers.py

+34-22
Original file line numberDiff line numberDiff line change
@@ -289,20 +289,28 @@ def add_input(self, accumulator, batch_values):
289289
in zip(accumulator, reduced_values)]
290290

291291
def merge_accumulators(self, accumulators):
292-
# numpy's sum, min, max, etc functions operate on array-like objects, but
293-
# not arbitrary iterables. Convert the provided accumulators into a list
294-
return [
295-
self._fn(list(sub_accumulators), axis=0)
296-
for sub_accumulators in zip(*accumulators)]
292+
non_empty_accumulators = [
293+
accumulator for accumulator in accumulators if accumulator is not None
294+
]
295+
if non_empty_accumulators:
296+
return [
297+
# numpy's sum, min, max, etc functions operate on array-like objects,
298+
# but not arbitrary iterables. Convert the provided sub_accumulators
299+
# into a list.
300+
self._fn(list(sub_accumulators), axis=0)
301+
for sub_accumulators in zip(*non_empty_accumulators)]
302+
else:
303+
return None
297304

298305
def extract_output(self, accumulator):
299306
if accumulator is None:
300307
return None
301-
# For each output, cast that output to the specified type. Note there will
302-
# be one output for each input tensor to the analyzer.
303-
return [sub_accumulator.astype(output_dtype)
304-
for sub_accumulator, output_dtype
305-
in zip(accumulator, self._output_dtypes)]
308+
else:
309+
# For each output, cast that output to the specified type. Note there
310+
# will be one output for each input tensor to the analyzer.
311+
return [sub_accumulator.astype(output_dtype)
312+
for sub_accumulator, output_dtype
313+
in zip(accumulator, self._output_dtypes)]
306314

307315
def num_outputs(self):
308316
return len(self._output_dtypes)
@@ -1040,18 +1048,22 @@ def add_input(self, accumulator, batch_values):
10401048

10411049
def merge_accumulators(self, accumulators):
10421050
"""Sums values in each accumulator entry."""
1043-
# Convert `accumulators` to list (it may be an arbitrary iterator) so it can
1044-
# be iterated over multiple times.
1045-
accumulators = list(accumulators)
1046-
# Because each accumulator contains multiple arrays of different dimensions,
1047-
# the np.sum operation must be explicitly used across the entries within
1048-
# each accumulator. np.sum(list(accumulators)) does not work.
1049-
sum_product = np.sum(
1050-
[accumulator[0] for accumulator in accumulators], axis=0)
1051-
sum_vectors = np.sum(
1052-
[accumulator[1] for accumulator in accumulators], axis=0)
1053-
count = np.sum([accumulator[2] for accumulator in accumulators], axis=0)
1054-
return [sum_product, sum_vectors, count]
1051+
accumulators = [
1052+
accumulator for accumulator in accumulators if accumulator is not None
1053+
]
1054+
if accumulators:
1055+
# Because each accumulator contains multiple arrays of different
1056+
# dimensions, the np.sum operation must be explicitly used across the
1057+
# entries within each accumulator. np.sum(list(accumulators)) does not
1058+
# work.
1059+
sum_product = np.sum(
1060+
[accumulator[0] for accumulator in accumulators], axis=0)
1061+
sum_vectors = np.sum(
1062+
[accumulator[1] for accumulator in accumulators], axis=0)
1063+
count = np.sum([accumulator[2] for accumulator in accumulators], axis=0)
1064+
return [sum_product, sum_vectors, count]
1065+
else:
1066+
return None
10551067

10561068
def extract_output(self, accumulator):
10571069
"""Run covariance logic on sum_product, sum of input vectors, and count.

tensorflow_transform/beam/analyzer_impls.py

+56-44
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,30 @@ def expand(self, pcoll):
5959
raise NotImplementedError(self._spec.__class__)
6060

6161

62-
def _flatten_value_to_list(batch_values):
63-
"""Converts an N-D dense or sparse batch to a 1-D list."""
64-
# Ravel for flattening and tolist so that we go to native Python types
65-
# for more efficient followup processing.
66-
#
67-
batch_value, = batch_values
68-
return batch_value.ravel().tolist()
62+
class _OrderElementsFn(beam.DoFn):
63+
"""Sort the vocabulary by descending frequency count."""
64+
65+
def __init__(self, store_frequency):
66+
self._store_frequency = store_frequency
67+
68+
# Metrics.
69+
self._vocab_size_distribution = beam.metrics.Metrics.distribution(
70+
common.METRICS_NAMESPACE, 'vocabulary_size')
71+
72+
def process(self, element, counts_iter):
73+
del element
74+
counts = list(counts_iter)
75+
self._vocab_size_distribution.update(len(counts))
76+
77+
if not counts:
78+
counts = [(1, '49d0cd50-04bb-48c0-bc6f-5b575dce351a')]
79+
80+
counts.sort(reverse=True) # Largest first.
81+
for count, entry in counts:
82+
if self._store_frequency:
83+
yield '{} {}'.format(count, entry)
84+
else:
85+
yield entry
6986

7087

7188
@with_input_types(List[np.ndarray])
@@ -88,13 +105,21 @@ def expand(self, pcoll):
88105
# pairs in sorted order by decreasing counts (and by values for equal
89106
# counts).
90107

108+
def flatten_value_to_list(batch_values):
109+
"""Converts an N-D dense or sparse batch to a 1-D list."""
110+
# Ravel for flattening and tolist so that we go to native Python types
111+
# for more efficient followup processing.
112+
#
113+
batch_value, = batch_values
114+
return batch_value.ravel().tolist()
115+
91116
def is_problematic_string(kv):
92117
string, _ = kv # Ignore counts.
93118
return string and '\n' not in string and '\r' not in string
94119

95120
counts = (
96121
pcoll
97-
| 'FlattenStrings' >> beam.FlatMap(_flatten_value_to_list)
122+
| 'FlattenStrings' >> beam.FlatMap(flatten_value_to_list)
98123
| 'CountPerString' >> beam.combiners.Count.PerElement()
99124
| 'FilterProblematicStrings' >> beam.Filter(is_problematic_string)
100125
| 'SwapStringsAndCounts' >> beam.KvSwap())
@@ -105,51 +130,38 @@ def is_problematic_string(kv):
105130
counts |= ('FilterByFrequencyThreshold(%s)' % frequency_threshold >>
106131
beam.Filter(lambda kv: kv[0] >= frequency_threshold))
107132

108-
if top_k is not None:
133+
if top_k is None:
134+
# Performance optimization to obviate reading from finely sharded files
135+
# via AsIter in order_elements below. By breaking fusion, we allow sharded
136+
# files' sizes to be automatically computed (when possible), so we end up
137+
# reading from fewer and larger files. This is not needed when top_k is
138+
# provided since that already induces a single-sharded output (due to the
139+
# CombineGlobaly).
140+
counts |= 'Reshard' >> beam.transforms.Reshuffle() # pylint: disable=no-value-for-parameter
141+
else:
109142
counts = (counts
110143
| 'Top(%s)' % top_k
111-
>> beam.transforms.combiners.Top.Largest(top_k)
144+
# Using without_defaults() below since it obviates unnecessary
145+
# materializations. This is worth doing because:
146+
# a) Some vocabs could be really large and allthough they do
147+
# fit in memory they might go over per-record
148+
# materialization limits (TopCombineFn is producing
149+
# single-record with the entire vocabulary as a list).
150+
# b) More fusion leads to increased performance in general.
151+
>> beam.CombineGlobally(
152+
beam.combiners.TopCombineFn(top_k)).without_defaults()
112153
| 'FlattenList' >> beam.FlatMap(lambda lst: lst))
113154

114-
# Performance optimization to obviate reading from finely sharded files
115-
# via AsIter. By breaking fusion, we allow sharded files' sizes to be
116-
# automatically computed (when possible), so we end up reading from fewer
117-
# and larger files.
118-
counts |= 'Reshard' >> beam.transforms.Reshuffle() # pylint: disable=no-value-for-parameter
119-
120-
# Using AsIter instead of AsList at the callsite below in order to reduce
121-
# max memory usage (due to AsList caching).
122-
def order_elements(ignored, counts_iter, store_frequency):
123-
"""Sort the vocabulary by descending frequency count."""
124-
del ignored
125-
counts = list(counts_iter)
126-
if not counts:
127-
counts = [(1, '49d0cd50-04bb-48c0-bc6f-5b575dce351a')]
128-
counts.sort(reverse=True) # Largest first.
129-
130-
# Log vocabulary size to metrics. Note we can call
131-
# beam.metrics.Metrics.distribution here because this function only gets
132-
# called once, so there is no need to amortize the cost of calling the
133-
# constructor by putting in a DoFn initializer.
134-
vocab_size_distribution = beam.metrics.Metrics.distribution(
135-
common.METRICS_NAMESPACE, 'vocabulary_size')
136-
vocab_size_distribution.update(len(counts))
137-
138-
if store_frequency:
139-
# Returns ['count1 element1', ... ]
140-
return ['{} {}'.format(count, element) for count, element in counts]
141-
else:
142-
return [element for _, element in counts]
143-
144155
vocabulary_file = os.path.join(self._temp_assets_dir,
145156
self._spec.vocab_filename)
146157
vocab_is_written = (
147158
pcoll.pipeline
148159
| 'Prepare' >> beam.Create([None])
149-
| 'OrderElements' >> beam.FlatMap(
150-
order_elements,
151-
counts_iter=beam.pvalue.AsIter(counts),
152-
store_frequency=self._spec.store_frequency)
160+
| 'OrderElements' >> beam.ParDo(
161+
_OrderElementsFn(self._spec.store_frequency),
162+
# Using AsIter instead of AsList at the callsite below in order to
163+
# reduce max memory usage.
164+
counts_iter=beam.pvalue.AsIter(counts))
153165
| 'WriteToFile' >> beam.io.WriteToText(vocabulary_file,
154166
shard_name_template=''))
155167
# Return the vocabulary path.

0 commit comments

Comments
 (0)