Skip to content

Commit 3b6b81b

Browse files
tf-transform-teamelmer-garduno
authored andcommitted
Project import generated by Copybara.
PiperOrigin-RevId: 177753043
1 parent 6c83b47 commit 3b6b81b

File tree

8 files changed

+220
-346
lines changed

8 files changed

+220
-346
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ releasing a new version.
5757
|tensorflow-transform |tensorflow |apache-beam[gcp]|
5858
|--------------------------------------------------------------------------------|--------------|----------------|
5959
|[GitHub master](https://github.com/tensorflow/transform/blob/master/RELEASE.md) |nightly (1.x) |2.2.0 |
60+
|[0.4.0](https://github.com/tensorflow/transform/blob/v0.4.0/RELEASE.md) |1.4 |2.2.0 |
6061
|[0.3.1](https://github.com/tensorflow/transform/blob/v0.3.1/RELEASE.md) |1.3 |2.1.1 |
6162
|[0.3.0](https://github.com/tensorflow/transform/blob/v0.3.0/RELEASE.md) |1.3 |2.1.1 |
6263
|[0.1.10](https://github.com/tensorflow/transform/blob/v0.1.10/RELEASE.md) |1.0 |2.0.0 |

RELEASE.md

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
# Current version (not yet released; still in development)
1+
# Release 0.4.0
22

33
## Major Features and Improvements
44
* Added a combine_analyzer() that supports user provided combiner, conforming to
55
beam.CombinFn(). This allows users to implement custom combiners
66
(e.g. median), to complement analyzers (like min, max) that are
77
prepackaged in TFT.
8-
* Quantiles Analyzer (`tft.quantiles`).
8+
* Quantiles Analyzer (`tft.quantiles`), with a corresponding `tft.bucketize`
9+
mapper.
910

1011
## Bug Fixes and Other Changes
1112
* Depends on `apache-beam[gcp]>=2.2,<3`.
@@ -33,9 +34,15 @@
3334
* Some functions now introduce a new name scope when they did not before so the
3435
names of tensors may change. This will only affect you if you directly lookup
3536
tensors by name in the graph produced by tf.Transform.
36-
* Various Analyzer Specs (_NumericCombineSpec, _UniquesSpec, _QuantilesSpec) are
37-
now private. Analyzers are accessible only via the top-level TFT functions (
38-
min, max, sum, size, mean, var, uniques, quantiles).
37+
* Various Analyzer Specs (\_NumericCombineSpec, \_UniquesSpec, \_QuantilesSpec)
38+
are now private. Analyzers are accessible only via the top-level TFT functions
39+
(min, max, sum, size, mean, var, uniques, quantiles).
40+
41+
## Upcoming deprecations
42+
* The `serving_input_fn`s on `tensorflow_transform/saved/input_fn_maker.py` will
43+
be removed on a future version and should not be used on new code,
44+
see the `examples` directory for details on how to migrate your code to define
45+
their own serving functions.
3946

4047
# Release 0.3.1
4148

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from setuptools import setup
1818

1919
# Tensorflow transform version.
20-
__version__ = '0.4.0dev'
20+
__version__ = '0.4.0'
2121

2222

2323
def _make_required_install_packages():

tensorflow_transform/analyzers.py

Lines changed: 98 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
import re
3232

33+
import numpy as np
3334
import tensorflow as tf
3435

3536

@@ -102,32 +103,102 @@ def output_is_asset(self, output_tensor):
102103
return self._output_is_asset_map[output_tensor]
103104

104105

105-
class _NumericCombineSpec(object):
106-
"""Operation to combine numeric values."""
106+
class CombinerSpec(object):
107+
"""Analyze using combiner function.
108+
109+
This object mirrors a beam.CombineFn, that will receive a beam PCollection
110+
representing the batched input tensors.
111+
"""
112+
113+
def create_accumulator(self):
114+
"""Return a fresh, empty accumulator.
115+
116+
Returns: An empty accumulator. This can be an Python value.
117+
"""
118+
raise NotImplementedError
119+
120+
def add_input(self, accumulator, element):
121+
"""Return result of folding element into accumulator.
122+
123+
Args:
124+
accumulator: the current accumulator
125+
element: the element to add, which will be an ndarray representing the
126+
value of the input for a batch.
127+
128+
Returns: An accumulator that includes the additional element.
129+
"""
130+
raise NotImplementedError
131+
132+
def merge_accumulators(self, accumulators):
133+
"""Merges several accumulators to a single accumulator value.
134+
135+
Args:
136+
accumulators: the accumulators to merge
137+
138+
Returns: The sole merged accumulator.
139+
"""
140+
raise NotImplementedError
141+
142+
def extract_output(self, accumulator):
143+
"""Return result of converting accumulator into the output value.
144+
145+
Args:
146+
accumulator: the final accumulator value. Should be a list of ndarrays.
147+
148+
Returns: An ndarray representing the result of this combiner.
149+
"""
150+
raise NotImplementedError
107151

108-
MIN = 'min'
109-
MAX = 'max'
110-
SUM = 'sum'
111152

112-
def __init__(self, dtype, combiner_type, reduce_instance_dims):
113-
self._dtype = dtype
114-
self._combiner_type = combiner_type
153+
def combine_analyzer(x, output_dtype, output_shape, combiner_spec, name):
154+
"""Applies the combiner over the whole dataset.
155+
156+
Args:
157+
x: An input `Tensor` or `SparseTensor`.
158+
output_dtype: The dtype of the output of the analyzer.
159+
output_shape: The shape of the output of the analyzer.
160+
combiner_spec: A subclass of CombinerSpec.
161+
name: Similar to a TF op name. Used to define a unique scope for this
162+
analyzer, which can be used for debugging info.
163+
164+
Returns:
165+
The combined values, which is a `Tensor` with type output_dtype and shape
166+
`output_shape`. These must be compatible with the combiner_spec.
167+
"""
168+
return Analyzer([x], [(output_dtype, output_shape, False)], combiner_spec,
169+
name).outputs[0]
170+
171+
172+
class _NumPyCombinerSpec(CombinerSpec):
173+
"""Combines the PCollection only on the 0th dimension using nparray."""
174+
175+
def __init__(self, fn, reduce_instance_dims):
176+
self._fn = fn
115177
self._reduce_instance_dims = reduce_instance_dims
116178

117-
@property
118-
def dtype(self):
119-
return self._dtype
179+
def create_accumulator(self):
180+
return None
120181

121-
@property
122-
def combiner_type(self):
123-
return self._combiner_type
182+
def add_input(self, accumulator, next_input):
183+
if self._reduce_instance_dims:
184+
batch = self._fn(next_input)
185+
else:
186+
batch = self._fn(next_input, axis=0)
187+
if accumulator is None:
188+
return batch
189+
else:
190+
return self._fn((accumulator, batch), axis=0)
124191

125-
@property
126-
def reduce_instance_dims(self):
127-
return self._reduce_instance_dims
192+
def merge_accumulators(self, accumulators):
193+
# numpy's sum, min, max, etc functions operate on array-like objects, but
194+
# not arbitrary iterables. Convert the provided accumulators into a list
195+
return self._fn(list(accumulators), axis=0)
196+
197+
def extract_output(self, accumulator):
198+
return [accumulator]
128199

129200

130-
def _numeric_combine(x, combiner_type, reduce_instance_dims=True, name=None):
201+
def _numeric_combine(x, fn, reduce_instance_dims=True, name=None):
131202
"""Apply an analyzer with _NumericCombineSpec to given input."""
132203
if not isinstance(x, tf.Tensor):
133204
raise TypeError('Expected a Tensor, but got %r' % x)
@@ -143,10 +214,9 @@ def _numeric_combine(x, combiner_type, reduce_instance_dims=True, name=None):
143214
# If reducing over batch dimensions, with unknown shape, the result will
144215
# also have unknown shape.
145216
shape = None
146-
spec = _NumericCombineSpec(x.dtype, combiner_type, reduce_instance_dims)
147-
return Analyzer(
148-
[x], [(x.dtype, shape, False)], spec,
149-
name if name is not None else combiner_type).outputs[0]
217+
return combine_analyzer(
218+
x, x.dtype, shape, _NumPyCombinerSpec(fn, reduce_instance_dims),
219+
name if name is not None else fn.__name__)
150220

151221

152222
def min(x, reduce_instance_dims=True, name=None): # pylint: disable=redefined-builtin
@@ -162,8 +232,7 @@ def min(x, reduce_instance_dims=True, name=None): # pylint: disable=redefined-b
162232
Returns:
163233
A `Tensor`. Has the same type as `x`.
164234
"""
165-
return _numeric_combine(
166-
x, _NumericCombineSpec.MIN, reduce_instance_dims, name)
235+
return _numeric_combine(x, np.min, reduce_instance_dims, name)
167236

168237

169238
def max(x, reduce_instance_dims=True, name=None): # pylint: disable=redefined-builtin
@@ -179,8 +248,7 @@ def max(x, reduce_instance_dims=True, name=None): # pylint: disable=redefined-b
179248
Returns:
180249
A `Tensor`. Has the same type as `x`.
181250
"""
182-
return _numeric_combine(
183-
x, _NumericCombineSpec.MAX, reduce_instance_dims, name)
251+
return _numeric_combine(x, np.max, reduce_instance_dims, name)
184252

185253

186254
def sum(x, reduce_instance_dims=True, name=None): # pylint: disable=redefined-builtin
@@ -196,8 +264,7 @@ def sum(x, reduce_instance_dims=True, name=None): # pylint: disable=redefined-b
196264
Returns:
197265
A `Tensor`. Has the same type as `x`.
198266
"""
199-
return _numeric_combine(
200-
x, _NumericCombineSpec.SUM, reduce_instance_dims, name)
267+
return _numeric_combine(x, np.sum, reduce_instance_dims, name)
201268

202269

203270
def size(x, reduce_instance_dims=True, name=None):
@@ -271,18 +338,13 @@ def var(x, reduce_instance_dims=True, name=None):
271338
class _UniquesSpec(object):
272339
"""Operation to compute unique values."""
273340

274-
def __init__(self, dtype, top_k, frequency_threshold,
341+
def __init__(self, top_k, frequency_threshold,
275342
vocab_filename, store_frequency):
276-
self._dtype = dtype
277343
self._top_k = top_k
278344
self._frequency_threshold = frequency_threshold
279345
self._vocab_filename = vocab_filename
280346
self._store_frequency = store_frequency
281347

282-
@property
283-
def dtype(self):
284-
return self._dtype
285-
286348
@property
287349
def top_k(self):
288350
return self._top_k
@@ -400,8 +462,8 @@ def uniques(x, top_k=None, frequency_threshold=None,
400462
# Make the file name path safe.
401463
vocab_filename = sanitized_vocab_filename(vocab_filename, prefix=prefix)
402464

403-
spec = _UniquesSpec(tf.string, top_k, frequency_threshold,
404-
vocab_filename, store_frequency)
465+
spec = _UniquesSpec(top_k, frequency_threshold, vocab_filename,
466+
store_frequency)
405467
return Analyzer([x], [(tf.string, [], True)], spec, 'uniques').outputs[0]
406468

407469

@@ -469,50 +531,3 @@ def quantiles(x, num_buckets, epsilon, name=None):
469531
# Drop the fist and last quantile boundaries, so that we end-up with
470532
# num_buckets-1 boundaries, and hence num_buckets buckets.
471533
return quantile_boundaries[0:1, 1:-1]
472-
473-
474-
class _CombinerSpec(object):
475-
"""Analyze using combiner function.
476-
477-
Args:
478-
combiner: Object of a class that implements beam.CombineFn() interface.
479-
In addtion, the combiner class must implement a @property method called
480-
output_dtype() that returns the tf.DType of the output of the combiner.
481-
"""
482-
483-
def __init__(self, combiner):
484-
self._combiner = combiner
485-
486-
@property
487-
def combiner(self):
488-
return self._combiner
489-
490-
@property
491-
def output_dtype(self):
492-
return self._combiner.output_dtype
493-
494-
495-
def combine_analyzer(x, combiner, name=None):
496-
"""Applies the combiner over the whole dataset.
497-
498-
Args:
499-
x: An input `Tensor` or `SparseTensor`.
500-
combiner: Object of a class that implements beam.CombineFn() interface.
501-
In addtion, the combiner class must implement a @property method called
502-
output_dtype() that returns the type of the output of the combiner.
503-
name: (Optional) A name for this operation.
504-
505-
Returns:
506-
The combined values as a list, where the each element in the list
507-
is of type combiner.output_dtype().
508-
"""
509-
510-
# The TF node name will be of the form:
511-
# original_scope/{combine_analyzer|name}/{class-name-of-combiner}
512-
with tf.name_scope(name, 'combine_analyzer'):
513-
spec = _CombinerSpec(combiner)
514-
return Analyzer(
515-
[x],
516-
[(spec.output_dtype, [1, None], False)],
517-
spec,
518-
type(combiner).__name__).outputs[0]

0 commit comments

Comments
 (0)