sdks/python/apache_beam/yaml/yaml_ml_test.py - beam - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 import logging
 import tempfile
 import unittest

 import apache_beam as beam
 from apache_beam.testing.util import assert_that
 from apache_beam.testing.util import equal_to
 from apache_beam.yaml.yaml_transform import YamlTransform

 try:
   # pylint: disable=wrong-import-order, wrong-import-position, unused-import
   from apache_beam.ml.transforms import tft
 except ImportError:
   raise unittest.SkipTest('tensorflow_transform is not installed.')

 TRAIN_DATA = [
     beam.Row(num=0, text='And God said, Let there be light,'),
     beam.Row(num=2, text='And there was light'),
     beam.Row(num=8, text='And God saw the light, that it was good'),
 ]

 TEST_DATA = [
     beam.Row(num=6, text='And God divided the light from the darkness.'),
 ]


 class MLTransformTest(unittest.TestCase):
   def test_ml_transform(self):
     ml_opts = beam.options.pipeline_options.PipelineOptions(
         pickle_library='cloudpickle', yaml_experimental_features=['ML'])
     with tempfile.TemporaryDirectory() as tempdir:
       with beam.Pipeline(options=ml_opts) as p:
         elements = p | beam.Create(TRAIN_DATA)
         result = elements | YamlTransform(
             f'''
             type: MLTransform
             config:
               write_artifact_location: {tempdir}
               transforms:
                 - type: ScaleTo01
                   config:
                     columns: [num]
                 - type: ComputeAndApplyVocabulary
                   config:
                     columns: [text]
                     split_string_by_delimiter: ' ,.'
             ''')
         assert_that(
             # Why is this an array, not a scalar?
             result | beam.Map(lambda x: x.num[0]),
             equal_to([0, .25, 1]))
         assert_that(
             result | beam.Map(lambda x: set(x.text))
             | beam.CombineGlobally(lambda xs: set.union(*xs)),
             equal_to([set(range(13))]),
             label='CheckVocab')

       with beam.Pipeline(options=ml_opts) as p:
         elements = p | beam.Create(TEST_DATA)
         result = elements | YamlTransform(
             f'''
             type: MLTransform
             config:
               read_artifact_location: {tempdir}
             ''')
         assert_that(result | beam.Map(lambda x: x.num[0]), equal_to([.75]))
         assert_that(
             result | beam.Map(lambda x: len(set(x.text))),
             equal_to([5]),
             label='CheckVocab')

   def test_ml_transform_read_with_map_to_fields(self):
     ml_opts = beam.options.pipeline_options.PipelineOptions(
         pickle_library='cloudpickle', yaml_experimental_features=['ML'])
     with tempfile.TemporaryDirectory() as tempdir:
       # First, write the artifacts.
       with beam.Pipeline(options=ml_opts) as p:
         elements = p | beam.Create(TRAIN_DATA)
         _ = elements | YamlTransform(
             f'''
             type: MLTransform
             config:
               write_artifact_location: {tempdir}
               transforms:
                 - type: ScaleTo01
                   config:
                     columns: [num]
                 - type: ComputeAndApplyVocabulary
                   config:
                     columns: [text]
                     split_string_by_delimiter: ' ,.'
             ''')

       # Now, read the artifacts and use MapToFields.
       with beam.Pipeline(options=ml_opts) as p:
         elements = p | beam.Create(TEST_DATA)
         result = elements | YamlTransform(
             f'''
             type: chain
             transforms:
               - type: MLTransform
                 config:
                   read_artifact_location: {tempdir}
               - type: MapToFields
                 config:
                   language: python
                   fields:
                     num_scaled: "num[0]"
                     text_vocab: text
             ''')

         def check_row(row):
           assert row.num_scaled == 0.75
           assert len(set(row.text_vocab)) == 5
           return row.num_scaled

         assert_that(result | beam.Map(check_row), equal_to([0.75]))

   def test_sentence_transformer_embedding(self):
     SENTENCE_EMBEDDING_DIMENSION = 384
     DATA = [{
         'id': 1, 'log_message': "Error in module A"
     }, {
         'id': 2, 'log_message': "Warning in module B"
     }, {
         'id': 3, 'log_message': "Info in module C"
     }]
     ml_opts = beam.options.pipeline_options.PipelineOptions(
         pickle_library='cloudpickle', yaml_experimental_features=['ML'])
     with tempfile.TemporaryDirectory() as tempdir:
       with beam.Pipeline(options=ml_opts) as p:
         elements = p | beam.Create(DATA)
         result = elements | YamlTransform(
             f'''
             type: MLTransform
             config:
               write_artifact_location: {tempdir}
               transforms:
                 - type: SentenceTransformerEmbeddings
                   config:
                     model_name: all-MiniLM-L6-v2
                     columns: [log_message]
             ''')

         # Perform a basic check to ensure that embeddings are generated
         # and that the dimension of those embeddings is correct.
         actual_output = result | beam.Map(lambda x: len(x['log_message']))
         assert_that(
             actual_output, equal_to([SENTENCE_EMBEDDING_DIMENSION] * len(DATA)))

   def test_sentence_transformer_embedding_with_beam_rows(self):
     SENTENCE_EMBEDDING_DIMENSION = 384
     DATA = [
         beam.Row(id=1, log_message="Error in module A"),
         beam.Row(id=2, log_message="Warning in module B"),
         beam.Row(id=3, log_message="Info in module C"),
     ]
     ml_opts = beam.options.pipeline_options.PipelineOptions(
         pickle_library='cloudpickle', yaml_experimental_features=['ML'])
     with tempfile.TemporaryDirectory() as tempdir:
       with beam.Pipeline(options=ml_opts) as p:
         elements = p | beam.Create(DATA)
         result = elements | YamlTransform(
             f'''
             type: MLTransform
             config:
               write_artifact_location: {tempdir}
               transforms:
                 - type: SentenceTransformerEmbeddings
                   config:
                     model_name: all-MiniLM-L6-v2
                     columns: [log_message]
             ''')

         # Perform a basic check to ensure that embeddings are generated
         # and that the dimension of those embeddings is correct.
         actual_output = result | beam.Map(lambda x: len(x.log_message))
         assert_that(
             actual_output, equal_to([SENTENCE_EMBEDDING_DIMENSION] * len(DATA)))

   def test_ml_transform_outputs_schema(self):
     SENTENCE_EMBEDDING_DIMENSION = 384
     ml_opts = beam.options.pipeline_options.PipelineOptions(
         pickle_library='cloudpickle', yaml_experimental_features=['ML'])
     with tempfile.TemporaryDirectory() as tempdir:
       with beam.Pipeline(options=ml_opts) as p:
         result = p | YamlTransform(
             f'''
             type: chain
             transforms:
               - type: Create
                 config:
                   elements:
                     - {{id: 1, log_message: "Error in module A"}}
                     - {{id: 2, log_message: "Warning in module B"}}
                     - {{id: 3, log_message: "Info in module C"}}
               - type: MLTransform
                 config:
                   write_artifact_location: {tempdir}
                   transforms:
                     - type: SentenceTransformerEmbeddings
                       config:
                         model_name: all-MiniLM-L6-v2
                         columns: [log_message]
               - type: MapToFields
                 config:
                   language: python
                   fields:
                     id: id
                     embedding: log_message
             ''')

         def check_row(row):
           assert isinstance(row.id, int)
           assert isinstance(row.embedding, list)
           assert len(row.embedding) == SENTENCE_EMBEDDING_DIMENSION
           return row.id

         assert_that(result | beam.Map(check_row), equal_to([1, 2, 3]))


 if __name__ == '__main__':
   logging.getLogger().setLevel(logging.INFO)
   unittest.main()
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import logging
	import tempfile
	import unittest

	import apache_beam as beam
	from apache_beam.testing.util import assert_that
	from apache_beam.testing.util import equal_to
	from apache_beam.yaml.yaml_transform import YamlTransform

	try:
	# pylint: disable=wrong-import-order, wrong-import-position, unused-import
	from apache_beam.ml.transforms import tft
	except ImportError:
	raise unittest.SkipTest('tensorflow_transform is not installed.')

	TRAIN_DATA = [
	beam.Row(num=0, text='And God said, Let there be light,'),
	beam.Row(num=2, text='And there was light'),
	beam.Row(num=8, text='And God saw the light, that it was good'),
	]

	TEST_DATA = [
	beam.Row(num=6, text='And God divided the light from the darkness.'),
	]


	class MLTransformTest(unittest.TestCase):
	def test_ml_transform(self):
	ml_opts = beam.options.pipeline_options.PipelineOptions(
	pickle_library='cloudpickle', yaml_experimental_features=['ML'])
	with tempfile.TemporaryDirectory() as tempdir:
	with beam.Pipeline(options=ml_opts) as p:
	elements = p \| beam.Create(TRAIN_DATA)
	result = elements \| YamlTransform(
	f'''
	type: MLTransform
	config:
	write_artifact_location: {tempdir}
	transforms:
	- type: ScaleTo01
	config:
	columns: [num]
	- type: ComputeAndApplyVocabulary
	config:
	columns: [text]
	split_string_by_delimiter: ' ,.'
	''')
	assert_that(
	# Why is this an array, not a scalar?
	result \| beam.Map(lambda x: x.num[0]),
	equal_to([0, .25, 1]))
	assert_that(
	result \| beam.Map(lambda x: set(x.text))
	\| beam.CombineGlobally(lambda xs: set.union(*xs)),
	equal_to([set(range(13))]),
	label='CheckVocab')

	with beam.Pipeline(options=ml_opts) as p:
	elements = p \| beam.Create(TEST_DATA)
	result = elements \| YamlTransform(
	f'''
	type: MLTransform
	config:
	read_artifact_location: {tempdir}
	''')
	assert_that(result \| beam.Map(lambda x: x.num[0]), equal_to([.75]))
	assert_that(
	result \| beam.Map(lambda x: len(set(x.text))),
	equal_to([5]),
	label='CheckVocab')

	def test_ml_transform_read_with_map_to_fields(self):
	ml_opts = beam.options.pipeline_options.PipelineOptions(
	pickle_library='cloudpickle', yaml_experimental_features=['ML'])
	with tempfile.TemporaryDirectory() as tempdir:
	# First, write the artifacts.
	with beam.Pipeline(options=ml_opts) as p:
	elements = p \| beam.Create(TRAIN_DATA)
	_ = elements \| YamlTransform(
	f'''
	type: MLTransform
	config:
	write_artifact_location: {tempdir}
	transforms:
	- type: ScaleTo01
	config:
	columns: [num]
	- type: ComputeAndApplyVocabulary
	config:
	columns: [text]
	split_string_by_delimiter: ' ,.'
	''')

	# Now, read the artifacts and use MapToFields.
	with beam.Pipeline(options=ml_opts) as p:
	elements = p \| beam.Create(TEST_DATA)
	result = elements \| YamlTransform(
	f'''
	type: chain
	transforms:
	- type: MLTransform
	config:
	read_artifact_location: {tempdir}
	- type: MapToFields
	config:
	language: python
	fields:
	num_scaled: "num[0]"
	text_vocab: text
	''')

	def check_row(row):
	assert row.num_scaled == 0.75
	assert len(set(row.text_vocab)) == 5
	return row.num_scaled

	assert_that(result \| beam.Map(check_row), equal_to([0.75]))

	def test_sentence_transformer_embedding(self):
	SENTENCE_EMBEDDING_DIMENSION = 384
	DATA = [{
	'id': 1, 'log_message': "Error in module A"
	}, {
	'id': 2, 'log_message': "Warning in module B"
	}, {
	'id': 3, 'log_message': "Info in module C"
	}]
	ml_opts = beam.options.pipeline_options.PipelineOptions(
	pickle_library='cloudpickle', yaml_experimental_features=['ML'])
	with tempfile.TemporaryDirectory() as tempdir:
	with beam.Pipeline(options=ml_opts) as p:
	elements = p \| beam.Create(DATA)
	result = elements \| YamlTransform(
	f'''
	type: MLTransform
	config:
	write_artifact_location: {tempdir}
	transforms:
	- type: SentenceTransformerEmbeddings
	config:
	model_name: all-MiniLM-L6-v2
	columns: [log_message]
	''')

	# Perform a basic check to ensure that embeddings are generated
	# and that the dimension of those embeddings is correct.
	actual_output = result \| beam.Map(lambda x: len(x['log_message']))
	assert_that(
	actual_output, equal_to([SENTENCE_EMBEDDING_DIMENSION] * len(DATA)))

	def test_sentence_transformer_embedding_with_beam_rows(self):
	SENTENCE_EMBEDDING_DIMENSION = 384
	DATA = [
	beam.Row(id=1, log_message="Error in module A"),
	beam.Row(id=2, log_message="Warning in module B"),
	beam.Row(id=3, log_message="Info in module C"),
	]
	ml_opts = beam.options.pipeline_options.PipelineOptions(
	pickle_library='cloudpickle', yaml_experimental_features=['ML'])
	with tempfile.TemporaryDirectory() as tempdir:
	with beam.Pipeline(options=ml_opts) as p:
	elements = p \| beam.Create(DATA)
	result = elements \| YamlTransform(
	f'''
	type: MLTransform
	config:
	write_artifact_location: {tempdir}
	transforms:
	- type: SentenceTransformerEmbeddings
	config:
	model_name: all-MiniLM-L6-v2
	columns: [log_message]
	''')

	# Perform a basic check to ensure that embeddings are generated
	# and that the dimension of those embeddings is correct.
	actual_output = result \| beam.Map(lambda x: len(x.log_message))
	assert_that(
	actual_output, equal_to([SENTENCE_EMBEDDING_DIMENSION] * len(DATA)))

	def test_ml_transform_outputs_schema(self):
	SENTENCE_EMBEDDING_DIMENSION = 384
	ml_opts = beam.options.pipeline_options.PipelineOptions(
	pickle_library='cloudpickle', yaml_experimental_features=['ML'])
	with tempfile.TemporaryDirectory() as tempdir:
	with beam.Pipeline(options=ml_opts) as p:
	result = p \| YamlTransform(
	f'''
	type: chain
	transforms:
	- type: Create
	config:
	elements:
	- {{id: 1, log_message: "Error in module A"}}
	- {{id: 2, log_message: "Warning in module B"}}
	- {{id: 3, log_message: "Info in module C"}}
	- type: MLTransform
	config:
	write_artifact_location: {tempdir}
	transforms:
	- type: SentenceTransformerEmbeddings
	config:
	model_name: all-MiniLM-L6-v2
	columns: [log_message]
	- type: MapToFields
	config:
	language: python
	fields:
	id: id
	embedding: log_message
	''')

	def check_row(row):
	assert isinstance(row.id, int)
	assert isinstance(row.embedding, list)
	assert len(row.embedding) == SENTENCE_EMBEDDING_DIMENSION
	return row.id

	assert_that(result \| beam.Map(check_row), equal_to([1, 2, 3]))


	if __name__ == '__main__':
	logging.getLogger().setLevel(logging.INFO)
	unittest.main()