| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| |
| import logging |
| import tempfile |
| import unittest |
| |
| import apache_beam as beam |
| from apache_beam.testing.util import assert_that |
| from apache_beam.testing.util import equal_to |
| from apache_beam.yaml.yaml_transform import YamlTransform |
| |
| try: |
| # pylint: disable=wrong-import-order, wrong-import-position, unused-import |
| from apache_beam.ml.transforms import tft |
| except ImportError: |
| raise unittest.SkipTest('tensorflow_transform is not installed.') |
| |
| TRAIN_DATA = [ |
| beam.Row(num=0, text='And God said, Let there be light,'), |
| beam.Row(num=2, text='And there was light'), |
| beam.Row(num=8, text='And God saw the light, that it was good'), |
| ] |
| |
| TEST_DATA = [ |
| beam.Row(num=6, text='And God divided the light from the darkness.'), |
| ] |
| |
| |
| class MLTransformTest(unittest.TestCase): |
| def test_ml_transform(self): |
| ml_opts = beam.options.pipeline_options.PipelineOptions( |
| pickle_library='cloudpickle', yaml_experimental_features=['ML']) |
| with tempfile.TemporaryDirectory() as tempdir: |
| with beam.Pipeline(options=ml_opts) as p: |
| elements = p | beam.Create(TRAIN_DATA) |
| result = elements | YamlTransform( |
| f''' |
| type: MLTransform |
| config: |
| write_artifact_location: {tempdir} |
| transforms: |
| - type: ScaleTo01 |
| config: |
| columns: [num] |
| - type: ComputeAndApplyVocabulary |
| config: |
| columns: [text] |
| split_string_by_delimiter: ' ,.' |
| ''') |
| assert_that( |
| # Why is this an array, not a scalar? |
| result | beam.Map(lambda x: x.num[0]), |
| equal_to([0, .25, 1])) |
| assert_that( |
| result | beam.Map(lambda x: set(x.text)) |
| | beam.CombineGlobally(lambda xs: set.union(*xs)), |
| equal_to([set(range(13))]), |
| label='CheckVocab') |
| |
| with beam.Pipeline(options=ml_opts) as p: |
| elements = p | beam.Create(TEST_DATA) |
| result = elements | YamlTransform( |
| f''' |
| type: MLTransform |
| config: |
| read_artifact_location: {tempdir} |
| ''') |
| assert_that(result | beam.Map(lambda x: x.num[0]), equal_to([.75])) |
| assert_that( |
| result | beam.Map(lambda x: len(set(x.text))), |
| equal_to([5]), |
| label='CheckVocab') |
| |
| def test_ml_transform_read_with_map_to_fields(self): |
| ml_opts = beam.options.pipeline_options.PipelineOptions( |
| pickle_library='cloudpickle', yaml_experimental_features=['ML']) |
| with tempfile.TemporaryDirectory() as tempdir: |
| # First, write the artifacts. |
| with beam.Pipeline(options=ml_opts) as p: |
| elements = p | beam.Create(TRAIN_DATA) |
| _ = elements | YamlTransform( |
| f''' |
| type: MLTransform |
| config: |
| write_artifact_location: {tempdir} |
| transforms: |
| - type: ScaleTo01 |
| config: |
| columns: [num] |
| - type: ComputeAndApplyVocabulary |
| config: |
| columns: [text] |
| split_string_by_delimiter: ' ,.' |
| ''') |
| |
| # Now, read the artifacts and use MapToFields. |
| with beam.Pipeline(options=ml_opts) as p: |
| elements = p | beam.Create(TEST_DATA) |
| result = elements | YamlTransform( |
| f''' |
| type: chain |
| transforms: |
| - type: MLTransform |
| config: |
| read_artifact_location: {tempdir} |
| - type: MapToFields |
| config: |
| language: python |
| fields: |
| num_scaled: "num[0]" |
| text_vocab: text |
| ''') |
| |
| def check_row(row): |
| assert row.num_scaled == 0.75 |
| assert len(set(row.text_vocab)) == 5 |
| return row.num_scaled |
| |
| assert_that(result | beam.Map(check_row), equal_to([0.75])) |
| |
| def test_sentence_transformer_embedding(self): |
| SENTENCE_EMBEDDING_DIMENSION = 384 |
| DATA = [{ |
| 'id': 1, 'log_message': "Error in module A" |
| }, { |
| 'id': 2, 'log_message': "Warning in module B" |
| }, { |
| 'id': 3, 'log_message': "Info in module C" |
| }] |
| ml_opts = beam.options.pipeline_options.PipelineOptions( |
| pickle_library='cloudpickle', yaml_experimental_features=['ML']) |
| with tempfile.TemporaryDirectory() as tempdir: |
| with beam.Pipeline(options=ml_opts) as p: |
| elements = p | beam.Create(DATA) |
| result = elements | YamlTransform( |
| f''' |
| type: MLTransform |
| config: |
| write_artifact_location: {tempdir} |
| transforms: |
| - type: SentenceTransformerEmbeddings |
| config: |
| model_name: all-MiniLM-L6-v2 |
| columns: [log_message] |
| ''') |
| |
| # Perform a basic check to ensure that embeddings are generated |
| # and that the dimension of those embeddings is correct. |
| actual_output = result | beam.Map(lambda x: len(x['log_message'])) |
| assert_that( |
| actual_output, equal_to([SENTENCE_EMBEDDING_DIMENSION] * len(DATA))) |
| |
| def test_sentence_transformer_embedding_with_beam_rows(self): |
| SENTENCE_EMBEDDING_DIMENSION = 384 |
| DATA = [ |
| beam.Row(id=1, log_message="Error in module A"), |
| beam.Row(id=2, log_message="Warning in module B"), |
| beam.Row(id=3, log_message="Info in module C"), |
| ] |
| ml_opts = beam.options.pipeline_options.PipelineOptions( |
| pickle_library='cloudpickle', yaml_experimental_features=['ML']) |
| with tempfile.TemporaryDirectory() as tempdir: |
| with beam.Pipeline(options=ml_opts) as p: |
| elements = p | beam.Create(DATA) |
| result = elements | YamlTransform( |
| f''' |
| type: MLTransform |
| config: |
| write_artifact_location: {tempdir} |
| transforms: |
| - type: SentenceTransformerEmbeddings |
| config: |
| model_name: all-MiniLM-L6-v2 |
| columns: [log_message] |
| ''') |
| |
| # Perform a basic check to ensure that embeddings are generated |
| # and that the dimension of those embeddings is correct. |
| actual_output = result | beam.Map(lambda x: len(x.log_message)) |
| assert_that( |
| actual_output, equal_to([SENTENCE_EMBEDDING_DIMENSION] * len(DATA))) |
| |
| def test_ml_transform_outputs_schema(self): |
| SENTENCE_EMBEDDING_DIMENSION = 384 |
| ml_opts = beam.options.pipeline_options.PipelineOptions( |
| pickle_library='cloudpickle', yaml_experimental_features=['ML']) |
| with tempfile.TemporaryDirectory() as tempdir: |
| with beam.Pipeline(options=ml_opts) as p: |
| result = p | YamlTransform( |
| f''' |
| type: chain |
| transforms: |
| - type: Create |
| config: |
| elements: |
| - {{id: 1, log_message: "Error in module A"}} |
| - {{id: 2, log_message: "Warning in module B"}} |
| - {{id: 3, log_message: "Info in module C"}} |
| - type: MLTransform |
| config: |
| write_artifact_location: {tempdir} |
| transforms: |
| - type: SentenceTransformerEmbeddings |
| config: |
| model_name: all-MiniLM-L6-v2 |
| columns: [log_message] |
| - type: MapToFields |
| config: |
| language: python |
| fields: |
| id: id |
| embedding: log_message |
| ''') |
| |
| def check_row(row): |
| assert isinstance(row.id, int) |
| assert isinstance(row.embedding, list) |
| assert len(row.embedding) == SENTENCE_EMBEDDING_DIMENSION |
| return row.id |
| |
| assert_that(result | beam.Map(check_row), equal_to([1, 2, 3])) |
| |
| |
| if __name__ == '__main__': |
| logging.getLogger().setLevel(logging.INFO) |
| unittest.main() |