paimon-python/pypaimon/tests/rest/rest_object_table_test.py - paimon - Git at Google

 """
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file
 distributed with this work for additional information
 regarding copyright ownership.  The ASF licenses this file
 to you under the Apache License, Version 2.0 (the
 "License"); you may not use this file except in compliance
 with the License.  You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 """

 import pyarrow as pa

 from pypaimon import PaimonVirtualFileSystem, Schema
 from pypaimon.table.object import ObjectTable
 from pypaimon.tests.rest.rest_base_test import RESTBaseTest


 class RESTObjectTableTest(RESTBaseTest):

     def setUp(self):
         super().setUp()
         pvfs_options = {
             'uri': self.options['uri'],
             'warehouse': self.options['warehouse'],
             'dlf.region': self.options['dlf.region'],
             'token.provider': self.options['token.provider'],
             'token': self.options['token'],
         }
         self.pvfs = PaimonVirtualFileSystem(pvfs_options)

     def _create_object_table(self, table_name, extra_options=None):
         """Helper to create an object table with the given name.

         ObjectTable has a fixed schema and does not support custom fields.
         Only options (including type=object-table) are needed.
         """
         options = {"type": "object-table"}
         if extra_options:
             options.update(extra_options)
         schema = Schema(options=options)
         self.rest_catalog.drop_table(table_name, True)
         self.rest_catalog.create_table(table_name, schema, False)
         return self.rest_catalog.get_table(table_name)

     def _pvfs_table_path(self, table_name, sub_path=None):
         """Build a pvfs:// path for the given table and optional sub-path."""
         warehouse = self.options['warehouse']
         base = "pvfs://{}/{}".format(warehouse, table_name.replace('.', '/'))
         if sub_path:
             return "{}/{}".format(base, sub_path)
         return base

     def _write_file_via_pvfs(self, table_name, filename, content):
         """Write a file into the table's location using pvfs."""
         path = self._pvfs_table_path(table_name, filename)
         # Ensure parent directory exists when filename contains subdirectories
         if '/' in filename:
             parent_dir = self._pvfs_table_path(
                 table_name, filename.rsplit('/', 1)[0]
             )
             self.pvfs.makedirs(parent_dir, exist_ok=True)
         with self.pvfs.open(path, 'wb') as f:
             f.write(content)

     def test_get_object_table(self):
         table_name = "default.object_table_basic"
         table = self._create_object_table(table_name)

         self.assertIsInstance(table, ObjectTable)
         self.assertEqual(table.name(), "object_table_basic")
         self.assertEqual(table.full_name(), table_name)
         self.assertEqual(table.partition_keys, [])
         self.assertEqual(table.primary_keys, [])
         self.assertEqual(table.options().get("type"), "object-table")

     def test_object_table_read_files(self):
         table_name = "default.object_table_read"
         table = self._create_object_table(table_name)

         # Write some test files into the table's location via pvfs
         test_files = {
             "file_a.txt": b"hello world",
             "file_b.dat": b"some binary data here",
         }
         for filename, content in test_files.items():
             self._write_file_via_pvfs(table_name, filename, content)

         # Read the object table
         read_builder = table.new_read_builder()
         scan = read_builder.new_scan()
         plan = scan.plan()
         splits = plan.splits()

         self.assertEqual(len(splits), 1)

         table_read = read_builder.new_read()
         result = table_read.to_arrow(splits)

         self.assertIsInstance(result, pa.Table)
         # Should contain the schema, snapshot and manifest files plus our test files
         # Filter to only our test files
         result_names = result.column("name").to_pylist()
         for filename in test_files:
             self.assertIn(filename, result_names)

         # Verify file sizes for our test files
         result_dict = {}
         for i in range(result.num_rows):
             name = result.column("name")[i].as_py()
             length = result.column("length")[i].as_py()
             result_dict[name] = length

         for filename, content in test_files.items():
             self.assertEqual(result_dict[filename], len(content))

         # Verify schema columns
         self.assertIn("path", result.column_names)
         self.assertIn("name", result.column_names)
         self.assertIn("length", result.column_names)
         self.assertIn("mtime", result.column_names)
         self.assertIn("atime", result.column_names)
         self.assertIn("owner", result.column_names)

     def test_object_table_read_with_subdirectories(self):
         table_name = "default.object_table_subdir"
         table = self._create_object_table(table_name)

         # Write files via pvfs, including a nested subdirectory file
         self._write_file_via_pvfs(table_name, "root_file.txt", b"root content")
         self._write_file_via_pvfs(table_name, "subdir/nested_file.txt", b"nested content")

         read_builder = table.new_read_builder()
         splits = read_builder.new_scan().plan().splits()
         result = read_builder.new_read().to_arrow(splits)

         result_names = result.column("name").to_pylist()
         self.assertIn("root_file.txt", result_names)
         self.assertIn("nested_file.txt", result_names)

         # Verify relative paths contain subdirectory
         result_paths = result.column("path").to_pylist()
         nested_paths = [p for p in result_paths if "nested_file.txt" in p]
         self.assertTrue(len(nested_paths) > 0)
         self.assertIn("subdir/", nested_paths[0])

     def test_object_table_with_projection(self):
         table_name = "default.object_table_projection"
         table = self._create_object_table(table_name)

         self._write_file_via_pvfs(table_name, "proj_test.txt", b"test data")

         # Test projection with two columns
         read_builder = table.new_read_builder()
         read_builder.with_projection(["name", "length"])
         splits = read_builder.new_scan().plan().splits()
         result = read_builder.new_read().to_arrow(splits)

         self.assertEqual(result.column_names, ["name", "length"])
         self.assertNotIn("path", result.column_names)
         self.assertNotIn("mtime", result.column_names)
         self.assertNotIn("atime", result.column_names)
         self.assertNotIn("owner", result.column_names)

         # Verify data content for our test file
         result_names = result.column("name").to_pylist()
         self.assertIn("proj_test.txt", result_names)
         idx = result_names.index("proj_test.txt")
         self.assertEqual(result.column("length")[idx].as_py(), len(b"test data"))

     def test_object_table_with_limit(self):
         table_name = "default.object_table_limit"
         table = self._create_object_table(table_name)

         for i in range(5):
             self._write_file_via_pvfs(
                 table_name, "file_{}.txt".format(i), "content {}".format(i).encode()
             )

         read_builder = table.new_read_builder()
         read_builder.with_limit(2)
         splits = read_builder.new_scan().plan().splits()
         result = read_builder.new_read().to_arrow(splits)

         self.assertEqual(result.num_rows, 2)

     def test_object_table_options_and_copy(self):
         table_name = "default.object_table_options"
         table = self._create_object_table(
             table_name, extra_options={"custom.key": "custom_value"}
         )

         self.assertEqual(table.options().get("custom.key"), "custom_value")
         self.assertEqual(table.options().get("type"), "object-table")

         # Test copy with dynamic options
         copied = table.copy({"new.key": "new_value"})
         self.assertIsInstance(copied, ObjectTable)
         self.assertEqual(copied.options().get("custom.key"), "custom_value")
         self.assertEqual(copied.options().get("new.key"), "new_value")

         # Original should not be modified
         self.assertIsNone(table.options().get("new.key"))

         # Test copy with override
         overridden = table.copy({"custom.key": "overridden"})
         self.assertEqual(overridden.options().get("custom.key"), "overridden")
         self.assertEqual(table.options().get("custom.key"), "custom_value")

     def test_object_table_unsupported_write(self):
         table_name = "default.object_table_no_write"
         table = self._create_object_table(table_name)

         with self.assertRaises(NotImplementedError):
             table.new_batch_write_builder()
         with self.assertRaises(NotImplementedError):
             table.new_stream_write_builder()

     def test_object_table_unsupported_drop_partitions(self):
         table_name = "default.object_table_no_drop_partitions"
         self._create_object_table(table_name)

         with self.assertRaisesRegex(
             ValueError,
             "drop_partitions is not supported for table type 'ObjectTable'",
         ):
             self.rest_catalog.drop_partitions(
                 table_name,
                 [{"dt": "20250101"}],
             )

     def test_object_table_to_pandas(self):
         table_name = "default.object_table_pandas"
         table = self._create_object_table(table_name)

         self._write_file_via_pvfs(table_name, "pandas_test.txt", b"pandas data")

         read_builder = table.new_read_builder()
         splits = read_builder.new_scan().plan().splits()
         result_df = read_builder.new_read().to_pandas(splits)

         self.assertIn("name", result_df.columns)
         self.assertIn("pandas_test.txt", result_df["name"].values)
	"""
	Licensed to the Apache Software Foundation (ASF) under one
	or more contributor license agreements. See the NOTICE file
	distributed with this work for additional information
	regarding copyright ownership. The ASF licenses this file
	to you under the Apache License, Version 2.0 (the
	"License"); you may not use this file except in compliance
	with the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	"""

	import pyarrow as pa

	from pypaimon import PaimonVirtualFileSystem, Schema
	from pypaimon.table.object import ObjectTable
	from pypaimon.tests.rest.rest_base_test import RESTBaseTest


	class RESTObjectTableTest(RESTBaseTest):

	def setUp(self):
	super().setUp()
	pvfs_options = {
	'uri': self.options['uri'],
	'warehouse': self.options['warehouse'],
	'dlf.region': self.options['dlf.region'],
	'token.provider': self.options['token.provider'],
	'token': self.options['token'],
	}
	self.pvfs = PaimonVirtualFileSystem(pvfs_options)

	def _create_object_table(self, table_name, extra_options=None):
	"""Helper to create an object table with the given name.

	ObjectTable has a fixed schema and does not support custom fields.
	Only options (including type=object-table) are needed.
	"""
	options = {"type": "object-table"}
	if extra_options:
	options.update(extra_options)
	schema = Schema(options=options)
	self.rest_catalog.drop_table(table_name, True)
	self.rest_catalog.create_table(table_name, schema, False)
	return self.rest_catalog.get_table(table_name)

	def _pvfs_table_path(self, table_name, sub_path=None):
	"""Build a pvfs:// path for the given table and optional sub-path."""
	warehouse = self.options['warehouse']
	base = "pvfs://{}/{}".format(warehouse, table_name.replace('.', '/'))
	if sub_path:
	return "{}/{}".format(base, sub_path)
	return base

	def _write_file_via_pvfs(self, table_name, filename, content):
	"""Write a file into the table's location using pvfs."""
	path = self._pvfs_table_path(table_name, filename)
	# Ensure parent directory exists when filename contains subdirectories
	if '/' in filename:
	parent_dir = self._pvfs_table_path(
	table_name, filename.rsplit('/', 1)[0]
	)
	self.pvfs.makedirs(parent_dir, exist_ok=True)
	with self.pvfs.open(path, 'wb') as f:
	f.write(content)

	def test_get_object_table(self):
	table_name = "default.object_table_basic"
	table = self._create_object_table(table_name)

	self.assertIsInstance(table, ObjectTable)
	self.assertEqual(table.name(), "object_table_basic")
	self.assertEqual(table.full_name(), table_name)
	self.assertEqual(table.partition_keys, [])
	self.assertEqual(table.primary_keys, [])
	self.assertEqual(table.options().get("type"), "object-table")

	def test_object_table_read_files(self):
	table_name = "default.object_table_read"
	table = self._create_object_table(table_name)

	# Write some test files into the table's location via pvfs
	test_files = {
	"file_a.txt": b"hello world",
	"file_b.dat": b"some binary data here",
	}
	for filename, content in test_files.items():
	self._write_file_via_pvfs(table_name, filename, content)

	# Read the object table
	read_builder = table.new_read_builder()
	scan = read_builder.new_scan()
	plan = scan.plan()
	splits = plan.splits()

	self.assertEqual(len(splits), 1)

	table_read = read_builder.new_read()
	result = table_read.to_arrow(splits)

	self.assertIsInstance(result, pa.Table)
	# Should contain the schema, snapshot and manifest files plus our test files
	# Filter to only our test files
	result_names = result.column("name").to_pylist()
	for filename in test_files:
	self.assertIn(filename, result_names)

	# Verify file sizes for our test files
	result_dict = {}
	for i in range(result.num_rows):
	name = result.column("name")[i].as_py()
	length = result.column("length")[i].as_py()
	result_dict[name] = length

	for filename, content in test_files.items():
	self.assertEqual(result_dict[filename], len(content))

	# Verify schema columns
	self.assertIn("path", result.column_names)
	self.assertIn("name", result.column_names)
	self.assertIn("length", result.column_names)
	self.assertIn("mtime", result.column_names)
	self.assertIn("atime", result.column_names)
	self.assertIn("owner", result.column_names)

	def test_object_table_read_with_subdirectories(self):
	table_name = "default.object_table_subdir"
	table = self._create_object_table(table_name)

	# Write files via pvfs, including a nested subdirectory file
	self._write_file_via_pvfs(table_name, "root_file.txt", b"root content")
	self._write_file_via_pvfs(table_name, "subdir/nested_file.txt", b"nested content")

	read_builder = table.new_read_builder()
	splits = read_builder.new_scan().plan().splits()
	result = read_builder.new_read().to_arrow(splits)

	result_names = result.column("name").to_pylist()
	self.assertIn("root_file.txt", result_names)
	self.assertIn("nested_file.txt", result_names)

	# Verify relative paths contain subdirectory
	result_paths = result.column("path").to_pylist()
	nested_paths = [p for p in result_paths if "nested_file.txt" in p]
	self.assertTrue(len(nested_paths) > 0)
	self.assertIn("subdir/", nested_paths[0])

	def test_object_table_with_projection(self):
	table_name = "default.object_table_projection"
	table = self._create_object_table(table_name)

	self._write_file_via_pvfs(table_name, "proj_test.txt", b"test data")

	# Test projection with two columns
	read_builder = table.new_read_builder()
	read_builder.with_projection(["name", "length"])
	splits = read_builder.new_scan().plan().splits()
	result = read_builder.new_read().to_arrow(splits)

	self.assertEqual(result.column_names, ["name", "length"])
	self.assertNotIn("path", result.column_names)
	self.assertNotIn("mtime", result.column_names)
	self.assertNotIn("atime", result.column_names)
	self.assertNotIn("owner", result.column_names)

	# Verify data content for our test file
	result_names = result.column("name").to_pylist()
	self.assertIn("proj_test.txt", result_names)
	idx = result_names.index("proj_test.txt")
	self.assertEqual(result.column("length")[idx].as_py(), len(b"test data"))

	def test_object_table_with_limit(self):
	table_name = "default.object_table_limit"
	table = self._create_object_table(table_name)

	for i in range(5):
	self._write_file_via_pvfs(
	table_name, "file_{}.txt".format(i), "content {}".format(i).encode()
	)

	read_builder = table.new_read_builder()
	read_builder.with_limit(2)
	splits = read_builder.new_scan().plan().splits()
	result = read_builder.new_read().to_arrow(splits)

	self.assertEqual(result.num_rows, 2)

	def test_object_table_options_and_copy(self):
	table_name = "default.object_table_options"
	table = self._create_object_table(
	table_name, extra_options={"custom.key": "custom_value"}
	)

	self.assertEqual(table.options().get("custom.key"), "custom_value")
	self.assertEqual(table.options().get("type"), "object-table")

	# Test copy with dynamic options
	copied = table.copy({"new.key": "new_value"})
	self.assertIsInstance(copied, ObjectTable)
	self.assertEqual(copied.options().get("custom.key"), "custom_value")
	self.assertEqual(copied.options().get("new.key"), "new_value")

	# Original should not be modified
	self.assertIsNone(table.options().get("new.key"))

	# Test copy with override
	overridden = table.copy({"custom.key": "overridden"})
	self.assertEqual(overridden.options().get("custom.key"), "overridden")
	self.assertEqual(table.options().get("custom.key"), "custom_value")

	def test_object_table_unsupported_write(self):
	table_name = "default.object_table_no_write"
	table = self._create_object_table(table_name)

	with self.assertRaises(NotImplementedError):
	table.new_batch_write_builder()
	with self.assertRaises(NotImplementedError):
	table.new_stream_write_builder()

	def test_object_table_unsupported_drop_partitions(self):
	table_name = "default.object_table_no_drop_partitions"
	self._create_object_table(table_name)

	with self.assertRaisesRegex(
	ValueError,
	"drop_partitions is not supported for table type 'ObjectTable'",
	):
	self.rest_catalog.drop_partitions(
	table_name,
	[{"dt": "20250101"}],
	)

	def test_object_table_to_pandas(self):
	table_name = "default.object_table_pandas"
	table = self._create_object_table(table_name)

	self._write_file_via_pvfs(table_name, "pandas_test.txt", b"pandas data")

	read_builder = table.new_read_builder()
	splits = read_builder.new_scan().plan().splits()
	result_df = read_builder.new_read().to_pandas(splits)

	self.assertIn("name", result_df.columns)
	self.assertIn("pandas_test.txt", result_df["name"].values)