python/pyarrow/_exec_plan.pyx - arrow - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # ---------------------------------------------------------------------
 # Implement Internal ExecPlan bindings

 # cython: profile=False
 # distutils: language = c++
 # cython: language_level = 3

 from cython.operator cimport dereference as deref, preincrement as inc

 from pyarrow.includes.common cimport *
 from pyarrow.includes.libarrow cimport *
 from pyarrow.includes.libarrow_dataset cimport *
 from pyarrow.lib cimport (Table, check_status, pyarrow_unwrap_table, pyarrow_wrap_table)
 from pyarrow.lib import tobytes
 from pyarrow._compute cimport Expression, _true
 from pyarrow._dataset cimport Dataset
 from pyarrow._dataset import InMemoryDataset

 Initialize()  # Initialise support for Datasets in ExecPlan


 cdef execplan(inputs, output_type, vector[CDeclaration] plan, c_bool use_threads=True):
     """
     Internal Function to create an ExecPlan and run it.

     Parameters
     ----------
     inputs : list of Table or Dataset
         The sources from which the ExecPlan should fetch data.
         In most cases this is only one, unless the first node of the
         plan is able to get data from multiple different sources.
     output_type : Table or InMemoryDataset
         In which format the output should be provided.
     plan : vector[CDeclaration]
         The nodes of the plan that should be applied to the sources
         to produce the output.
     use_threads : bool, default True
         Whenever to use multithreading or not.
     """
     cdef:
         CExecutor *c_executor
         shared_ptr[CExecContext] c_exec_context
         shared_ptr[CExecPlan] c_exec_plan
         vector[CDeclaration] c_decls
         vector[CExecNode*] _empty
         vector[CExecNode*] c_final_node_vec
         CExecNode *c_node
         CTable* c_table
         shared_ptr[CTable] c_in_table
         shared_ptr[CTable] c_out_table
         shared_ptr[CTableSourceNodeOptions] c_tablesourceopts
         shared_ptr[CScanNodeOptions] c_scanopts
         shared_ptr[CExecNodeOptions] c_input_node_opts
         shared_ptr[CSinkNodeOptions] c_sinkopts
         shared_ptr[CAsyncExecBatchGenerator] c_async_exec_batch_gen
         shared_ptr[CRecordBatchReader] c_recordbatchreader
         vector[CDeclaration].iterator plan_iter
         vector[CDeclaration.Input] no_c_inputs
         CStatus c_plan_status

     if use_threads:
         c_executor = GetCpuThreadPool()
     else:
         c_executor = NULL

     c_exec_context = make_shared[CExecContext](
         c_default_memory_pool(), c_executor)
     c_exec_plan = GetResultValue(CExecPlan.Make(c_exec_context.get()))

     plan_iter = plan.begin()

     # Create source nodes for each input
     for ipt in inputs:
         if isinstance(ipt, Table):
             node_factory = "table_source"
             c_in_table = pyarrow_unwrap_table(ipt)
             c_tablesourceopts = make_shared[CTableSourceNodeOptions](
                 c_in_table, 1 << 20)
             c_input_node_opts = static_pointer_cast[CExecNodeOptions, CTableSourceNodeOptions](
                 c_tablesourceopts)
         elif isinstance(ipt, Dataset):
             node_factory = "scan"
             c_in_dataset = (<Dataset>ipt).unwrap()
             c_scanopts = make_shared[CScanNodeOptions](
                 c_in_dataset, make_shared[CScanOptions]())
             deref(deref(c_scanopts).scan_options).use_threads = use_threads
             c_input_node_opts = static_pointer_cast[CExecNodeOptions, CScanNodeOptions](
                 c_scanopts)
         else:
             raise TypeError("Unsupported type")

         if plan_iter != plan.end():
             # Flag the source as the input of the first plan node.
             deref(plan_iter).inputs.push_back(CDeclaration.Input(
                 CDeclaration(tobytes(node_factory),
                              no_c_inputs, c_input_node_opts)
             ))
         else:
             # Empty plan, make the source the first plan node.
             c_decls.push_back(
                 CDeclaration(tobytes(node_factory),
                              no_c_inputs, c_input_node_opts)
             )

     # Add Here additional nodes
     while plan_iter != plan.end():
         c_decls.push_back(deref(plan_iter))
         inc(plan_iter)

     # Add all CDeclarations to the plan
     c_node = GetResultValue(
         CDeclaration.Sequence(c_decls).AddToPlan(&deref(c_exec_plan))
     )
     c_final_node_vec.push_back(c_node)

     # Create the output node
     c_async_exec_batch_gen = make_shared[CAsyncExecBatchGenerator]()
     c_sinkopts = make_shared[CSinkNodeOptions](c_async_exec_batch_gen.get())
     GetResultValue(
         MakeExecNode(tobytes("sink"), &deref(c_exec_plan),
                      c_final_node_vec, deref(c_sinkopts))
     )

     # Convert the asyncgenerator to a sync batch reader
     c_recordbatchreader = MakeGeneratorReader(c_node.output_schema(),
                                               deref(c_async_exec_batch_gen),
                                               deref(c_exec_context).memory_pool())

     # Start execution of the ExecPlan
     deref(c_exec_plan).Validate()
     deref(c_exec_plan).StartProducing()

     # Convert output to the expected one.
     c_out_table = GetResultValue(
         CTable.FromRecordBatchReader(c_recordbatchreader.get()))
     if output_type == Table:
         output = pyarrow_wrap_table(c_out_table)
     elif output_type == InMemoryDataset:
         output = InMemoryDataset(pyarrow_wrap_table(c_out_table))
     else:
         raise TypeError("Unsupported output type")

     with nogil:
         c_plan_status = deref(c_exec_plan).finished().status()
     check_status(c_plan_status)

     return output


 def _perform_join(join_type, left_operand not None, left_keys,
                   right_operand not None, right_keys,
                   left_suffix=None, right_suffix=None,
                   use_threads=True, coalesce_keys=False,
                   output_type=Table):
     """
     Perform join of two tables or datasets.

     The result will be an output table with the result of the join operation

     Parameters
     ----------
     join_type : str
         One of supported join types.
     left_operand : Table or Dataset
         The left operand for the join operation.
     left_keys : str or list[str]
         The left key (or keys) on which the join operation should be performed.
     right_operand : Table or Dataset
         The right operand for the join operation.
     right_keys : str or list[str]
         The right key (or keys) on which the join operation should be performed.
     left_suffix : str, default None
         Which suffix to add to right column names. This prevents confusion
         when the columns in left and right operands have colliding names.
     right_suffix : str, default None
         Which suffic to add to the left column names. This prevents confusion
         when the columns in left and right operands have colliding names.
     use_threads : bool, default True
         Whenever to use multithreading or not.
     coalesce_keys : bool, default False
         If the duplicated keys should be omitted from one of the sides
         in the join result.
     output_type: Table or InMemoryDataset
         The output type for the exec plan result.

     Returns
     -------
     result_table : Table
     """
     cdef:
         vector[CFieldRef] c_left_keys
         vector[CFieldRef] c_right_keys
         vector[CFieldRef] c_left_columns
         vector[CFieldRef] c_right_columns
         vector[CDeclaration] c_decl_plan
         vector[CExpression] c_projections
         vector[c_string] c_projected_col_names
         CJoinType c_join_type

     # Prepare left and right tables Keys to send them to the C++ function
     left_keys_order = {}
     if isinstance(left_keys, str):
         left_keys = [left_keys]
     for idx, key in enumerate(left_keys):
         left_keys_order[key] = idx
         c_left_keys.push_back(CFieldRef(<c_string>tobytes(key)))

     right_keys_order = {}
     if isinstance(right_keys, str):
         right_keys = [right_keys]
     for idx, key in enumerate(right_keys):
         right_keys_order[key] = idx
         c_right_keys.push_back(CFieldRef(<c_string>tobytes(key)))

     # By default expose all columns on both left and right table
     if isinstance(left_operand, Table):
         left_columns = left_operand.column_names
     elif isinstance(left_operand, Dataset):
         left_columns = left_operand.schema.names
     else:
         raise TypeError("Unsupported left join member type")

     if isinstance(right_operand, Table):
         right_columns = right_operand.column_names
     elif isinstance(right_operand, Dataset):
         right_columns = right_operand.schema.names
     else:
         raise TypeError("Unsupported right join member type")

     # Pick the join type
     if join_type == "left semi":
         c_join_type = CJoinType_LEFT_SEMI
         right_columns = []
     elif join_type == "right semi":
         c_join_type = CJoinType_RIGHT_SEMI
         left_columns = []
     elif join_type == "left anti":
         c_join_type = CJoinType_LEFT_ANTI
         right_columns = []
     elif join_type == "right anti":
         c_join_type = CJoinType_RIGHT_ANTI
         left_columns = []
     elif join_type == "inner":
         c_join_type = CJoinType_INNER
         right_columns = set(right_columns) - set(right_keys)
     elif join_type == "left outer":
         c_join_type = CJoinType_LEFT_OUTER
         right_columns = set(right_columns) - set(right_keys)
     elif join_type == "right outer":
         c_join_type = CJoinType_RIGHT_OUTER
         left_columns = set(left_columns) - set(left_keys)
     elif join_type == "full outer":
         c_join_type = CJoinType_FULL_OUTER
     else:
         raise ValueError("Unsupported join type")

     # Turn the columns to vectors of FieldRefs
     # and set aside indices of keys.
     left_column_keys_indices = {}
     for idx, colname in enumerate(left_columns):
         c_left_columns.push_back(CFieldRef(<c_string>tobytes(colname)))
         if colname in left_keys:
             left_column_keys_indices[colname] = idx
     right_column_keys_indices = {}
     for idx, colname in enumerate(right_columns):
         c_right_columns.push_back(CFieldRef(<c_string>tobytes(colname)))
         if colname in right_keys:
             right_column_keys_indices[colname] = idx

     # Add the join node to the execplan
     if coalesce_keys:
         c_decl_plan.push_back(
             CDeclaration(tobytes("hashjoin"), CHashJoinNodeOptions(
                 c_join_type, c_left_keys, c_right_keys,
                 c_left_columns, c_right_columns,
                 _true,
                 <c_string>tobytes(left_suffix or ""),
                 <c_string>tobytes(right_suffix or "")
             ))
         )
         if join_type == "full outer":
             # In case of full outer joins, the join operation will output all columns
             # so that we can coalesce the keys and exclude duplicates in a subsequent projection.
             left_columns_set = set(left_columns)
             right_columns_set = set(right_columns)
             # Where the right table columns start.
             right_operand_index = len(left_columns)
             for idx, col in enumerate(left_columns + right_columns):
                 if idx < len(left_columns) and col in left_column_keys_indices:
                     # Include keys only once and coalesce left+right table keys.
                     c_projected_col_names.push_back(tobytes(col))
                     # Get the index of the right key that is being paired
                     # with this left key. We do so by retrieving the name
                     # of the right key that is in the same position in the provided keys
                     # and then looking up the index for that name in the right table.
                     right_key_index = right_column_keys_indices[right_keys[left_keys_order[col]]]
                     c_projections.push_back(Expression.unwrap(
                         Expression._call("coalesce", [
                             Expression._field(idx), Expression._field(
                                 right_operand_index+right_key_index)
                         ])
                     ))
                 elif idx >= right_operand_index and col in right_column_keys_indices:
                     # Do not include right table keys. As they would lead to duplicated keys.
                     continue
                 else:
                     # For all the other columns incude them as they are.
                     # Just recompute the suffixes that the join produced as the projection
                     # would lose them otherwise.
                     if left_suffix and idx < right_operand_index and col in right_columns_set:
                         col += left_suffix
                     if right_suffix and idx >= right_operand_index and col in left_columns_set:
                         col += right_suffix
                     c_projected_col_names.push_back(tobytes(col))
                     c_projections.push_back(
                         Expression.unwrap(Expression._field(idx)))
             c_decl_plan.push_back(
                 CDeclaration(tobytes("project"), CProjectNodeOptions(
                     c_projections, c_projected_col_names))
             )
     else:
         c_decl_plan.push_back(
             CDeclaration(tobytes("hashjoin"), CHashJoinNodeOptions(
                 c_join_type, c_left_keys, c_right_keys,
                 _true,
                 <c_string>tobytes(left_suffix or ""),
                 <c_string>tobytes(right_suffix or "")
             ))
         )

     result_table = execplan([left_operand, right_operand],
                             plan=c_decl_plan,
                             output_type=output_type,
                             use_threads=use_threads)

     return result_table
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	# ---------------------------------------------------------------------
	# Implement Internal ExecPlan bindings

	# cython: profile=False
	# distutils: language = c++
	# cython: language_level = 3

	from cython.operator cimport dereference as deref, preincrement as inc

	from pyarrow.includes.common cimport *
	from pyarrow.includes.libarrow cimport *
	from pyarrow.includes.libarrow_dataset cimport *
	from pyarrow.lib cimport (Table, check_status, pyarrow_unwrap_table, pyarrow_wrap_table)
	from pyarrow.lib import tobytes
	from pyarrow._compute cimport Expression, _true
	from pyarrow._dataset cimport Dataset
	from pyarrow._dataset import InMemoryDataset

	Initialize() # Initialise support for Datasets in ExecPlan


	cdef execplan(inputs, output_type, vector[CDeclaration] plan, c_bool use_threads=True):
	"""
	Internal Function to create an ExecPlan and run it.

	Parameters
	----------
	inputs : list of Table or Dataset
	The sources from which the ExecPlan should fetch data.
	In most cases this is only one, unless the first node of the
	plan is able to get data from multiple different sources.
	output_type : Table or InMemoryDataset
	In which format the output should be provided.
	plan : vector[CDeclaration]
	The nodes of the plan that should be applied to the sources
	to produce the output.
	use_threads : bool, default True
	Whenever to use multithreading or not.
	"""
	cdef:
	CExecutor *c_executor
	shared_ptr[CExecContext] c_exec_context
	shared_ptr[CExecPlan] c_exec_plan
	vector[CDeclaration] c_decls
	vector[CExecNode*] _empty
	vector[CExecNode*] c_final_node_vec
	CExecNode *c_node
	CTable* c_table
	shared_ptr[CTable] c_in_table
	shared_ptr[CTable] c_out_table
	shared_ptr[CTableSourceNodeOptions] c_tablesourceopts
	shared_ptr[CScanNodeOptions] c_scanopts
	shared_ptr[CExecNodeOptions] c_input_node_opts
	shared_ptr[CSinkNodeOptions] c_sinkopts
	shared_ptr[CAsyncExecBatchGenerator] c_async_exec_batch_gen
	shared_ptr[CRecordBatchReader] c_recordbatchreader
	vector[CDeclaration].iterator plan_iter
	vector[CDeclaration.Input] no_c_inputs
	CStatus c_plan_status

	if use_threads:
	c_executor = GetCpuThreadPool()
	else:
	c_executor = NULL

	c_exec_context = make_shared[CExecContext](
	c_default_memory_pool(), c_executor)
	c_exec_plan = GetResultValue(CExecPlan.Make(c_exec_context.get()))

	plan_iter = plan.begin()

	# Create source nodes for each input
	for ipt in inputs:
	if isinstance(ipt, Table):
	node_factory = "table_source"
	c_in_table = pyarrow_unwrap_table(ipt)
	c_tablesourceopts = make_shared[CTableSourceNodeOptions](
	c_in_table, 1 << 20)
	c_input_node_opts = static_pointer_cast[CExecNodeOptions, CTableSourceNodeOptions](
	c_tablesourceopts)
	elif isinstance(ipt, Dataset):
	node_factory = "scan"
	c_in_dataset = (<Dataset>ipt).unwrap()
	c_scanopts = make_shared[CScanNodeOptions](
	c_in_dataset, make_shared[CScanOptions]())
	deref(deref(c_scanopts).scan_options).use_threads = use_threads
	c_input_node_opts = static_pointer_cast[CExecNodeOptions, CScanNodeOptions](
	c_scanopts)
	else:
	raise TypeError("Unsupported type")

	if plan_iter != plan.end():
	# Flag the source as the input of the first plan node.
	deref(plan_iter).inputs.push_back(CDeclaration.Input(
	CDeclaration(tobytes(node_factory),
	no_c_inputs, c_input_node_opts)
	))
	else:
	# Empty plan, make the source the first plan node.
	c_decls.push_back(
	CDeclaration(tobytes(node_factory),
	no_c_inputs, c_input_node_opts)
	)

	# Add Here additional nodes
	while plan_iter != plan.end():
	c_decls.push_back(deref(plan_iter))
	inc(plan_iter)

	# Add all CDeclarations to the plan
	c_node = GetResultValue(
	CDeclaration.Sequence(c_decls).AddToPlan(&deref(c_exec_plan))
	)
	c_final_node_vec.push_back(c_node)

	# Create the output node
	c_async_exec_batch_gen = make_shared[CAsyncExecBatchGenerator]()
	c_sinkopts = make_shared[CSinkNodeOptions](c_async_exec_batch_gen.get())
	GetResultValue(
	MakeExecNode(tobytes("sink"), &deref(c_exec_plan),
	c_final_node_vec, deref(c_sinkopts))
	)

	# Convert the asyncgenerator to a sync batch reader
	c_recordbatchreader = MakeGeneratorReader(c_node.output_schema(),
	deref(c_async_exec_batch_gen),
	deref(c_exec_context).memory_pool())

	# Start execution of the ExecPlan
	deref(c_exec_plan).Validate()
	deref(c_exec_plan).StartProducing()

	# Convert output to the expected one.
	c_out_table = GetResultValue(
	CTable.FromRecordBatchReader(c_recordbatchreader.get()))
	if output_type == Table:
	output = pyarrow_wrap_table(c_out_table)
	elif output_type == InMemoryDataset:
	output = InMemoryDataset(pyarrow_wrap_table(c_out_table))
	else:
	raise TypeError("Unsupported output type")

	with nogil:
	c_plan_status = deref(c_exec_plan).finished().status()
	check_status(c_plan_status)

	return output


	def _perform_join(join_type, left_operand not None, left_keys,
	right_operand not None, right_keys,
	left_suffix=None, right_suffix=None,
	use_threads=True, coalesce_keys=False,
	output_type=Table):
	"""
	Perform join of two tables or datasets.

	The result will be an output table with the result of the join operation

	Parameters
	----------
	join_type : str
	One of supported join types.
	left_operand : Table or Dataset
	The left operand for the join operation.
	left_keys : str or list[str]
	The left key (or keys) on which the join operation should be performed.
	right_operand : Table or Dataset
	The right operand for the join operation.
	right_keys : str or list[str]
	The right key (or keys) on which the join operation should be performed.
	left_suffix : str, default None
	Which suffix to add to right column names. This prevents confusion
	when the columns in left and right operands have colliding names.
	right_suffix : str, default None
	Which suffic to add to the left column names. This prevents confusion
	when the columns in left and right operands have colliding names.
	use_threads : bool, default True
	Whenever to use multithreading or not.
	coalesce_keys : bool, default False
	If the duplicated keys should be omitted from one of the sides
	in the join result.
	output_type: Table or InMemoryDataset
	The output type for the exec plan result.

	Returns
	-------
	result_table : Table
	"""
	cdef:
	vector[CFieldRef] c_left_keys
	vector[CFieldRef] c_right_keys
	vector[CFieldRef] c_left_columns
	vector[CFieldRef] c_right_columns
	vector[CDeclaration] c_decl_plan
	vector[CExpression] c_projections
	vector[c_string] c_projected_col_names
	CJoinType c_join_type

	# Prepare left and right tables Keys to send them to the C++ function
	left_keys_order = {}
	if isinstance(left_keys, str):
	left_keys = [left_keys]
	for idx, key in enumerate(left_keys):
	left_keys_order[key] = idx
	c_left_keys.push_back(CFieldRef(<c_string>tobytes(key)))

	right_keys_order = {}
	if isinstance(right_keys, str):
	right_keys = [right_keys]
	for idx, key in enumerate(right_keys):
	right_keys_order[key] = idx
	c_right_keys.push_back(CFieldRef(<c_string>tobytes(key)))

	# By default expose all columns on both left and right table
	if isinstance(left_operand, Table):
	left_columns = left_operand.column_names
	elif isinstance(left_operand, Dataset):
	left_columns = left_operand.schema.names
	else:
	raise TypeError("Unsupported left join member type")

	if isinstance(right_operand, Table):
	right_columns = right_operand.column_names
	elif isinstance(right_operand, Dataset):
	right_columns = right_operand.schema.names
	else:
	raise TypeError("Unsupported right join member type")

	# Pick the join type
	if join_type == "left semi":
	c_join_type = CJoinType_LEFT_SEMI
	right_columns = []
	elif join_type == "right semi":
	c_join_type = CJoinType_RIGHT_SEMI
	left_columns = []
	elif join_type == "left anti":
	c_join_type = CJoinType_LEFT_ANTI
	right_columns = []
	elif join_type == "right anti":
	c_join_type = CJoinType_RIGHT_ANTI
	left_columns = []
	elif join_type == "inner":
	c_join_type = CJoinType_INNER
	right_columns = set(right_columns) - set(right_keys)
	elif join_type == "left outer":
	c_join_type = CJoinType_LEFT_OUTER
	right_columns = set(right_columns) - set(right_keys)
	elif join_type == "right outer":
	c_join_type = CJoinType_RIGHT_OUTER
	left_columns = set(left_columns) - set(left_keys)
	elif join_type == "full outer":
	c_join_type = CJoinType_FULL_OUTER
	else:
	raise ValueError("Unsupported join type")

	# Turn the columns to vectors of FieldRefs
	# and set aside indices of keys.
	left_column_keys_indices = {}
	for idx, colname in enumerate(left_columns):
	c_left_columns.push_back(CFieldRef(<c_string>tobytes(colname)))
	if colname in left_keys:
	left_column_keys_indices[colname] = idx
	right_column_keys_indices = {}
	for idx, colname in enumerate(right_columns):
	c_right_columns.push_back(CFieldRef(<c_string>tobytes(colname)))
	if colname in right_keys:
	right_column_keys_indices[colname] = idx

	# Add the join node to the execplan
	if coalesce_keys:
	c_decl_plan.push_back(
	CDeclaration(tobytes("hashjoin"), CHashJoinNodeOptions(
	c_join_type, c_left_keys, c_right_keys,
	c_left_columns, c_right_columns,
	_true,
	<c_string>tobytes(left_suffix or ""),
	<c_string>tobytes(right_suffix or "")
	))
	)
	if join_type == "full outer":
	# In case of full outer joins, the join operation will output all columns
	# so that we can coalesce the keys and exclude duplicates in a subsequent projection.
	left_columns_set = set(left_columns)
	right_columns_set = set(right_columns)
	# Where the right table columns start.
	right_operand_index = len(left_columns)
	for idx, col in enumerate(left_columns + right_columns):
	if idx < len(left_columns) and col in left_column_keys_indices:
	# Include keys only once and coalesce left+right table keys.
	c_projected_col_names.push_back(tobytes(col))
	# Get the index of the right key that is being paired
	# with this left key. We do so by retrieving the name
	# of the right key that is in the same position in the provided keys
	# and then looking up the index for that name in the right table.
	right_key_index = right_column_keys_indices[right_keys[left_keys_order[col]]]
	c_projections.push_back(Expression.unwrap(
	Expression._call("coalesce", [
	Expression._field(idx), Expression._field(
	right_operand_index+right_key_index)
	])
	))
	elif idx >= right_operand_index and col in right_column_keys_indices:
	# Do not include right table keys. As they would lead to duplicated keys.
	continue
	else:
	# For all the other columns incude them as they are.
	# Just recompute the suffixes that the join produced as the projection
	# would lose them otherwise.
	if left_suffix and idx < right_operand_index and col in right_columns_set:
	col += left_suffix
	if right_suffix and idx >= right_operand_index and col in left_columns_set:
	col += right_suffix
	c_projected_col_names.push_back(tobytes(col))
	c_projections.push_back(
	Expression.unwrap(Expression._field(idx)))
	c_decl_plan.push_back(
	CDeclaration(tobytes("project"), CProjectNodeOptions(
	c_projections, c_projected_col_names))
	)
	else:
	c_decl_plan.push_back(
	CDeclaration(tobytes("hashjoin"), CHashJoinNodeOptions(
	c_join_type, c_left_keys, c_right_keys,
	_true,
	<c_string>tobytes(left_suffix or ""),
	<c_string>tobytes(right_suffix or "")
	))
	)

	result_table = execplan([left_operand, right_operand],
	plan=c_decl_plan,
	output_type=output_type,
	use_threads=use_threads)

	return result_table