blob: 0b3c9d4253e8c5620ac254cde2a2ecf484be3854 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
syntax = 'proto3';
package spark.connect;
import "google/protobuf/any.proto";
import "spark/connect/expressions.proto";
import "spark/connect/types.proto";
import "spark/connect/catalog.proto";
option java_multiple_files = true;
option java_package = "org.apache.spark.connect.proto";
option go_package = "internal/generated";
// The main [[Relation]] type. Fundamentally, a relation is a typed container
// that has exactly one explicit relation type set.
//
// When adding new relation types, they have to be registered here.
message Relation {
RelationCommon common = 1;
oneof rel_type {
Read read = 2;
Project project = 3;
Filter filter = 4;
Join join = 5;
SetOperation set_op = 6;
Sort sort = 7;
Limit limit = 8;
Aggregate aggregate = 9;
SQL sql = 10;
LocalRelation local_relation = 11;
Sample sample = 12;
Offset offset = 13;
Deduplicate deduplicate = 14;
Range range = 15;
SubqueryAlias subquery_alias = 16;
Repartition repartition = 17;
ToDF to_df = 18;
WithColumnsRenamed with_columns_renamed = 19;
ShowString show_string = 20;
Drop drop = 21;
Tail tail = 22;
WithColumns with_columns = 23;
Hint hint = 24;
Unpivot unpivot = 25;
ToSchema to_schema = 26;
RepartitionByExpression repartition_by_expression = 27;
MapPartitions map_partitions = 28;
CollectMetrics collect_metrics = 29;
Parse parse = 30;
GroupMap group_map = 31;
CoGroupMap co_group_map = 32;
WithWatermark with_watermark = 33;
ApplyInPandasWithState apply_in_pandas_with_state = 34;
HtmlString html_string = 35;
CachedLocalRelation cached_local_relation = 36;
CachedRemoteRelation cached_remote_relation = 37;
CommonInlineUserDefinedTableFunction common_inline_user_defined_table_function = 38;
AsOfJoin as_of_join = 39;
CommonInlineUserDefinedDataSource common_inline_user_defined_data_source = 40;
WithRelations with_relations = 41;
// NA functions
NAFill fill_na = 90;
NADrop drop_na = 91;
NAReplace replace = 92;
// stat functions
StatSummary summary = 100;
StatCrosstab crosstab = 101;
StatDescribe describe = 102;
StatCov cov = 103;
StatCorr corr = 104;
StatApproxQuantile approx_quantile = 105;
StatFreqItems freq_items = 106;
StatSampleBy sample_by = 107;
// Catalog API (experimental / unstable)
Catalog catalog = 200;
// This field is used to mark extensions to the protocol. When plugins generate arbitrary
// relations they can add them here. During the planning the correct resolution is done.
google.protobuf.Any extension = 998;
Unknown unknown = 999;
}
}
// Used for testing purposes only.
message Unknown {}
// Common metadata of all relations.
message RelationCommon {
// (Required) Shared relation metadata.
string source_info = 1;
// (Optional) A per-client globally unique id for a given connect plan.
optional int64 plan_id = 2;
}
// Relation that uses a SQL query to generate the output.
message SQL {
// (Required) The SQL query.
string query = 1;
// (Optional) A map of parameter names to literal expressions.
map<string, Expression.Literal> args = 2 [deprecated=true];
// (Optional) A sequence of literal expressions for positional parameters in the SQL query text.
repeated Expression.Literal pos_args = 3 [deprecated=true];
// (Optional) A map of parameter names to expressions.
// It cannot coexist with `pos_arguments`.
map<string, Expression> named_arguments = 4;
// (Optional) A sequence of expressions for positional parameters in the SQL query text.
// It cannot coexist with `named_arguments`.
repeated Expression pos_arguments = 5;
}
// Relation of type [[WithRelations]].
//
// This relation contains a root plan, and one or more references that are used by the root plan.
// There are two ways of referencing a relation, by name (through a subquery alias), or by plan_id
// (using RelationCommon.plan_id).
//
// This relation can be used to implement CTEs, describe DAGs, or to reduce tree depth.
message WithRelations {
// (Required) Plan at the root of the query tree. This plan is expected to contain one or more
// references. Those references get expanded later on by the engine.
Relation root = 1;
// (Required) Plans referenced by the root plan. Relations in this list are also allowed to
// contain references to other relations in this list, as long they do not form cycles.
repeated Relation references = 2;
}
// Relation that reads from a file / table or other data source. Does not have additional
// inputs.
message Read {
oneof read_type {
NamedTable named_table = 1;
DataSource data_source = 2;
}
// (Optional) Indicates if this is a streaming read.
bool is_streaming = 3;
message NamedTable {
// (Required) Unparsed identifier for the table.
string unparsed_identifier = 1;
// Options for the named table. The map key is case insensitive.
map<string, string> options = 2;
}
message DataSource {
// (Optional) Supported formats include: parquet, orc, text, json, parquet, csv, avro.
//
// If not set, the value from SQL conf 'spark.sql.sources.default' will be used.
optional string format = 1;
// (Optional) If not set, Spark will infer the schema.
//
// This schema string should be either DDL-formatted or JSON-formatted.
optional string schema = 2;
// Options for the data source. The context of this map varies based on the
// data source format. This options could be empty for valid data source format.
// The map key is case insensitive.
map<string, string> options = 3;
// (Optional) A list of path for file-system backed data sources.
repeated string paths = 4;
// (Optional) Condition in the where clause for each partition.
//
// This is only supported by the JDBC data source.
repeated string predicates = 5;
}
}
// Projection of a bag of expressions for a given input relation.
//
// The input relation must be specified.
// The projected expression can be an arbitrary expression.
message Project {
// (Optional) Input relation is optional for Project.
//
// For example, `SELECT ABS(-1)` is valid plan without an input plan.
Relation input = 1;
// (Required) A Project requires at least one expression.
repeated Expression expressions = 3;
}
// Relation that applies a boolean expression `condition` on each row of `input` to produce
// the output result.
message Filter {
// (Required) Input relation for a Filter.
Relation input = 1;
// (Required) A Filter must have a condition expression.
Expression condition = 2;
}
// Relation of type [[Join]].
//
// `left` and `right` must be present.
message Join {
// (Required) Left input relation for a Join.
Relation left = 1;
// (Required) Right input relation for a Join.
Relation right = 2;
// (Optional) The join condition. Could be unset when `using_columns` is utilized.
//
// This field does not co-exist with using_columns.
Expression join_condition = 3;
// (Required) The join type.
JoinType join_type = 4;
// Optional. using_columns provides a list of columns that should present on both sides of
// the join inputs that this Join will join on. For example A JOIN B USING col_name is
// equivalent to A JOIN B on A.col_name = B.col_name.
//
// This field does not co-exist with join_condition.
repeated string using_columns = 5;
enum JoinType {
JOIN_TYPE_UNSPECIFIED = 0;
JOIN_TYPE_INNER = 1;
JOIN_TYPE_FULL_OUTER = 2;
JOIN_TYPE_LEFT_OUTER = 3;
JOIN_TYPE_RIGHT_OUTER = 4;
JOIN_TYPE_LEFT_ANTI = 5;
JOIN_TYPE_LEFT_SEMI = 6;
JOIN_TYPE_CROSS = 7;
}
// (Optional) Only used by joinWith. Set the left and right join data types.
optional JoinDataType join_data_type = 6;
message JoinDataType {
// If the left data type is a struct.
bool is_left_struct = 1;
// If the right data type is a struct.
bool is_right_struct = 2;
}
}
// Relation of type [[SetOperation]]
message SetOperation {
// (Required) Left input relation for a Set operation.
Relation left_input = 1;
// (Required) Right input relation for a Set operation.
Relation right_input = 2;
// (Required) The Set operation type.
SetOpType set_op_type = 3;
// (Optional) If to remove duplicate rows.
//
// True to preserve all results.
// False to remove duplicate rows.
optional bool is_all = 4;
// (Optional) If to perform the Set operation based on name resolution.
//
// Only UNION supports this option.
optional bool by_name = 5;
// (Optional) If to perform the Set operation and allow missing columns.
//
// Only UNION supports this option.
optional bool allow_missing_columns = 6;
enum SetOpType {
SET_OP_TYPE_UNSPECIFIED = 0;
SET_OP_TYPE_INTERSECT = 1;
SET_OP_TYPE_UNION = 2;
SET_OP_TYPE_EXCEPT = 3;
}
}
// Relation of type [[Limit]] that is used to `limit` rows from the input relation.
message Limit {
// (Required) Input relation for a Limit.
Relation input = 1;
// (Required) the limit.
int32 limit = 2;
}
// Relation of type [[Offset]] that is used to read rows staring from the `offset` on
// the input relation.
message Offset {
// (Required) Input relation for an Offset.
Relation input = 1;
// (Required) the limit.
int32 offset = 2;
}
// Relation of type [[Tail]] that is used to fetch `limit` rows from the last of the input relation.
message Tail {
// (Required) Input relation for an Tail.
Relation input = 1;
// (Required) the limit.
int32 limit = 2;
}
// Relation of type [[Aggregate]].
message Aggregate {
// (Required) Input relation for a RelationalGroupedDataset.
Relation input = 1;
// (Required) How the RelationalGroupedDataset was built.
GroupType group_type = 2;
// (Required) Expressions for grouping keys
repeated Expression grouping_expressions = 3;
// (Required) List of values that will be translated to columns in the output DataFrame.
repeated Expression aggregate_expressions = 4;
// (Optional) Pivots a column of the current `DataFrame` and performs the specified aggregation.
Pivot pivot = 5;
// (Optional) List of values that will be translated to columns in the output DataFrame.
repeated GroupingSets grouping_sets = 6;
enum GroupType {
GROUP_TYPE_UNSPECIFIED = 0;
GROUP_TYPE_GROUPBY = 1;
GROUP_TYPE_ROLLUP = 2;
GROUP_TYPE_CUBE = 3;
GROUP_TYPE_PIVOT = 4;
GROUP_TYPE_GROUPING_SETS = 5;
}
message Pivot {
// (Required) The column to pivot
Expression col = 1;
// (Optional) List of values that will be translated to columns in the output DataFrame.
//
// Note that if it is empty, the server side will immediately trigger a job to collect
// the distinct values of the column.
repeated Expression.Literal values = 2;
}
message GroupingSets {
// (Required) Individual grouping set
repeated Expression grouping_set = 1;
}
}
// Relation of type [[Sort]].
message Sort {
// (Required) Input relation for a Sort.
Relation input = 1;
// (Required) The ordering expressions
repeated Expression.SortOrder order = 2;
// (Optional) if this is a global sort.
optional bool is_global = 3;
}
// Drop specified columns.
message Drop {
// (Required) The input relation.
Relation input = 1;
// (Optional) columns to drop.
repeated Expression columns = 2;
// (Optional) names of columns to drop.
repeated string column_names = 3;
}
// Relation of type [[Deduplicate]] which have duplicate rows removed, could consider either only
// the subset of columns or all the columns.
message Deduplicate {
// (Required) Input relation for a Deduplicate.
Relation input = 1;
// (Optional) Deduplicate based on a list of column names.
//
// This field does not co-use with `all_columns_as_keys`.
repeated string column_names = 2;
// (Optional) Deduplicate based on all the columns of the input relation.
//
// This field does not co-use with `column_names`.
optional bool all_columns_as_keys = 3;
// (Optional) Deduplicate within the time range of watermark.
optional bool within_watermark = 4;
}
// A relation that does not need to be qualified by name.
message LocalRelation {
// (Optional) Local collection data serialized into Arrow IPC streaming format which contains
// the schema of the data.
optional bytes data = 1;
// (Optional) The schema of local data.
// It should be either a DDL-formatted type string or a JSON string.
//
// The server side will update the column names and data types according to this schema.
// If the 'data' is not provided, then this schema will be required.
optional string schema = 2;
}
// A local relation that has been cached already.
message CachedLocalRelation {
// `userId` and `sessionId` fields are deleted since the server must always use the active
// session/user rather than arbitrary values provided by the client. It is never valid to access
// a local relation from a different session/user.
reserved 1, 2;
reserved "userId", "sessionId";
// (Required) A sha-256 hash of the serialized local relation in proto, see LocalRelation.
string hash = 3;
}
// Represents a remote relation that has been cached on server.
message CachedRemoteRelation {
// (Required) ID of the remote related (assigned by the service).
string relation_id = 1;
}
// Relation of type [[Sample]] that samples a fraction of the dataset.
message Sample {
// (Required) Input relation for a Sample.
Relation input = 1;
// (Required) lower bound.
double lower_bound = 2;
// (Required) upper bound.
double upper_bound = 3;
// (Optional) Whether to sample with replacement.
optional bool with_replacement = 4;
// (Required) The random seed.
// This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details),
// however, still keep it 'optional' here for backward compatibility.
optional int64 seed = 5;
// (Required) Explicitly sort the underlying plan to make the ordering deterministic or cache it.
// This flag is true when invoking `dataframe.randomSplit` to randomly splits DataFrame with the
// provided weights. Otherwise, it is false.
bool deterministic_order = 6;
}
// Relation of type [[Range]] that generates a sequence of integers.
message Range {
// (Optional) Default value = 0
optional int64 start = 1;
// (Required)
int64 end = 2;
// (Required)
int64 step = 3;
// Optional. Default value is assigned by 1) SQL conf "spark.sql.leafNodeDefaultParallelism" if
// it is set, or 2) spark default parallelism.
optional int32 num_partitions = 4;
}
// Relation alias.
message SubqueryAlias {
// (Required) The input relation of SubqueryAlias.
Relation input = 1;
// (Required) The alias.
string alias = 2;
// (Optional) Qualifier of the alias.
repeated string qualifier = 3;
}
// Relation repartition.
message Repartition {
// (Required) The input relation of Repartition.
Relation input = 1;
// (Required) Must be positive.
int32 num_partitions = 2;
// (Optional) Default value is false.
optional bool shuffle = 3;
}
// Compose the string representing rows for output.
// It will invoke 'Dataset.showString' to compute the results.
message ShowString {
// (Required) The input relation.
Relation input = 1;
// (Required) Number of rows to show.
int32 num_rows = 2;
// (Required) If set to more than 0, truncates strings to
// `truncate` characters and all cells will be aligned right.
int32 truncate = 3;
// (Required) If set to true, prints output rows vertically (one line per column value).
bool vertical = 4;
}
// Compose the string representing rows for output.
// It will invoke 'Dataset.htmlString' to compute the results.
message HtmlString {
// (Required) The input relation.
Relation input = 1;
// (Required) Number of rows to show.
int32 num_rows = 2;
// (Required) If set to more than 0, truncates strings to
// `truncate` characters and all cells will be aligned right.
int32 truncate = 3;
}
// Computes specified statistics for numeric and string columns.
// It will invoke 'Dataset.summary' (same as 'StatFunctions.summary')
// to compute the results.
message StatSummary {
// (Required) The input relation.
Relation input = 1;
// (Optional) Statistics from to be computed.
//
// Available statistics are:
// count
// mean
// stddev
// min
// max
// arbitrary approximate percentiles specified as a percentage (e.g. 75%)
// count_distinct
// approx_count_distinct
//
// If no statistics are given, this function computes 'count', 'mean', 'stddev', 'min',
// 'approximate quartiles' (percentiles at 25%, 50%, and 75%), and 'max'.
repeated string statistics = 2;
}
// Computes basic statistics for numeric and string columns, including count, mean, stddev, min,
// and max. If no columns are given, this function computes statistics for all numerical or
// string columns.
message StatDescribe {
// (Required) The input relation.
Relation input = 1;
// (Optional) Columns to compute statistics on.
repeated string cols = 2;
}
// Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
// It will invoke 'Dataset.stat.crosstab' (same as 'StatFunctions.crossTabulate')
// to compute the results.
message StatCrosstab {
// (Required) The input relation.
Relation input = 1;
// (Required) The name of the first column.
//
// Distinct items will make the first item of each row.
string col1 = 2;
// (Required) The name of the second column.
//
// Distinct items will make the column names of the DataFrame.
string col2 = 3;
}
// Calculate the sample covariance of two numerical columns of a DataFrame.
// It will invoke 'Dataset.stat.cov' (same as 'StatFunctions.calculateCov') to compute the results.
message StatCov {
// (Required) The input relation.
Relation input = 1;
// (Required) The name of the first column.
string col1 = 2;
// (Required) The name of the second column.
string col2 = 3;
}
// Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson
// Correlation Coefficient. It will invoke 'Dataset.stat.corr' (same as
// 'StatFunctions.pearsonCorrelation') to compute the results.
message StatCorr {
// (Required) The input relation.
Relation input = 1;
// (Required) The name of the first column.
string col1 = 2;
// (Required) The name of the second column.
string col2 = 3;
// (Optional) Default value is 'pearson'.
//
// Currently only supports the Pearson Correlation Coefficient.
optional string method = 4;
}
// Calculates the approximate quantiles of numerical columns of a DataFrame.
// It will invoke 'Dataset.stat.approxQuantile' (same as 'StatFunctions.approxQuantile')
// to compute the results.
message StatApproxQuantile {
// (Required) The input relation.
Relation input = 1;
// (Required) The names of the numerical columns.
repeated string cols = 2;
// (Required) A list of quantile probabilities.
//
// Each number must belong to [0, 1].
// For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
repeated double probabilities = 3;
// (Required) The relative target precision to achieve (greater than or equal to 0).
//
// If set to zero, the exact quantiles are computed, which could be very expensive.
// Note that values greater than 1 are accepted but give the same result as 1.
double relative_error = 4;
}
// Finding frequent items for columns, possibly with false positives.
// It will invoke 'Dataset.stat.freqItems' (same as 'StatFunctions.freqItems')
// to compute the results.
message StatFreqItems {
// (Required) The input relation.
Relation input = 1;
// (Required) The names of the columns to search frequent items in.
repeated string cols = 2;
// (Optional) The minimum frequency for an item to be considered `frequent`.
// Should be greater than 1e-4.
optional double support = 3;
}
// Returns a stratified sample without replacement based on the fraction
// given on each stratum.
// It will invoke 'Dataset.stat.freqItems' (same as 'StatFunctions.freqItems')
// to compute the results.
message StatSampleBy {
// (Required) The input relation.
Relation input = 1;
// (Required) The column that defines strata.
Expression col = 2;
// (Required) Sampling fraction for each stratum.
//
// If a stratum is not specified, we treat its fraction as zero.
repeated Fraction fractions = 3;
// (Required) The random seed.
// This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details),
// however, still keep it 'optional' here for backward compatibility.
optional int64 seed = 5;
message Fraction {
// (Required) The stratum.
Expression.Literal stratum = 1;
// (Required) The fraction value. Must be in [0, 1].
double fraction = 2;
}
}
// Replaces null values.
// It will invoke 'Dataset.na.fill' (same as 'DataFrameNaFunctions.fill') to compute the results.
// Following 3 parameter combinations are supported:
// 1, 'values' only contains 1 item, 'cols' is empty:
// replaces null values in all type-compatible columns.
// 2, 'values' only contains 1 item, 'cols' is not empty:
// replaces null values in specified columns.
// 3, 'values' contains more than 1 items, then 'cols' is required to have the same length:
// replaces each specified column with corresponding value.
message NAFill {
// (Required) The input relation.
Relation input = 1;
// (Optional) Optional list of column names to consider.
repeated string cols = 2;
// (Required) Values to replace null values with.
//
// Should contain at least 1 item.
// Only 4 data types are supported now: bool, long, double, string
repeated Expression.Literal values = 3;
}
// Drop rows containing null values.
// It will invoke 'Dataset.na.drop' (same as 'DataFrameNaFunctions.drop') to compute the results.
message NADrop {
// (Required) The input relation.
Relation input = 1;
// (Optional) Optional list of column names to consider.
//
// When it is empty, all the columns in the input relation will be considered.
repeated string cols = 2;
// (Optional) The minimum number of non-null and non-NaN values required to keep.
//
// When not set, it is equivalent to the number of considered columns, which means
// a row will be kept only if all columns are non-null.
//
// 'how' options ('all', 'any') can be easily converted to this field:
// - 'all' -> set 'min_non_nulls' 1;
// - 'any' -> keep 'min_non_nulls' unset;
optional int32 min_non_nulls = 3;
}
// Replaces old values with the corresponding values.
// It will invoke 'Dataset.na.replace' (same as 'DataFrameNaFunctions.replace')
// to compute the results.
message NAReplace {
// (Required) The input relation.
Relation input = 1;
// (Optional) List of column names to consider.
//
// When it is empty, all the type-compatible columns in the input relation will be considered.
repeated string cols = 2;
// (Optional) The value replacement mapping.
repeated Replacement replacements = 3;
message Replacement {
// (Required) The old value.
//
// Only 4 data types are supported now: null, bool, double, string.
Expression.Literal old_value = 1;
// (Required) The new value.
//
// Should be of the same data type with the old value.
Expression.Literal new_value = 2;
}
}
// Rename columns on the input relation by the same length of names.
message ToDF {
// (Required) The input relation of RenameColumnsBySameLengthNames.
Relation input = 1;
// (Required)
//
// The number of columns of the input relation must be equal to the length
// of this field. If this is not true, an exception will be returned.
repeated string column_names = 2;
}
// Rename columns on the input relation by a map with name to name mapping.
message WithColumnsRenamed {
// (Required) The input relation.
Relation input = 1;
// (Optional)
//
// Renaming column names of input relation from A to B where A is the map key
// and B is the map value. This is a no-op if schema doesn't contain any A. It
// does not require that all input relation column names to present as keys.
// duplicated B are not allowed.
map<string, string> rename_columns_map = 2 [deprecated=true];
repeated Rename renames = 3;
message Rename {
// (Required) The existing column name.
string col_name = 1;
// (Required) The new column name.
string new_col_name = 2;
}
}
// Adding columns or replacing the existing columns that have the same names.
message WithColumns {
// (Required) The input relation.
Relation input = 1;
// (Required)
//
// Given a column name, apply the corresponding expression on the column. If column
// name exists in the input relation, then replace the column. If the column name
// does not exist in the input relation, then adds it as a new column.
//
// Only one name part is expected from each Expression.Alias.
//
// An exception is thrown when duplicated names are present in the mapping.
repeated Expression.Alias aliases = 2;
}
message WithWatermark {
// (Required) The input relation
Relation input = 1;
// (Required) Name of the column containing event time.
string event_time = 2;
// (Required)
string delay_threshold = 3;
}
// Specify a hint over a relation. Hint should have a name and optional parameters.
message Hint {
// (Required) The input relation.
Relation input = 1;
// (Required) Hint name.
//
// Supported Join hints include BROADCAST, MERGE, SHUFFLE_HASH, SHUFFLE_REPLICATE_NL.
//
// Supported partitioning hints include COALESCE, REPARTITION, REPARTITION_BY_RANGE.
string name = 2;
// (Optional) Hint parameters.
repeated Expression parameters = 3;
}
// Unpivot a DataFrame from wide format to long format, optionally leaving identifier columns set.
message Unpivot {
// (Required) The input relation.
Relation input = 1;
// (Required) Id columns.
repeated Expression ids = 2;
// (Optional) Value columns to unpivot.
optional Values values = 3;
// (Required) Name of the variable column.
string variable_column_name = 4;
// (Required) Name of the value column.
string value_column_name = 5;
message Values {
repeated Expression values = 1;
}
}
message ToSchema {
// (Required) The input relation.
Relation input = 1;
// (Required) The user provided schema.
//
// The Sever side will update the dataframe with this schema.
DataType schema = 2;
}
message RepartitionByExpression {
// (Required) The input relation.
Relation input = 1;
// (Required) The partitioning expressions.
repeated Expression partition_exprs = 2;
// (Optional) number of partitions, must be positive.
optional int32 num_partitions = 3;
}
message MapPartitions {
// (Required) Input relation for a mapPartitions-equivalent API: mapInPandas, mapInArrow.
Relation input = 1;
// (Required) Input user-defined function.
CommonInlineUserDefinedFunction func = 2;
// (Optional) Whether to use barrier mode execution or not.
optional bool is_barrier = 3;
// (Optional) ResourceProfile id used for the stage level scheduling.
optional int32 profile_id = 4;
}
message GroupMap {
// (Required) Input relation for Group Map API: apply, applyInPandas.
Relation input = 1;
// (Required) Expressions for grouping keys.
repeated Expression grouping_expressions = 2;
// (Required) Input user-defined function.
CommonInlineUserDefinedFunction func = 3;
// (Optional) Expressions for sorting. Only used by Scala Sorted Group Map API.
repeated Expression sorting_expressions = 4;
// Below fields are only used by (Flat)MapGroupsWithState
// (Optional) Input relation for initial State.
Relation initial_input = 5;
// (Optional) Expressions for grouping keys of the initial state input relation.
repeated Expression initial_grouping_expressions = 6;
// (Optional) True if MapGroupsWithState, false if FlatMapGroupsWithState.
optional bool is_map_groups_with_state = 7;
// (Optional) The output mode of the function.
optional string output_mode = 8;
// (Optional) Timeout configuration for groups that do not receive data for a while.
optional string timeout_conf = 9;
}
message CoGroupMap {
// (Required) One input relation for CoGroup Map API - applyInPandas.
Relation input = 1;
// Expressions for grouping keys of the first input relation.
repeated Expression input_grouping_expressions = 2;
// (Required) The other input relation.
Relation other = 3;
// Expressions for grouping keys of the other input relation.
repeated Expression other_grouping_expressions = 4;
// (Required) Input user-defined function.
CommonInlineUserDefinedFunction func = 5;
// (Optional) Expressions for sorting. Only used by Scala Sorted CoGroup Map API.
repeated Expression input_sorting_expressions = 6;
// (Optional) Expressions for sorting. Only used by Scala Sorted CoGroup Map API.
repeated Expression other_sorting_expressions = 7;
}
message ApplyInPandasWithState {
// (Required) Input relation for applyInPandasWithState.
Relation input = 1;
// (Required) Expressions for grouping keys.
repeated Expression grouping_expressions = 2;
// (Required) Input user-defined function.
CommonInlineUserDefinedFunction func = 3;
// (Required) Schema for the output DataFrame.
string output_schema = 4;
// (Required) Schema for the state.
string state_schema = 5;
// (Required) The output mode of the function.
string output_mode = 6;
// (Required) Timeout configuration for groups that do not receive data for a while.
string timeout_conf = 7;
}
message CommonInlineUserDefinedTableFunction {
// (Required) Name of the user-defined table function.
string function_name = 1;
// (Optional) Whether the user-defined table function is deterministic.
bool deterministic = 2;
// (Optional) Function input arguments. Empty arguments are allowed.
repeated Expression arguments = 3;
// (Required) Type of the user-defined table function.
oneof function {
PythonUDTF python_udtf = 4;
}
}
message PythonUDTF {
// (Optional) Return type of the Python UDTF.
optional DataType return_type = 1;
// (Required) EvalType of the Python UDTF.
int32 eval_type = 2;
// (Required) The encoded commands of the Python UDTF.
bytes command = 3;
// (Required) Python version being used in the client.
string python_ver = 4;
}
message CommonInlineUserDefinedDataSource {
// (Required) Name of the data source.
string name = 1;
// (Required) The data source type.
oneof data_source {
PythonDataSource python_data_source = 2;
}
}
message PythonDataSource {
// (Required) The encoded commands of the Python data source.
bytes command = 1;
// (Required) Python version being used in the client.
string python_ver = 2;
}
// Collect arbitrary (named) metrics from a dataset.
message CollectMetrics {
// (Required) The input relation.
Relation input = 1;
// (Required) Name of the metrics.
string name = 2;
// (Required) The metric sequence.
repeated Expression metrics = 3;
}
message Parse {
// (Required) Input relation to Parse. The input is expected to have single text column.
Relation input = 1;
// (Required) The expected format of the text.
ParseFormat format = 2;
// (Optional) DataType representing the schema. If not set, Spark will infer the schema.
optional DataType schema = 3;
// Options for the csv/json parser. The map key is case insensitive.
map<string, string> options = 4;
enum ParseFormat {
PARSE_FORMAT_UNSPECIFIED = 0;
PARSE_FORMAT_CSV = 1;
PARSE_FORMAT_JSON = 2;
}
}
// Relation of type [[AsOfJoin]].
//
// `left` and `right` must be present.
message AsOfJoin {
// (Required) Left input relation for a Join.
Relation left = 1;
// (Required) Right input relation for a Join.
Relation right = 2;
// (Required) Field to join on in left DataFrame
Expression left_as_of = 3;
// (Required) Field to join on in right DataFrame
Expression right_as_of = 4;
// (Optional) The join condition. Could be unset when `using_columns` is utilized.
//
// This field does not co-exist with using_columns.
Expression join_expr = 5;
// Optional. using_columns provides a list of columns that should present on both sides of
// the join inputs that this Join will join on. For example A JOIN B USING col_name is
// equivalent to A JOIN B on A.col_name = B.col_name.
//
// This field does not co-exist with join_condition.
repeated string using_columns = 6;
// (Required) The join type.
string join_type = 7;
// (Optional) The asof tolerance within this range.
Expression tolerance = 8;
// (Required) Whether allow matching with the same value or not.
bool allow_exact_matches = 9;
// (Required) Whether to search for prior, subsequent, or closest matches.
string direction = 10;
}