blob: 513f5b98c5987f89ef8459bc10198d3984a4c3a7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.indexing.overlord;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.timeline.partition.PartialShardSpec;
import org.joda.time.Interval;
import javax.annotation.Nullable;
import javax.validation.constraints.NotNull;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;
/**
*/
public interface IndexerMetadataStorageCoordinator
{
/**
* Retrieve all published segments which may include any data in the interval and are marked as used from the
* metadata store.
*
* The order of segments within the returned collection is unspecified, but each segment is guaranteed to appear in
* the collection only once.
*
* @param dataSource The data source to query
* @param interval The interval for which all applicable and used segmented are requested.
* @param visibility Whether only visible or visible as well as overshadowed segments should be returned. The
* visibility is considered within the specified interval: that is, a segment which is visible
* outside of the specified interval, but overshadowed within the specified interval will not be
* returned if {@link Segments#ONLY_VISIBLE} is passed. See more precise description in the doc for
* {@link Segments}.
* @return The DataSegments which include data in the requested interval. These segments may contain data outside the
* requested interval.
*
* @implNote This method doesn't return a {@link Set} because there may be an expectation that {@code Set.contains()}
* is O(1) operation, while it's not the case for the returned collection unless it copies all segments into a new
* {@link java.util.HashSet} or {@link com.google.common.collect.ImmutableSet} which may in turn be unnecessary in
* other use cases. So clients should perform such copy themselves if they need {@link Set} semantics.
*/
default Collection<DataSegment> retrieveUsedSegmentsForInterval(
String dataSource,
Interval interval,
Segments visibility
)
{
return retrieveUsedSegmentsForIntervals(dataSource, Collections.singletonList(interval), visibility);
}
/**
* Retrieve all published used segments in the data source from the metadata store.
*
* @param dataSource The data source to query
*
* @return all segments belonging to the given data source
* @see #retrieveUsedSegmentsForInterval(String, Interval, Segments) similar to this method but also accepts data
* interval.
*/
Collection<DataSegment> retrieveAllUsedSegments(String dataSource, Segments visibility);
/**
* Retrieve all published segments which are marked as used and the created_date of these segments belonging to the
* given data source from the metadata store.
*
* Unlike other similar methods in this interface, this method doesn't accept a {@link Segments} "visibility"
* parameter. The returned collection may include overshadowed segments and their created_dates, as if {@link
* Segments#INCLUDING_OVERSHADOWED} was passed. It's the responsibility of the caller to filter out overshadowed ones
* if needed.
*
* @param dataSource The data source to query
*
* @return The DataSegments and the related created_date of segments
*/
Collection<Pair<DataSegment, String>> retrieveUsedSegmentsAndCreatedDates(String dataSource);
/**
* Retrieve all published segments which may include any data in the given intervals and are marked as used from the
* metadata store.
*
* The order of segments within the returned collection is unspecified, but each segment is guaranteed to appear in
* the collection only once.
*
* @param dataSource The data source to query
* @param intervals The intervals for which all applicable and used segments are requested.
* @param visibility Whether only visible or visible as well as overshadowed segments should be returned. The
* visibility is considered within the specified intervals: that is, a segment which is visible
* outside of the specified intervals, but overshadowed on the specified intervals will not be
* returned if {@link Segments#ONLY_VISIBLE} is passed. See more precise description in the doc for
* {@link Segments}.
* @return The DataSegments which include data in the requested intervals. These segments may contain data outside the
* requested intervals.
*
* @implNote This method doesn't return a {@link Set} because there may be an expectation that {@code Set.contains()}
* is O(1) operation, while it's not the case for the returned collection unless it copies all segments into a new
* {@link java.util.HashSet} or {@link com.google.common.collect.ImmutableSet} which may in turn be unnecessary in
* other use cases. So clients should perform such copy themselves if they need {@link Set} semantics.
*/
Collection<DataSegment> retrieveUsedSegmentsForIntervals(
String dataSource,
List<Interval> intervals,
Segments visibility
);
/**
* Retrieve all published segments which include ONLY data within the given interval and are marked as unused from the
* metadata store.
*
* @param dataSource The data source the segments belong to
* @param interval Filter the data segments to ones that include data in this interval exclusively.
*
* @return DataSegments which include ONLY data within the requested interval and are marked as unused. Segments NOT
* returned here may include data in the interval
*/
List<DataSegment> retrieveUnusedSegmentsForInterval(String dataSource, Interval interval);
/**
* Attempts to insert a set of segments to the metadata storage. Returns the set of segments actually added (segments
* with identifiers already in the metadata storage will not be added).
*
* @param segments set of segments to add
*
* @return set of segments actually added
*/
Set<DataSegment> announceHistoricalSegments(Set<DataSegment> segments) throws IOException;
/**
* Allocate a new pending segment in the pending segments table. This segment identifier will never be given out
* again, <em>unless</em> another call is made with the same dataSource, sequenceName, and previousSegmentId.
* <p/>
* The sequenceName and previousSegmentId parameters are meant to make it easy for two independent ingestion tasks
* to produce the same series of segments.
* <p/>
* Note that a segment sequence may include segments with a variety of different intervals and versions.
*
* @param dataSource dataSource for which to allocate a segment
* @param sequenceName name of the group of ingestion tasks producing a segment series
* @param previousSegmentId previous segment in the series; may be null or empty, meaning this is the first
* segment
* @param interval interval for which to allocate a segment
* @param partialShardSpec partialShardSpec containing all necessary information to create a shardSpec for the
* new segmentId
* @param maxVersion use this version if we have no better version to use. The returned segment
* identifier may have a version lower than this one, but will not have one higher.
* @param skipSegmentLineageCheck if true, perform lineage validation using previousSegmentId for this sequence.
* Should be set to false if replica tasks would index events in same order
*
* @return the pending segment identifier, or null if it was impossible to allocate a new segment
*/
SegmentIdWithShardSpec allocatePendingSegment(
String dataSource,
String sequenceName,
@Nullable String previousSegmentId,
Interval interval,
PartialShardSpec partialShardSpec,
String maxVersion,
boolean skipSegmentLineageCheck
);
/**
* Delete pending segments created in the given interval belonging to the given data source from the pending segments
* table. The {@code created_date} field of the pending segments table is checked to find segments to be deleted.
*
* Note that the semantic of the interval (for `created_date`s) is different from the semantic of the interval
* parameters in some other methods in this class, such as {@link #retrieveUsedSegmentsForInterval} (where the
* interval is about the time column value in rows belonging to the segment).
*
* @param dataSource dataSource
* @param deleteInterval interval to check the {@code created_date} of pendingSegments
*
* @return number of deleted pending segments
*/
int deletePendingSegmentsCreatedInInterval(String dataSource, Interval deleteInterval);
/**
* Delete all pending segments belonging to the given data source from the pending segments table.
*
* @return number of deleted pending segments
* @see #deletePendingSegmentsCreatedInInterval(String, Interval) similar to this method but also accepts interval for
* segments' `created_date`s
*/
int deletePendingSegments(String dataSource);
/**
* Attempts to insert a set of segments to the metadata storage. Returns the set of segments actually added (segments
* with identifiers already in the metadata storage will not be added).
* <p/>
* If startMetadata and endMetadata are set, this insertion will be atomic with a compare-and-swap on dataSource
* commit metadata.
*
* If segmentsToDrop is not null and not empty, this insertion will be atomic with a insert-and-drop on inserting
* {@param segments} and dropping {@param segmentsToDrop}
*
* @param segments set of segments to add, must all be from the same dataSource
* @param segmentsToDrop set of segments to drop, must all be from the same dataSource
* @param startMetadata dataSource metadata pre-insert must match this startMetadata according to
* {@link DataSourceMetadata#matches(DataSourceMetadata)}. If null, this insert will
* not involve a metadata transaction
* @param endMetadata dataSource metadata post-insert will have this endMetadata merged in with
* {@link DataSourceMetadata#plus(DataSourceMetadata)}. If null, this insert will not
* involve a metadata transaction
*
* @return segment publish result indicating transaction success or failure, and set of segments actually published.
* This method must only return a failure code if it is sure that the transaction did not happen. If it is not sure,
* it must throw an exception instead.
*
* @throws IllegalArgumentException if startMetadata and endMetadata are not either both null or both non-null
* @throws RuntimeException if the state of metadata storage after this call is unknown
*/
SegmentPublishResult announceHistoricalSegments(
Set<DataSegment> segments,
Set<DataSegment> segmentsToDrop,
@Nullable DataSourceMetadata startMetadata,
@Nullable DataSourceMetadata endMetadata
) throws IOException;
/**
* Retrieves data source's metadata from the metadata store. Returns null if there is no metadata.
*/
@Nullable DataSourceMetadata retrieveDataSourceMetadata(String dataSource);
/**
* Removes entry for 'dataSource' from the dataSource metadata table.
*
* @param dataSource identifier
*
* @return true if the entry was deleted, false otherwise
*/
boolean deleteDataSourceMetadata(String dataSource);
/**
* Resets dataSourceMetadata entry for 'dataSource' to the one supplied.
*
* @param dataSource identifier
* @param dataSourceMetadata value to set
*
* @return true if the entry was reset, false otherwise
*/
boolean resetDataSourceMetadata(String dataSource, DataSourceMetadata dataSourceMetadata) throws IOException;
/**
* Insert dataSourceMetadata entry for 'dataSource'.
*
* @param dataSource identifier
* @param dataSourceMetadata value to set
*
* @return true if the entry was inserted, false otherwise
*/
boolean insertDataSourceMetadata(String dataSource, DataSourceMetadata dataSourceMetadata);
/**
* Remove datasource metadata created before the given timestamp and not in given excludeDatasources set.
*
* @param timestamp timestamp in milliseconds
* @param excludeDatasources set of datasource names to exclude from removal
* @return number of datasource metadata removed
*/
int removeDataSourceMetadataOlderThan(long timestamp, @NotNull Set<String> excludeDatasources);
/**
* Similar to {@link #announceHistoricalSegments(Set)}, but meant for streaming ingestion tasks for handling
* the case where the task ingested no records and created no segments, but still needs to update the metadata
* with the progress that the task made.
*
* The metadata should undergo the same validation checks as performed by {@link #announceHistoricalSegments}.
*
*
* @param dataSource the datasource
* @param startMetadata dataSource metadata pre-insert must match this startMetadata according to
* {@link DataSourceMetadata#matches(DataSourceMetadata)}.
* @param endMetadata dataSource metadata post-insert will have this endMetadata merged in with
* {@link DataSourceMetadata#plus(DataSourceMetadata)}.
*
* @return segment publish result indicating transaction success or failure.
* This method must only return a failure code if it is sure that the transaction did not happen. If it is not sure,
* it must throw an exception instead.
*
* @throws IllegalArgumentException if either startMetadata and endMetadata are null
* @throws RuntimeException if the state of metadata storage after this call is unknown
*/
SegmentPublishResult commitMetadataOnly(
String dataSource,
DataSourceMetadata startMetadata,
DataSourceMetadata endMetadata
);
void updateSegmentMetadata(Set<DataSegment> segments);
void deleteSegments(Set<DataSegment> segments);
}