connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala - spark - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.spark.sql.catalog

 import scala.collection.JavaConverters._

 import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.storage.StorageLevel

 /**
  * Catalog interface for Spark. To access this, use `SparkSession.catalog`.
  *
  * @since 3.5.0
  */
 abstract class Catalog {

   /**
    * Returns the current database (namespace) in this session.
    *
    * @since 3.5.0
    */
   def currentDatabase: String

   /**
    * Sets the current database (namespace) in this session.
    *
    * @since 3.5.0
    */
   def setCurrentDatabase(dbName: String): Unit

   /**
    * Returns a list of databases (namespaces) available within the current catalog.
    *
    * @since 3.5.0
    */
   def listDatabases(): Dataset[Database]

   /**
    * Returns a list of databases (namespaces) which name match the specify pattern and available
    * within the current catalog.
    *
    * @since 3.5.0
    */
   def listDatabases(pattern: String): Dataset[Database]

   /**
    * Returns a list of tables/views in the current database (namespace). This includes all
    * temporary views.
    *
    * @since 3.5.0
    */
   def listTables(): Dataset[Table]

   /**
    * Returns a list of tables/views in the specified database (namespace) (the name can be
    * qualified with catalog). This includes all temporary views.
    *
    * @since 3.5.0
    */
   @throws[AnalysisException]("database does not exist")
   def listTables(dbName: String): Dataset[Table]

   /**
    * Returns a list of functions registered in the current database (namespace). This includes all
    * temporary functions.
    *
    * @since 3.5.0
    */
   def listFunctions(): Dataset[Function]

   /**
    * Returns a list of functions registered in the specified database (namespace) (the name can be
    * qualified with catalog). This includes all built-in and temporary functions.
    *
    * @since 3.5.0
    */
   @throws[AnalysisException]("database does not exist")
   def listFunctions(dbName: String): Dataset[Function]

   /**
    * Returns a list of columns for the given table/view or temporary view.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table/view. It follows the same
    *   resolution rule with SQL: search for temp views first then table/views in the current
    *   database (namespace).
    * @since 3.5.0
    */
   @throws[AnalysisException]("table does not exist")
   def listColumns(tableName: String): Dataset[Column]

   /**
    * Returns a list of columns for the given table/view in the specified database under the Hive
    * Metastore.
    *
    * To list columns for table/view in other catalogs, please use `listColumns(tableName)` with
    * qualified table/view name instead.
    *
    * @param dbName
    *   is an unqualified name that designates a database.
    * @param tableName
    *   is an unqualified name that designates a table/view.
    * @since 3.5.0
    */
   @throws[AnalysisException]("database or table does not exist")
   def listColumns(dbName: String, tableName: String): Dataset[Column]

   /**
    * Get the database (namespace) with the specified name (can be qualified with catalog). This
    * throws an AnalysisException when the database (namespace) cannot be found.
    *
    * @since 3.5.0
    */
   @throws[AnalysisException]("database does not exist")
   def getDatabase(dbName: String): Database

   /**
    * Get the table or view with the specified name. This table can be a temporary view or a
    * table/view. This throws an AnalysisException when no Table can be found.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table/view. It follows the same
    *   resolution rule with SQL: search for temp views first then table/views in the current
    *   database (namespace).
    * @since 3.5.0
    */
   @throws[AnalysisException]("table does not exist")
   def getTable(tableName: String): Table

   /**
    * Get the table or view with the specified name in the specified database under the Hive
    * Metastore. This throws an AnalysisException when no Table can be found.
    *
    * To get table/view in other catalogs, please use `getTable(tableName)` with qualified
    * table/view name instead.
    *
    * @since 3.5.0
    */
   @throws[AnalysisException]("database or table does not exist")
   def getTable(dbName: String, tableName: String): Table

   /**
    * Get the function with the specified name. This function can be a temporary function or a
    * function. This throws an AnalysisException when the function cannot be found.
    *
    * @param functionName
    *   is either a qualified or unqualified name that designates a function. It follows the same
    *   resolution rule with SQL: search for built-in/temp functions first then functions in the
    *   current database (namespace).
    * @since 3.5.0
    */
   @throws[AnalysisException]("function does not exist")
   def getFunction(functionName: String): Function

   /**
    * Get the function with the specified name in the specified database under the Hive Metastore.
    * This throws an AnalysisException when the function cannot be found.
    *
    * To get functions in other catalogs, please use `getFunction(functionName)` with qualified
    * function name instead.
    *
    * @param dbName
    *   is an unqualified name that designates a database.
    * @param functionName
    *   is an unqualified name that designates a function in the specified database
    * @since 3.5.0
    */
   @throws[AnalysisException]("database or function does not exist")
   def getFunction(dbName: String, functionName: String): Function

   /**
    * Check if the database (namespace) with the specified name exists (the name can be qualified
    * with catalog).
    *
    * @since 3.5.0
    */
   def databaseExists(dbName: String): Boolean

   /**
    * Check if the table or view with the specified name exists. This can either be a temporary
    * view or a table/view.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table/view. It follows the same
    *   resolution rule with SQL: search for temp views first then table/views in the current
    *   database (namespace).
    * @since 3.5.0
    */
   def tableExists(tableName: String): Boolean

   /**
    * Check if the table or view with the specified name exists in the specified database under the
    * Hive Metastore.
    *
    * To check existence of table/view in other catalogs, please use `tableExists(tableName)` with
    * qualified table/view name instead.
    *
    * @param dbName
    *   is an unqualified name that designates a database.
    * @param tableName
    *   is an unqualified name that designates a table.
    * @since 3.5.0
    */
   def tableExists(dbName: String, tableName: String): Boolean

   /**
    * Check if the function with the specified name exists. This can either be a temporary function
    * or a function.
    *
    * @param functionName
    *   is either a qualified or unqualified name that designates a function. It follows the same
    *   resolution rule with SQL: search for built-in/temp functions first then functions in the
    *   current database (namespace).
    * @since 3.5.0
    */
   def functionExists(functionName: String): Boolean

   /**
    * Check if the function with the specified name exists in the specified database under the Hive
    * Metastore.
    *
    * To check existence of functions in other catalogs, please use `functionExists(functionName)`
    * with qualified function name instead.
    *
    * @param dbName
    *   is an unqualified name that designates a database.
    * @param functionName
    *   is an unqualified name that designates a function.
    * @since 3.5.0
    */
   def functionExists(dbName: String, functionName: String): Boolean

   /**
    * Creates a table from the given path and returns the corresponding DataFrame. It will use the
    * default data source configured by spark.sql.sources.default.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   @deprecated("use createTable instead.", "2.2.0")
   def createExternalTable(tableName: String, path: String): DataFrame = {
     createTable(tableName, path)
   }

   /**
    * Creates a table from the given path and returns the corresponding DataFrame. It will use the
    * default data source configured by spark.sql.sources.default.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   def createTable(tableName: String, path: String): DataFrame

   /**
    * Creates a table from the given path based on a data source and returns the corresponding
    * DataFrame.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   @deprecated("use createTable instead.", "2.2.0")
   def createExternalTable(tableName: String, path: String, source: String): DataFrame = {
     createTable(tableName, path, source)
   }

   /**
    * Creates a table from the given path based on a data source and returns the corresponding
    * DataFrame.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   def createTable(tableName: String, path: String, source: String): DataFrame

   /**
    * Creates a table from the given path based on a data source and a set of options. Then,
    * returns the corresponding DataFrame.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   @deprecated("use createTable instead.", "2.2.0")
   def createExternalTable(
       tableName: String,
       source: String,
       options: java.util.Map[String, String]): DataFrame = {
     createTable(tableName, source, options)
   }

   /**
    * Creates a table based on the dataset in a data source and a set of options. Then, returns the
    * corresponding DataFrame.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   def createTable(
       tableName: String,
       source: String,
       options: java.util.Map[String, String]): DataFrame = {
     createTable(tableName, source, options.asScala.toMap)
   }

   /**
    * (Scala-specific) Creates a table from the given path based on a data source and a set of
    * options. Then, returns the corresponding DataFrame.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   @deprecated("use createTable instead.", "2.2.0")
   def createExternalTable(
       tableName: String,
       source: String,
       options: Map[String, String]): DataFrame = {
     createTable(tableName, source, options)
   }

   /**
    * (Scala-specific) Creates a table based on the dataset in a data source and a set of options.
    * Then, returns the corresponding DataFrame.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   def createTable(tableName: String, source: String, options: Map[String, String]): DataFrame

   /**
    * Create a table from the given path based on a data source, a schema and a set of options.
    * Then, returns the corresponding DataFrame.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   @deprecated("use createTable instead.", "2.2.0")
   def createExternalTable(
       tableName: String,
       source: String,
       schema: StructType,
       options: java.util.Map[String, String]): DataFrame = {
     createTable(tableName, source, schema, options)
   }

   /**
    * Creates a table based on the dataset in a data source and a set of options. Then, returns the
    * corresponding DataFrame.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   def createTable(
       tableName: String,
       source: String,
       description: String,
       options: java.util.Map[String, String]): DataFrame = {
     createTable(
       tableName,
       source = source,
       description = description,
       options = options.asScala.toMap)
   }

   /**
    * (Scala-specific) Creates a table based on the dataset in a data source and a set of options.
    * Then, returns the corresponding DataFrame.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   def createTable(
       tableName: String,
       source: String,
       description: String,
       options: Map[String, String]): DataFrame

   /**
    * Create a table based on the dataset in a data source, a schema and a set of options. Then,
    * returns the corresponding DataFrame.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   def createTable(
       tableName: String,
       source: String,
       schema: StructType,
       options: java.util.Map[String, String]): DataFrame = {
     createTable(tableName, source, schema, options.asScala.toMap)
   }

   /**
    * (Scala-specific) Create a table from the given path based on a data source, a schema and a
    * set of options. Then, returns the corresponding DataFrame.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   @deprecated("use createTable instead.", "2.2.0")
   def createExternalTable(
       tableName: String,
       source: String,
       schema: StructType,
       options: Map[String, String]): DataFrame = {
     createTable(tableName, source, schema, options)
   }

   /**
    * (Scala-specific) Create a table based on the dataset in a data source, a schema and a set of
    * options. Then, returns the corresponding DataFrame.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   def createTable(
       tableName: String,
       source: String,
       schema: StructType,
       options: Map[String, String]): DataFrame

   /**
    * Create a table based on the dataset in a data source, a schema and a set of options. Then,
    * returns the corresponding DataFrame.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   def createTable(
       tableName: String,
       source: String,
       schema: StructType,
       description: String,
       options: java.util.Map[String, String]): DataFrame = {
     createTable(
       tableName,
       source = source,
       schema = schema,
       description = description,
       options = options.asScala.toMap)
   }

   /**
    * (Scala-specific) Create a table based on the dataset in a data source, a schema and a set of
    * options. Then, returns the corresponding DataFrame.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   def createTable(
       tableName: String,
       source: String,
       schema: StructType,
       description: String,
       options: Map[String, String]): DataFrame

   /**
    * Drops the local temporary view with the given view name in the catalog. If the view has been
    * cached before, then it will also be uncached.
    *
    * Local temporary view is session-scoped. Its lifetime is the lifetime of the session that
    * created it, i.e. it will be automatically dropped when the session terminates. It's not tied
    * to any databases, i.e. we can't use `db1.view1` to reference a local temporary view.
    *
    * Note that, the return type of this method was Unit in Spark 2.0, but changed to Boolean in
    * Spark 2.1.
    *
    * @param viewName
    *   the name of the temporary view to be dropped.
    * @return
    *   true if the view is dropped successfully, false otherwise.
    * @since 3.5.0
    */
   def dropTempView(viewName: String): Boolean

   /**
    * Drops the global temporary view with the given view name in the catalog. If the view has been
    * cached before, then it will also be uncached.
    *
    * Global temporary view is cross-session. Its lifetime is the lifetime of the Spark
    * application,
    * i.e. it will be automatically dropped when the application terminates. It's tied to a system
    * preserved database `global_temp`, and we must use the qualified name to refer a global temp
    * view, e.g. `SELECT * FROM global_temp.view1`.
    *
    * @param viewName
    *   the unqualified name of the temporary view to be dropped.
    * @return
    *   true if the view is dropped successfully, false otherwise.
    * @since 3.5.0
    */
   def dropGlobalTempView(viewName: String): Boolean

   /**
    * Recovers all the partitions in the directory of a table and update the catalog. Only works
    * with a partitioned table, and not a view.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table. If no database
    *   identifier is provided, it refers to a table in the current database.
    * @since 3.5.0
    */
   def recoverPartitions(tableName: String): Unit

   /**
    * Returns true if the table is currently cached in-memory.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table/view. If no database
    *   identifier is provided, it refers to a temporary view or a table/view in the current
    *   database.
    * @since 3.5.0
    */
   def isCached(tableName: String): Boolean

   /**
    * Caches the specified table in-memory.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table/view. If no database
    *   identifier is provided, it refers to a temporary view or a table/view in the current
    *   database.
    * @since 3.5.0
    */
   def cacheTable(tableName: String): Unit

   /**
    * Caches the specified table with the given storage level.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table/view. If no database
    *   identifier is provided, it refers to a temporary view or a table/view in the current
    *   database.
    * @param storageLevel
    *   storage level to cache table.
    * @since 3.5.0
    */
   def cacheTable(tableName: String, storageLevel: StorageLevel): Unit

   /**
    * Removes the specified table from the in-memory cache.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table/view. If no database
    *   identifier is provided, it refers to a temporary view or a table/view in the current
    *   database.
    * @since 3.5.0
    */
   def uncacheTable(tableName: String): Unit

   /**
    * Removes all cached tables from the in-memory cache.
    *
    * @since 3.5.0
    */
   def clearCache(): Unit

   /**
    * Invalidates and refreshes all the cached data and metadata of the given table. For
    * performance reasons, Spark SQL or the external data source library it uses might cache
    * certain metadata about a table, such as the location of blocks. When those change outside of
    * Spark SQL, users should call this function to invalidate the cache.
    *
    * If this table is cached as an InMemoryRelation, drop the original cached version and make the
    * new version cached lazily.
    *
    * @param tableName
    *   is either a qualified or unqualified name that designates a table/view. If no database
    *   identifier is provided, it refers to a temporary view or a table/view in the current
    *   database.
    * @since 3.5.0
    */
   def refreshTable(tableName: String): Unit

   /**
    * Invalidates and refreshes all the cached data (and the associated metadata) for any `Dataset`
    * that contains the given data source path. Path matching is by prefix, i.e. "/" would
    * invalidate everything that is cached.
    *
    * @since 3.5.0
    */
   def refreshByPath(path: String): Unit

   /**
    * Returns the current catalog in this session.
    *
    * @since 3.5.0
    */
   def currentCatalog(): String

   /**
    * Sets the current catalog in this session.
    *
    * @since 3.5.0
    */
   def setCurrentCatalog(catalogName: String): Unit

   /**
    * Returns a list of catalogs available in this session.
    *
    * @since 3.5.0
    */
   def listCatalogs(): Dataset[CatalogMetadata]

   /**
    * Returns a list of catalogs which name match the specify pattern and available in this
    * session.
    *
    * @since 3.5.0
    */
   def listCatalogs(pattern: String): Dataset[CatalogMetadata]
 }