blob: 34461c87aa24166ea4698e630c8274a43b1b7dc4 [file] [log] [blame]
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
gpcatalog.py
Contains two classes representing catalog metadata:
Catalog - a container class for CatalogTables
CatalogTable - metadata about a single tables
"""
# ============================================================================
import os
import json
from gppylib import gplog
from gppylib.gpversion import GpVersion
logger = gplog.get_default_logger()
class GPCatalogException(Exception):
pass
# Hard coded since "master only" is not defined in the catalog
MASTER_ONLY_TABLES = [
'gp_configuration',
'gp_configuration_history',
'gp_distribution_policy',
'gp_db_interfaces',
'gp_interfaces',
'gp_master_mirroring',
'gp_fault_strategy',
'gp_san_configuration',
'gp_segment_configuration',
'gp_verification_history',
'pg_description',
'pg_listener', # ???
'pg_partition',
'pg_partition_rule',
'pg_shdescription',
'pg_stat_last_operation',
'pg_stat_last_shoperation',
'pg_statistic',
'pg_filespace_entry',
'pg_partition_encoding',
'pg_auth_time_constraint',
]
# Hard coded since "persistent" is not defined in the catalog
PERSISTENT_TABLES = [
'gp_global_sequence',
'gp_persistent_database_node',
'gp_persistent_filespace_node',
'gp_persistent_relation_node',
'gp_persistent_tablespace_node',
'gp_relation_node',
]
# Hard coded tables that have different values on every segment
SEGMENT_LOCAL_TABLES = [
'pg_depend', # (not if we fix oid inconsistencies)
'pg_shdepend', # (not if we fix oid inconsistencies)
'gp_fastsequence', # AO segment row id allocations
'pg_statistic',
]
# ============================================================================
class GPCatalog():
"""
Catalog is a container class that contains dictionary of CatalogTable
objects.
It provides the CatalogTables with a context that they can use to
refer to other CatalogTables (e.g. describe foreign keys) and it
provides calling code with a simple wrapper for what a known catalog
layout looks like.
It supports multiple source versions of the database. It issues a
warning if there are catalog tables defined in the database that
it is unaware of, usually indicating that it is operating against
an unknown version.
"""
# --------------------------------------------------------------------
# Public API functions:
# - Catalog() - Create a Catalog object
# - getCatalogTable() - Returns a single CatalogTable
# - getCatalogTables() - Returns a list of CatalogTable
# - getCatalogVersion() - Returns a GpVersion
# --------------------------------------------------------------------
def getCatalogTable(self, tablename):
"""
getCatalogTable(tablename) => Returns the specified CatalogTable
Raises: CatalogException when the table does not exist
"""
if tablename not in self._tables:
raise GPCatalogException("No such catalog table: %s" % str(tablename))
return self._tables[tablename]
def getCatalogTables(self):
"""
getCatalogTables() => Returns a list of CatalogTable
"""
return self._tables.values()
def getCatalogVersion(self):
"""
getCatalogVersion() => Returns the GpVersion object
"""
return self._version
# --------------------------------------------------------------------
# Private implementation functions:
# --------------------------------------------------------------------
def __init__(self, dbConnection):
"""
Catalog() constructor
1) Uses the supplied database connection to get a list of catalog tables
2) iterate through the list building up CatalogTable objects
3) Mark "master only" tables manually
4) Mark a couple primary keys manually
5) Mark foreign keys manually
6) Mark known catalog differences manually
7) Validate and return the Catalog object
"""
self._dbConnection = dbConnection
self._tables = {}
self._version = None
self._tidycat = {} # tidycat definitions from JSON file
version_query = """
SELECT version()
"""
catalog_query = """
SELECT relname, relisshared FROM pg_class
WHERE relnamespace=11 and relkind = 'r'
"""
# Read the catalog version from the database
try:
curs = self._query(version_query)
except Exception, e:
raise GPCatalogException("Error reading database version: " + str(e))
self._version = GpVersion(curs.getresult()[0][0])
# Read the list of catalog tables from the database
try:
curs = self._query(catalog_query)
except Exception, e:
raise GPCatalogException("Error reading catalog: " + str(e))
# Construct our internal representation of the catalog
for [relname, relisshared] in curs.getresult():
self._tables[relname] = GPCatalogTable(self, relname)
# Note: stupid API returns t/f for boolean value
self._tables[relname]._setShared(relisshared is 't')
# The tidycat.pl utility has been used to generate a json file
# describing aspects of the catalog that we can not currently
# interrogate from the catalog itself. This includes things
# like which tables are master only vs segment local and what
# the foreign key relationships are.
self._getJson()
# Which tables are "master only" is not derivable from the catalog
# so we have to set this manually.
self._markMasterOnlyTables()
# We derived primary keys for most of the catalogs based on un
# unique indexes, but we have to manually set a few stranglers
self._setPrimaryKeys()
# Foreign key relationships of the catalog tables are not actually
# defined in the catalog, so must be obtained from tidycat
self._setForeignKeys()
# Most catalog tables are now ready to go, but some columns can
# not be compared directly between segments, we need to indicate
# these exceptions manually.
self._setKnownDifferences()
# Finally validate that everything looks right, this will issue
# warnings if there are any regular catalog tables that do not
# have primary keys set.
self._validate()
def _query(self, qry):
"""
Simple wrapper around querying the database connection
"""
return self._dbConnection.query(qry)
def _markMasterOnlyTables(self):
"""
We mark three types of catalog tables as "master only"
- True "master only" tables
- Tables we know to have different contents on master/segment
- Persistent Tables
While the later two are not technically "master only" they have
the property that we cannot validate cross segment consistency,
which makes them the same for our current purposes.
We may want to eventually move these other types of tables into
a different classification.
"""
for name in MASTER_ONLY_TABLES:
if name in self._tables:
self._tables[name]._setMasterOnly()
for name in SEGMENT_LOCAL_TABLES:
if name in self._tables:
self._tables[name]._setMasterOnly()
for name in PERSISTENT_TABLES:
if name in self._tables:
self._tables[name]._setMasterOnly()
def _setPrimaryKeys(self):
"""
Most of the catalog primary keys are set automatically in
CatalogTable by looking at unique indexes over the catalogs.
However there are a couple of catalog tables that do not have
unique indexes that we still want to perform cross segment
consistency on, for them we have to manually set a primary key
"""
self._tables['gp_version_at_initdb']._setPrimaryKey(
"schemaversion productversion")
self._tables['pg_constraint']._setPrimaryKey(
"conname connamespace conrelid contypid")
if self._version >= "4.0":
self._tables['pg_resqueuecapability']._setPrimaryKey(
"resqueueid restypid")
def _getJson(self):
"""
Read the json file generated by tidycat which contains, among other
things, the primary key/foreign key relationships for the catalog
tables. Build the fkeys for each table and validate them against
the catalog.
"""
indir = os.path.dirname(__file__)
jname = str(self._version.getVersionRelease()) + ".json"
try:
# json doc in data subdirectory of pylib module
infil = open(os.path.join(indir, "data", jname), "r")
d = json.load(infil)
# remove the tidycat comment
if "__comment" in d:
del d["__comment"]
if "__info" in d:
del d["__info"]
infil.close()
self._tidycat = d
except Exception, e:
# older versions of product will not have tidycat defs --
# need to handle this case
logger.warn("GPCatalogTable: "+ str(e))
def _setForeignKeys(self):
"""
Setup the foreign key relationships amongst the catalogs. We
drive this based on the tidycat generate json file since this
information is not derivable from the catalog.
"""
try:
for tname, tdef in self._tidycat.iteritems():
if "foreign_keys" not in tdef:
continue
for fkdef in tdef["foreign_keys"]:
fk2 = GPCatalogTableForeignKey(tname,
fkdef[0],
fkdef[1],
fkdef[2])
self._tables[tname]._addForeignKey(fk2)
except Exception, e:
# older versions of product will not have tidycat defs --
# need to handle this case
logger.warn("GPCatalogTable: "+ str(e))
def _setKnownDifferences(self):
"""
Some catalogs have columns that, for one reason or another, we
need to mark as being different between the segments and the master.
These fall into two catagories:
- Bugs (marked with the appropriate jiras)
- A small number of "special" columns
"""
# -------------
# Special cases
# -------------
# pg_class:
# - relfilenode should generally be consistent, but may not be (jira?)
# - relpages/reltuples/relfrozenxid are all vacumm/analyze related
# - relhasindex/relhaspkey are only cleared when vacuum completes
# - relowner has its own checks:
# => may want to separate out "owner" columns like acl and oid
self._tables['pg_class']._setKnownDifferences(
"relfilenode relpages reltuples relhasindex relhaspkey relowner relfrozenxid")
# pg_type: typowner has its own checks:
# => may want to separate out "owner" columns like acl and oid
self._tables['pg_type']._setKnownDifferences("typowner")
# pg_database: datfrozenxid = vacuum related
self._tables['pg_database']._setKnownDifferences("datfrozenxid")
# -------------
# Issues still present in the product
# -------------
# MPP-11289 : inconsistent OIDS for table "default values"
self._tables['pg_attrdef']._setKnownDifferences("oid")
# MPP-11284 : inconsistent OIDS for constraints
self._tables['pg_constraint']._setKnownDifferences("oid")
# MPP-11282: Inconsistent oids for language callback functions
# MPP-12015: Inconsistent oids for operator communtator/negator functions
self._tables['pg_proc']._setKnownDifferences("oid prolang")
# MPP-11282: pg_language oids and callback functions
self._tables['pg_language']._setKnownDifferences("oid lanplcallfoid lanvalidator")
# MPP-12015: Inconsistent oids for operator communtator/negator functions
# MPP-12015: Inconsistent oids for operator sort/cmp operators
self._tables['pg_operator']._setKnownDifferences(
"oid oprcom oprnegate oprlsortop oprrsortop oprltcmpop oprgtcmpop")
self._tables['pg_amop']._setKnownDifferences("amopopr")
self._tables['pg_aggregate']._setKnownDifferences("aggsortop")
# MPP-11281 : Inconsistent oids for views
self._tables['pg_rewrite']._setKnownDifferences("oid ev_action")
# MPP-11285 : Inconsistent oids for triggers
self._tables['pg_trigger']._setKnownDifferences("oid")
# MPP-11575 : Inconsistent handling of indpred for partial indexes
self._tables['pg_index']._setKnownDifferences("indpred")
# -------------
# Historical Issues that have been fixed
#
# These issues are patched during upgrade, but we must exclude them
# from checks when we are checking older versions of the catalog.
# -------------
if self._version < '4.1':
# MPP-12057 : roleresqueue is only set on the master:
self._tables['pg_authid']._setKnownDifferences("rolresqueue")
# MPP-10286 : inconsistent oids for temporary schemas
self._tables['pg_namespace']._setKnownDifferences("oid")
# MPP-11858: pg_resqueue/pg_resqueuecapability oid inconsistencies
self._tables['pg_resqueue']._setKnownDifferences("oid")
# pg_resqueue is master only in 4.0
self._tables['pg_resqueue']._setMasterOnly()
def _validate(self):
"""
Check that all tables defined in the catalog have either been marked
as "master only" or have a primary key
"""
for relname in sorted(self._tables):
if self._tables[relname].isMasterOnly():
continue
if self._tables[relname].getPrimaryKey() == []:
logger.warn("GPCatalogTable: unable to derive primary key for %s"
% str(relname))
# ============================================================================
class GPCatalogTable():
# --------------------------------------------------------------------
# Public API functions:
#
# Accessor functions
# - getTableName() - Returns the table name (string)
# - tableHasOids() - Returns if the table has oids (boolean)
# - isMasterOnly() - Returns if the table is "master only" (boolean)
# - isShared() - Returns if the table is shared (boolean)
# - getTableAcl() - Returns name of the acl column (string|None)
# - getPrimaryKey() - Returns the primary key (list)
# - getForeignKeys() - Returns a list of foreign keys (list)
# - getTableColumns() - Returns a list of table columns (list)
#
# --------------------------------------------------------------------
def getTableName(self):
return self._name
def tableHasOids(self):
return self._has_oid
def tableHasConsistentOids(self):
return (self._has_oid and 'oid' not in self._excluding)
def isMasterOnly(self):
return self._master
def isShared(self):
return self._isshared
def getTableAcl(self):
return self._acl
def getPrimaryKey(self):
return self._pkey
def getForeignKeys(self):
return self._fkey
def getTableColtypes(self):
return self._coltypes
def getTableColumns(self, with_oid=True, with_acl=True, excluding=None):
'''
Returns the list of columns this catalog table contains.
Optionally excluding:
- oid columns
- acl columns
- user specified list of excluded columns
By default excludes the "known differences" columns, to include them
pass [] as the excluding list.
'''
if excluding == None:
excluding = self._excluding
else:
excluding = set(excluding)
# Return all columns that are not excluded
return [
x for x in self._columns
if ((with_oid or x != 'oid') and
(with_acl or x != self._acl) and
(x not in excluding))
]
# --------------------------------------------------------------------
# Private Implementation functions
# --------------------------------------------------------------------
def __init__(self, parent, name, pkey=None):
"""
Create a new GPCatalogTable object
Uses the supplied database connection to identify:
- What are the columns in the table?
- Does the catalog table have an oid column?
- Does the catalog table have an acl column?
"""
assert(name != None)
# Split string input
if isinstance(pkey, str):
pkey = pkey.split()
self._parent = parent
self._name = name
self._master = False
self._isshared = False
self._pkey = list(pkey or [])
self._fkey = [] # foreign key
self._excluding = set()
self._columns = [] # initial value
self._coltypes = {}
self._acl = None # initial value
self._has_oid = False # initial value
# Query the database to lookup the catalog's definition
qry = """
select a.attname, a.atttypid, t.typname
from pg_attribute a
left outer join pg_type t on (a.atttypid = t.oid)
where attrelid = 'pg_catalog.%s'::regclass and
(attnum > 0 or attname='oid')
order by attnum
""" % name
try:
cur = parent._query(qry)
except:
# The cast to regclass will fail if the catalog table doesn't
# exist.
raise GPCatalogException("Catalog table %s does not exist" % name)
if cur.ntuples() == 0:
raise GPCatalogException("Catalog table %s does not exist" % name)
for row in cur.getresult():
(attname, atttype, typname) = row
# Mark if the catalog has an oid column
if attname == 'oid':
self._has_oid = True
# Detect the presence of an ACL column
if atttype == 1034:
self._acl = attname
# Add to the list of columns
self._columns.append(attname)
# Add to the coltypes dictionary
self._coltypes[attname] = typname
# If a primary key was not specified try to locate a unique index
# If a table has mutiple matching indexes, we'll pick the first index
# order by indkey to avoid the issue of MPP-16663.
if self._pkey == []:
qry = """
SELECT attname FROM (
SELECT unnest(indkey) as keynum FROM (
SELECT indkey
FROM pg_index
WHERE indisunique and not (indkey @> '-2'::int2vector) and
indrelid = 'pg_catalog.{catname}'::regclass
ORDER BY indkey LIMIT 1
) index_keys
) unnested_index_keys
JOIN pg_attribute ON (attnum = keynum)
WHERE attrelid = 'pg_catalog.{catname}'::regclass
""".format(catname=name)
cur = parent._query(qry)
self._pkey = [row[0] for row in cur.getresult()]
# Primary key must be in the column list
for k in self._pkey:
if k not in self._columns:
raise GPCatalogException("%s.%s does not exist" % (name, k))
def __str__(self):
return self._name
def __repr__(self):
return "GPCatalogTable: %s; pkey: %s; oids: %s; acl: %s" % (
str(self._name), str(self._pkey), str(self._has_oid), str(self._acl),
)
def __cmp__(self, other):
return cmp(other, self._name)
def _setMasterOnly(self, value=True):
self._master = value
def _setShared(self, value):
self._isshared = value
def _setPrimaryKey(self, pkey=None):
# Split string input
if isinstance(pkey, str):
pkey = pkey.split()
# Check that the specified keys are real columns
pkey = list(pkey or [])
for k in pkey:
if k not in self._columns:
raise Exception("%s.%s does not exist" % (self._name, k))
self._pkey = pkey
def _addForeignKey(self, fkey):
# Check that the specified keys are real columns
for k in fkey.getColumns():
if k not in self._columns:
raise Exception("%s.%s does not exist" % (self._name, k))
self._fkey.append(fkey)
def _setKnownDifferences(self, diffs):
# Split string input
if isinstance(diffs, str):
diffs = diffs.split()
self._excluding = set(diffs or [])
# ============================================================================
class GPCatalogTableForeignKey():
"""
GPCatalogTableForeignKey is a container for a single instance of a
postgres catalog primary key/foreign key relationship. The
foreign key is a set of columns for with a table, associated with
a set of primary key columns on a primary key table.
Note that tables can self-join, so it is possible to have the
primary and foreign key tables be one and the same.
This class constructs the key, but does not validate it against
the catalog.
"""
# --------------------------------------------------------------------
# Public API functions:
#
# Accessor functions
# - getTableName() - Returns name of table with fkeys
# - getPkeyTableName() - Returns name of the pkey table for the fkeys
# - getColumns() - Returns a list of [foreign] key columns (list)
# - getPKey() - Returns a list of primary key columns (list)
#
# --------------------------------------------------------------------
def getTableName(self):
return self._tname
def getPkeyTableName(self):
return self._pktablename
def getColumns(self):
return self._columns
def getPKey(self):
return self._pkey
# --------------------------------------------------------------------
# Private Implementation functions
# --------------------------------------------------------------------
def __init__(self, tname, cols, pktablename, pkey):
"""
Create a new GPCatalogTableForeignKey object
"""
assert(tname != None)
assert(pktablename != None)
# Split string input
if isinstance(pkey, str):
pkey = pkey.split()
self._tname = tname
self._pktablename = pktablename
self._pkey = list(pkey or [])
self._columns = cols
def __str__(self):
return "%s: %s" % (self._tname, str(self._columns))
def __repr__(self):
return "GPCatalogTableForeignKey: %s; col: %s; " % (
str(self._tname), str(self._columns)
)