blob: 0ce46426c581b236e1630e4d1edeee580c293d69 [file] [log] [blame]
/*-------------------------------------------------------------------------
*
* aomd.c
* This code manages append only relations that reside on magnetic disk.
* It serves the same general purpose as smgr/md.c however we introduce
* AO specific file access functions mainly because would like to bypass
* md.c's and bgwriter's fsyncing. AO relations also use a non constant
* block number to file segment mapping unlike heap relations.
*
* As of now we still let md.c create and unlink AO relations for us. This
* may need to change if inconsistencies arise.
*
* Portions Copyright (c) 2008, Greenplum Inc.
* Portions Copyright (c) 2012-Present VMware, Inc. or its affiliates.
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/access/appendonly/aomd.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>
#include "access/aomd.h"
#include "access/aocssegfiles.h"
#include "access/appendonlytid.h"
#include "access/appendonlywriter.h"
#include "access/appendonly_compaction.h"
#include "access/table.h"
#include "catalog/catalog.h"
#include "catalog/pg_appendonly.h"
#include "catalog/pg_attribute_encoding.h"
#include "cdb/cdbappendonlystorage.h"
#include "cdb/cdbappendonlyxlog.h"
#include "crypto/bufenc.h"
#include "commands/progress.h"
#include "common/relpath.h"
#include "pgstat.h"
#include "storage/bufmgr.h"
#include "storage/sync.h"
#include "utils/faultinjector.h"
#include "utils/guc.h"
#define SEGNO_SUFFIX_LENGTH 12
static void mdunlink_ao_base_relfile(void *ctx);
static bool mdunlink_ao_perFile(const int segno, void *ctx);
static bool copy_append_only_data_perFile(const int segno, void *ctx);
static bool truncate_ao_perFile(const int segno, void *ctx);
static uint64 ao_segfile_get_physical_size(Relation aorel, int segno, FileNumber filenum);
int
AOSegmentFilePathNameLen(Relation rel)
{
char *basepath;
int len;
/* Get base path for this relation file */
basepath = relpathbackend(rel->rd_locator, rel->rd_backend, MAIN_FORKNUM);
/*
* The basepath will be the RelFileNode number. Optional part is dot "." plus
* 6 digit segment file number.
*/
len = strlen(basepath) + 8; // Generous.
pfree(basepath);
return len;
}
/*
* Formats an Append Only relation file segment file name.
*
* The filepathname parameter assume sufficient space.
*/
void
FormatAOSegmentFileName(char *basepath,
int segno,
int filenum,
int32 *fileSegNo,
char *filepathname)
{
int pseudoSegNo;
Assert(segno >= 0);
Assert(segno <= AOTupleId_MaxSegmentFileNum);
if (filenum == InvalidFileNumber)
{
/*
* Row oriented Append-Only.
*/
pseudoSegNo = segno;
}
else
{
/*
* Column oriented Append-only.
*/
pseudoSegNo = ((filenum - 1) * AOTupleId_MultiplierSegmentFileNum) + segno;
}
*fileSegNo = pseudoSegNo;
if (pseudoSegNo > 0)
{
sprintf(filepathname, "%s.%u", basepath, pseudoSegNo);
}
else
strcpy(filepathname, basepath);
}
/*
* Make an Append Only relation file segment file name.
*
* The filepathname parameter assume sufficient space.
*/
void
MakeAOSegmentFileName(Relation rel,
int segno,
int filenum,
int32 *fileSegNo,
char *filepathname)
{
char *basepath;
int32 fileSegNoLocal;
/* Get base path for this relation file */
basepath = relpathbackend(rel->rd_locator, rel->rd_backend, MAIN_FORKNUM);
FormatAOSegmentFileName(basepath, segno, filenum, &fileSegNoLocal, filepathname);
*fileSegNo = fileSegNoLocal;
pfree(basepath);
}
/*
* Open an Append Only relation file segment
*
* The fd module's PathNameOpenFile() is used to open the file, so the
* the File* routines can be used to read, write, close, etc, the file.
*/
File
OpenAOSegmentFile(Relation rel,
char *filepathname,
int64 logicalEof)
{
int fileFlags = O_RDWR | PG_BINARY;
File fd;
errno = 0;
fd = RelationGetSmgr(rel)->smgr_ao->smgr_AORelOpenSegFile(RelationGetRelid(rel), filepathname, fileFlags);
if (fd < 0)
{
if (logicalEof == 0 && errno == ENOENT)
return -1;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open Append-Only segment file \"%s\": %m",
filepathname),
errdetail("logicalEof for open operation: %ld", logicalEof)));
}
return fd;
}
/*
* Close an Append Only relation file segment
*/
void
CloseAOSegmentFile(File fd, Relation rel)
{
Assert(fd > 0);
RelationGetSmgr(rel)->smgr_ao->smgr_FileClose(fd);
}
/*
* Truncate all bytes from offset to end of file.
*/
void
TruncateAOSegmentFile(File fd, Relation rel, int32 segFileNum, int64 offset, AOVacuumRelStats *vacrelstats)
{
char *relname = RelationGetRelationName(rel);
int64 filesize_before;
Assert(fd > 0);
Assert(offset >= 0);
filesize_before = RelationGetSmgr(rel)->smgr_ao->smgr_FileSize(fd);
if (filesize_before < offset)
ereport(ERROR,
(errmsg("\"%s\": file size smaller than logical eof: %m",
relname)));
/*
* Call the 'fd' module with a 64-bit length since AO segment files
* can be multi-gigabyte to the terabytes...
*/
if (rel->rd_smgr->smgr_ao->smgr_FileTruncate(fd, offset, WAIT_EVENT_DATA_FILE_TRUNCATE) != 0)
ereport(ERROR,
(errmsg("\"%s\": failed to truncate data after eof: %m",
relname)));
if (vacrelstats)
{
/* report heap-equivalent blocks vacuumed */
vacrelstats->nbytes_truncated += filesize_before - offset;
pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED,
RelationGuessNumberOfBlocksFromSize(vacrelstats->nbytes_truncated));
}
if (XLogIsNeeded() && RelationNeedsWAL(rel))
xlog_ao_truncate(rel->rd_locator, segFileNum, offset);
SIMPLE_FAULT_INJECTOR("appendonly_after_truncate_segment_file");
if (file_truncate_hook)
{
RelFileLocatorBackend rnode;
rnode.locator = rel->rd_locator;
rnode.backend = rel->rd_backend;
(*file_truncate_hook)(rnode);
}
}
struct mdunlink_ao_callback_ctx
{
RelFileLocator rnode; /* used to register forget request */
char *segPath;
char *segpathSuffixPosition;
bool isRedo;
};
struct truncate_ao_callback_ctx
{
char *segPath;
char *segpathSuffixPosition;
Relation rel;
};
void
mdunlink_ao(RelFileLocatorBackend rnode, ForkNumber forkNumber, bool isRedo)
{
const char *path = relpath(rnode, forkNumber);
/*
* Unlogged AO tables have INIT_FORK, in addition to MAIN_FORK. It is
* created once, regardless of the number of segment files (or the number
* of columns for column-oriented tables). Sync requests for INIT_FORKs
* are not remembered, so they need not be forgotten.
*/
if (forkNumber == INIT_FORKNUM)
{
path = relpath(rnode, forkNumber);
if (unlink(path) < 0 && errno != ENOENT)
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not remove file \"%s\": %m", path)));
}
/* This storage manager is not concerned with forks other than MAIN_FORK */
else if (forkNumber == MAIN_FORKNUM)
{
int pathSize = strlen(path);
char *segPath = (char *) palloc(pathSize + SEGNO_SUFFIX_LENGTH);
char *segPathSuffixPosition = segPath + pathSize;
struct mdunlink_ao_callback_ctx unlinkFiles;
unlinkFiles.isRedo = isRedo;
unlinkFiles.rnode = rnode.locator;
strncpy(segPath, path, pathSize);
unlinkFiles.segPath = segPath;
unlinkFiles.segpathSuffixPosition = segPathSuffixPosition;
mdunlink_ao_base_relfile(&unlinkFiles);
ao_foreach_extent_file(mdunlink_ao_perFile, &unlinkFiles);
pfree(segPath);
}
pfree((void *) path);
}
/*
* Delete or truncate segfile 0. Note: There is no <relfilenode>.0 file. The
* segfile 0 is the same as base relfilenode for row-oriented AO. For
* column-oriented AO, the segno 0 for the first column corresponds to base
* relfilenode. See also: ao_foreach_extent_file.
*/
static void
mdunlink_ao_base_relfile(void *ctx)
{
FileTag tag;
struct mdunlink_ao_callback_ctx *unlinkFiles =
(struct mdunlink_ao_callback_ctx *)ctx;
const char *baserel = unlinkFiles->segPath;
*unlinkFiles->segpathSuffixPosition = '\0';
if (unlinkFiles->isRedo)
{
/* First, forget any pending sync requests for the first segment */
INIT_FILETAG(tag, unlinkFiles->rnode, MAIN_FORKNUM, 0,
SYNC_HANDLER_AO);
RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
if (unlink(baserel) != 0)
{
/* ENOENT is expected after the end of the extensions */
if (errno != ENOENT)
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not remove file \"%s\": %m",
baserel)));
}
}
else
{
int fd;
int ret;
/* Register request to unlink first segment later */
INIT_FILETAG(tag, unlinkFiles->rnode, MAIN_FORKNUM, 0,
SYNC_HANDLER_AO);
RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
fd = OpenTransientFile(baserel, O_RDWR | PG_BINARY);
if (fd >= 0)
{
int save_errno;
ret = ftruncate(fd, 0);
save_errno = errno;
CloseTransientFile(fd);
errno = save_errno;
}
else
ret = -1;
if (ret < 0 && errno != ENOENT)
{
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not truncate file \"%s\": %m", baserel)));
}
}
}
static bool
mdunlink_ao_perFile(const int segno, void *ctx)
{
FileTag tag;
const struct mdunlink_ao_callback_ctx *unlinkFiles = ctx;
char *segPath = unlinkFiles->segPath;
char *segPathSuffixPosition = unlinkFiles->segpathSuffixPosition;
Assert (segno > 0);
sprintf(segPathSuffixPosition, ".%u", segno);
/* First, forget any pending sync requests for the first segment */
INIT_FILETAG(tag, unlinkFiles->rnode, MAIN_FORKNUM, segno,
SYNC_HANDLER_AO);
RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
/* Next unlink the file */
if (unlink(segPath) != 0)
{
/* ENOENT is expected after the end of the extensions */
if (errno != ENOENT)
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not remove file \"%s\": %m", segPath)));
else
return false;
}
return true;
}
static void
copy_file(char *srcsegpath, char *dstsegpath,
RelFileLocator dst, SMgrRelation srcSMGR, SMgrRelation dstSMGR,
int segfilenum, bool use_wal)
{
File srcFile;
File dstFile;
int64 left;
off_t offset;
char *buffer = palloc(BLCKSZ);
int dstflags;
srcFile = srcSMGR->smgr_ao->smgr_AORelOpenSegFile(InvalidOid, srcsegpath, O_RDONLY | PG_BINARY);
if (srcFile < 0)
ereport(ERROR,
(errcode_for_file_access(),
(errmsg("could not open file %s: %m", srcsegpath))));
dstflags = O_WRONLY | O_EXCL | PG_BINARY;
/*
* .0 relfilenode is expected to exist before calling this
* function. Caller calls RelationCreateStorage() which creates the base
* file for the relation. Hence use different flag for the same.
*/
if (segfilenum)
dstflags |= O_CREAT;
dstFile = dstSMGR->smgr_ao->smgr_AORelOpenSegFile(InvalidOid, dstsegpath, dstflags);
if (dstFile < 0)
ereport(ERROR,
(errcode_for_file_access(),
(errmsg("could not create destination file %s: %m", dstsegpath))));
left = srcSMGR->smgr_ao->smgr_FileDiskSize(srcFile);
if (left < 0)
ereport(ERROR,
(errcode_for_file_access(),
(errmsg("could not seek to end of file %s: %m", srcsegpath))));
offset = 0;
while(left > 0)
{
int len;
CHECK_FOR_INTERRUPTS();
len = Min(left, BLCKSZ);
if (srcSMGR->smgr_ao->smgr_FileRead(srcFile, buffer, len, offset, WAIT_EVENT_DATA_FILE_READ) != len)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read %d bytes from file \"%s\": %m",
len, srcsegpath)));
if (dstSMGR->smgr_ao->smgr_FileWrite(dstFile, buffer, len, offset, WAIT_EVENT_DATA_FILE_WRITE) != len)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write %d bytes to file \"%s\": %m",
len, dstsegpath)));
if (use_wal)
xlog_ao_insert(dst, segfilenum, offset, buffer, len);
offset += len;
left -= len;
}
if (dstSMGR->smgr_ao->smgr_FileSync(dstFile, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
dstsegpath)));
srcSMGR->smgr_ao->smgr_FileClose(srcFile);
dstSMGR->smgr_ao->smgr_FileClose(dstFile);
pfree(buffer);
}
struct copy_append_only_data_callback_ctx {
char *srcPath;
char *dstPath;
SMgrRelation srcSMGR;
SMgrRelation dstSMGR;
RelFileLocator src;
RelFileLocator dst;
bool useWal;
};
/*
* Like copy_relation_data(), but for AO tables.
*
*/
void
copy_append_only_data(RelFileLocator src, RelFileLocator dst,
SMgrRelation srcSMGR, SMgrRelation dstSMGR,
BackendId backendid, char relpersistence)
{
char *srcPath;
char *dstPath;
bool useWal;
struct copy_append_only_data_callback_ctx copyFiles = { 0 };
/*
* We need to log the copied data in WAL iff WAL archiving/streaming is
* enabled AND it's a permanent relation.
*/
useWal = XLogIsNeeded() && relpersistence == RELPERSISTENCE_PERMANENT;
srcPath = relpathbackend(src, backendid, MAIN_FORKNUM);
dstPath = relpathbackend(dst, backendid, MAIN_FORKNUM);
copy_file(srcPath, dstPath, dst, srcSMGR, dstSMGR, 0, useWal);
copyFiles.srcPath = srcPath;
copyFiles.dstPath = dstPath;
copyFiles.srcSMGR = srcSMGR;
copyFiles.dstSMGR = dstSMGR;
copyFiles.src = src;
copyFiles.dst = dst;
copyFiles.useWal = useWal;
ao_foreach_extent_file(copy_append_only_data_perFile, &copyFiles);
if (file_extend_hook)
{
RelFileLocatorBackend rnode;
rnode.locator = dst;
rnode.backend = backendid;
(*file_extend_hook)(rnode);
}
}
static bool
copy_append_only_data_perFile(const int segno, void *ctx)
{
const struct copy_append_only_data_callback_ctx *copyFiles = ctx;
char srcSegPath[MAXPGPATH + 12];
char dstSegPath[MAXPGPATH + 12];
sprintf(srcSegPath, "%s.%u", copyFiles->srcPath, segno);
if (access(srcSegPath, F_OK) != 0)
{
/* ENOENT is expected after the end of the extensions */
if (errno != ENOENT)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("access failed for file \"%s\": %m", srcSegPath)));
return false;
}
sprintf(dstSegPath, "%s.%u", copyFiles->dstPath, segno);
copy_file(srcSegPath, dstSegPath, copyFiles->dst, copyFiles->srcSMGR, copyFiles->dstSMGR, segno, copyFiles->useWal);
return true;
}
/*
* ao_truncate_one_rel
*
* This routine deletes all data within the specified ao relation.
*/
void
ao_truncate_one_rel(Relation rel)
{
char *basepath;
char *segPath;
char *segPathSuffixPosition;
struct truncate_ao_callback_ctx truncateFiles = { 0 };
int pathSize;
/* Get base path for this relation file */
basepath = relpathbackend(rel->rd_locator, rel->rd_backend, MAIN_FORKNUM);
pathSize = strlen(basepath);
segPath = (char *) palloc(pathSize + SEGNO_SUFFIX_LENGTH);
segPathSuffixPosition = segPath + pathSize;
strncpy(segPath, basepath, pathSize);
truncateFiles.segPath = segPath;
truncateFiles.segpathSuffixPosition = segPathSuffixPosition;
truncateFiles.rel = rel;
/*
* Truncate the actual file.
*
* Segfile 0 first, ao_foreach_extent_file() doesn't invoke the
* callback for it.
*/
truncate_ao_perFile(0, &truncateFiles);
ao_foreach_extent_file(truncate_ao_perFile, &truncateFiles);
pfree(segPath);
pfree(basepath);
}
/*
* Truncate a specific segment file of ao relation.
*/
static bool
truncate_ao_perFile(const int segno, void *ctx)
{
File fd;
Relation aorel;
const struct truncate_ao_callback_ctx *truncateFiles = ctx;
char *segPath = truncateFiles->segPath;
char *segPathSuffixPosition = truncateFiles->segpathSuffixPosition;
aorel = truncateFiles->rel;
if (segno > 0)
sprintf(segPathSuffixPosition, ".%u", segno);
else
*segPathSuffixPosition = '\0';
fd = OpenAOSegmentFile(aorel, segPath, 0);
if (fd >= 0)
{
TruncateAOSegmentFile(fd, aorel, segno, 0, NULL);
CloseAOSegmentFile(fd, aorel);
}
else
{
/*
* we traverse possible segment files of AO/AOCS tables and call
* truncate_ao_perFile to truncate them. It is ok that some files do not exist
*/
return false;
}
return true;
}
/*
* Returns the total of segment files' on-disk size for an AO/AOCO relation.
* This is only used by AO vaccum progress reporting.
*/
uint64
ao_rel_get_physical_size(Relation aorel)
{
Relation pg_aoseg_rel;
TupleDesc pg_aoseg_dsc;
SysScanDesc aoscan;
HeapTuple tuple;
Snapshot appendOnlyMetaDataSnapshot = RegisterSnapshot(GetCatalogSnapshot(InvalidOid));
Oid segrelid;
uint64 total_physical_size = 0;
Assert(RelationStorageIsAO(aorel));
GetAppendOnlyEntryAuxOids(aorel,
&segrelid, NULL, NULL, NULL, NULL);
pg_aoseg_rel = heap_open(segrelid, AccessShareLock);
pg_aoseg_dsc = RelationGetDescr(pg_aoseg_rel);
aoscan = systable_beginscan(pg_aoseg_rel, InvalidOid, false, appendOnlyMetaDataSnapshot, 0, NULL);
while ((tuple = systable_getnext(aoscan)) != NULL)
{
int segno;
bool isNull;
if (RelationStorageIsAoRows(aorel))
{
segno = DatumGetInt32(fastgetattr(tuple,
Anum_pg_aoseg_segno,
pg_aoseg_dsc, &isNull));
total_physical_size += ao_segfile_get_physical_size(aorel, segno, InvalidFileNumber);
}
else
{
Datum d;
AOCSVPInfo *vpinfo;
int col;
Assert(RelationStorageIsAoCols(aorel));
segno = DatumGetInt32(fastgetattr(tuple,
Anum_pg_aocs_segno,
pg_aoseg_dsc, &isNull));
d = fastgetattr(tuple,
Anum_pg_aocs_vpinfo,
pg_aoseg_dsc, &isNull);
vpinfo = (AOCSVPInfo *) PG_DETOAST_DATUM(d);
for (col = 0; col < vpinfo->nEntry; ++col)
{
FileNumber filenum = GetFilenumForAttribute(RelationGetRelid(aorel), col + 1);
total_physical_size += ao_segfile_get_physical_size(aorel, segno, filenum);
}
if (DatumGetPointer(d) != (Pointer) vpinfo)
pfree(vpinfo);
}
}
systable_endscan(aoscan);
heap_close(pg_aoseg_rel, AccessShareLock);
UnregisterSnapshot(appendOnlyMetaDataSnapshot);
return total_physical_size;
}
static uint64
ao_segfile_get_physical_size(Relation aorel, int segno, FileNumber filenum)
{
const char *relname;
File fd;
int32 fileSegNo;
char filenamepath[MAXPGPATH];
uint64 physical_size = 0;
relname = RelationGetRelationName(aorel);
MakeAOSegmentFileName(aorel, segno, filenum, &fileSegNo, filenamepath);
elogif(Debug_appendonly_print_compaction, LOG,
"Opening append-optimized relation \"%s\", relation id %u, relfilenode %u filenum #%d, logical segment #%d (physical segment file #%d)",
relname,
aorel->rd_id,
aorel->rd_locator.relNumber,
filenum,
segno,
fileSegNo);
fd = PathNameOpenFile(filenamepath, O_RDONLY | PG_BINARY);
if (fd >= 0)
physical_size = FileDiskSize(fd);
else
elogif(Debug_appendonly_print_compaction, LOG,
"No gp_relation_node entry for append-optimized relation \"%s\", relation id %u, relfilenode %u filenum #%d, logical segment #%d (physical segment file #%d)",
relname,
aorel->rd_id,
aorel->rd_locator.relNumber,
filenum,
segno,
fileSegNo);
return physical_size;
}