| /*------------------------------------------------------------------------- |
| * |
| * md.c |
| * This code manages relations that reside on magnetic disk. |
| * |
| * Or at least, that was what the Berkeley folk had in mind when they named |
| * this file. In reality, what this code provides is an interface from |
| * the smgr API to Unix-like filesystem APIs, so it will work with any type |
| * of device for which the operating system provides filesystem support. |
| * It doesn't matter whether the bits are on spinning rust or some other |
| * storage technology. |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/storage/smgr/md.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include <unistd.h> |
| #include <fcntl.h> |
| #include <sys/file.h> |
| #include <sys/types.h> |
| #include <sys/stat.h> |
| |
| #include "access/aomd.h" |
| #include "access/htup_details.h" |
| #include "access/xlog.h" |
| #include "access/xlogutils.h" |
| #include "commands/tablespace.h" |
| #include "miscadmin.h" |
| #include "pg_trace.h" |
| #include "pgstat.h" |
| #include "postmaster/bgwriter.h" |
| #include "storage/bufmgr.h" |
| #include "storage/fd.h" |
| #include "storage/md.h" |
| #include "storage/relfilenode.h" |
| #include "storage/smgr.h" |
| #include "storage/sync.h" |
| #include "utils/hsearch.h" |
| #include "utils/memutils.h" |
| |
| #include "catalog/catalog.h" |
| #include "catalog/pg_tablespace.h" |
| #include "utils/faultinjector.h" |
| |
| /* |
| * The magnetic disk storage manager keeps track of open file |
| * descriptors in its own descriptor pool. This is done to make it |
| * easier to support relations that are larger than the operating |
| * system's file size limit (often 2GBytes). In order to do that, |
| * we break relations up into "segment" files that are each shorter than |
| * the OS file size limit. The segment size is set by the RELSEG_SIZE |
| * configuration constant in pg_config.h. |
| * |
| * On disk, a relation must consist of consecutively numbered segment |
| * files in the pattern |
| * -- Zero or more full segments of exactly RELSEG_SIZE blocks each |
| * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks |
| * -- Optionally, any number of inactive segments of size 0 blocks. |
| * The full and partial segments are collectively the "active" segments. |
| * Inactive segments are those that once contained data but are currently |
| * not needed because of an mdtruncate() operation. The reason for leaving |
| * them present at size zero, rather than unlinking them, is that other |
| * backends and/or the checkpointer might be holding open file references to |
| * such segments. If the relation expands again after mdtruncate(), such |
| * that a deactivated segment becomes active again, it is important that |
| * such file references still be valid --- else data might get written |
| * out to an unlinked old copy of a segment file that will eventually |
| * disappear. |
| * |
| * File descriptors are stored in the per-fork md_seg_fds arrays inside |
| * SMgrRelation. The length of these arrays is stored in md_num_open_segs. |
| * Note that a fork's md_num_open_segs having a specific value does not |
| * necessarily mean the relation doesn't have additional segments; we may |
| * just not have opened the next segment yet. (We could not have "all |
| * segments are in the array" as an invariant anyway, since another backend |
| * could extend the relation while we aren't looking.) We do not have |
| * entries for inactive segments, however; as soon as we find a partial |
| * segment, we assume that any subsequent segments are inactive. |
| * |
| * The entire MdfdVec array is palloc'd in the MdCxt memory context. |
| */ |
| |
| typedef struct _MdfdVec |
| { |
| File mdfd_vfd; /* fd number in fd.c's pool */ |
| BlockNumber mdfd_segno; /* segment number, from 0 */ |
| } MdfdVec; |
| |
| static MemoryContext MdCxt; /* context for all MdfdVec objects */ |
| |
| |
| /* Populate a file tag describing an md.c segment file. */ |
| #define INIT_MD_FILETAG(a,xx_rnode,xx_forknum,xx_segno) \ |
| ( \ |
| memset(&(a), 0, sizeof(FileTag)), \ |
| (a).handler = SYNC_HANDLER_MD, \ |
| (a).rnode = (xx_rnode), \ |
| (a).forknum = (xx_forknum), \ |
| (a).segno = (xx_segno) \ |
| ) |
| |
| |
| /*** behavior for mdopen & _mdfd_getseg ***/ |
| /* ereport if segment not present */ |
| #define EXTENSION_FAIL (1 << 0) |
| /* return NULL if segment not present */ |
| #define EXTENSION_RETURN_NULL (1 << 1) |
| /* create new segments as needed */ |
| #define EXTENSION_CREATE (1 << 2) |
| /* create new segments if needed during recovery */ |
| #define EXTENSION_CREATE_RECOVERY (1 << 3) |
| /* |
| * Allow opening segments which are preceded by segments smaller than |
| * RELSEG_SIZE, e.g. inactive segments (see above). Note that this breaks |
| * mdnblocks() and related functionality henceforth - which currently is ok, |
| * because this is only required in the checkpointer which never uses |
| * mdnblocks(). |
| */ |
| #define EXTENSION_DONT_CHECK_SIZE (1 << 4) |
| |
| |
| /* local routines */ |
| static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, |
| bool isRedo); |
| static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior); |
| static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, |
| MdfdVec *seg); |
| static void register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum, |
| BlockNumber segno); |
| static void register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum, |
| BlockNumber segno); |
| static void _fdvec_resize(SMgrRelation reln, |
| ForkNumber forknum, |
| int nseg); |
| static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, |
| BlockNumber segno); |
| static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno, |
| BlockNumber segno, int oflags); |
| static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno, |
| BlockNumber blkno, bool skipFsync, int behavior); |
| static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, |
| MdfdVec *seg); |
| |
| |
| /* |
| * mdinit() -- Initialize private state for magnetic disk storage manager. |
| */ |
| void |
| mdinit(void) |
| { |
| MdCxt = AllocSetContextCreate(TopMemoryContext, |
| "MdSmgr", |
| ALLOCSET_DEFAULT_SIZES); |
| } |
| |
| /* |
| * mdexists() -- Does the physical file exist? |
| * |
| * Note: this will return true for lingering files, with pending deletions |
| */ |
| bool |
| mdexists(SMgrRelation reln, ForkNumber forkNum) |
| { |
| /* |
| * Close it first, to ensure that we notice if the fork has been unlinked |
| * since we opened it. |
| */ |
| mdclose(reln, forkNum); |
| |
| return (mdopenfork(reln, forkNum, EXTENSION_RETURN_NULL) != NULL); |
| } |
| |
| /* |
| * mdcreate() -- Create a new relation on magnetic disk. |
| * |
| * If isRedo is true, it's okay for the relation to exist already. |
| */ |
| void |
| mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) |
| { |
| MdfdVec *mdfd; |
| char *path; |
| File fd; |
| |
| if (isRedo && reln->md_num_open_segs[forkNum] > 0) |
| return; /* created and opened already... */ |
| |
| Assert(reln->md_num_open_segs[forkNum] == 0); |
| |
| /* |
| * We may be using the target table space for the first time in this |
| * database, so create a per-database subdirectory if needed. |
| * |
| * XXX this is a fairly ugly violation of module layering, but this seems |
| * to be the best place to put the check. Maybe TablespaceCreateDbspace |
| * should be here and not in commands/tablespace.c? But that would imply |
| * importing a lot of stuff that smgr.c oughtn't know, either. |
| */ |
| TablespaceCreateDbspace(reln->smgr_rnode.node.spcNode, |
| reln->smgr_rnode.node.dbNode, |
| isRedo); |
| |
| path = relpath(reln->smgr_rnode, forkNum); |
| |
| fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); |
| |
| if (fd < 0) |
| { |
| int save_errno = errno; |
| |
| if (isRedo) |
| fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); |
| if (fd < 0) |
| { |
| /* be sure to report the error reported by create, not open */ |
| errno = save_errno; |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not create file \"%s\": %m", path))); |
| } |
| } |
| |
| pfree(path); |
| |
| _fdvec_resize(reln, forkNum, 1); |
| mdfd = &reln->md_seg_fds[forkNum][0]; |
| mdfd->mdfd_vfd = fd; |
| mdfd->mdfd_segno = 0; |
| } |
| |
| /* |
| * mdcreate_ao() -- Create a AO segfile |
| * |
| * If isRedo is true, it's okay for the file to exist already. |
| */ |
| void |
| mdcreate_ao(RelFileNodeBackend rnode, int32 segmentFileNum, bool isRedo) |
| { |
| char *path; |
| File fd; |
| |
| path = aorelpath(rnode, segmentFileNum); |
| |
| fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); |
| |
| if (fd < 0) |
| { |
| int save_errno = errno; |
| |
| /* |
| * During bootstrap, there are cases where a system relation will be |
| * accessed (by internal backend processes) before the bootstrap |
| * script nominally creates it. Therefore, allow the file to exist |
| * already, even if isRedo is not set. (See also mdopen) |
| */ |
| if (isRedo || IsBootstrapProcessingMode()) |
| fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); |
| if (fd < 0) |
| { |
| /* be sure to report the error reported by create, not open */ |
| errno = save_errno; |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not create relation %s: %m", path))); |
| } |
| } |
| |
| pfree(path); |
| } |
| |
| /* |
| * mdunlink() -- Unlink a relation. |
| * |
| * Note that we're passed a RelFileNodeBackend --- by the time this is called, |
| * there won't be an SMgrRelation hashtable entry anymore. |
| * |
| * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber |
| * to delete all forks. |
| * |
| * For regular relations, we don't unlink the first segment file of the rel, |
| * but just truncate it to zero length, and record a request to unlink it after |
| * the next checkpoint. Additional segments can be unlinked immediately, |
| * however. Leaving the empty file in place prevents that relfilenode |
| * number from being reused. The scenario this protects us from is: |
| * 1. We delete a relation (and commit, and actually remove its file). |
| * 2. We create a new relation, which by chance gets the same relfilenode as |
| * the just-deleted one (OIDs must've wrapped around for that to happen). |
| * 3. We crash before another checkpoint occurs. |
| * During replay, we would delete the file and then recreate it, which is fine |
| * if the contents of the file were repopulated by subsequent WAL entries. |
| * But if we didn't WAL-log insertions, but instead relied on fsyncing the |
| * file after populating it (as we do at wal_level=minimal), the contents of |
| * the file would be lost forever. By leaving the empty file until after the |
| * next checkpoint, we prevent reassignment of the relfilenode number until |
| * it's safe, because relfilenode assignment skips over any existing file. |
| * |
| * We do not need to go through this dance for temp relations, though, because |
| * we never make WAL entries for temp rels, and so a temp rel poses no threat |
| * to the health of a regular rel that has taken over its relfilenode number. |
| * The fact that temp rels and regular rels have different file naming |
| * patterns provides additional safety. |
| * |
| * All the above applies only to the relation's main fork; other forks can |
| * just be removed immediately, since they are not needed to prevent the |
| * relfilenode number from being recycled. Also, we do not carefully |
| * track whether other forks have been created or not, but just attempt to |
| * unlink them unconditionally; so we should never complain about ENOENT. |
| * |
| * If isRedo is true, it's unsurprising for the relation to be already gone. |
| * Also, we should remove the file immediately instead of queuing a request |
| * for later, since during redo there's no possibility of creating a |
| * conflicting relation. |
| * |
| * Note: any failure should be reported as WARNING not ERROR, because |
| * we are usually not in a transaction anymore when this is called. |
| */ |
| void |
| mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) |
| { |
| /* Now do the per-fork work */ |
| if (forkNum == InvalidForkNumber) |
| { |
| for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) |
| mdunlinkfork(rnode, forkNum, isRedo); |
| } |
| else |
| mdunlinkfork(rnode, forkNum, isRedo); |
| } |
| |
| /* |
| * Truncate a file to release disk space. |
| */ |
| static int |
| do_truncate(const char *path) |
| { |
| int save_errno; |
| int ret; |
| |
| ret = pg_truncate(path, 0); |
| |
| /* Log a warning here to avoid repetition in callers. */ |
| if (ret < 0 && errno != ENOENT) |
| { |
| save_errno = errno; |
| ereport(WARNING, |
| (errcode_for_file_access(), |
| errmsg("could not truncate file \"%s\": %m", path))); |
| errno = save_errno; |
| } |
| |
| return ret; |
| } |
| |
| static void |
| mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) |
| { |
| char *path; |
| int ret; |
| |
| path = relpath(rnode, forkNum); |
| |
| /* |
| * Delete or truncate the first segment. |
| */ |
| if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode)) |
| { |
| if (!RelFileNodeBackendIsTemp(rnode)) |
| { |
| /* Prevent other backends' fds from holding on to the disk space */ |
| ret = do_truncate(path); |
| |
| /* Forget any pending sync requests for the first segment */ |
| register_forget_request(rnode, forkNum, 0 /* first seg */ ); |
| } |
| else |
| ret = 0; |
| |
| /* Next unlink the file, unless it was already found to be missing */ |
| if (ret == 0 || errno != ENOENT) |
| { |
| ret = unlink(path); |
| if (ret < 0 && errno != ENOENT) |
| ereport(WARNING, |
| (errcode_for_file_access(), |
| errmsg("could not remove file \"%s\": %m", path))); |
| } |
| } |
| else |
| { |
| /* Prevent other backends' fds from holding on to the disk space */ |
| ret = do_truncate(path); |
| |
| /* Register request to unlink first segment later */ |
| register_unlink_segment(rnode, forkNum, 0 /* first seg */ ); |
| } |
| |
| /* |
| * Delete any additional segments. |
| */ |
| if (ret >= 0) |
| { |
| char *segpath = (char *) palloc(strlen(path) + 12); |
| BlockNumber segno; |
| |
| /* |
| * Note that because we loop until getting ENOENT, we will correctly |
| * remove all inactive segments as well as active ones. |
| */ |
| for (segno = 1;; segno++) |
| { |
| sprintf(segpath, "%s.%u", path, segno); |
| |
| if (!RelFileNodeBackendIsTemp(rnode)) |
| { |
| /* |
| * Prevent other backends' fds from holding on to the disk |
| * space. |
| */ |
| if (do_truncate(segpath) < 0 && errno == ENOENT) |
| break; |
| |
| /* |
| * Forget any pending sync requests for this segment before we |
| * try to unlink. |
| */ |
| register_forget_request(rnode, forkNum, segno); |
| } |
| |
| if (unlink(segpath) < 0) |
| { |
| /* ENOENT is expected after the last segment... */ |
| if (errno != ENOENT) |
| ereport(WARNING, |
| (errcode_for_file_access(), |
| errmsg("could not remove file \"%s\": %m", segpath))); |
| break; |
| } |
| } |
| pfree(segpath); |
| } |
| |
| pfree(path); |
| } |
| |
| /* |
| * mdextend() -- Add a block to the specified relation. |
| * |
| * The semantics are nearly the same as mdwrite(): write at the |
| * specified position. However, this is to be used for the case of |
| * extending a relation (i.e., blocknum is at or beyond the current |
| * EOF). Note that we assume writing a block beyond current EOF |
| * causes intervening file space to become filled with zeroes. |
| */ |
| void |
| mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, |
| char *buffer, bool skipFsync) |
| { |
| off_t seekpos; |
| int nbytes; |
| MdfdVec *v; |
| |
| /* This assert is too expensive to have on normally ... */ |
| #ifdef CHECK_WRITE_VS_EXTEND |
| Assert(blocknum >= mdnblocks(reln, forknum)); |
| #endif |
| |
| /* |
| * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any |
| * more --- we mustn't create a block whose number actually is |
| * InvalidBlockNumber. (Note that this failure should be unreachable |
| * because of upstream checks in bufmgr.c.) |
| */ |
| if (blocknum == InvalidBlockNumber) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("cannot extend file \"%s\" beyond %u blocks", |
| relpath(reln->smgr_rnode, forknum), |
| InvalidBlockNumber))); |
| |
| v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); |
| |
| seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); |
| |
| Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); |
| |
| if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) |
| { |
| if (nbytes < 0) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not extend file \"%s\": %m", |
| FilePathName(v->mdfd_vfd)), |
| errhint("Check free disk space."))); |
| /* short write: complain appropriately */ |
| ereport(ERROR, |
| (errcode(ERRCODE_DISK_FULL), |
| errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u", |
| FilePathName(v->mdfd_vfd), |
| nbytes, BLCKSZ, blocknum), |
| errhint("Check free disk space."))); |
| } |
| |
| if (!skipFsync && !SmgrIsTemp(reln)) |
| register_dirty_segment(reln, forknum, v); |
| |
| Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); |
| } |
| |
| /* |
| * mdopenfork() -- Open one fork of the specified relation. |
| * |
| * Note we only open the first segment, when there are multiple segments. |
| * |
| * If first segment is not present, either ereport or return NULL according |
| * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL; |
| * EXTENSION_CREATE means it's OK to extend an existing relation, not to |
| * invent one out of whole cloth. |
| */ |
| static MdfdVec * |
| mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) |
| { |
| MdfdVec *mdfd; |
| char *path; |
| File fd; |
| |
| /* No work if already open */ |
| if (reln->md_num_open_segs[forknum] > 0) |
| return &reln->md_seg_fds[forknum][0]; |
| |
| path = relpath(reln->smgr_rnode, forknum); |
| |
| fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); |
| |
| if (fd < 0) |
| { |
| if ((behavior & EXTENSION_RETURN_NULL) && |
| FILE_POSSIBLY_DELETED(errno)) |
| { |
| pfree(path); |
| return NULL; |
| } |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not open file \"%s\": %m", path))); |
| } |
| |
| pfree(path); |
| |
| _fdvec_resize(reln, forknum, 1); |
| mdfd = &reln->md_seg_fds[forknum][0]; |
| mdfd->mdfd_vfd = fd; |
| mdfd->mdfd_segno = 0; |
| |
| Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE) || reln->smgr_which == SMGR_AO); |
| |
| return mdfd; |
| } |
| |
| /* |
| * mdopen() -- Initialize newly-opened relation. |
| */ |
| void |
| mdopen(SMgrRelation reln) |
| { |
| /* mark it not open */ |
| for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) |
| reln->md_num_open_segs[forknum] = 0; |
| } |
| |
| /* |
| * mdclose() -- Close the specified relation, if it isn't closed already. |
| */ |
| void |
| mdclose(SMgrRelation reln, ForkNumber forknum) |
| { |
| int nopensegs = reln->md_num_open_segs[forknum]; |
| |
| /* No work if already closed */ |
| if (nopensegs == 0) |
| return; |
| |
| /* close segments starting from the end */ |
| while (nopensegs > 0) |
| { |
| MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1]; |
| |
| FileClose(v->mdfd_vfd); |
| _fdvec_resize(reln, forknum, nopensegs - 1); |
| nopensegs--; |
| } |
| } |
| |
| /* |
| * mdprefetch() -- Initiate asynchronous read of the specified block of a relation |
| */ |
| bool |
| mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) |
| { |
| #ifdef USE_PREFETCH |
| off_t seekpos; |
| MdfdVec *v; |
| |
| v = _mdfd_getseg(reln, forknum, blocknum, false, |
| InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL); |
| if (v == NULL) |
| return false; |
| |
| seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); |
| |
| Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); |
| |
| (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH); |
| #endif /* USE_PREFETCH */ |
| |
| return true; |
| } |
| |
| /* |
| * mdwriteback() -- Tell the kernel to write pages back to storage. |
| * |
| * This accepts a range of blocks because flushing several pages at once is |
| * considerably more efficient than doing so individually. |
| */ |
| void |
| mdwriteback(SMgrRelation reln, ForkNumber forknum, |
| BlockNumber blocknum, BlockNumber nblocks) |
| { |
| /* |
| * Issue flush requests in as few requests as possible; have to split at |
| * segment boundaries though, since those are actually separate files. |
| */ |
| while (nblocks > 0) |
| { |
| BlockNumber nflush = nblocks; |
| off_t seekpos; |
| MdfdVec *v; |
| int segnum_start, |
| segnum_end; |
| |
| v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ , |
| EXTENSION_RETURN_NULL); |
| |
| /* |
| * We might be flushing buffers of already removed relations, that's |
| * ok, just ignore that case. |
| */ |
| if (!v) |
| return; |
| |
| /* compute offset inside the current segment */ |
| segnum_start = blocknum / RELSEG_SIZE; |
| |
| /* compute number of desired writes within the current segment */ |
| segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE; |
| if (segnum_start != segnum_end) |
| nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)); |
| |
| Assert(nflush >= 1); |
| Assert(nflush <= nblocks); |
| |
| seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); |
| |
| FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH); |
| |
| nblocks -= nflush; |
| blocknum += nflush; |
| } |
| } |
| |
| /* |
| * mdread() -- Read the specified block from a relation. |
| */ |
| void |
| mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, |
| char *buffer) |
| { |
| off_t seekpos; |
| int nbytes; |
| MdfdVec *v; |
| |
| TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, |
| reln->smgr_rnode.node.spcNode, |
| reln->smgr_rnode.node.dbNode, |
| reln->smgr_rnode.node.relNode, |
| reln->smgr_rnode.backend); |
| |
| v = _mdfd_getseg(reln, forknum, blocknum, false, |
| EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); |
| |
| seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); |
| |
| Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); |
| |
| nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ); |
| |
| TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, |
| reln->smgr_rnode.node.spcNode, |
| reln->smgr_rnode.node.dbNode, |
| reln->smgr_rnode.node.relNode, |
| reln->smgr_rnode.backend, |
| nbytes, |
| BLCKSZ); |
| |
| if (nbytes != BLCKSZ) |
| { |
| if (nbytes < 0) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not read block %u in file \"%s\": %m", |
| blocknum, FilePathName(v->mdfd_vfd)))); |
| |
| /* |
| * Short read: we are at or past EOF, or we read a partial block at |
| * EOF. Normally this is an error; upper levels should never try to |
| * read a nonexistent block. However, if zero_damaged_pages is ON or |
| * we are InRecovery, we should instead return zeroes without |
| * complaining. This allows, for example, the case of trying to |
| * update a block that was later truncated away. |
| */ |
| if (zero_damaged_pages || InRecovery) |
| MemSet(buffer, 0, BLCKSZ); |
| else |
| ereport(ERROR, |
| (errcode(ERRCODE_DATA_CORRUPTED), |
| errmsg("could not read block %u in file \"%s\": read only %d of %d bytes", |
| blocknum, FilePathName(v->mdfd_vfd), |
| nbytes, BLCKSZ))); |
| } |
| } |
| |
| /* |
| * mdwrite() -- Write the supplied block at the appropriate location. |
| * |
| * This is to be used only for updating already-existing blocks of a |
| * relation (ie, those before the current EOF). To extend a relation, |
| * use mdextend(). |
| */ |
| void |
| mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, |
| char *buffer, bool skipFsync) |
| { |
| off_t seekpos; |
| int nbytes; |
| MdfdVec *v; |
| |
| /* This assert is too expensive to have on normally ... */ |
| #ifdef CHECK_WRITE_VS_EXTEND |
| Assert(blocknum < mdnblocks(reln, forknum)); |
| #endif |
| |
| TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, |
| reln->smgr_rnode.node.spcNode, |
| reln->smgr_rnode.node.dbNode, |
| reln->smgr_rnode.node.relNode, |
| reln->smgr_rnode.backend); |
| |
| v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, |
| EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); |
| |
| seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); |
| |
| Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); |
| |
| nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE); |
| |
| TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, |
| reln->smgr_rnode.node.spcNode, |
| reln->smgr_rnode.node.dbNode, |
| reln->smgr_rnode.node.relNode, |
| reln->smgr_rnode.backend, |
| nbytes, |
| BLCKSZ); |
| |
| if (nbytes != BLCKSZ) |
| { |
| if (nbytes < 0) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not write block %u in file \"%s\": %m", |
| blocknum, FilePathName(v->mdfd_vfd)))); |
| /* short write: complain appropriately */ |
| ereport(ERROR, |
| (errcode(ERRCODE_DISK_FULL), |
| errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes", |
| blocknum, |
| FilePathName(v->mdfd_vfd), |
| nbytes, BLCKSZ), |
| errhint("Check free disk space."))); |
| } |
| |
| if (!skipFsync && !SmgrIsTemp(reln)) |
| register_dirty_segment(reln, forknum, v); |
| } |
| |
| /* |
| * mdnblocks() -- Get the number of blocks stored in a relation. |
| * |
| * Important side effect: all active segments of the relation are opened |
| * and added to the md_seg_fds array. If this routine has not been |
| * called, then only segments up to the last one actually touched |
| * are present in the array. |
| */ |
| BlockNumber |
| mdnblocks(SMgrRelation reln, ForkNumber forknum) |
| { |
| MdfdVec *v; |
| BlockNumber nblocks; |
| BlockNumber segno; |
| |
| mdopenfork(reln, forknum, EXTENSION_FAIL); |
| |
| /* mdopen has opened the first segment */ |
| Assert(reln->md_num_open_segs[forknum] > 0); |
| |
| /* |
| * Start from the last open segments, to avoid redundant seeks. We have |
| * previously verified that these segments are exactly RELSEG_SIZE long, |
| * and it's useless to recheck that each time. |
| * |
| * NOTE: this assumption could only be wrong if another backend has |
| * truncated the relation. We rely on higher code levels to handle that |
| * scenario by closing and re-opening the md fd, which is handled via |
| * relcache flush. (Since the checkpointer doesn't participate in |
| * relcache flush, it could have segment entries for inactive segments; |
| * that's OK because the checkpointer never needs to compute relation |
| * size.) |
| */ |
| segno = reln->md_num_open_segs[forknum] - 1; |
| v = &reln->md_seg_fds[forknum][segno]; |
| |
| for (;;) |
| { |
| nblocks = _mdnblocks(reln, forknum, v); |
| if (nblocks > ((BlockNumber) RELSEG_SIZE)) |
| elog(FATAL, "segment too big"); |
| if (nblocks < ((BlockNumber) RELSEG_SIZE)) |
| return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks; |
| |
| /* |
| * If segment is exactly RELSEG_SIZE, advance to next one. |
| */ |
| segno++; |
| |
| /* |
| * We used to pass O_CREAT here, but that has the disadvantage that it |
| * might create a segment which has vanished through some operating |
| * system misadventure. In such a case, creating the segment here |
| * undermines _mdfd_getseg's attempts to notice and report an error |
| * upon access to a missing segment. |
| */ |
| v = _mdfd_openseg(reln, forknum, segno, 0); |
| if (v == NULL) |
| return segno * ((BlockNumber) RELSEG_SIZE); |
| } |
| } |
| |
| /* |
| * mdtruncate() -- Truncate relation to specified number of blocks. |
| */ |
| void |
| mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) |
| { |
| BlockNumber curnblk; |
| BlockNumber priorblocks; |
| int curopensegs; |
| |
| /* |
| * NOTE: mdnblocks makes sure we have opened all active segments, so that |
| * truncation loop will get them all! |
| */ |
| curnblk = mdnblocks(reln, forknum); |
| if (nblocks > curnblk) |
| { |
| /* Bogus request ... but no complaint if InRecovery */ |
| if (InRecovery) |
| return; |
| ereport(ERROR, |
| (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now", |
| relpath(reln->smgr_rnode, forknum), |
| nblocks, curnblk))); |
| } |
| |
| if (nblocks == curnblk && (forknum != MAIN_FORKNUM)) |
| return; /* no work */ |
| |
| /* |
| * Truncate segments, starting at the last one. Starting at the end makes |
| * managing the memory for the fd array easier, should there be errors. |
| */ |
| curopensegs = reln->md_num_open_segs[forknum]; |
| while (curopensegs > 0) |
| { |
| MdfdVec *v; |
| |
| priorblocks = (curopensegs - 1) * RELSEG_SIZE; |
| |
| v = &reln->md_seg_fds[forknum][curopensegs - 1]; |
| |
| if (priorblocks > nblocks) |
| { |
| /* |
| * This segment is no longer active. We truncate the file, but do |
| * not delete it, for reasons explained in the header comments. |
| */ |
| if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not truncate file \"%s\": %m", |
| FilePathName(v->mdfd_vfd)))); |
| |
| if (!SmgrIsTemp(reln)) |
| register_dirty_segment(reln, forknum, v); |
| |
| /* we never drop the 1st segment */ |
| Assert(v != &reln->md_seg_fds[forknum][0]); |
| |
| FileClose(v->mdfd_vfd); |
| _fdvec_resize(reln, forknum, curopensegs - 1); |
| } |
| else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks) |
| { |
| /* |
| * This is the last segment we want to keep. Truncate the file to |
| * the right length. NOTE: if nblocks is exactly a multiple K of |
| * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but |
| * keep it. This adheres to the invariant given in the header |
| * comments. |
| */ |
| BlockNumber lastsegblocks = nblocks - priorblocks; |
| |
| if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not truncate file \"%s\" to %u blocks: %m", |
| FilePathName(v->mdfd_vfd), |
| nblocks))); |
| if (!SmgrIsTemp(reln)) |
| register_dirty_segment(reln, forknum, v); |
| } |
| else |
| { |
| /* |
| * We still need this segment, so nothing to do for this and any |
| * earlier segment. |
| */ |
| break; |
| } |
| curopensegs--; |
| } |
| } |
| |
| /* |
| * mdimmedsync() -- Immediately sync a relation to stable storage. |
| * |
| * Note that only writes already issued are synced; this routine knows |
| * nothing of dirty buffers that may exist inside the buffer manager. We |
| * sync active and inactive segments; smgrDoPendingSyncs() relies on this. |
| * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of |
| * some segment, then mdtruncate() renders that segment inactive. If we |
| * crash before the next checkpoint syncs the newly-inactive segment, that |
| * segment may survive recovery, reintroducing unwanted data into the table. |
| */ |
| void |
| mdimmedsync(SMgrRelation reln, ForkNumber forknum) |
| { |
| int segno; |
| int min_inactive_seg; |
| |
| /* |
| * NOTE: mdnblocks makes sure we have opened all active segments, so that |
| * fsync loop will get them all! |
| */ |
| mdnblocks(reln, forknum); |
| |
| min_inactive_seg = segno = reln->md_num_open_segs[forknum]; |
| |
| /* |
| * Temporarily open inactive segments, then close them after sync. There |
| * may be some inactive segments left opened after fsync() error, but that |
| * is harmless. We don't bother to clean them up and take a risk of |
| * further trouble. The next mdclose() will soon close them. |
| */ |
| while (_mdfd_openseg(reln, forknum, segno, 0) != NULL) |
| segno++; |
| |
| while (segno > 0) |
| { |
| MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; |
| |
| if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0) |
| ereport(data_sync_elevel(ERROR), |
| (errcode_for_file_access(), |
| errmsg("could not fsync file \"%s\": %m", |
| FilePathName(v->mdfd_vfd)))); |
| |
| /* Close inactive segments immediately */ |
| if (segno > min_inactive_seg) |
| { |
| FileClose(v->mdfd_vfd); |
| _fdvec_resize(reln, forknum, segno - 1); |
| } |
| |
| segno--; |
| } |
| } |
| |
| /* |
| * register_dirty_segment() -- Mark a relation segment as needing fsync |
| * |
| * If there is a local pending-ops table, just make an entry in it for |
| * ProcessSyncRequests to process later. Otherwise, try to pass off the |
| * fsync request to the checkpointer process. If that fails, just do the |
| * fsync locally before returning (we hope this will not happen often |
| * enough to be a performance problem). |
| */ |
| static void |
| register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) |
| { |
| FileTag tag; |
| |
| INIT_MD_FILETAG(tag, reln->smgr_rnode.node, forknum, seg->mdfd_segno); |
| |
| /* Temp relations should never be fsync'd */ |
| Assert(!SmgrIsTemp(reln)); |
| |
| if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ )) |
| { |
| ereport(DEBUG1, |
| (errmsg_internal("could not forward fsync request because request queue is full"))); |
| |
| if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0) |
| ereport(data_sync_elevel(ERROR), |
| (errcode_for_file_access(), |
| errmsg("could not fsync file \"%s\": %m", |
| FilePathName(seg->mdfd_vfd)))); |
| } |
| } |
| |
| /* |
| * register_dirty_segment_ao() |
| * |
| * Similar to register_dirty_segment() but it is for append optimized tables. |
| * The API definition is different because (1) relation forks are not used for |
| * AO tables, it's always MAIN_FORKNUM and (2) there is no MdfdVec equivalent |
| * for AO segment files. |
| */ |
| void |
| register_dirty_segment_ao(RelFileNode rnode, int segno, File vfd) |
| { |
| FileTag tag; |
| |
| INIT_FILETAG(tag, rnode, MAIN_FORKNUM, segno, SYNC_HANDLER_AO); |
| |
| if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ )) |
| { |
| ereport(DEBUG1, |
| (errmsg("could not forward AO fsync request because request queue is full"))); |
| |
| if (FileSync(vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0) |
| ereport(data_sync_elevel(ERROR), |
| (errcode_for_file_access(), |
| errmsg("could not fsync AO file \"%s\": %m", |
| FilePathName(vfd)))); |
| } |
| } |
| |
| /* |
| * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint |
| */ |
| static void |
| register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum, |
| BlockNumber segno) |
| { |
| FileTag tag; |
| |
| INIT_MD_FILETAG(tag, rnode.node, forknum, segno); |
| |
| /* Should never be used with temp relations */ |
| Assert(!RelFileNodeBackendIsTemp(rnode)); |
| |
| RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ ); |
| } |
| |
| /* |
| * register_forget_request() -- forget any fsyncs for a relation fork's segment |
| */ |
| static void |
| register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum, |
| BlockNumber segno) |
| { |
| FileTag tag; |
| |
| INIT_MD_FILETAG(tag, rnode.node, forknum, segno); |
| |
| RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ ); |
| } |
| |
| /* |
| * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB |
| */ |
| void |
| ForgetDatabaseSyncRequests(Oid dbid) |
| { |
| FileTag tag; |
| RelFileNode rnode; |
| |
| rnode.dbNode = dbid; |
| rnode.spcNode = 0; |
| rnode.relNode = 0; |
| |
| INIT_MD_FILETAG(tag, rnode, InvalidForkNumber, InvalidBlockNumber); |
| |
| RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ ); |
| |
| /* |
| * Need to register filter requests for all the handlers because handler |
| * is part of the key that is used to determine equivalence among two |
| * pending entries. |
| */ |
| INIT_FILETAG(tag, rnode, InvalidForkNumber, InvalidBlockNumber, SYNC_HANDLER_AO); |
| |
| RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ ); |
| } |
| |
| /* |
| * DropRelationFiles -- drop files of all given relations |
| */ |
| void |
| DropRelationFiles(RelFileNodePendingDelete *delrels, int ndelrels, bool isRedo) |
| { |
| SMgrRelation *srels; |
| int i; |
| |
| if (!ndelrels) |
| return; |
| |
| srels = palloc(sizeof(SMgrRelation) * ndelrels); |
| for (i = 0; i < ndelrels; i++) |
| { |
| /* GPDB: backend can only be TempRelBackendId or InvalidBackendId for a |
| * given relfile since we don't tie temp relations to their backends. */ |
| SMgrRelation srel = smgropen(delrels[i].node, |
| delrels[i].isTempRelation ? |
| TempRelBackendId : InvalidBackendId, |
| delrels[i].smgr_which, NULL); |
| |
| if (isRedo) |
| { |
| ForkNumber fork; |
| |
| for (fork = 0; fork <= MAX_FORKNUM; fork++) |
| XLogDropRelation(delrels[i].node, fork); |
| } |
| srels[i] = srel; |
| } |
| |
| smgrdounlinkall(srels, ndelrels, isRedo); |
| |
| /* |
| * Call smgrclose() in reverse order as when smgropen() is called. |
| * This trick enables remove_from_unowned_list() in smgrclose() |
| * to search the SMgrRelation from the unowned list, |
| * with O(1) performance. |
| */ |
| for (i = ndelrels - 1; i >= 0; i--) |
| smgrclose(srels[i]); |
| pfree(srels); |
| } |
| |
| /* |
| * _fdvec_resize() -- Resize the fork's open segments array |
| */ |
| static void |
| _fdvec_resize(SMgrRelation reln, |
| ForkNumber forknum, |
| int nseg) |
| { |
| if (nseg == 0) |
| { |
| if (reln->md_num_open_segs[forknum] > 0) |
| { |
| pfree(reln->md_seg_fds[forknum]); |
| reln->md_seg_fds[forknum] = NULL; |
| } |
| } |
| else if (reln->md_num_open_segs[forknum] == 0) |
| { |
| reln->md_seg_fds[forknum] = |
| MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg); |
| } |
| else |
| { |
| /* |
| * It doesn't seem worthwhile complicating the code to amortize |
| * repalloc() calls. Those are far faster than PathNameOpenFile() or |
| * FileClose(), and the memory context internally will sometimes avoid |
| * doing an actual reallocation. |
| */ |
| reln->md_seg_fds[forknum] = |
| repalloc(reln->md_seg_fds[forknum], |
| sizeof(MdfdVec) * nseg); |
| } |
| |
| reln->md_num_open_segs[forknum] = nseg; |
| } |
| |
| /* |
| * Return the filename for the specified segment of the relation. The |
| * returned string is palloc'd. |
| */ |
| static char * |
| _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno) |
| { |
| char *path, |
| *fullpath; |
| |
| path = relpath(reln->smgr_rnode, forknum); |
| |
| if (segno > 0) |
| { |
| fullpath = psprintf("%s.%u", path, segno); |
| pfree(path); |
| } |
| else |
| fullpath = path; |
| |
| return fullpath; |
| } |
| |
| /* |
| * Open the specified segment of the relation, |
| * and make a MdfdVec object for it. Returns NULL on failure. |
| */ |
| static MdfdVec * |
| _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, |
| int oflags) |
| { |
| MdfdVec *v; |
| File fd; |
| char *fullpath; |
| |
| fullpath = _mdfd_segpath(reln, forknum, segno); |
| |
| /* open the file */ |
| fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags); |
| |
| pfree(fullpath); |
| |
| if (fd < 0) |
| return NULL; |
| |
| /* |
| * Segments are always opened in order from lowest to highest, so we must |
| * be adding a new one at the end. |
| */ |
| Assert(segno == reln->md_num_open_segs[forknum]); |
| |
| _fdvec_resize(reln, forknum, segno + 1); |
| |
| /* fill the entry */ |
| v = &reln->md_seg_fds[forknum][segno]; |
| v->mdfd_vfd = fd; |
| v->mdfd_segno = segno; |
| |
| Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); |
| |
| /* all done */ |
| return v; |
| } |
| |
| /* |
| * _mdfd_getseg() -- Find the segment of the relation holding the |
| * specified block. |
| * |
| * If the segment doesn't exist, we ereport, return NULL, or create the |
| * segment, according to "behavior". Note: skipFsync is only used in the |
| * EXTENSION_CREATE case. |
| */ |
| static MdfdVec * |
| _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, |
| bool skipFsync, int behavior) |
| { |
| MdfdVec *v; |
| BlockNumber targetseg; |
| BlockNumber nextsegno; |
| |
| /* some way to handle non-existent segments needs to be specified */ |
| Assert(behavior & |
| (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL)); |
| |
| targetseg = blkno / ((BlockNumber) RELSEG_SIZE); |
| |
| /* if an existing and opened segment, we're done */ |
| if (targetseg < reln->md_num_open_segs[forknum]) |
| { |
| v = &reln->md_seg_fds[forknum][targetseg]; |
| return v; |
| } |
| |
| /* |
| * The target segment is not yet open. Iterate over all the segments |
| * between the last opened and the target segment. This way missing |
| * segments either raise an error, or get created (according to |
| * 'behavior'). Start with either the last opened, or the first segment if |
| * none was opened before. |
| */ |
| if (reln->md_num_open_segs[forknum] > 0) |
| v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1]; |
| else |
| { |
| v = mdopenfork(reln, forknum, behavior); |
| if (!v) |
| return NULL; /* if behavior & EXTENSION_RETURN_NULL */ |
| } |
| |
| for (nextsegno = reln->md_num_open_segs[forknum]; |
| nextsegno <= targetseg; nextsegno++) |
| { |
| BlockNumber nblocks = _mdnblocks(reln, forknum, v); |
| int flags = 0; |
| |
| Assert(nextsegno == v->mdfd_segno + 1); |
| |
| if (nblocks > ((BlockNumber) RELSEG_SIZE)) |
| elog(FATAL, "segment too big"); |
| |
| if ((behavior & EXTENSION_CREATE) || |
| (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY))) |
| { |
| /* |
| * Normally we will create new segments only if authorized by the |
| * caller (i.e., we are doing mdextend()). But when doing WAL |
| * recovery, create segments anyway; this allows cases such as |
| * replaying WAL data that has a write into a high-numbered |
| * segment of a relation that was later deleted. We want to go |
| * ahead and create the segments so we can finish out the replay. |
| * |
| * We have to maintain the invariant that segments before the last |
| * active segment are of size RELSEG_SIZE; therefore, if |
| * extending, pad them out with zeroes if needed. (This only |
| * matters if in recovery, or if the caller is extending the |
| * relation discontiguously, but that can happen in hash indexes.) |
| */ |
| if (nblocks < ((BlockNumber) RELSEG_SIZE)) |
| { |
| char *zerobuf = palloc0(BLCKSZ); |
| |
| mdextend(reln, forknum, |
| nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, |
| zerobuf, skipFsync); |
| pfree(zerobuf); |
| } |
| flags = O_CREAT; |
| } |
| else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) && |
| nblocks < ((BlockNumber) RELSEG_SIZE)) |
| { |
| /* |
| * When not extending (or explicitly including truncated |
| * segments), only open the next segment if the current one is |
| * exactly RELSEG_SIZE. If not (this branch), either return NULL |
| * or fail. |
| */ |
| if (behavior & EXTENSION_RETURN_NULL) |
| { |
| /* |
| * Some callers discern between reasons for _mdfd_getseg() |
| * returning NULL based on errno. As there's no failing |
| * syscall involved in this case, explicitly set errno to |
| * ENOENT, as that seems the closest interpretation. |
| */ |
| errno = ENOENT; |
| return NULL; |
| } |
| |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks", |
| _mdfd_segpath(reln, forknum, nextsegno), |
| blkno, nblocks))); |
| } |
| |
| v = _mdfd_openseg(reln, forknum, nextsegno, flags); |
| |
| if (v == NULL) |
| { |
| if ((behavior & EXTENSION_RETURN_NULL) && |
| FILE_POSSIBLY_DELETED(errno)) |
| return NULL; |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not open file \"%s\" (target block %u): %m", |
| _mdfd_segpath(reln, forknum, nextsegno), |
| blkno))); |
| } |
| } |
| |
| return v; |
| } |
| |
| /* |
| * Get number of blocks present in a single disk file |
| */ |
| static BlockNumber |
| _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) |
| { |
| off_t len; |
| |
| len = FileSize(seg->mdfd_vfd); |
| if (len < 0) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not seek to end of file \"%s\": %m", |
| FilePathName(seg->mdfd_vfd)))); |
| /* note that this calculation will ignore any partial block at EOF */ |
| return (BlockNumber) (len / BLCKSZ); |
| } |
| |
| /* |
| * Sync a file to disk, given a file tag. Write the path into an output |
| * buffer so the caller can use it in error messages. |
| * |
| * Return 0 on success, -1 on failure, with errno set. |
| */ |
| int |
| mdsyncfiletag(const FileTag *ftag, char *path) |
| { |
| SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId, 0, NULL); |
| File file; |
| bool need_to_close; |
| int result, |
| save_errno; |
| |
| /* See if we already have the file open, or need to open it. */ |
| if (ftag->segno < reln->md_num_open_segs[ftag->forknum]) |
| { |
| file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd; |
| strlcpy(path, FilePathName(file), MAXPGPATH); |
| need_to_close = false; |
| } |
| else |
| { |
| char *p; |
| |
| p = _mdfd_segpath(reln, ftag->forknum, ftag->segno); |
| strlcpy(path, p, MAXPGPATH); |
| pfree(p); |
| |
| file = PathNameOpenFile(path, O_RDWR | PG_BINARY); |
| if (file < 0) |
| return -1; |
| need_to_close = true; |
| } |
| |
| /* Sync the file. */ |
| result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC); |
| save_errno = errno; |
| |
| if (need_to_close) |
| FileClose(file); |
| |
| errno = save_errno; |
| return result; |
| } |
| |
| |
| int |
| aosyncfiletag(const FileTag *ftag, char *path) |
| { |
| SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId, 1, NULL); |
| char *p; |
| int result, |
| save_errno; |
| |
| /* Provide the path for informational messages. */ |
| p = _mdfd_segpath(reln, ftag->forknum, ftag->segno); |
| strlcpy(path, p, MAXPGPATH); |
| pfree(p); |
| |
| File fd = PathNameOpenFile(path, O_RDWR); |
| if (fd <= 0) |
| elog(ERROR, "could not open file %s: %m", path); |
| |
| /* Try to fsync the file. */ |
| result = FileSync(fd, WAIT_EVENT_DATA_FILE_SYNC); |
| save_errno = errno; |
| |
| FileClose(fd); |
| |
| errno = save_errno; |
| return result; |
| } |
| |
| /* |
| * Unlink a file, given a file tag. Write the path into an output |
| * buffer so the caller can use it in error messages. |
| * |
| * Return 0 on success, -1 on failure, with errno set. |
| */ |
| int |
| mdunlinkfiletag(const FileTag *ftag, char *path) |
| { |
| char *p; |
| |
| /* Compute the path. */ |
| p = relpathperm(ftag->rnode, MAIN_FORKNUM); |
| strlcpy(path, p, MAXPGPATH); |
| pfree(p); |
| |
| /* Try to unlink the file. */ |
| return unlink(path); |
| } |
| |
| /* |
| * Check if a given candidate request matches a given tag, when processing |
| * a SYNC_FILTER_REQUEST request. This will be called for all pending |
| * requests to find out whether to forget them. |
| */ |
| bool |
| mdfiletagmatches(const FileTag *ftag, const FileTag *candidate) |
| { |
| /* |
| * For now we only use filter requests as a way to drop all scheduled |
| * callbacks relating to a given database, when dropping the database. |
| * We'll return true for all candidates that have the same database OID as |
| * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten. |
| */ |
| return ftag->rnode.dbNode == candidate->rnode.dbNode; |
| } |