| /*------------------------------------------------------------------------- |
| * |
| * fd.c |
| * Virtual file descriptor code. |
| * |
| * Portions Copyright (c) 2007-2009, Greenplum inc |
| * Portions Copyright (c) 2012-Present VMware, Inc. or its affiliates. |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * src/backend/storage/file/fd.c |
| * |
| * NOTES: |
| * |
| * This code manages a cache of 'virtual' file descriptors (VFDs). |
| * The server opens many file descriptors for a variety of reasons, |
| * including base tables, scratch files (e.g., sort and hash spool |
| * files), and random calls to C library routines like system(3); it |
| * is quite easy to exceed system limits on the number of open files a |
| * single process can have. (This is around 1024 on many modern |
| * operating systems, but may be lower on others.) |
| * |
| * VFDs are managed as an LRU pool, with actual OS file descriptors |
| * being opened and closed as needed. Obviously, if a routine is |
| * opened using these interfaces, all subsequent operations must also |
| * be through these interfaces (the File type is not a real file |
| * descriptor). |
| * |
| * For this scheme to work, most (if not all) routines throughout the |
| * server should use these interfaces instead of calling the C library |
| * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we |
| * may find ourselves short of real file descriptors anyway. |
| * |
| * INTERFACE ROUTINES |
| * |
| * PathNameOpenFile and OpenTemporaryFile are used to open virtual files. |
| * A File opened with OpenTemporaryFile is automatically deleted when the |
| * File is closed, either explicitly or implicitly at end of transaction or |
| * process exit. PathNameOpenFile is intended for files that are held open |
| * for a long time, like relation files. It is the caller's responsibility |
| * to close them, there is no automatic mechanism in fd.c for that. |
| * |
| * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage |
| * temporary files that have names so that they can be shared between |
| * backends. Such files are automatically closed and count against the |
| * temporary file limit of the backend that creates them, but unlike anonymous |
| * files they are not automatically deleted. See sharedfileset.c for a shared |
| * ownership mechanism that provides automatic cleanup for shared files when |
| * the last of a group of backends detaches. |
| * |
| * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are |
| * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively. |
| * They behave like the corresponding native functions, except that the handle |
| * is registered with the current subtransaction, and will be automatically |
| * closed at abort. These are intended mainly for short operations like |
| * reading a configuration file; there is a limit on the number of files that |
| * can be opened using these functions at any one time. |
| * |
| * Finally, BasicOpenFile is just a thin wrapper around open() that can |
| * release file descriptors in use by the virtual file descriptors if |
| * necessary. There is no automatic cleanup of file descriptors returned by |
| * BasicOpenFile, it is solely the caller's responsibility to close the file |
| * descriptor by calling close(2). |
| * |
| * If a non-virtual file descriptor needs to be held open for any length of |
| * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD |
| * (and eventually ReleaseExternalFD), so that we can take it into account |
| * while deciding how many VFDs can be open. This applies to FDs obtained |
| * with BasicOpenFile as well as those obtained without use of any fd.c API. |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "postgres.h" |
| |
| #include <dirent.h> |
| #include <sys/file.h> |
| #include <sys/param.h> |
| #include <sys/resource.h> /* for getrlimit */ |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #ifndef WIN32 |
| #include <sys/mman.h> |
| #endif |
| #include <limits.h> |
| #include <unistd.h> |
| #include <fcntl.h> |
| |
| #include "access/xact.h" |
| #include "access/xlog.h" |
| #include "catalog/pg_tablespace.h" |
| #include "cdb/cdbvars.h" |
| #include "common/file_perm.h" |
| #include "common/file_utils.h" |
| #include "common/pg_prng.h" |
| #include "miscadmin.h" |
| #include "pgstat.h" |
| #include "portability/mem.h" |
| #include "postmaster/startup.h" |
| #include "storage/fd.h" |
| #include "storage/ipc.h" |
| #include "utils/guc.h" |
| #include "utils/guc_hooks.h" |
| #include "utils/resowner_private.h" |
| #include "utils/workfile_mgr.h" |
| #include "utils/faultinjector.h" |
| #include "utils/varlena.h" |
| |
| /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ |
| #if defined(HAVE_SYNC_FILE_RANGE) |
| #define PG_FLUSH_DATA_WORKS 1 |
| #elif !defined(WIN32) && defined(MS_ASYNC) |
| #define PG_FLUSH_DATA_WORKS 1 |
| #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) |
| #define PG_FLUSH_DATA_WORKS 1 |
| #endif |
| |
| /* |
| * We must leave some file descriptors free for system(), the dynamic loader, |
| * and other code that tries to open files without consulting fd.c. This |
| * is the number left free. (While we try fairly hard to prevent EMFILE |
| * errors, there's never any guarantee that we won't get ENFILE due to |
| * other processes chewing up FDs. So it's a bad idea to try to open files |
| * without consulting fd.c. Nonetheless we cannot control all code.) |
| * |
| * Because this is just a fixed setting, we are effectively assuming that |
| * no such code will leave FDs open over the long term; otherwise the slop |
| * is likely to be insufficient. Note in particular that we expect that |
| * loading a shared library does not result in any permanent increase in |
| * the number of open files. (This appears to be true on most if not |
| * all platforms as of Feb 2004.) |
| */ |
| #define NUM_RESERVED_FDS 10 |
| |
| /* |
| * If we have fewer than this many usable FDs after allowing for the reserved |
| * ones, choke. (This value is chosen to work with "ulimit -n 64", but not |
| * much less than that. Note that this value ensures numExternalFDs can be |
| * at least 16; as of this writing, the contrib/postgres_fdw regression tests |
| * will not pass unless that can grow to at least 14.) |
| */ |
| #define FD_MINFREE 48 |
| |
| /* |
| * A number of platforms allow individual processes to open many more files |
| * than they can really support when *many* processes do the same thing. |
| * This GUC parameter lets the DBA limit max_safe_fds to something less than |
| * what the postmaster's initial probe suggests will work. |
| */ |
| int max_files_per_process = 1000; |
| |
| /* |
| * Maximum number of file descriptors to open for operations that fd.c knows |
| * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized |
| * to a conservative value, and remains that way indefinitely in bootstrap or |
| * standalone-backend cases. In normal postmaster operation, the postmaster |
| * calls set_max_safe_fds() late in initialization to update the value, and |
| * that value is then inherited by forked subprocesses. |
| * |
| * Note: the value of max_files_per_process is taken into account while |
| * setting this variable, and so need not be tested separately. |
| */ |
| int max_safe_fds = FD_MINFREE; /* default if not changed */ |
| |
| /* Whether it is safe to continue running after fsync() fails. */ |
| bool data_sync_retry = false; |
| |
| /* How SyncDataDirectory() should do its job. */ |
| int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC; |
| |
| /* Which kinds of files should be opened with PG_O_DIRECT. */ |
| int io_direct_flags; |
| |
| /* Debugging.... */ |
| |
| #ifdef FDDEBUG |
| #define DO_DB(A) \ |
| do { \ |
| int _do_db_save_errno = errno; \ |
| A; \ |
| errno = _do_db_save_errno; \ |
| } while (0) |
| #else |
| #define DO_DB(A) \ |
| ((void) 0) |
| #endif |
| |
| #define VFD_CLOSED (-1) |
| |
| #define FileIsValid(file) \ |
| ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL) |
| |
| #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED) |
| |
| /* these are the assigned bits in fdstate below: */ |
| #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */ |
| #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */ |
| #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */ |
| /* GPDB private flag */ |
| #define FD_WORKFILE (1 << 3) /* tracked by workfile manager */ |
| |
| typedef struct vfd |
| { |
| int fd; /* current FD, or VFD_CLOSED if none */ |
| unsigned short fdstate; /* bitflags for VFD's state */ |
| ResourceOwner resowner; /* owner, for automatic cleanup */ |
| File nextFree; /* link to next free VFD, if in freelist */ |
| File lruMoreRecently; /* doubly linked recency-of-use list */ |
| File lruLessRecently; |
| off_t fileSize; /* current size of file (0 if not temporary) */ |
| char *fileName; /* name of file, or NULL for unused VFD */ |
| /* NB: fileName is malloc'd, and must be free'd when closing the VFD */ |
| int fileFlags; /* open(2) flags for (re)opening the file */ |
| mode_t fileMode; /* mode to pass to open(2) */ |
| } Vfd; |
| |
| /* |
| * Virtual File Descriptor array pointer and size. This grows as |
| * needed. 'File' values are indexes into this array. |
| * Note that VfdCache[0] is not a usable VFD, just a list header. |
| */ |
| static Vfd *VfdCache; |
| static Size SizeVfdCache = 0; |
| |
| /* |
| * Number of file descriptors known to be in use by VFD entries. |
| */ |
| static int nfile = 0; |
| |
| /* |
| * Flag to tell whether it's worth scanning VfdCache looking for temp files |
| * to close |
| */ |
| static bool have_xact_temporary_files = false; |
| |
| /* |
| * Tracks the total size of all temporary files. Note: when temp_file_limit |
| * is being enforced, this cannot overflow since the limit cannot be more |
| * than INT_MAX kilobytes. When not enforcing, it could theoretically |
| * overflow, but we don't care. |
| */ |
| static uint64 temporary_files_size = 0; |
| |
| /* Temporary file access initialized and not yet shut down? */ |
| #ifdef USE_ASSERT_CHECKING |
| static bool temporary_files_allowed = false; |
| #endif |
| |
| /* |
| * List of OS handles opened with AllocateFile, AllocateDir and |
| * OpenTransientFile. |
| * |
| * Since we don't want to encourage heavy use of those functions, |
| * it seems OK to put a pretty small maximum limit on the number of |
| * simultaneously allocated descs. |
| */ |
| typedef enum |
| { |
| AllocateDescFile, |
| AllocateDescPipe, |
| AllocateDescDir, |
| AllocateDescRawFD |
| } AllocateDescKind; |
| |
| typedef struct |
| { |
| AllocateDescKind kind; |
| SubTransactionId create_subid; |
| union |
| { |
| FILE *file; |
| DIR *dir; |
| int fd; |
| } desc; |
| } AllocateDesc; |
| |
| static int numAllocatedDescs = 0; |
| static int maxAllocatedDescs = 0; |
| static AllocateDesc *allocatedDescs = NULL; |
| |
| /* |
| * Number of open "external" FDs reported to Reserve/ReleaseExternalFD. |
| */ |
| static int numExternalFDs = 0; |
| |
| /* |
| * Number of temporary files opened during the current session; |
| * this is used in generation of tempfile names. |
| */ |
| static long tempFileCounter = 0; |
| |
| /* |
| * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid, |
| * indicating that the current database's default tablespace should be used.) |
| * When numTempTableSpaces is -1, this has not been set in the current |
| * transaction. |
| */ |
| static Oid *tempTableSpaces = NULL; |
| static int numTempTableSpaces = -1; |
| static int nextTempTableSpace = 0; |
| |
| |
| /*-------------------- |
| * |
| * Private Routines |
| * |
| * Delete - delete a file from the Lru ring |
| * LruDelete - remove a file from the Lru ring and close its FD |
| * Insert - put a file at the front of the Lru ring |
| * LruInsert - put a file at the front of the Lru ring and open it |
| * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring |
| * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit |
| * AllocateVfd - grab a free (or new) file record (from VfdCache) |
| * FreeVfd - free a file record |
| * |
| * The Least Recently Used ring is a doubly linked list that begins and |
| * ends on element zero. Element zero is special -- it doesn't represent |
| * a file and its "fd" field always == VFD_CLOSED. Element zero is just an |
| * anchor that shows us the beginning/end of the ring. |
| * Only VFD elements that are currently really open (have an FD assigned) are |
| * in the Lru ring. Elements that are "virtually" open can be recognized |
| * by having a non-null fileName field. |
| * |
| * example: |
| * |
| * /--less----\ /---------\ |
| * v \ v \ |
| * #0 --more---> LeastRecentlyUsed --more-\ \ |
| * ^\ | | |
| * \\less--> MostRecentlyUsedFile <---/ | |
| * \more---/ \--less--/ |
| * |
| *-------------------- |
| */ |
| static void Delete(File file); |
| static void LruDelete(File file); |
| static void Insert(File file); |
| static int LruInsert(File file); |
| static bool ReleaseLruFile(void); |
| static void ReleaseLruFiles(void); |
| static File AllocateVfd(void); |
| static void FreeVfd(File file); |
| |
| static int FileAccess(File file); |
| static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError, |
| const char *filename, bool makenameunique, bool create); |
| static bool reserveAllocatedDesc(void); |
| static int FreeDesc(AllocateDesc *desc); |
| |
| static void BeforeShmemExit_Files(int code, Datum arg); |
| static void CleanupTempFiles(bool isCommit, bool isProcExit); |
| static void RemovePgTempRelationFiles(const char *tsdirname); |
| static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname); |
| |
| static void walkdir(const char *path, |
| void (*action) (const char *fname, bool isdir, int elevel), |
| bool process_symlinks, |
| int elevel); |
| #ifdef PG_FLUSH_DATA_WORKS |
| static void pre_sync_fname(const char *fname, bool isdir, int elevel); |
| #endif |
| static void datadir_fsync_fname(const char *fname, bool isdir, int elevel); |
| static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel); |
| |
| static int fsync_parent_path(const char *fname, int elevel); |
| |
| |
| /* |
| * pg_fsync --- do fsync with or without writethrough |
| */ |
| int |
| pg_fsync(int fd) |
| { |
| #if !defined(WIN32) && defined(USE_ASSERT_CHECKING) |
| struct stat st; |
| |
| /* |
| * Some operating system implementations of fsync() have requirements |
| * about the file access modes that were used when their file descriptor |
| * argument was opened, and these requirements differ depending on whether |
| * the file descriptor is for a directory. |
| * |
| * For any file descriptor that may eventually be handed to fsync(), we |
| * should have opened it with access modes that are compatible with |
| * fsync() on all supported systems, otherwise the code may not be |
| * portable, even if it runs ok on the current system. |
| * |
| * We assert here that a descriptor for a file was opened with write |
| * permissions (either O_RDWR or O_WRONLY) and for a directory without |
| * write permissions (O_RDONLY). |
| * |
| * Ignore any fstat errors and let the follow-up fsync() do its work. |
| * Doing this sanity check here counts for the case where fsync() is |
| * disabled. |
| */ |
| if (fstat(fd, &st) == 0) |
| { |
| int desc_flags = fcntl(fd, F_GETFL); |
| |
| /* |
| * O_RDONLY is historically 0, so just make sure that for directories |
| * no write flags are used. |
| */ |
| if (S_ISDIR(st.st_mode)) |
| Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0); |
| else |
| Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0); |
| } |
| errno = 0; |
| #endif |
| |
| /* #if is to skip the sync_method test if there's no need for it */ |
| #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC) |
| if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH) |
| return pg_fsync_writethrough(fd); |
| else |
| #endif |
| return pg_fsync_no_writethrough(fd); |
| } |
| |
| |
| /* |
| * pg_fsync_no_writethrough --- same as fsync except does nothing if |
| * enableFsync is off |
| */ |
| int |
| pg_fsync_no_writethrough(int fd) |
| { |
| int rc; |
| |
| if (!enableFsync) |
| return 0; |
| |
| retry: |
| rc = fsync(fd); |
| |
| if (rc == -1 && errno == EINTR) |
| goto retry; |
| |
| return rc; |
| } |
| |
| /* |
| * pg_fsync_writethrough |
| */ |
| int |
| pg_fsync_writethrough(int fd) |
| { |
| if (enableFsync) |
| { |
| #ifdef WIN32 |
| return _commit(fd); |
| #elif defined(F_FULLFSYNC) |
| return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0; |
| #else |
| errno = ENOSYS; |
| return -1; |
| #endif |
| } |
| else |
| return 0; |
| } |
| |
| /* |
| * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off |
| */ |
| int |
| pg_fdatasync(int fd) |
| { |
| int rc; |
| |
| if (!enableFsync) |
| return 0; |
| |
| retry: |
| rc = fdatasync(fd); |
| |
| if (rc == -1 && errno == EINTR) |
| goto retry; |
| |
| return rc; |
| } |
| |
| /* |
| * pg_flush_data --- advise OS that the described dirty data should be flushed |
| * |
| * offset of 0 with nbytes 0 means that the entire file should be flushed |
| */ |
| void |
| pg_flush_data(int fd, off_t offset, off_t nbytes) |
| { |
| /* |
| * Right now file flushing is primarily used to avoid making later |
| * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes |
| * if fsyncs are disabled - that's a decision we might want to make |
| * configurable at some point. |
| */ |
| if (!enableFsync) |
| return; |
| |
| /* |
| * We compile all alternatives that are supported on the current platform, |
| * to find portability problems more easily. |
| */ |
| #if defined(HAVE_SYNC_FILE_RANGE) |
| { |
| int rc; |
| static bool not_implemented_by_kernel = false; |
| |
| if (not_implemented_by_kernel) |
| return; |
| |
| retry: |
| |
| /* |
| * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific, |
| * tells the OS that writeback for the specified blocks should be |
| * started, but that we don't want to wait for completion. Note that |
| * this call might block if too much dirty data exists in the range. |
| * This is the preferable method on OSs supporting it, as it works |
| * reliably when available (contrast to msync()) and doesn't flush out |
| * clean data (like FADV_DONTNEED). |
| */ |
| rc = sync_file_range(fd, offset, nbytes, |
| SYNC_FILE_RANGE_WRITE); |
| if (rc != 0) |
| { |
| int elevel; |
| |
| if (rc == EINTR) |
| goto retry; |
| |
| /* |
| * For systems that don't have an implementation of |
| * sync_file_range() such as Windows WSL, generate only one |
| * warning and then suppress all further attempts by this process. |
| */ |
| if (errno == ENOSYS) |
| { |
| elevel = WARNING; |
| not_implemented_by_kernel = true; |
| } |
| else |
| elevel = data_sync_elevel(WARNING); |
| |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not flush dirty data: %m"))); |
| } |
| |
| return; |
| } |
| #endif |
| #if !defined(WIN32) && defined(MS_ASYNC) |
| { |
| void *p; |
| static int pagesize = 0; |
| |
| /* |
| * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers |
| * writeback. On linux it only does so if MS_SYNC is specified, but |
| * then it does the writeback synchronously. Luckily all common linux |
| * systems have sync_file_range(). This is preferable over |
| * FADV_DONTNEED because it doesn't flush out clean data. |
| * |
| * We map the file (mmap()), tell the kernel to sync back the contents |
| * (msync()), and then remove the mapping again (munmap()). |
| */ |
| |
| /* mmap() needs actual length if we want to map whole file */ |
| if (offset == 0 && nbytes == 0) |
| { |
| nbytes = lseek(fd, 0, SEEK_END); |
| if (nbytes < 0) |
| { |
| ereport(WARNING, |
| (errcode_for_file_access(), |
| errmsg("could not determine dirty data size: %m"))); |
| return; |
| } |
| } |
| |
| /* |
| * Some platforms reject partial-page mmap() attempts. To deal with |
| * that, just truncate the request to a page boundary. If any extra |
| * bytes don't get flushed, well, it's only a hint anyway. |
| */ |
| |
| /* fetch pagesize only once */ |
| if (pagesize == 0) |
| pagesize = sysconf(_SC_PAGESIZE); |
| |
| /* align length to pagesize, dropping any fractional page */ |
| if (pagesize > 0) |
| nbytes = (nbytes / pagesize) * pagesize; |
| |
| /* fractional-page request is a no-op */ |
| if (nbytes <= 0) |
| return; |
| |
| /* |
| * mmap could well fail, particularly on 32-bit platforms where there |
| * may simply not be enough address space. If so, silently fall |
| * through to the next implementation. |
| */ |
| if (nbytes <= (off_t) SSIZE_MAX) |
| p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset); |
| else |
| p = MAP_FAILED; |
| |
| if (p != MAP_FAILED) |
| { |
| int rc; |
| |
| rc = msync(p, (size_t) nbytes, MS_ASYNC); |
| if (rc != 0) |
| { |
| ereport(data_sync_elevel(WARNING), |
| (errcode_for_file_access(), |
| errmsg("could not flush dirty data: %m"))); |
| /* NB: need to fall through to munmap()! */ |
| } |
| |
| rc = munmap(p, (size_t) nbytes); |
| if (rc != 0) |
| { |
| /* FATAL error because mapping would remain */ |
| ereport(FATAL, |
| (errcode_for_file_access(), |
| errmsg("could not munmap() while flushing data: %m"))); |
| } |
| |
| return; |
| } |
| } |
| #endif |
| #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) |
| { |
| int rc; |
| |
| /* |
| * Signal the kernel that the passed in range should not be cached |
| * anymore. This has the, desired, side effect of writing out dirty |
| * data, and the, undesired, side effect of likely discarding useful |
| * clean cached blocks. For the latter reason this is the least |
| * preferable method. |
| */ |
| |
| rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED); |
| |
| if (rc != 0) |
| { |
| /* don't error out, this is just a performance optimization */ |
| ereport(WARNING, |
| (errcode_for_file_access(), |
| errmsg("could not flush dirty data: %m"))); |
| } |
| |
| return; |
| } |
| #endif |
| } |
| |
| /* |
| * Retrying close in case it gets interrupted. If that happens, it will cause |
| * unlink to fail later. |
| */ |
| int |
| gp_retry_close(int fd) |
| { |
| int err = 0; |
| do |
| { |
| err = close(fd); |
| } while (err == -1 && errno == EINTR); |
| return err; |
| } |
| |
| /* |
| * Truncate an open file to a given length. |
| */ |
| static int |
| pg_ftruncate(int fd, off_t length) |
| { |
| int ret; |
| |
| retry: |
| ret = ftruncate(fd, length); |
| |
| if (ret == -1 && errno == EINTR) |
| goto retry; |
| |
| return ret; |
| } |
| |
| /* |
| * Truncate a file to a given length by name. |
| */ |
| int |
| pg_truncate(const char *path, off_t length) |
| { |
| int ret; |
| #ifdef WIN32 |
| int save_errno; |
| int fd; |
| |
| fd = OpenTransientFile(path, O_RDWR | PG_BINARY); |
| if (fd >= 0) |
| { |
| ret = pg_ftruncate(fd, length); |
| save_errno = errno; |
| CloseTransientFile(fd); |
| errno = save_errno; |
| } |
| else |
| ret = -1; |
| #else |
| |
| retry: |
| ret = truncate(path, length); |
| |
| if (ret == -1 && errno == EINTR) |
| goto retry; |
| #endif |
| |
| return ret; |
| } |
| |
| /* |
| * fsync_fname -- fsync a file or directory, handling errors properly |
| * |
| * Try to fsync a file or directory. When doing the latter, ignore errors that |
| * indicate the OS just doesn't allow/require fsyncing directories. |
| */ |
| void |
| fsync_fname(const char *fname, bool isdir) |
| { |
| fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR)); |
| } |
| |
| /* |
| * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability |
| * |
| * This routine ensures that, after returning, the effect of renaming file |
| * persists in case of a crash. A crash while this routine is running will |
| * leave you with either the pre-existing or the moved file in place of the |
| * new file; no mixed state or truncated files are possible. |
| * |
| * It does so by using fsync on the old filename and the possibly existing |
| * target filename before the rename, and the target file and directory after. |
| * |
| * Note that rename() cannot be used across arbitrary directories, as they |
| * might not be on the same filesystem. Therefore this routine does not |
| * support renaming across directories. |
| * |
| * Log errors with the caller specified severity. |
| * |
| * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not |
| * valid upon return. |
| */ |
| int |
| durable_rename(const char *oldfile, const char *newfile, int elevel) |
| { |
| int fd; |
| |
| /* |
| * First fsync the old and target path (if it exists), to ensure that they |
| * are properly persistent on disk. Syncing the target file is not |
| * strictly necessary, but it makes it easier to reason about crashes; |
| * because it's then guaranteed that either source or target file exists |
| * after a crash. |
| */ |
| if (fsync_fname_ext(oldfile, false, false, elevel) != 0) |
| return -1; |
| |
| fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR); |
| if (fd < 0) |
| { |
| if (errno != ENOENT) |
| { |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not open file \"%s\": %m", newfile))); |
| return -1; |
| } |
| } |
| else |
| { |
| if (pg_fsync(fd) != 0) |
| { |
| int save_errno; |
| |
| /* close file upon error, might not be in transaction context */ |
| save_errno = errno; |
| CloseTransientFile(fd); |
| errno = save_errno; |
| |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not fsync file \"%s\": %m", newfile))); |
| return -1; |
| } |
| |
| if (CloseTransientFile(fd) != 0) |
| { |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not close file \"%s\": %m", newfile))); |
| return -1; |
| } |
| } |
| |
| /* Time to do the real deal... */ |
| if (rename(oldfile, newfile) < 0) |
| { |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not rename file \"%s\" to \"%s\": %m", |
| oldfile, newfile))); |
| return -1; |
| } |
| |
| /* |
| * To guarantee renaming the file is persistent, fsync the file with its |
| * new name, and its containing directory. |
| */ |
| if (fsync_fname_ext(newfile, false, false, elevel) != 0) |
| return -1; |
| |
| if (fsync_parent_path(newfile, elevel) != 0) |
| return -1; |
| |
| return 0; |
| } |
| |
| /* |
| * durable_unlink -- remove a file in a durable manner |
| * |
| * This routine ensures that, after returning, the effect of removing file |
| * persists in case of a crash. A crash while this routine is running will |
| * leave the system in no mixed state. |
| * |
| * It does so by using fsync on the parent directory of the file after the |
| * actual removal is done. |
| * |
| * Log errors with the severity specified by caller. |
| * |
| * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not |
| * valid upon return. |
| */ |
| int |
| durable_unlink(const char *fname, int elevel) |
| { |
| if (unlink(fname) < 0) |
| { |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not remove file \"%s\": %m", |
| fname))); |
| return -1; |
| } |
| |
| /* |
| * To guarantee that the removal of the file is persistent, fsync its |
| * parent directory. |
| */ |
| if (fsync_parent_path(fname, elevel) != 0) |
| return -1; |
| |
| return 0; |
| } |
| |
| /* |
| * durable_rename_excl -- rename a file in a durable manner. |
| * |
| * Similar to durable_rename(), except that this routine tries (but does not |
| * guarantee) not to overwrite the target file. |
| * |
| * Note that a crash in an unfortunate moment can leave you with two links to |
| * the target file. |
| * |
| * Log errors with the caller specified severity. |
| * |
| * On Windows, using a hard link followed by unlink() causes concurrency |
| * issues, while a simple rename() does not cause that, so be careful when |
| * changing the logic of this routine. |
| * |
| * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not |
| * valid upon return. |
| */ |
| int |
| durable_rename_excl(const char *oldfile, const char *newfile, int elevel) |
| { |
| /* |
| * Ensure that, if we crash directly after the rename/link, a file with |
| * valid contents is moved into place. |
| */ |
| if (fsync_fname_ext(oldfile, false, false, elevel) != 0) |
| return -1; |
| |
| #ifdef HAVE_WORKING_LINK |
| if (link(oldfile, newfile) < 0) |
| { |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not link file \"%s\" to \"%s\": %m", |
| oldfile, newfile), |
| (AmCheckpointerProcess() ? |
| errhint("This is known to fail occasionally during archive recovery, where it is harmless.") : |
| 0))); |
| return -1; |
| } |
| unlink(oldfile); |
| #else |
| if (rename(oldfile, newfile) < 0) |
| { |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not rename file \"%s\" to \"%s\": %m", |
| oldfile, newfile), |
| (AmCheckpointerProcess() ? |
| errhint("This is known to fail occasionally during archive recovery, where it is harmless.") : |
| 0))); |
| return -1; |
| } |
| #endif |
| |
| /* |
| * Make change persistent in case of an OS crash, both the new entry and |
| * its parent directory need to be flushed. |
| */ |
| if (fsync_fname_ext(newfile, false, false, elevel) != 0) |
| return -1; |
| |
| /* Same for parent directory */ |
| if (fsync_parent_path(newfile, elevel) != 0) |
| return -1; |
| |
| return 0; |
| } |
| |
| /* |
| ======= |
| >>>>>>> REL_16_9 |
| * InitFileAccess --- initialize this module during backend startup |
| * |
| * This is called during either normal or standalone backend start. |
| * It is *not* called in the postmaster. |
| * |
| * Note that this does not initialize temporary file access, that is |
| * separately initialized via InitTemporaryFileAccess(). |
| */ |
| void |
| InitFileAccess(void) |
| { |
| Assert(SizeVfdCache == 0); /* call me only once */ |
| |
| /* initialize cache header entry */ |
| VfdCache = (Vfd *) malloc(sizeof(Vfd)); |
| if (VfdCache == NULL) |
| ereport(FATAL, |
| (errcode(ERRCODE_OUT_OF_MEMORY), |
| errmsg("out of memory"))); |
| |
| MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd)); |
| VfdCache->fd = VFD_CLOSED; |
| |
| SizeVfdCache = 1; |
| } |
| |
| /* |
| * InitTemporaryFileAccess --- initialize temporary file access during startup |
| * |
| * This is called during either normal or standalone backend start. |
| * It is *not* called in the postmaster. |
| * |
| * This is separate from InitFileAccess() because temporary file cleanup can |
| * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(), |
| * our reporting has to happen before that. Low level file access should be |
| * available for longer, hence the separate initialization / shutdown of |
| * temporary file handling. |
| */ |
| void |
| InitTemporaryFileAccess(void) |
| { |
| Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */ |
| Assert(!temporary_files_allowed); /* call me only once */ |
| |
| /* |
| * Register before-shmem-exit hook to ensure temp files are dropped while |
| * we can still report stats. |
| */ |
| before_shmem_exit(BeforeShmemExit_Files, 0); |
| |
| #ifdef USE_ASSERT_CHECKING |
| temporary_files_allowed = true; |
| #endif |
| } |
| |
| /* |
| * count_usable_fds --- count how many FDs the system will let us open, |
| * and estimate how many are already open. |
| * |
| * We stop counting if usable_fds reaches max_to_probe. Note: a small |
| * value of max_to_probe might result in an underestimate of already_open; |
| * we must fill in any "gaps" in the set of used FDs before the calculation |
| * of already_open will give the right answer. In practice, max_to_probe |
| * of a couple of dozen should be enough to ensure good results. |
| * |
| * We assume stderr (FD 2) is available for dup'ing. While the calling |
| * script could theoretically close that, it would be a really bad idea, |
| * since then one risks loss of error messages from, e.g., libc. |
| */ |
| static void |
| count_usable_fds(int max_to_probe, int *usable_fds, int *already_open) |
| { |
| int *fd; |
| int size; |
| int used = 0; |
| int highestfd = 0; |
| int j; |
| |
| #ifdef HAVE_GETRLIMIT |
| struct rlimit rlim; |
| int getrlimit_status; |
| #endif |
| |
| size = 1024; |
| fd = (int *) palloc(size * sizeof(int)); |
| |
| #ifdef HAVE_GETRLIMIT |
| getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim); |
| if (getrlimit_status != 0) |
| ereport(WARNING, (errmsg("getrlimit failed: %m"))); |
| #endif /* HAVE_GETRLIMIT */ |
| |
| /* dup until failure or probe limit reached */ |
| for (;;) |
| { |
| int thisfd; |
| |
| #ifdef HAVE_GETRLIMIT |
| |
| /* |
| * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on |
| * some platforms |
| */ |
| if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1) |
| break; |
| #endif |
| |
| thisfd = dup(2); |
| if (thisfd < 0) |
| { |
| /* Expect EMFILE or ENFILE, else it's fishy */ |
| if (errno != EMFILE && errno != ENFILE) |
| elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used); |
| break; |
| } |
| |
| if (used >= size) |
| { |
| size *= 2; |
| fd = (int *) repalloc(fd, size * sizeof(int)); |
| } |
| fd[used++] = thisfd; |
| |
| if (highestfd < thisfd) |
| highestfd = thisfd; |
| |
| if (used >= max_to_probe) |
| break; |
| } |
| |
| /* release the files we opened */ |
| for (j = 0; j < used; j++) |
| close(fd[j]); |
| |
| pfree(fd); |
| |
| /* |
| * Return results. usable_fds is just the number of successful dups. We |
| * assume that the system limit is highestfd+1 (remember 0 is a legal FD |
| * number) and so already_open is highestfd+1 - usable_fds. |
| */ |
| *usable_fds = used; |
| *already_open = highestfd + 1 - used; |
| } |
| |
| /* |
| * set_max_safe_fds |
| * Determine number of file descriptors that fd.c is allowed to use |
| */ |
| void |
| set_max_safe_fds(void) |
| { |
| int usable_fds; |
| int already_open; |
| |
| /*---------- |
| * We want to set max_safe_fds to |
| * MIN(usable_fds, max_files_per_process - already_open) |
| * less the slop factor for files that are opened without consulting |
| * fd.c. This ensures that we won't exceed either max_files_per_process |
| * or the experimentally-determined EMFILE limit. |
| *---------- |
| */ |
| count_usable_fds(max_files_per_process, |
| &usable_fds, &already_open); |
| |
| max_safe_fds = Min(usable_fds, max_files_per_process - already_open); |
| |
| /* |
| * Take off the FDs reserved for system() etc. |
| */ |
| max_safe_fds -= NUM_RESERVED_FDS; |
| |
| /* |
| * Make sure we still have enough to get by. |
| */ |
| if (max_safe_fds < FD_MINFREE) |
| ereport(FATAL, |
| (errcode(ERRCODE_INSUFFICIENT_RESOURCES), |
| errmsg("insufficient file descriptors available to start server process"), |
| errdetail("System allows %d, server needs at least %d.", |
| max_safe_fds + NUM_RESERVED_FDS, |
| FD_MINFREE + NUM_RESERVED_FDS))); |
| |
| elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d", |
| max_safe_fds, usable_fds, already_open); |
| } |
| |
| /* |
| * Open a file with BasicOpenFilePerm() and pass default file mode for the |
| * fileMode parameter. |
| */ |
| int |
| BasicOpenFile(const char *fileName, int fileFlags) |
| { |
| return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode); |
| } |
| |
| /* |
| * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed |
| * |
| * This is exported for use by places that really want a plain kernel FD, |
| * but need to be proof against running out of FDs. Once an FD has been |
| * successfully returned, it is the caller's responsibility to ensure that |
| * it will not be leaked on ereport()! Most users should *not* call this |
| * routine directly, but instead use the VFD abstraction level, which |
| * provides protection against descriptor leaks as well as management of |
| * files that need to be open for more than a short period of time. |
| * |
| * Ideally this should be the *only* direct call of open() in the backend. |
| * In practice, the postmaster calls open() directly, and there are some |
| * direct open() calls done early in backend startup. Those are OK since |
| * this module wouldn't have any open files to close at that point anyway. |
| */ |
| int |
| BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) |
| { |
| int fd; |
| |
| tryAgain: |
| #ifdef PG_O_DIRECT_USE_F_NOCACHE |
| |
| /* |
| * The value we defined to stand in for O_DIRECT when simulating it with |
| * F_NOCACHE had better not collide with any of the standard flags. |
| */ |
| StaticAssertStmt((PG_O_DIRECT & |
| (O_APPEND | |
| O_CLOEXEC | |
| O_CREAT | |
| O_DSYNC | |
| O_EXCL | |
| O_RDWR | |
| O_RDONLY | |
| O_SYNC | |
| O_TRUNC | |
| O_WRONLY)) == 0, |
| "PG_O_DIRECT value collides with standard flag"); |
| fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode); |
| #else |
| fd = open(fileName, fileFlags, fileMode); |
| #endif |
| |
| if (fd >= 0) |
| { |
| #ifdef PG_O_DIRECT_USE_F_NOCACHE |
| if (fileFlags & PG_O_DIRECT) |
| { |
| if (fcntl(fd, F_NOCACHE, 1) < 0) |
| { |
| int save_errno = errno; |
| |
| close(fd); |
| errno = save_errno; |
| return -1; |
| } |
| } |
| #endif |
| |
| return fd; /* success! */ |
| } |
| |
| if (errno == EMFILE || errno == ENFILE) |
| { |
| int save_errno = errno; |
| |
| ereport(LOG, |
| (errcode(ERRCODE_INSUFFICIENT_RESOURCES), |
| errmsg("out of file descriptors: %m; release and retry"))); |
| errno = 0; |
| if (ReleaseLruFile()) |
| goto tryAgain; |
| errno = save_errno; |
| } |
| |
| return -1; /* failure */ |
| } |
| |
| /* |
| * AcquireExternalFD - attempt to reserve an external file descriptor |
| * |
| * This should be used by callers that need to hold a file descriptor open |
| * over more than a short interval, but cannot use any of the other facilities |
| * provided by this module. |
| * |
| * The difference between this and the underlying ReserveExternalFD function |
| * is that this will report failure (by setting errno and returning false) |
| * if "too many" external FDs are already reserved. This should be used in |
| * any code where the total number of FDs to be reserved is not predictable |
| * and small. |
| */ |
| bool |
| AcquireExternalFD(void) |
| { |
| /* |
| * We don't want more than max_safe_fds / 3 FDs to be consumed for |
| * "external" FDs. |
| */ |
| if (numExternalFDs < max_safe_fds / 3) |
| { |
| ReserveExternalFD(); |
| return true; |
| } |
| errno = EMFILE; |
| return false; |
| } |
| |
| /* |
| * ReserveExternalFD - report external consumption of a file descriptor |
| * |
| * This should be used by callers that need to hold a file descriptor open |
| * over more than a short interval, but cannot use any of the other facilities |
| * provided by this module. This just tracks the use of the FD and closes |
| * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available. |
| * |
| * Call this directly only in code where failure to reserve the FD would be |
| * fatal; for example, the WAL-writing code does so, since the alternative is |
| * session failure. Also, it's very unwise to do so in code that could |
| * consume more than one FD per process. |
| * |
| * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain |
| * available, it doesn't matter too much whether this is called before or |
| * after actually opening the FD; but doing so beforehand reduces the risk of |
| * an EMFILE failure if not everybody played nice. In any case, it's solely |
| * caller's responsibility to keep the external-FD count in sync with reality. |
| */ |
| void |
| ReserveExternalFD(void) |
| { |
| /* |
| * Release VFDs if needed to stay safe. Because we do this before |
| * incrementing numExternalFDs, the final state will be as desired, i.e., |
| * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds. |
| */ |
| ReleaseLruFiles(); |
| |
| numExternalFDs++; |
| } |
| |
| /* |
| * ReleaseExternalFD - report release of an external file descriptor |
| * |
| * This is guaranteed not to change errno, so it can be used in failure paths. |
| */ |
| void |
| ReleaseExternalFD(void) |
| { |
| Assert(numExternalFDs > 0); |
| numExternalFDs--; |
| } |
| |
| |
| #if defined(FDDEBUG) |
| |
| static void |
| _dump_lru(void) |
| { |
| int mru = VfdCache[0].lruLessRecently; |
| Vfd *vfdP = &VfdCache[mru]; |
| char buf[2048]; |
| |
| snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru); |
| while (mru != 0) |
| { |
| mru = vfdP->lruLessRecently; |
| vfdP = &VfdCache[mru]; |
| snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru); |
| } |
| snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST"); |
| elog(LOG, "%s", buf); |
| } |
| #endif /* FDDEBUG */ |
| |
| static void |
| Delete(File file) |
| { |
| Vfd *vfdP; |
| |
| Assert(file != 0); |
| |
| DO_DB(elog(LOG, "Delete %d (%s)", |
| file, VfdCache[file].fileName)); |
| DO_DB(_dump_lru()); |
| |
| vfdP = &VfdCache[file]; |
| |
| VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently; |
| VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently; |
| |
| DO_DB(_dump_lru()); |
| } |
| |
| static void |
| LruDelete(File file) |
| { |
| Vfd *vfdP; |
| |
| Assert(file != 0); |
| |
| DO_DB(elog(LOG, "LruDelete %d (%s)", |
| file, VfdCache[file].fileName)); |
| |
| vfdP = &VfdCache[file]; |
| |
| /* |
| * Close the file. We aren't expecting this to fail; if it does, better |
| * to leak the FD than to mess up our internal state. |
| */ |
| if (close(vfdP->fd) != 0) |
| elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG), |
| "could not close file \"%s\": %m", vfdP->fileName); |
| vfdP->fd = VFD_CLOSED; |
| --nfile; |
| |
| /* delete the vfd record from the LRU ring */ |
| Delete(file); |
| } |
| |
| static void |
| Insert(File file) |
| { |
| Vfd *vfdP; |
| |
| Assert(file != 0); |
| |
| DO_DB(elog(LOG, "Insert %d (%s)", |
| file, VfdCache[file].fileName)); |
| DO_DB(_dump_lru()); |
| |
| vfdP = &VfdCache[file]; |
| |
| vfdP->lruMoreRecently = 0; |
| vfdP->lruLessRecently = VfdCache[0].lruLessRecently; |
| VfdCache[0].lruLessRecently = file; |
| VfdCache[vfdP->lruLessRecently].lruMoreRecently = file; |
| |
| DO_DB(_dump_lru()); |
| } |
| |
| /* returns 0 on success, -1 on re-open failure (with errno set) */ |
| static int |
| LruInsert(File file) |
| { |
| Vfd *vfdP; |
| |
| Assert(file != 0); |
| |
| DO_DB(elog(LOG, "LruInsert %d (%s)", |
| file, VfdCache[file].fileName)); |
| |
| vfdP = &VfdCache[file]; |
| |
| if (FileIsNotOpen(file)) |
| { |
| /* Close excess kernel FDs. */ |
| ReleaseLruFiles(); |
| |
| /* |
| * The open could still fail for lack of file descriptors, eg due to |
| * overall system file table being full. So, be prepared to release |
| * another FD if necessary... |
| */ |
| vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags, |
| vfdP->fileMode); |
| if (vfdP->fd < 0) |
| { |
| DO_DB(elog(LOG, "re-open failed: %m")); |
| return -1; |
| } |
| else |
| { |
| ++nfile; |
| } |
| } |
| |
| /* |
| * put it at the head of the Lru ring |
| */ |
| |
| Insert(file); |
| |
| return 0; |
| } |
| |
| /* |
| * Release one kernel FD by closing the least-recently-used VFD. |
| */ |
| static bool |
| ReleaseLruFile(void) |
| { |
| DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile)); |
| |
| if (nfile > 0) |
| { |
| /* |
| * There are opened files and so there should be at least one used vfd |
| * in the ring. |
| */ |
| Assert(VfdCache[0].lruMoreRecently != 0); |
| LruDelete(VfdCache[0].lruMoreRecently); |
| return true; /* freed a file */ |
| } |
| return false; /* no files available to free */ |
| } |
| |
| /* |
| * Release kernel FDs as needed to get under the max_safe_fds limit. |
| * After calling this, it's OK to try to open another file. |
| */ |
| static void |
| ReleaseLruFiles(void) |
| { |
| while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds) |
| { |
| if (!ReleaseLruFile()) |
| break; |
| } |
| } |
| |
| static File |
| AllocateVfd(void) |
| { |
| Index i; |
| File file; |
| |
| DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache)); |
| |
| Assert(SizeVfdCache > 0); /* InitFileAccess not called? */ |
| |
| if (VfdCache[0].nextFree == 0) |
| { |
| /* |
| * The free list is empty so it is time to increase the size of the |
| * array. We choose to double it each time this happens. However, |
| * there's not much point in starting *real* small. |
| */ |
| Size newCacheSize = SizeVfdCache * 2; |
| Vfd *newVfdCache; |
| |
| if (newCacheSize < 32) |
| newCacheSize = 32; |
| |
| /* |
| * Be careful not to clobber VfdCache ptr if realloc fails. |
| */ |
| newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize); |
| if (newVfdCache == NULL) |
| ereport(ERROR, |
| (errcode(ERRCODE_OUT_OF_MEMORY), |
| errmsg("out of memory"))); |
| VfdCache = newVfdCache; |
| |
| /* |
| * Initialize the new entries and link them into the free list. |
| */ |
| for (i = SizeVfdCache; i < newCacheSize; i++) |
| { |
| MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd)); |
| VfdCache[i].nextFree = i + 1; |
| VfdCache[i].fd = VFD_CLOSED; |
| } |
| VfdCache[newCacheSize - 1].nextFree = 0; |
| VfdCache[0].nextFree = SizeVfdCache; |
| |
| /* |
| * Record the new size |
| */ |
| SizeVfdCache = newCacheSize; |
| } |
| |
| file = VfdCache[0].nextFree; |
| |
| VfdCache[0].nextFree = VfdCache[file].nextFree; |
| |
| return file; |
| } |
| |
| static void |
| FreeVfd(File file) |
| { |
| Vfd *vfdP = &VfdCache[file]; |
| |
| DO_DB(elog(LOG, "FreeVfd: %d (%s)", |
| file, vfdP->fileName ? vfdP->fileName : "")); |
| |
| if (vfdP->fileName != NULL) |
| { |
| free(vfdP->fileName); |
| vfdP->fileName = NULL; |
| } |
| vfdP->fdstate = 0x0; |
| |
| vfdP->nextFree = VfdCache[0].nextFree; |
| VfdCache[0].nextFree = file; |
| } |
| |
| /* returns 0 on success, -1 on re-open failure (with errno set) */ |
| static int |
| FileAccess(File file) |
| { |
| int returnValue; |
| |
| DO_DB(elog(LOG, "FileAccess %d (%s)", |
| file, VfdCache[file].fileName)); |
| |
| /* |
| * Is the file open? If not, open it and put it at the head of the LRU |
| * ring (possibly closing the least recently used file to get an FD). |
| */ |
| |
| if (FileIsNotOpen(file)) |
| { |
| returnValue = LruInsert(file); |
| if (returnValue != 0) |
| return returnValue; |
| } |
| else if (VfdCache[0].lruLessRecently != file) |
| { |
| /* |
| * We now know that the file is open and that it is not the last one |
| * accessed, so we need to move it to the head of the Lru ring. |
| */ |
| |
| Delete(file); |
| Insert(file); |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Called whenever a temporary file is deleted to report its size. |
| */ |
| static void |
| ReportTemporaryFileUsage(const char *path, off_t size) |
| { |
| pgstat_report_tempfile(size); |
| |
| if (log_temp_files >= 0) |
| { |
| if ((size / 1024) >= log_temp_files) |
| ereport(LOG, |
| (errmsg("temporary file: path \"%s\", size %lu", |
| path, (unsigned long) size))); |
| } |
| } |
| |
| /* |
| * Called to register a temporary file for automatic close. |
| * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called |
| * before the file was opened. |
| */ |
| static void |
| RegisterTemporaryFile(File file) |
| { |
| ResourceOwnerRememberFile(CurrentResourceOwner, file); |
| VfdCache[file].resowner = CurrentResourceOwner; |
| |
| /* Backup mechanism for closing at end of xact. */ |
| VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT; |
| have_xact_temporary_files = true; |
| } |
| |
| /* |
| * Called when we get a shared invalidation message on some relation. |
| */ |
| #ifdef NOT_USED |
| void |
| FileInvalidate(File file) |
| { |
| Assert(FileIsValid(file)); |
| if (!FileIsNotOpen(file)) |
| LruDelete(file); |
| } |
| #endif |
| |
| /* |
| * Open a file with PathNameOpenFilePerm() and pass default file mode for the |
| * fileMode parameter. |
| */ |
| File |
| PathNameOpenFile(const char *fileName, int fileFlags) |
| { |
| return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode); |
| } |
| |
| /* |
| * open a file in an arbitrary directory |
| * |
| * NB: if the passed pathname is relative (which it usually is), |
| * it will be interpreted relative to the process' working directory |
| * (which should always be $PGDATA when this code is running). |
| */ |
| File |
| PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) |
| { |
| char *fnamecopy; |
| File file; |
| Vfd *vfdP; |
| |
| DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o", |
| fileName, fileFlags, fileMode)); |
| |
| /* |
| * We need a malloc'd copy of the file name; fail cleanly if no room. |
| */ |
| fnamecopy = strdup(fileName); |
| if (fnamecopy == NULL) |
| ereport(ERROR, |
| (errcode(ERRCODE_OUT_OF_MEMORY), |
| errmsg("out of memory"))); |
| |
| file = AllocateVfd(); |
| vfdP = &VfdCache[file]; |
| |
| /* Close excess kernel FDs. */ |
| ReleaseLruFiles(); |
| |
| /* |
| * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The |
| * client shouldn't be expected to know which kernel descriptors are |
| * currently open, so it wouldn't make sense for them to be inherited by |
| * executed subprograms. |
| */ |
| fileFlags |= O_CLOEXEC; |
| |
| vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode); |
| |
| if (vfdP->fd < 0) |
| { |
| int save_errno = errno; |
| |
| FreeVfd(file); |
| free(fnamecopy); |
| errno = save_errno; |
| return -1; |
| } |
| ++nfile; |
| DO_DB(elog(LOG, "PathNameOpenFile: success %d", |
| vfdP->fd)); |
| |
| vfdP->fileName = fnamecopy; |
| /* Saved flags are adjusted to be OK for re-opening file */ |
| vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL); |
| vfdP->fileMode = fileMode; |
| vfdP->fileSize = 0; |
| vfdP->fdstate = 0x0; |
| vfdP->resowner = NULL; |
| |
| Insert(file); |
| |
| return file; |
| } |
| |
| /* |
| * Create directory 'directory'. If necessary, create 'basedir', which must |
| * be the directory above it. This is designed for creating the top-level |
| * temporary directory on demand before creating a directory underneath it. |
| * Do nothing if the directory already exists. |
| * |
| * Directories created within the top-level temporary directory should begin |
| * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and |
| * deleted at startup by RemovePgTempFiles(). Further subdirectories below |
| * that do not need any particular prefix. |
| */ |
| void |
| PathNameCreateTemporaryDir(const char *basedir, const char *directory) |
| { |
| if (MakePGDirectory(directory) < 0) |
| { |
| if (errno == EEXIST) |
| return; |
| |
| /* |
| * Failed. Try to create basedir first in case it's missing. Tolerate |
| * EEXIST to close a race against another process following the same |
| * algorithm. |
| */ |
| if (MakePGDirectory(basedir) < 0 && errno != EEXIST) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("cannot create temporary directory \"%s\": %m", |
| basedir))); |
| |
| /* Try again. */ |
| if (MakePGDirectory(directory) < 0 && errno != EEXIST) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("cannot create temporary subdirectory \"%s\": %m", |
| directory))); |
| } |
| } |
| |
| /* |
| * Delete a directory and everything in it, if it exists. |
| */ |
| void |
| PathNameDeleteTemporaryDir(const char *dirname) |
| { |
| struct stat statbuf; |
| |
| /* Silently ignore missing directory. */ |
| if (stat(dirname, &statbuf) != 0 && errno == ENOENT) |
| return; |
| |
| /* |
| * Currently, walkdir doesn't offer a way for our passed in function to |
| * maintain state. Perhaps it should, so that we could tell the caller |
| * whether this operation succeeded or failed. Since this operation is |
| * used in a cleanup path, we wouldn't actually behave differently: we'll |
| * just log failures. |
| */ |
| walkdir(dirname, unlink_if_exists_fname, false, LOG); |
| } |
| |
| /* |
| * Open a temporary file that will disappear when we close it. |
| * |
| * This routine takes care of generating an appropriate tempfile name. |
| * There's no need to pass in fileFlags or fileMode either, since only |
| * one setting makes any sense for a temp file. |
| * |
| * Unless interXact is true, the file is remembered by CurrentResourceOwner |
| * to ensure it's closed and deleted when it's no longer needed, typically at |
| * the end-of-transaction. In most cases, you don't want temporary files to |
| * outlive the transaction that created them, so this should be false -- but |
| * if you need "somewhat" temporary storage, this might be useful. In either |
| * case, the file is removed when the File is explicitly closed. |
| * |
| * GPDB: As a convenience for monitoring and debugging, the given 'filePrefix' |
| * string is embedded in the file name. It can be NULL. |
| */ |
| File |
| OpenTemporaryFile(bool interXact, const char *filePrefix) |
| { |
| File file = 0; |
| |
| Assert(temporary_files_allowed); /* check temp file access is up */ |
| |
| /* |
| * Make sure the current resource owner has space for this File before we |
| * open it, if we'll be registering it below. |
| */ |
| if (!interXact) |
| ResourceOwnerEnlargeFiles(CurrentResourceOwner); |
| |
| /* |
| * If some temp tablespace(s) have been given to us, try to use the next |
| * one. If a given tablespace can't be found, we silently fall back to |
| * the database's default tablespace. |
| * |
| * BUT: if the temp file is slated to outlive the current transaction, |
| * force it into the database's default tablespace, so that it will not |
| * pose a threat to possible tablespace drop attempts. |
| */ |
| if (numTempTableSpaces > 0 && !interXact) |
| { |
| Oid tblspcOid = GetNextTempTableSpace(); |
| |
| if (OidIsValid(tblspcOid)) |
| file = OpenTemporaryFileInTablespace(tblspcOid, |
| false, /* rejectError */ |
| filePrefix, |
| true, /* makenameunique */ |
| true); /* create */ |
| } |
| |
| /* |
| * If not, or if tablespace is bad, create in database's default |
| * tablespace. MyDatabaseTableSpace should normally be set before we get |
| * here, but just in case it isn't, fall back to pg_default tablespace. |
| */ |
| if (file <= 0) |
| file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ? |
| MyDatabaseTableSpace : |
| DEFAULTTABLESPACE_OID, |
| true, |
| filePrefix, |
| true, /* makenameunique */ |
| true); /* create */ |
| |
| /* Mark it for deletion at close and temporary file size limit */ |
| VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT; |
| |
| /* Register it with the current resource owner */ |
| if (!interXact) |
| RegisterTemporaryFile(file); |
| |
| return file; |
| } |
| |
| /* |
| * Return the path of the temp directory in a given tablespace. |
| */ |
| void |
| TempTablespacePath(char *path, Oid tablespace) |
| { |
| /* |
| * Identify the tempfile directory for this tablespace. |
| * |
| * If someone tries to specify pg_global, use pg_default instead. |
| */ |
| if (tablespace == InvalidOid || |
| tablespace == DEFAULTTABLESPACE_OID || |
| tablespace == GLOBALTABLESPACE_OID) |
| snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR); |
| else |
| { |
| /* All other tablespaces are accessed via symlinks */ |
| snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s", |
| tablespace, GP_TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR); |
| } |
| } |
| |
| /* |
| * Open a temporary file in a specific tablespace. |
| * Subroutine for OpenTemporaryFile, which see for details. |
| */ |
| static File |
| OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError, |
| const char *filename, bool makenameunique, bool create) |
| { |
| char tempdirpath[MAXPGPATH]; |
| char tempfilepath[MAXPGPATH]; |
| File file; |
| int flags; |
| |
| TempTablespacePath(tempdirpath, tblspcOid); |
| |
| /* |
| * Generate a tempfile name that should be unique within the current |
| * database instance. |
| */ |
| if (filename == NULL) |
| { |
| Assert (makenameunique); |
| filename = ""; |
| } |
| |
| if (makenameunique) |
| { |
| Assert(create); |
| snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%s%d.%ld", |
| tempdirpath, PG_TEMP_FILE_PREFIX, filename, MyProcPid, tempFileCounter++); |
| } |
| else |
| snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s_%s", |
| tempdirpath, PG_TEMP_FILE_PREFIX, filename); |
| |
| /* |
| * Open the file. Note: we don't use O_EXCL, in case there is an orphaned |
| * temp file that can be reused. |
| */ |
| flags = O_RDWR | PG_BINARY; |
| if (create) |
| flags |= O_CREAT | O_TRUNC; |
| file = PathNameOpenFile(tempfilepath, |
| flags); |
| if (file <= 0) |
| { |
| /* |
| * We might need to create the tablespace's tempfile directory, if no |
| * one has yet done so. |
| * |
| * Don't check for an error from MakePGDirectory; it could fail if |
| * someone else just did the same thing. If it doesn't work then |
| * we'll bomb out on the second create attempt, instead. |
| */ |
| (void) MakePGDirectory(tempdirpath); |
| |
| file = PathNameOpenFile(tempfilepath, |
| flags); |
| if (file <= 0 && rejectError) |
| { |
| if (create) |
| elog(ERROR, "could not create temporary file \"%s\": %m", |
| tempfilepath); |
| else |
| elog(ERROR, "could not open existing temporary file \"%s\": %m", |
| tempfilepath); |
| } |
| } |
| |
| return file; |
| } |
| |
| |
| /* |
| * Create a new file. The directory containing it must already exist. Files |
| * created this way are subject to temp_file_limit and are automatically |
| * closed at end of transaction, but are not automatically deleted on close |
| * because they are intended to be shared between cooperating backends. |
| * |
| * If the file is inside the top-level temporary directory, its name should |
| * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary |
| * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be |
| * inside a directory created with PathNameCreateTemporaryDir(), in which case |
| * the prefix isn't needed. |
| */ |
| File |
| PathNameCreateTemporaryFile(const char *path, bool error_on_failure) |
| { |
| File file; |
| |
| Assert(temporary_files_allowed); /* check temp file access is up */ |
| |
| ResourceOwnerEnlargeFiles(CurrentResourceOwner); |
| |
| /* |
| * Open the file. Note: we don't use O_EXCL, in case there is an orphaned |
| * temp file that can be reused. |
| */ |
| file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY); |
| if (file <= 0) |
| { |
| if (error_on_failure) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not create temporary file \"%s\": %m", |
| path))); |
| else |
| return file; |
| } |
| |
| /* Mark it for temp_file_limit accounting. */ |
| VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT; |
| |
| /* Register it for automatic close. */ |
| RegisterTemporaryFile(file); |
| |
| return file; |
| } |
| |
| /* |
| * Open a file that was created with PathNameCreateTemporaryFile, possibly in |
| * another backend. Files opened this way don't count against the |
| * temp_file_limit of the caller, are automatically closed at the end of the |
| * transaction but are not deleted on close. |
| */ |
| File |
| PathNameOpenTemporaryFile(const char *path, int mode) |
| { |
| File file; |
| |
| Assert(temporary_files_allowed); /* check temp file access is up */ |
| |
| ResourceOwnerEnlargeFiles(CurrentResourceOwner); |
| |
| file = PathNameOpenFile(path, mode | PG_BINARY); |
| |
| /* If no such file, then we don't raise an error. */ |
| if (file <= 0 && errno != ENOENT) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not open temporary file \"%s\": %m", |
| path))); |
| |
| if (file > 0) |
| { |
| /* Register it for automatic close. */ |
| RegisterTemporaryFile(file); |
| } |
| |
| return file; |
| } |
| |
| /* |
| * Delete a file by pathname. Return true if the file existed, false if |
| * didn't. |
| */ |
| bool |
| PathNameDeleteTemporaryFile(const char *path, bool error_on_failure) |
| { |
| struct stat filestats; |
| int stat_errno; |
| |
| /* Get the final size for pgstat reporting. */ |
| if (stat(path, &filestats) != 0) |
| stat_errno = errno; |
| else |
| stat_errno = 0; |
| |
| /* |
| * Unlike FileClose's automatic file deletion code, we tolerate |
| * non-existence to support BufFileDeleteFileSet which doesn't know how |
| * many segments it has to delete until it runs out. |
| */ |
| if (stat_errno == ENOENT) |
| return false; |
| |
| if (unlink(path) < 0) |
| { |
| if (errno != ENOENT) |
| ereport(error_on_failure ? ERROR : LOG, |
| (errcode_for_file_access(), |
| errmsg("could not unlink temporary file \"%s\": %m", |
| path))); |
| return false; |
| } |
| |
| if (stat_errno == 0) |
| ReportTemporaryFileUsage(path, filestats.st_size); |
| else |
| { |
| errno = stat_errno; |
| ereport(LOG, |
| (errcode_for_file_access(), |
| errmsg("could not stat file \"%s\": %m", path))); |
| } |
| |
| return true; |
| } |
| |
| /* |
| * close a file when done with it |
| */ |
| void |
| FileClose(File file) |
| { |
| Vfd *vfdP; |
| |
| Assert(FileIsValid(file)); |
| |
| DO_DB(elog(LOG, "FileClose: %d (%s)", |
| file, VfdCache[file].fileName)); |
| |
| vfdP = &VfdCache[file]; |
| |
| if (!FileIsNotOpen(file)) |
| { |
| /* close the file */ |
| if (gp_retry_close(vfdP->fd)) |
| { |
| /* |
| * We may need to panic on failure to close non-temporary files; |
| * see LruDelete. |
| */ |
| elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG), |
| "could not close file \"%s\": %m", vfdP->fileName); |
| } |
| |
| --nfile; |
| vfdP->fd = VFD_CLOSED; |
| |
| /* remove the file from the lru ring */ |
| Delete(file); |
| } |
| |
| if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) |
| { |
| /* Subtract its size from current usage (do first in case of error) */ |
| temporary_files_size -= vfdP->fileSize; |
| vfdP->fileSize = 0; |
| } |
| |
| /* |
| * Delete the file if it was temporary, and make a log entry if wanted |
| */ |
| if (vfdP->fdstate & FD_DELETE_AT_CLOSE) |
| { |
| struct stat filestats; |
| int stat_errno; |
| |
| /* |
| * If we get an error, as could happen within the ereport/elog calls, |
| * we'll come right back here during transaction abort. Reset the |
| * flag to ensure that we can't get into an infinite loop. This code |
| * is arranged to ensure that the worst-case consequence is failing to |
| * emit log message(s), not failing to attempt the unlink. |
| */ |
| vfdP->fdstate &= ~FD_DELETE_AT_CLOSE; |
| |
| |
| /* first try the stat() */ |
| if (stat(vfdP->fileName, &filestats)) |
| stat_errno = errno; |
| else |
| stat_errno = 0; |
| |
| /* in any case do the unlink */ |
| if (unlink(vfdP->fileName)) |
| ereport(DEBUG1, |
| (errcode_for_file_access(), |
| errmsg("could not delete file \"%s\": %m", vfdP->fileName))); |
| |
| /* and last report the stat results */ |
| if (stat_errno == 0) |
| ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size); |
| else |
| { |
| errno = stat_errno; |
| ereport(DEBUG1, |
| (errcode_for_file_access(), |
| errmsg("could not stat file \"%s\": %m", vfdP->fileName))); |
| } |
| } |
| |
| /* Unregister it from the resource owner */ |
| if (vfdP->resowner) |
| ResourceOwnerForgetFile(vfdP->resowner, file); |
| |
| /* Unregister it from the workfile set */ |
| if (vfdP->fdstate & FD_WORKFILE) |
| WorkFileDeleted(file, true); |
| |
| /* |
| * Return the Vfd slot to the free list |
| */ |
| FreeVfd(file); |
| } |
| |
| /* |
| * FilePrefetch - initiate asynchronous read of a given range of the file. |
| * |
| * Currently the only implementation of this function is using posix_fadvise |
| * which is the simplest standardized interface that accomplishes this. |
| * We could add an implementation using libaio in the future; but note that |
| * this API is inappropriate for libaio, which wants to have a buffer provided |
| * to read into. |
| */ |
| int |
| FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info) |
| { |
| #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED) |
| int returnCode; |
| |
| Assert(FileIsValid(file)); |
| |
| DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT, |
| file, VfdCache[file].fileName, |
| (int64) offset, (int64) amount)); |
| |
| returnCode = FileAccess(file); |
| if (returnCode < 0) |
| return returnCode; |
| |
| retry: |
| pgstat_report_wait_start(wait_event_info); |
| returnCode = posix_fadvise(VfdCache[file].fd, offset, amount, |
| POSIX_FADV_WILLNEED); |
| pgstat_report_wait_end(); |
| |
| if (returnCode == EINTR) |
| goto retry; |
| |
| return returnCode; |
| #else |
| Assert(FileIsValid(file)); |
| return 0; |
| #endif |
| } |
| |
| void |
| FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) |
| { |
| int returnCode; |
| |
| Assert(FileIsValid(file)); |
| |
| DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT, |
| file, VfdCache[file].fileName, |
| (int64) offset, (int64) nbytes)); |
| |
| if (nbytes <= 0) |
| return; |
| |
| if (VfdCache[file].fileFlags & PG_O_DIRECT) |
| return; |
| |
| returnCode = FileAccess(file); |
| if (returnCode < 0) |
| return; |
| |
| pgstat_report_wait_start(wait_event_info); |
| pg_flush_data(VfdCache[file].fd, offset, nbytes); |
| pgstat_report_wait_end(); |
| } |
| |
| int |
| FileRead(File file, void *buffer, size_t amount, off_t offset, |
| uint32 wait_event_info) |
| { |
| int returnCode; |
| Vfd *vfdP; |
| |
| Assert(FileIsValid(file)); |
| |
| DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %zu %p", |
| file, VfdCache[file].fileName, |
| (int64) offset, |
| amount, buffer)); |
| |
| returnCode = FileAccess(file); |
| if (returnCode < 0) |
| return returnCode; |
| |
| vfdP = &VfdCache[file]; |
| |
| retry: |
| pgstat_report_wait_start(wait_event_info); |
| returnCode = pg_pread(vfdP->fd, buffer, amount, offset); |
| pgstat_report_wait_end(); |
| |
| if (returnCode < 0) |
| { |
| /* |
| * Windows may run out of kernel buffers and return "Insufficient |
| * system resources" error. Wait a bit and retry to solve it. |
| * |
| * It is rumored that EINTR is also possible on some Unix filesystems, |
| * in which case immediate retry is indicated. |
| */ |
| #ifdef WIN32 |
| DWORD error = GetLastError(); |
| |
| switch (error) |
| { |
| case ERROR_NO_SYSTEM_RESOURCES: |
| pg_usleep(1000L); |
| errno = EINTR; |
| break; |
| default: |
| _dosmaperr(error); |
| break; |
| } |
| #endif |
| /* OK to retry if interrupted */ |
| if (errno == EINTR) |
| goto retry; |
| } |
| |
| return returnCode; |
| } |
| |
| int |
| FileWrite(File file, const void *buffer, size_t amount, off_t offset, |
| uint32 wait_event_info) |
| { |
| int returnCode; |
| Vfd *vfdP; |
| |
| Assert(FileIsValid(file)); |
| |
| DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %zu %p", |
| file, VfdCache[file].fileName, |
| (int64) offset, |
| amount, buffer)); |
| |
| returnCode = FileAccess(file); |
| if (returnCode < 0) |
| return returnCode; |
| |
| vfdP = &VfdCache[file]; |
| |
| /* |
| * If enforcing temp_file_limit and it's a temp file, check to see if the |
| * write would overrun temp_file_limit, and throw error if so. Note: it's |
| * really a modularity violation to throw error here; we should set errno |
| * and return -1. However, there's no way to report a suitable error |
| * message if we do that. All current callers would just throw error |
| * immediately anyway, so this is safe at present. |
| */ |
| if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT)) |
| { |
| off_t past_write = offset + amount; |
| |
| if (past_write > vfdP->fileSize) |
| { |
| uint64 newTotal = temporary_files_size; |
| |
| newTotal += past_write - vfdP->fileSize; |
| if (newTotal > (uint64) temp_file_limit * (uint64) 1024) |
| ereport(ERROR, |
| (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), |
| errmsg("temporary file size exceeds temp_file_limit (%dkB)", |
| temp_file_limit))); |
| } |
| } |
| |
| /* |
| * Also update the stats in workfile manager. This might also |
| * throw an error, if we're over the limits. |
| * |
| * Because we update the stats in workfile manager first, if the write |
| * fails, the workfile manager's status will be out of sync with reality. |
| * That's OK, the inaccuracy doesn't accumulate, and it doesn't need to be |
| * totallyaccurate. |
| */ |
| if ((VfdCache[file].fdstate & FD_WORKFILE) != 0) |
| { |
| off_t newPos = offset + amount; |
| |
| if (newPos > VfdCache[file].fileSize) |
| UpdateWorkFileSize(file, newPos); |
| } |
| |
| retry: |
| errno = 0; |
| pgstat_report_wait_start(wait_event_info); |
| returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset); |
| pgstat_report_wait_end(); |
| |
| /* if write didn't set errno, assume problem is no disk space */ |
| if (returnCode != amount && errno == 0) |
| errno = ENOSPC; |
| |
| if (returnCode >= 0) |
| { |
| /* |
| * Maintain fileSize and temporary_files_size if it's a temp file. |
| */ |
| if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) |
| { |
| off_t past_write = offset + amount; |
| |
| if (past_write > vfdP->fileSize) |
| { |
| temporary_files_size += past_write - vfdP->fileSize; |
| vfdP->fileSize = past_write; |
| } |
| } |
| } |
| else |
| { |
| /* |
| * See comments in FileRead() |
| */ |
| #ifdef WIN32 |
| DWORD error = GetLastError(); |
| |
| switch (error) |
| { |
| case ERROR_NO_SYSTEM_RESOURCES: |
| pg_usleep(1000L); |
| errno = EINTR; |
| break; |
| default: |
| _dosmaperr(error); |
| break; |
| } |
| #endif |
| /* OK to retry if interrupted */ |
| if (errno == EINTR) |
| goto retry; |
| } |
| |
| return returnCode; |
| } |
| |
| int |
| FileSync(File file, uint32 wait_event_info) |
| { |
| int returnCode; |
| Assert(FileIsValid(file)); |
| |
| DO_DB(elog(LOG, "FileSync: %d (%s)", |
| file, VfdCache[file].fileName)); |
| |
| returnCode = FileAccess(file); |
| if (returnCode < 0) |
| return returnCode; |
| |
| pgstat_report_wait_start(wait_event_info); |
| returnCode = pg_fsync(VfdCache[file].fd); |
| pgstat_report_wait_end(); |
| |
| return returnCode; |
| } |
| |
| /* |
| * Get the size of a physical file by using fstat() |
| * |
| * Returns size in bytes if successful, < 0 otherwise |
| */ |
| int64 |
| FileDiskSize(File file) |
| { |
| int returnCode = 0; |
| struct stat buf; |
| |
| returnCode = FileAccess(file); |
| if (returnCode < 0) |
| return returnCode; |
| |
| returnCode = fstat(VfdCache[file].fd, &buf); |
| if (returnCode < 0) |
| return returnCode; |
| |
| return (int64) buf.st_size; |
| } |
| |
| /* |
| * Zero a region of the file. |
| * |
| * Returns 0 on success, -1 otherwise. In the latter case errno is set to the |
| * appropriate error. |
| */ |
| int |
| FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info) |
| { |
| int returnCode; |
| ssize_t written; |
| |
| Assert(FileIsValid(file)); |
| |
| DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT, |
| file, VfdCache[file].fileName, |
| (int64) offset, (int64) amount)); |
| |
| returnCode = FileAccess(file); |
| if (returnCode < 0) |
| return returnCode; |
| |
| pgstat_report_wait_start(wait_event_info); |
| written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset); |
| pgstat_report_wait_end(); |
| |
| if (written < 0) |
| return -1; |
| else if (written != amount) |
| { |
| /* if errno is unset, assume problem is no disk space */ |
| if (errno == 0) |
| errno = ENOSPC; |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Try to reserve file space with posix_fallocate(). If posix_fallocate() is |
| * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP, |
| * use FileZero() instead. |
| * |
| * Note that at least glibc() implements posix_fallocate() in userspace if not |
| * implemented by the filesystem. That's not the case for all environments |
| * though. |
| * |
| * Returns 0 on success, -1 otherwise. In the latter case errno is set to the |
| * appropriate error. |
| */ |
| int |
| FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info) |
| { |
| #ifdef HAVE_POSIX_FALLOCATE |
| int returnCode; |
| |
| Assert(FileIsValid(file)); |
| |
| DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT, |
| file, VfdCache[file].fileName, |
| (int64) offset, (int64) amount)); |
| |
| returnCode = FileAccess(file); |
| if (returnCode < 0) |
| return -1; |
| |
| retry: |
| pgstat_report_wait_start(wait_event_info); |
| returnCode = posix_fallocate(VfdCache[file].fd, offset, amount); |
| pgstat_report_wait_end(); |
| |
| if (returnCode == 0) |
| return 0; |
| else if (returnCode == EINTR) |
| goto retry; |
| |
| /* for compatibility with %m printing etc */ |
| errno = returnCode; |
| |
| /* |
| * Return in cases of a "real" failure, if fallocate is not supported, |
| * fall through to the FileZero() backed implementation. |
| */ |
| if (returnCode != EINVAL && returnCode != EOPNOTSUPP) |
| return -1; |
| #endif |
| |
| return FileZero(file, offset, amount, wait_event_info); |
| } |
| |
| off_t |
| FileSize(File file) |
| { |
| Assert(FileIsValid(file)); |
| |
| DO_DB(elog(LOG, "FileSize %d (%s)", |
| file, VfdCache[file].fileName)); |
| |
| if (FileIsNotOpen(file)) |
| { |
| if (FileAccess(file) < 0) |
| return (off_t) -1; |
| } |
| |
| return lseek(VfdCache[file].fd, 0, SEEK_END); |
| } |
| |
| int |
| FileTruncate(File file, int64 offset, uint32 wait_event_info) |
| { |
| int returnCode; |
| |
| Assert(FileIsValid(file)); |
| |
| DO_DB(elog(LOG, "FileTruncate %d (%s)", |
| file, VfdCache[file].fileName)); |
| |
| returnCode = FileAccess(file); |
| if (returnCode < 0) |
| return returnCode; |
| |
| /* |
| * Call ftruncate with a int64 value. |
| * |
| * WARNING:DO NOT typecast this down to a 32-bit long or |
| * append-only vacuum full adjustment of the eof will erroneously remove |
| * table data. |
| */ |
| pgstat_report_wait_start(wait_event_info); |
| returnCode = pg_ftruncate(VfdCache[file].fd, offset); |
| pgstat_report_wait_end(); |
| |
| if (returnCode == 0 && VfdCache[file].fileSize > offset) |
| { |
| /* adjust our state for truncation of a temp file */ |
| Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT); |
| temporary_files_size -= VfdCache[file].fileSize - offset; |
| VfdCache[file].fileSize = offset; |
| } |
| |
| return returnCode; |
| } |
| |
| /* |
| * Return the pathname associated with an open file. |
| * |
| * The returned string points to an internal buffer, which is valid until |
| * the file is closed. |
| */ |
| char * |
| FilePathName(File file) |
| { |
| Assert(FileIsValid(file)); |
| |
| return VfdCache[file].fileName; |
| } |
| |
| /* |
| * Return the raw file descriptor of an opened file. |
| * |
| * The returned file descriptor will be valid until the file is closed, but |
| * there are a lot of things that can make that happen. So the caller should |
| * be careful not to do much of anything else before it finishes using the |
| * returned file descriptor. |
| */ |
| int |
| FileGetRawDesc(File file) |
| { |
| Assert(FileIsValid(file)); |
| return VfdCache[file].fd; |
| } |
| |
| /* |
| * FileGetRawFlags - returns the file flags on open(2) |
| */ |
| int |
| FileGetRawFlags(File file) |
| { |
| Assert(FileIsValid(file)); |
| return VfdCache[file].fileFlags; |
| } |
| |
| /* |
| * FileGetRawMode - returns the mode bitmask passed to open(2) |
| */ |
| mode_t |
| FileGetRawMode(File file) |
| { |
| Assert(FileIsValid(file)); |
| return VfdCache[file].fileMode; |
| } |
| |
| /* |
| * Make room for another allocatedDescs[] array entry if needed and possible. |
| * Returns true if an array element is available. |
| */ |
| static bool |
| reserveAllocatedDesc(void) |
| { |
| AllocateDesc *newDescs; |
| int newMax; |
| |
| /* Quick out if array already has a free slot. */ |
| if (numAllocatedDescs < maxAllocatedDescs) |
| return true; |
| |
| /* |
| * If the array hasn't yet been created in the current process, initialize |
| * it with FD_MINFREE / 3 elements. In many scenarios this is as many as |
| * we will ever need, anyway. We don't want to look at max_safe_fds |
| * immediately because set_max_safe_fds() may not have run yet. |
| */ |
| if (allocatedDescs == NULL) |
| { |
| newMax = FD_MINFREE / 3; |
| newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc)); |
| /* Out of memory already? Treat as fatal error. */ |
| if (newDescs == NULL) |
| ereport(ERROR, |
| (errcode(ERRCODE_OUT_OF_MEMORY), |
| errmsg("out of memory"))); |
| allocatedDescs = newDescs; |
| maxAllocatedDescs = newMax; |
| return true; |
| } |
| |
| /* |
| * Consider enlarging the array beyond the initial allocation used above. |
| * By the time this happens, max_safe_fds should be known accurately. |
| * |
| * We mustn't let allocated descriptors hog all the available FDs, and in |
| * practice we'd better leave a reasonable number of FDs for VFD use. So |
| * set the maximum to max_safe_fds / 3. (This should certainly be at |
| * least as large as the initial size, FD_MINFREE / 3, so we aren't |
| * tightening the restriction here.) Recall that "external" FDs are |
| * allowed to consume another third of max_safe_fds. |
| */ |
| newMax = max_safe_fds / 3; |
| if (newMax > maxAllocatedDescs) |
| { |
| newDescs = (AllocateDesc *) realloc(allocatedDescs, |
| newMax * sizeof(AllocateDesc)); |
| /* Treat out-of-memory as a non-fatal error. */ |
| if (newDescs == NULL) |
| return false; |
| allocatedDescs = newDescs; |
| maxAllocatedDescs = newMax; |
| return true; |
| } |
| |
| /* Can't enlarge allocatedDescs[] any more. */ |
| return false; |
| } |
| |
| /* |
| * Routines that want to use stdio (ie, FILE*) should use AllocateFile |
| * rather than plain fopen(). This lets fd.c deal with freeing FDs if |
| * necessary to open the file. When done, call FreeFile rather than fclose. |
| * |
| * Note that files that will be open for any significant length of time |
| * should NOT be handled this way, since they cannot share kernel file |
| * descriptors with other files; there is grave risk of running out of FDs |
| * if anyone locks down too many FDs. Most callers of this routine are |
| * simply reading a config file that they will read and close immediately. |
| * |
| * fd.c will automatically close all files opened with AllocateFile at |
| * transaction commit or abort; this prevents FD leakage if a routine |
| * that calls AllocateFile is terminated prematurely by ereport(ERROR). |
| * |
| * Ideally this should be the *only* direct call of fopen() in the backend. |
| */ |
| FILE * |
| AllocateFile(const char *name, const char *mode) |
| { |
| FILE *file; |
| |
| DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)", |
| numAllocatedDescs, name)); |
| |
| /* Can we allocate another non-virtual FD? */ |
| if (!reserveAllocatedDesc()) |
| ereport(ERROR, |
| (errcode(ERRCODE_INSUFFICIENT_RESOURCES), |
| errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"", |
| maxAllocatedDescs, name))); |
| |
| /* Close excess kernel FDs. */ |
| ReleaseLruFiles(); |
| |
| TryAgain: |
| if ((file = fopen(name, mode)) != NULL) |
| { |
| AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; |
| |
| desc->kind = AllocateDescFile; |
| desc->desc.file = file; |
| desc->create_subid = GetCurrentSubTransactionId(); |
| numAllocatedDescs++; |
| return desc->desc.file; |
| } |
| |
| if (errno == EMFILE || errno == ENFILE) |
| { |
| int save_errno = errno; |
| |
| ereport(LOG, |
| (errcode(ERRCODE_INSUFFICIENT_RESOURCES), |
| errmsg("out of file descriptors: %m; release and retry"))); |
| errno = 0; |
| if (ReleaseLruFile()) |
| goto TryAgain; |
| errno = save_errno; |
| } |
| |
| /* |
| * TEMPORARY hack to log the Windows error code on fopen failures, in |
| * hopes of diagnosing some hard-to-reproduce problems. |
| */ |
| #ifdef WIN32 |
| { |
| int save_errno = errno; |
| |
| elog(LOG, "Windows fopen(\"%s\",\"%s\") failed: code %lu, errno %d", |
| name, mode, GetLastError(), save_errno); |
| errno = save_errno; |
| } |
| #endif |
| |
| return NULL; |
| } |
| |
| /* |
| * Open a file with OpenTransientFilePerm() and pass default file mode for |
| * the fileMode parameter. |
| */ |
| int |
| OpenTransientFile(const char *fileName, int fileFlags) |
| { |
| return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode); |
| } |
| |
| /* |
| * Like AllocateFile, but returns an unbuffered fd like open(2) |
| */ |
| int |
| OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode) |
| { |
| int fd; |
| |
| DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)", |
| numAllocatedDescs, fileName)); |
| |
| /* Can we allocate another non-virtual FD? */ |
| if (!reserveAllocatedDesc()) |
| ereport(ERROR, |
| (errcode(ERRCODE_INSUFFICIENT_RESOURCES), |
| errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"", |
| maxAllocatedDescs, fileName))); |
| |
| /* Close excess kernel FDs. */ |
| ReleaseLruFiles(); |
| |
| fd = BasicOpenFilePerm(fileName, fileFlags, fileMode); |
| |
| if (fd >= 0) |
| { |
| AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; |
| |
| desc->kind = AllocateDescRawFD; |
| desc->desc.fd = fd; |
| desc->create_subid = GetCurrentSubTransactionId(); |
| numAllocatedDescs++; |
| |
| return fd; |
| } |
| |
| return -1; /* failure */ |
| } |
| |
| /* |
| * Routines that want to initiate a pipe stream should use OpenPipeStream |
| * rather than plain popen(). This lets fd.c deal with freeing FDs if |
| * necessary. When done, call ClosePipeStream rather than pclose. |
| * |
| * This function also ensures that the popen'd program is run with default |
| * SIGPIPE processing, rather than the SIG_IGN setting the backend normally |
| * uses. This ensures desirable response to, eg, closing a read pipe early. |
| */ |
| FILE * |
| OpenPipeStream(const char *command, const char *mode) |
| { |
| FILE *file; |
| int save_errno; |
| |
| DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)", |
| numAllocatedDescs, command)); |
| |
| /* Can we allocate another non-virtual FD? */ |
| if (!reserveAllocatedDesc()) |
| ereport(ERROR, |
| (errcode(ERRCODE_INSUFFICIENT_RESOURCES), |
| errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"", |
| maxAllocatedDescs, command))); |
| |
| /* Close excess kernel FDs. */ |
| ReleaseLruFiles(); |
| |
| TryAgain: |
| fflush(NULL); |
| pqsignal(SIGPIPE, SIG_DFL); |
| errno = 0; |
| file = popen(command, mode); |
| save_errno = errno; |
| pqsignal(SIGPIPE, SIG_IGN); |
| errno = save_errno; |
| if (file != NULL) |
| { |
| AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; |
| |
| desc->kind = AllocateDescPipe; |
| desc->desc.file = file; |
| desc->create_subid = GetCurrentSubTransactionId(); |
| numAllocatedDescs++; |
| return desc->desc.file; |
| } |
| |
| if (errno == EMFILE || errno == ENFILE) |
| { |
| ereport(LOG, |
| (errcode(ERRCODE_INSUFFICIENT_RESOURCES), |
| errmsg("out of file descriptors: %m; release and retry"))); |
| if (ReleaseLruFile()) |
| goto TryAgain; |
| errno = save_errno; |
| } |
| |
| return NULL; |
| } |
| |
| /* |
| * Free an AllocateDesc of any type. |
| * |
| * The argument *must* point into the allocatedDescs[] array. |
| */ |
| static int |
| FreeDesc(AllocateDesc *desc) |
| { |
| int result; |
| |
| /* Close the underlying object */ |
| switch (desc->kind) |
| { |
| case AllocateDescFile: |
| result = fclose(desc->desc.file); |
| break; |
| case AllocateDescPipe: |
| result = pclose(desc->desc.file); |
| break; |
| case AllocateDescDir: |
| result = closedir(desc->desc.dir); |
| break; |
| case AllocateDescRawFD: |
| result = close(desc->desc.fd); |
| break; |
| default: |
| elog(ERROR, "AllocateDesc kind not recognized"); |
| result = 0; /* keep compiler quiet */ |
| break; |
| } |
| |
| /* Compact storage in the allocatedDescs array */ |
| numAllocatedDescs--; |
| *desc = allocatedDescs[numAllocatedDescs]; |
| |
| return result; |
| } |
| |
| /* |
| * Close a file returned by AllocateFile. |
| * |
| * Note we do not check fclose's return value --- it is up to the caller |
| * to handle close errors. |
| */ |
| int |
| FreeFile(FILE *file) |
| { |
| int i; |
| |
| DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs)); |
| |
| /* Remove file from list of allocated files, if it's present */ |
| for (i = numAllocatedDescs; --i >= 0;) |
| { |
| AllocateDesc *desc = &allocatedDescs[i]; |
| |
| if (desc->kind == AllocateDescFile && desc->desc.file == file) |
| return FreeDesc(desc); |
| } |
| |
| /* Only get here if someone passes us a file not in allocatedDescs */ |
| elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile"); |
| Assert(false); |
| |
| return fclose(file); |
| } |
| |
| /* |
| * Close a file returned by OpenTransientFile. |
| * |
| * Note we do not check close's return value --- it is up to the caller |
| * to handle close errors. |
| */ |
| int |
| CloseTransientFile(int fd) |
| { |
| int i; |
| |
| DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs)); |
| |
| /* Remove fd from list of allocated files, if it's present */ |
| for (i = numAllocatedDescs; --i >= 0;) |
| { |
| AllocateDesc *desc = &allocatedDescs[i]; |
| |
| if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd) |
| return FreeDesc(desc); |
| } |
| |
| /* Only get here if someone passes us a file not in allocatedDescs */ |
| elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile"); |
| |
| return close(fd); |
| } |
| |
| /* |
| * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir |
| * rather than plain opendir(). This lets fd.c deal with freeing FDs if |
| * necessary to open the directory, and with closing it after an elog. |
| * When done, call FreeDir rather than closedir. |
| * |
| * Returns NULL, with errno set, on failure. Note that failure detection |
| * is commonly left to the following call of ReadDir or ReadDirExtended; |
| * see the comments for ReadDir. |
| * |
| * Ideally this should be the *only* direct call of opendir() in the backend. |
| */ |
| DIR * |
| AllocateDir(const char *dirname) |
| { |
| DIR *dir; |
| |
| DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)", |
| numAllocatedDescs, dirname)); |
| |
| /* Can we allocate another non-virtual FD? */ |
| if (!reserveAllocatedDesc()) |
| ereport(ERROR, |
| (errcode(ERRCODE_INSUFFICIENT_RESOURCES), |
| errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"", |
| maxAllocatedDescs, dirname))); |
| |
| /* Close excess kernel FDs. */ |
| ReleaseLruFiles(); |
| |
| TryAgain: |
| if ((dir = opendir(dirname)) != NULL) |
| { |
| AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; |
| |
| desc->kind = AllocateDescDir; |
| desc->desc.dir = dir; |
| desc->create_subid = GetCurrentSubTransactionId(); |
| numAllocatedDescs++; |
| return desc->desc.dir; |
| } |
| |
| if (errno == EMFILE || errno == ENFILE) |
| { |
| int save_errno = errno; |
| |
| ereport(LOG, |
| (errcode(ERRCODE_INSUFFICIENT_RESOURCES), |
| errmsg("out of file descriptors: %m; release and retry"))); |
| errno = 0; |
| if (ReleaseLruFile()) |
| goto TryAgain; |
| errno = save_errno; |
| } |
| |
| return NULL; |
| } |
| |
| /* |
| * Read a directory opened with AllocateDir, ereport'ing any error. |
| * |
| * This is easier to use than raw readdir() since it takes care of some |
| * otherwise rather tedious and error-prone manipulation of errno. Also, |
| * if you are happy with a generic error message for AllocateDir failure, |
| * you can just do |
| * |
| * dir = AllocateDir(path); |
| * while ((dirent = ReadDir(dir, path)) != NULL) |
| * process dirent; |
| * FreeDir(dir); |
| * |
| * since a NULL dir parameter is taken as indicating AllocateDir failed. |
| * (Make sure errno isn't changed between AllocateDir and ReadDir if you |
| * use this shortcut.) |
| * |
| * The pathname passed to AllocateDir must be passed to this routine too, |
| * but it is only used for error reporting. |
| */ |
| struct dirent * |
| ReadDir(DIR *dir, const char *dirname) |
| { |
| return ReadDirExtended(dir, dirname, ERROR); |
| } |
| |
| /* |
| * Alternate version of ReadDir that allows caller to specify the elevel |
| * for any error report (whether it's reporting an initial failure of |
| * AllocateDir or a subsequent directory read failure). |
| * |
| * If elevel < ERROR, returns NULL after any error. With the normal coding |
| * pattern, this will result in falling out of the loop immediately as |
| * though the directory contained no (more) entries. |
| */ |
| struct dirent * |
| ReadDirExtended(DIR *dir, const char *dirname, int elevel) |
| { |
| struct dirent *dent; |
| |
| /* Give a generic message for AllocateDir failure, if caller didn't */ |
| if (dir == NULL) |
| { |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not open directory \"%s\": %m", |
| dirname))); |
| return NULL; |
| } |
| |
| errno = 0; |
| if ((dent = readdir(dir)) != NULL) |
| return dent; |
| |
| if (errno) |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not read directory \"%s\": %m", |
| dirname))); |
| return NULL; |
| } |
| |
| /* |
| * Close a directory opened with AllocateDir. |
| * |
| * Returns closedir's return value (with errno set if it's not 0). |
| * Note we do not check the return value --- it is up to the caller |
| * to handle close errors if wanted. |
| * |
| * Does nothing if dir == NULL; we assume that directory open failure was |
| * already reported if desired. |
| */ |
| int |
| FreeDir(DIR *dir) |
| { |
| int i; |
| |
| /* Nothing to do if AllocateDir failed */ |
| if (dir == NULL) |
| return 0; |
| |
| DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs)); |
| |
| /* Remove dir from list of allocated dirs, if it's present */ |
| for (i = numAllocatedDescs; --i >= 0;) |
| { |
| AllocateDesc *desc = &allocatedDescs[i]; |
| |
| if (desc->kind == AllocateDescDir && desc->desc.dir == dir) |
| return FreeDesc(desc); |
| } |
| |
| /* Only get here if someone passes us a dir not in allocatedDescs */ |
| elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir"); |
| Assert(false); |
| |
| return closedir(dir); |
| } |
| |
| |
| /* |
| * Close a pipe stream returned by OpenPipeStream. |
| */ |
| int |
| ClosePipeStream(FILE *file) |
| { |
| int i; |
| |
| DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs)); |
| |
| /* Remove file from list of allocated files, if it's present */ |
| for (i = numAllocatedDescs; --i >= 0;) |
| { |
| AllocateDesc *desc = &allocatedDescs[i]; |
| |
| if (desc->kind == AllocateDescPipe && desc->desc.file == file) |
| return FreeDesc(desc); |
| } |
| |
| /* Only get here if someone passes us a file not in allocatedDescs */ |
| elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream"); |
| |
| return pclose(file); |
| } |
| |
| /* |
| * closeAllVfds |
| * |
| * Force all VFDs into the physically-closed state, so that the fewest |
| * possible number of kernel file descriptors are in use. There is no |
| * change in the logical state of the VFDs. |
| */ |
| void |
| closeAllVfds(void) |
| { |
| Index i; |
| |
| if (SizeVfdCache > 0) |
| { |
| Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */ |
| for (i = 1; i < SizeVfdCache; i++) |
| { |
| if (!FileIsNotOpen(i)) |
| LruDelete(i); |
| } |
| } |
| } |
| |
| |
| /* |
| * SetTempTablespaces |
| * |
| * Define a list (actually an array) of OIDs of tablespaces to use for |
| * temporary files. This list will be used until end of transaction, |
| * unless this function is called again before then. It is caller's |
| * responsibility that the passed-in array has adequate lifespan (typically |
| * it'd be allocated in TopTransactionContext). |
| * |
| * Some entries of the array may be InvalidOid, indicating that the current |
| * database's default tablespace should be used. |
| */ |
| void |
| SetTempTablespaces(Oid *tableSpaces, int numSpaces) |
| { |
| Assert(numSpaces >= 0); |
| tempTableSpaces = tableSpaces; |
| numTempTableSpaces = numSpaces; |
| |
| /* |
| * Select a random starting point in the list. This is to minimize |
| * conflicts between backends that are most likely sharing the same list |
| * of temp tablespaces. Note that if we create multiple temp files in the |
| * same transaction, we'll advance circularly through the list --- this |
| * ensures that large temporary sort files are nicely spread across all |
| * available tablespaces. |
| */ |
| if (numSpaces > 1) |
| nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state, |
| 0, numSpaces - 1); |
| else |
| nextTempTableSpace = 0; |
| } |
| |
| /* |
| * TempTablespacesAreSet |
| * |
| * Returns true if SetTempTablespaces has been called in current transaction. |
| * (This is just so that tablespaces.c doesn't need its own per-transaction |
| * state.) |
| */ |
| bool |
| TempTablespacesAreSet(void) |
| { |
| return (numTempTableSpaces >= 0); |
| } |
| |
| /* |
| * GetTempTablespaces |
| * |
| * Populate an array with the OIDs of the tablespaces that should be used for |
| * temporary files. (Some entries may be InvalidOid, indicating that the |
| * current database's default tablespace should be used.) At most numSpaces |
| * entries will be filled. |
| * Returns the number of OIDs that were copied into the output array. |
| */ |
| int |
| GetTempTablespaces(Oid *tableSpaces, int numSpaces) |
| { |
| int i; |
| |
| /* |
| * GPDB: This function is called only by SharedFileSetInit(), in which |
| * we call PrepareTempTablespaces() just before this function. In upstream |
| * Postgres, we would only go through this code path inside a transaction. |
| * However, in GPDB, SharedFileSetInit() may also get called in the process |
| * of ExecSquelchShareInputScan(), which could happen during abort |
| * transaction. If we are not in a transaction, PrepareTempTablespaces() |
| * would have to return early without setting the temp tablespaces. The |
| * shared fileset in this case will be writen in the default table space |
| * rather than the temp tablespaces. |
| */ |
| Assert(TempTablespacesAreSet() || IsAbortInProgress()); |
| for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i) |
| tableSpaces[i] = tempTableSpaces[i]; |
| |
| return i; |
| } |
| |
| /* |
| * GetNextTempTableSpace |
| * |
| * Select the next temp tablespace to use. A result of InvalidOid means |
| * to use the current database's default tablespace. |
| */ |
| Oid |
| GetNextTempTableSpace(void) |
| { |
| if (numTempTableSpaces > 0) |
| { |
| /* Advance nextTempTableSpace counter with wraparound */ |
| if (++nextTempTableSpace >= numTempTableSpaces) |
| nextTempTableSpace = 0; |
| return tempTableSpaces[nextTempTableSpace]; |
| } |
| return InvalidOid; |
| } |
| |
| |
| /* |
| * AtEOSubXact_Files |
| * |
| * Take care of subtransaction commit/abort. At abort, we close temp files |
| * that the subtransaction may have opened. At commit, we reassign the |
| * files that were opened to the parent subtransaction. |
| */ |
| void |
| AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, |
| SubTransactionId parentSubid) |
| { |
| Index i; |
| |
| for (i = 0; i < numAllocatedDescs; i++) |
| { |
| if (allocatedDescs[i].create_subid == mySubid) |
| { |
| if (isCommit) |
| allocatedDescs[i].create_subid = parentSubid; |
| else |
| { |
| /* have to recheck the item after FreeDesc (ugly) */ |
| FreeDesc(&allocatedDescs[i--]); |
| } |
| } |
| } |
| } |
| |
| /* |
| * AtEOXact_Files |
| * |
| * This routine is called during transaction commit or abort. All still-open |
| * per-transaction temporary file VFDs are closed, which also causes the |
| * underlying files to be deleted (although they should've been closed already |
| * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are |
| * closed. We also forget any transaction-local temp tablespace list. |
| * |
| * The isCommit flag is used only to decide whether to emit warnings about |
| * unclosed files. |
| */ |
| void |
| AtEOXact_Files(bool isCommit) |
| { |
| CleanupTempFiles(isCommit, false); |
| tempTableSpaces = NULL; |
| numTempTableSpaces = -1; |
| } |
| |
| /* |
| * BeforeShmemExit_Files |
| * |
| * before_shmem_exit hook to clean up temp files during backend shutdown. |
| * Here, we want to clean up *all* temp files including interXact ones. |
| */ |
| static void |
| BeforeShmemExit_Files(int code, Datum arg) |
| { |
| CleanupTempFiles(false, true); |
| |
| /* prevent further temp files from being created */ |
| #ifdef USE_ASSERT_CHECKING |
| temporary_files_allowed = false; |
| #endif |
| } |
| |
| /* |
| * Close temporary files and delete their underlying files. |
| * |
| * isCommit: if true, this is normal transaction commit, and we don't |
| * expect any remaining files; warn if there are some. |
| * |
| * isProcExit: if true, this is being called as the backend process is |
| * exiting. If that's the case, we should remove all temporary files; if |
| * that's not the case, we are being called for transaction commit/abort |
| * and should only remove transaction-local temp files. In either case, |
| * also clean up "allocated" stdio files, dirs and fds. |
| */ |
| static void |
| CleanupTempFiles(bool isCommit, bool isProcExit) |
| { |
| Index i; |
| |
| /* |
| * Careful here: at proc_exit we need extra cleanup, not just |
| * xact_temporary files. |
| */ |
| if (isProcExit || have_xact_temporary_files) |
| { |
| Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */ |
| for (i = 1; i < SizeVfdCache; i++) |
| { |
| unsigned short fdstate = VfdCache[i].fdstate; |
| |
| if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) && |
| VfdCache[i].fileName != NULL) |
| { |
| /* |
| * If we're in the process of exiting a backend process, close |
| * all temporary files. Otherwise, only close temporary files |
| * local to the current transaction. They should be closed by |
| * the ResourceOwner mechanism already, so this is just a |
| * debugging cross-check. |
| */ |
| if (isProcExit) |
| FileClose(i); |
| else if (fdstate & FD_CLOSE_AT_EOXACT) |
| { |
| elog(WARNING, |
| "temporary file %s not closed at end-of-transaction", |
| VfdCache[i].fileName); |
| FileClose(i); |
| } |
| } |
| } |
| |
| have_xact_temporary_files = false; |
| } |
| |
| /* Complain if any allocated files remain open at commit. */ |
| if (isCommit && numAllocatedDescs > 0) |
| elog(WARNING, "%d temporary files and directories not closed at end-of-transaction", |
| numAllocatedDescs); |
| |
| /* Clean up "allocated" stdio files, dirs and fds. */ |
| while (numAllocatedDescs > 0) |
| FreeDesc(&allocatedDescs[0]); |
| } |
| |
| |
| /* |
| * Remove temporary and temporary relation files left over from a prior |
| * postmaster session |
| * |
| * This should be called during postmaster startup. It will forcibly |
| * remove any leftover files created by OpenTemporaryFile and any leftover |
| * temporary relation files created by mdcreate. |
| * |
| * During post-backend-crash restart cycle, this routine is called when |
| * remove_temp_files_after_crash GUC is enabled. Multiple crashes while |
| * queries are using temp files could result in useless storage usage that can |
| * only be reclaimed by a service restart. The argument against enabling it is |
| * that someone might want to examine the temporary files for debugging |
| * purposes. This does however mean that OpenTemporaryFile had better allow for |
| * collision with an existing temp file name. |
| * |
| * NOTE: this function and its subroutines generally report syscall failures |
| * with ereport(LOG) and keep going. Removing temp files is not so critical |
| * that we should fail to start the database when we can't do it. |
| */ |
| void |
| RemovePgTempFiles(void) |
| { |
| char temp_path[MAXPGPATH + 11 + MAX_DBID_STRING_LENGTH + 1 + sizeof(GP_TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)]; |
| DIR *spc_dir; |
| struct dirent *spc_de; |
| |
| /* |
| * First process temp files in pg_default ($PGDATA/base) |
| */ |
| snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR); |
| RemovePgTempFilesInDir(temp_path, true, false); |
| RemovePgTempRelationFiles("base"); |
| |
| /* |
| * Cycle through temp directories for all non-default tablespaces. |
| */ |
| spc_dir = AllocateDir("pg_tblspc"); |
| |
| while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL) |
| { |
| if (strcmp(spc_de->d_name, ".") == 0 || |
| strcmp(spc_de->d_name, "..") == 0) |
| continue; |
| |
| snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s", |
| spc_de->d_name, GP_TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR); |
| RemovePgTempFilesInDir(temp_path, true, false); |
| |
| snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s", |
| spc_de->d_name, GP_TABLESPACE_VERSION_DIRECTORY); |
| RemovePgTempRelationFiles(temp_path); |
| } |
| |
| FreeDir(spc_dir); |
| |
| /* |
| * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of |
| * DataDir as well. However, that is *not* cleaned here because doing so |
| * would create a race condition. It's done separately, earlier in |
| * postmaster startup. |
| */ |
| } |
| |
| /* |
| * Process one pgsql_tmp directory for RemovePgTempFiles. |
| * |
| * If missing_ok is true, it's all right for the named directory to not exist. |
| * Any other problem results in a LOG message. (missing_ok should be true at |
| * the top level, since pgsql_tmp directories are not created until needed.) |
| * |
| * At the top level, this should be called with unlink_all = false, so that |
| * only files matching the temporary name prefix will be unlinked. When |
| * recursing it will be called with unlink_all = true to unlink everything |
| * under a top-level temporary directory. |
| * |
| * (These two flags could be replaced by one, but it seems clearer to keep |
| * them separate.) |
| */ |
| void |
| RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all) |
| { |
| DIR *temp_dir; |
| struct dirent *temp_de; |
| char rm_path[MAXPGPATH * 2]; |
| |
| temp_dir = AllocateDir(tmpdirname); |
| |
| if (temp_dir == NULL && errno == ENOENT && missing_ok) |
| return; |
| |
| while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL) |
| { |
| if (strcmp(temp_de->d_name, ".") == 0 || |
| strcmp(temp_de->d_name, "..") == 0) |
| continue; |
| |
| snprintf(rm_path, sizeof(rm_path), "%s/%s", |
| tmpdirname, temp_de->d_name); |
| |
| if (unlink_all || |
| strncmp(temp_de->d_name, |
| PG_TEMP_FILE_PREFIX, |
| strlen(PG_TEMP_FILE_PREFIX)) == 0) |
| { |
| PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG); |
| |
| if (type == PGFILETYPE_ERROR) |
| continue; |
| else if (type == PGFILETYPE_DIR) |
| { |
| /* recursively remove contents, then directory itself */ |
| RemovePgTempFilesInDir(rm_path, false, true); |
| |
| if (rmdir(rm_path) < 0) |
| ereport(LOG, |
| (errcode_for_file_access(), |
| errmsg("could not remove directory \"%s\": %m", |
| rm_path))); |
| } |
| else |
| { |
| if (unlink(rm_path) < 0) |
| ereport(LOG, |
| (errcode_for_file_access(), |
| errmsg("could not remove file \"%s\": %m", |
| rm_path))); |
| } |
| } |
| else |
| ereport(LOG, |
| (errmsg("unexpected file found in temporary-files directory: \"%s\"", |
| rm_path))); |
| } |
| |
| FreeDir(temp_dir); |
| } |
| |
| /* Process one tablespace directory, look for per-DB subdirectories */ |
| static void |
| RemovePgTempRelationFiles(const char *tsdirname) |
| { |
| DIR *ts_dir; |
| struct dirent *de; |
| char dbspace_path[MAXPGPATH * 2]; |
| |
| ts_dir = AllocateDir(tsdirname); |
| |
| while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL) |
| { |
| /* |
| * We're only interested in the per-database directories, which have |
| * numeric names. Note that this code will also (properly) ignore "." |
| * and "..". |
| */ |
| if (strspn(de->d_name, "0123456789") != strlen(de->d_name)) |
| continue; |
| |
| snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s", |
| tsdirname, de->d_name); |
| RemovePgTempRelationFilesInDbspace(dbspace_path); |
| } |
| |
| FreeDir(ts_dir); |
| } |
| |
| /* Process one per-dbspace directory for RemovePgTempRelationFiles */ |
| static void |
| RemovePgTempRelationFilesInDbspace(const char *dbspacedirname) |
| { |
| DIR *dbspace_dir; |
| struct dirent *de; |
| char rm_path[MAXPGPATH * 2]; |
| |
| dbspace_dir = AllocateDir(dbspacedirname); |
| |
| while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL) |
| { |
| if (!looks_like_temp_rel_name(de->d_name)) |
| continue; |
| |
| snprintf(rm_path, sizeof(rm_path), "%s/%s", |
| dbspacedirname, de->d_name); |
| |
| if (unlink(rm_path) < 0) |
| ereport(LOG, |
| (errcode_for_file_access(), |
| errmsg("could not remove file \"%s\": %m", |
| rm_path))); |
| } |
| |
| FreeDir(dbspace_dir); |
| } |
| |
| /* |
| * In PostgreSQL, the pattern is: |
| * |
| * t<digits>_<digits>, or t<digits>_<digits>_<forkname> |
| * |
| * In GPDB, however, we leave out the first <digits>. In PostgreSQL it's |
| * used for the backend ID, but we don't use that in GPDB because even |
| * temporary relation are kept in shared buffers, and need to be accessible |
| * from multiple backends. So the pattern in GPDB is: |
| * |
| * t_<digits>, or t_<digits>_<forkname> |
| */ |
| bool |
| looks_like_temp_rel_name(const char *name) |
| { |
| int pos; |
| int savepos; |
| |
| /* Must start with "t". */ |
| if (name[0] != 't') |
| return false; |
| |
| /* Followed by underscode. */ |
| if (name[1] != '_') |
| return false; |
| pos = 1; |
| |
| /* Followed by another nonempty string of digits. */ |
| for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos) |
| ; |
| if (savepos == pos) |
| return false; |
| |
| /* We might have _forkname or .segment or both. */ |
| if (name[pos] == '_') |
| { |
| int forkchar = forkname_chars(&name[pos + 1], NULL); |
| |
| if (forkchar <= 0) |
| return false; |
| pos += forkchar + 1; |
| } |
| if (name[pos] == '.') |
| { |
| int segchar; |
| |
| for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar) |
| ; |
| if (segchar <= 1) |
| return false; |
| pos += segchar; |
| } |
| |
| /* Now we should be at the end. */ |
| if (name[pos] != '\0') |
| return false; |
| return true; |
| } |
| |
| /* |
| * Synchronize all xlog files and pg_wal itself in pg_wal |
| * |
| * This is called at the beginning of recovery. |
| */ |
| void |
| SyncAllXLogFiles(void) |
| { |
| /* We can skip this whole thing if fsync is disabled. */ |
| if (!enableFsync) |
| return; |
| |
| ereport(LOG, (errmsg("Synchronization of the wal directory starts."))); |
| walkdir("pg_wal", datadir_fsync_fname, false, LOG); |
| ereport(LOG, (errmsg("synchronization of the wal directory finishes."))); |
| } |
| |
| #ifdef HAVE_SYNCFS |
| static void |
| do_syncfs(const char *path) |
| { |
| int fd; |
| |
| ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s", |
| path); |
| |
| fd = OpenTransientFile(path, O_RDONLY); |
| if (fd < 0) |
| { |
| ereport(LOG, |
| (errcode_for_file_access(), |
| errmsg("could not open file \"%s\": %m", path))); |
| return; |
| } |
| if (syncfs(fd) < 0) |
| ereport(LOG, |
| (errcode_for_file_access(), |
| errmsg("could not synchronize file system for file \"%s\": %m", path))); |
| CloseTransientFile(fd); |
| } |
| #endif |
| |
| /* |
| * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for |
| * all potential filesystem, depending on recovery_init_sync_method setting. |
| * |
| * We fsync regular files and directories wherever they are, but we |
| * follow symlinks only for pg_wal and immediately under pg_tblspc. |
| * Other symlinks are presumed to point at files we're not responsible |
| * for fsyncing, and might not have privileges to write at all. |
| * |
| * Errors are logged but not considered fatal; that's because this is used |
| * only during database startup, to deal with the possibility that there are |
| * issued-but-unsynced writes pending against the data directory. We want to |
| * ensure that such writes reach disk before anything that's done in the new |
| * run. However, aborting on error would result in failure to start for |
| * harmless cases such as read-only files in the data directory, and that's |
| * not good either. |
| * |
| * Note that if we previously crashed due to a PANIC on fsync(), we'll be |
| * rewriting all changes again during recovery. |
| * |
| * Note we assume we're chdir'd into PGDATA to begin with. |
| */ |
| void |
| SyncDataDirectory(void) |
| { |
| bool xlog_is_symlink; |
| |
| /* We can skip this whole thing if fsync is disabled. */ |
| if (!enableFsync) |
| return; |
| |
| /* |
| * If pg_wal is a symlink, we'll need to recurse into it separately, |
| * because the first walkdir below will ignore it. |
| */ |
| xlog_is_symlink = false; |
| |
| { |
| struct stat st; |
| |
| if (lstat("pg_wal", &st) < 0) |
| ereport(LOG, |
| (errcode_for_file_access(), |
| errmsg("could not stat file \"%s\": %m", |
| "pg_wal"))); |
| else if (S_ISLNK(st.st_mode)) |
| xlog_is_symlink = true; |
| } |
| |
| #ifdef HAVE_SYNCFS |
| if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS) |
| { |
| DIR *dir; |
| struct dirent *de; |
| |
| /* |
| * On Linux, we don't have to open every single file one by one. We |
| * can use syncfs() to sync whole filesystems. We only expect |
| * filesystem boundaries to exist where we tolerate symlinks, namely |
| * pg_wal and the tablespaces, so we call syncfs() for each of those |
| * directories. |
| */ |
| |
| /* Prepare to report progress syncing the data directory via syncfs. */ |
| begin_startup_progress_phase(); |
| |
| /* Sync the top level pgdata directory. */ |
| do_syncfs("."); |
| /* If any tablespaces are configured, sync each of those. */ |
| dir = AllocateDir("pg_tblspc"); |
| while ((de = ReadDirExtended(dir, "pg_tblspc", LOG))) |
| { |
| char path[MAXPGPATH]; |
| |
| if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) |
| continue; |
| |
| snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name); |
| do_syncfs(path); |
| } |
| FreeDir(dir); |
| /* If pg_wal is a symlink, process that too. */ |
| if (xlog_is_symlink) |
| do_syncfs("pg_wal"); |
| return; |
| } |
| #endif /* !HAVE_SYNCFS */ |
| |
| #ifdef PG_FLUSH_DATA_WORKS |
| /* Prepare to report progress of the pre-fsync phase. */ |
| begin_startup_progress_phase(); |
| |
| /* |
| * If possible, hint to the kernel that we're soon going to fsync the data |
| * directory and its contents. Errors in this step are even less |
| * interesting than normal, so log them only at DEBUG1. |
| */ |
| walkdir(".", pre_sync_fname, false, DEBUG1); |
| if (xlog_is_symlink) |
| walkdir("pg_wal", pre_sync_fname, false, DEBUG1); |
| walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1); |
| #endif |
| |
| /* Prepare to report progress syncing the data directory via fsync. */ |
| begin_startup_progress_phase(); |
| |
| /* |
| * Now we do the fsync()s in the same order. |
| * |
| * The main call ignores symlinks, so in addition to specially processing |
| * pg_wal if it's a symlink, pg_tblspc has to be visited separately with |
| * process_symlinks = true. Note that if there are any plain directories |
| * in pg_tblspc, they'll get fsync'd twice. That's not an expected case |
| * so we don't worry about optimizing it. |
| */ |
| walkdir(".", datadir_fsync_fname, false, LOG); |
| if (xlog_is_symlink) |
| walkdir("pg_wal", datadir_fsync_fname, false, LOG); |
| walkdir("pg_tblspc", datadir_fsync_fname, true, LOG); |
| } |
| |
| /* |
| * walkdir: recursively walk a directory, applying the action to each |
| * regular file and directory (including the named directory itself). |
| * |
| * If process_symlinks is true, the action and recursion are also applied |
| * to regular files and directories that are pointed to by symlinks in the |
| * given directory; otherwise symlinks are ignored. Symlinks are always |
| * ignored in subdirectories, ie we intentionally don't pass down the |
| * process_symlinks flag to recursive calls. |
| * |
| * Errors are reported at level elevel, which might be ERROR or less. |
| * |
| * See also walkdir in file_utils.c, which is a frontend version of this |
| * logic. |
| */ |
| static void |
| walkdir(const char *path, |
| void (*action) (const char *fname, bool isdir, int elevel), |
| bool process_symlinks, |
| int elevel) |
| { |
| DIR *dir; |
| struct dirent *de; |
| |
| dir = AllocateDir(path); |
| |
| while ((de = ReadDirExtended(dir, path, elevel)) != NULL) |
| { |
| char subpath[MAXPGPATH * 2]; |
| |
| CHECK_FOR_INTERRUPTS(); |
| |
| if (strcmp(de->d_name, ".") == 0 || |
| strcmp(de->d_name, "..") == 0) |
| continue; |
| |
| snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name); |
| |
| switch (get_dirent_type(subpath, de, process_symlinks, elevel)) |
| { |
| case PGFILETYPE_REG: |
| (*action) (subpath, false, elevel); |
| break; |
| case PGFILETYPE_DIR: |
| walkdir(subpath, action, false, elevel); |
| break; |
| default: |
| |
| /* |
| * Errors are already reported directly by get_dirent_type(), |
| * and any remaining symlinks and unknown file types are |
| * ignored. |
| */ |
| break; |
| } |
| } |
| |
| FreeDir(dir); /* we ignore any error here */ |
| |
| /* |
| * It's important to fsync the destination directory itself as individual |
| * file fsyncs don't guarantee that the directory entry for the file is |
| * synced. However, skip this if AllocateDir failed; the action function |
| * might not be robust against that. |
| */ |
| if (dir) |
| (*action) (path, true, elevel); |
| } |
| |
| |
| /* |
| * Hint to the OS that it should get ready to fsync() this file. |
| * |
| * Ignores errors trying to open unreadable files, and logs other errors at a |
| * caller-specified level. |
| */ |
| #ifdef PG_FLUSH_DATA_WORKS |
| |
| static void |
| pre_sync_fname(const char *fname, bool isdir, int elevel) |
| { |
| int fd; |
| |
| /* Don't try to flush directories, it'll likely just fail */ |
| if (isdir) |
| return; |
| |
| ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s", |
| fname); |
| |
| fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY); |
| |
| if (fd < 0) |
| { |
| if (errno == EACCES) |
| return; |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not open file \"%s\": %m", fname))); |
| return; |
| } |
| |
| /* |
| * pg_flush_data() ignores errors, which is ok because this is only a |
| * hint. |
| */ |
| pg_flush_data(fd, 0, 0); |
| |
| if (CloseTransientFile(fd) != 0) |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not close file \"%s\": %m", fname))); |
| } |
| |
| #endif /* PG_FLUSH_DATA_WORKS */ |
| |
| static void |
| datadir_fsync_fname(const char *fname, bool isdir, int elevel) |
| { |
| ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s", |
| fname); |
| |
| /* |
| * We want to silently ignoring errors about unreadable files. Pass that |
| * desire on to fsync_fname_ext(). |
| */ |
| fsync_fname_ext(fname, isdir, true, elevel); |
| } |
| |
| static void |
| unlink_if_exists_fname(const char *fname, bool isdir, int elevel) |
| { |
| if (isdir) |
| { |
| if (rmdir(fname) != 0 && errno != ENOENT) |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not remove directory \"%s\": %m", fname))); |
| } |
| else |
| { |
| /* Use PathNameDeleteTemporaryFile to report filesize */ |
| PathNameDeleteTemporaryFile(fname, false); |
| } |
| } |
| |
| /* |
| * fsync_fname_ext -- Try to fsync a file or directory |
| * |
| * If ignore_perm is true, ignore errors upon trying to open unreadable |
| * files. Logs other errors at a caller-specified level. |
| * |
| * Returns 0 if the operation succeeded, -1 otherwise. |
| */ |
| int |
| fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel) |
| { |
| int fd; |
| int flags; |
| int returncode; |
| |
| /* |
| * Some OSs require directories to be opened read-only whereas other |
| * systems don't allow us to fsync files opened read-only; so we need both |
| * cases here. Using O_RDWR will cause us to fail to fsync files that are |
| * not writable by our userid, but we assume that's OK. |
| */ |
| flags = PG_BINARY; |
| if (!isdir) |
| flags |= O_RDWR; |
| else |
| flags |= O_RDONLY; |
| |
| fd = OpenTransientFile(fname, flags); |
| |
| /* |
| * Some OSs don't allow us to open directories at all (Windows returns |
| * EACCES), just ignore the error in that case. If desired also silently |
| * ignoring errors about unreadable files. Log others. |
| */ |
| if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) |
| return 0; |
| else if (fd < 0 && ignore_perm && errno == EACCES) |
| return 0; |
| else if (fd < 0) |
| { |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not open file \"%s\": %m", fname))); |
| return -1; |
| } |
| |
| returncode = pg_fsync(fd); |
| |
| /* |
| * Some OSes don't allow us to fsync directories at all, so we can ignore |
| * those errors. Anything else needs to be logged. |
| */ |
| if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL))) |
| { |
| int save_errno; |
| |
| /* close file upon error, might not be in transaction context */ |
| save_errno = errno; |
| (void) CloseTransientFile(fd); |
| errno = save_errno; |
| |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not fsync file \"%s\": %m", fname))); |
| return -1; |
| } |
| |
| if (CloseTransientFile(fd) != 0) |
| { |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not close file \"%s\": %m", fname))); |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * fsync_parent_path -- fsync the parent path of a file or directory |
| * |
| * This is aimed at making file operations persistent on disk in case of |
| * an OS crash or power failure. |
| */ |
| static int |
| fsync_parent_path(const char *fname, int elevel) |
| { |
| char parentpath[MAXPGPATH]; |
| |
| strlcpy(parentpath, fname, MAXPGPATH); |
| get_parent_directory(parentpath); |
| |
| /* |
| * get_parent_directory() returns an empty string if the input argument is |
| * just a file name (see comments in path.c), so handle that as being the |
| * current directory. |
| */ |
| if (strlen(parentpath) == 0) |
| strlcpy(parentpath, ".", MAXPGPATH); |
| |
| if (fsync_fname_ext(parentpath, true, false, elevel) != 0) |
| return -1; |
| |
| return 0; |
| } |
| |
| const char * |
| FileGetFilename(File file) |
| { |
| |
| Assert(FileIsValid(file)); |
| |
| DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d", |
| file, VfdCache[file].fileName, |
| (int64) VfdCache[file].seekPos, |
| (int64) offset, whence)); |
| |
| return VfdCache[file].fileName; |
| } |
| |
| /* |
| * Mark the file as a "work file" that should be tracked by the workfile manager. |
| */ |
| void |
| FileSetIsWorkfile(File file) |
| { |
| VfdCache[file].fdstate |= FD_WORKFILE; |
| } |
| |
| /* |
| * Create a PostgreSQL data sub-directory |
| * |
| * The data directory itself, and most of its sub-directories, are created at |
| * initdb time, but we do have some occasions when we create directories in |
| * the backend (CREATE TABLESPACE, for example). In those cases, we want to |
| * make sure that those directories are created consistently. Today, that means |
| * making sure that the created directory has the correct permissions, which is |
| * what pg_dir_create_mode tracks for us. |
| * |
| * Note that we also set the umask() based on what we understand the correct |
| * permissions to be (see file_perm.c). |
| * |
| * For permissions other than the default, mkdir() can be used directly, but |
| * be sure to consider carefully such cases -- a sub-directory with incorrect |
| * permissions in a PostgreSQL data directory could cause backups and other |
| * processes to fail. |
| */ |
| int |
| MakePGDirectory(const char *directoryName) |
| { |
| return mkdir(directoryName, pg_dir_create_mode); |
| } |
| |
| /* |
| * Return the passed-in error level, or PANIC if data_sync_retry is off. |
| * |
| * Failure to fsync any data file is cause for immediate panic, unless |
| * data_sync_retry is enabled. Data may have been written to the operating |
| * system and removed from our buffer pool already, and if we are running on |
| * an operating system that forgets dirty data on write-back failure, there |
| * may be only one copy of the data remaining: in the WAL. A later attempt to |
| * fsync again might falsely report success. Therefore we must not allow any |
| * further checkpoints to be attempted. data_sync_retry can in theory be |
| * enabled on systems known not to drop dirty buffered data on write-back |
| * failure (with the likely outcome that checkpoints will continue to fail |
| * until the underlying problem is fixed). |
| * |
| * Any code that reports a failure from fsync() or related functions should |
| * filter the error level with this function. |
| */ |
| int |
| data_sync_elevel(int elevel) |
| { |
| return data_sync_retry ? elevel : PANIC; |
| } |
| |
| bool |
| check_debug_io_direct(char **newval, void **extra, GucSource source) |
| { |
| bool result = true; |
| int flags; |
| |
| #if PG_O_DIRECT == 0 |
| if (strcmp(*newval, "") != 0) |
| { |
| GUC_check_errdetail("debug_io_direct is not supported on this platform."); |
| result = false; |
| } |
| flags = 0; |
| #else |
| List *elemlist; |
| ListCell *l; |
| char *rawstring; |
| |
| /* Need a modifiable copy of string */ |
| rawstring = pstrdup(*newval); |
| |
| if (!SplitGUCList(rawstring, ',', &elemlist)) |
| { |
| GUC_check_errdetail("invalid list syntax in parameter \"%s\"", |
| "debug_io_direct"); |
| pfree(rawstring); |
| list_free(elemlist); |
| return false; |
| } |
| |
| flags = 0; |
| foreach(l, elemlist) |
| { |
| char *item = (char *) lfirst(l); |
| |
| if (pg_strcasecmp(item, "data") == 0) |
| flags |= IO_DIRECT_DATA; |
| else if (pg_strcasecmp(item, "wal") == 0) |
| flags |= IO_DIRECT_WAL; |
| else if (pg_strcasecmp(item, "wal_init") == 0) |
| flags |= IO_DIRECT_WAL_INIT; |
| else |
| { |
| GUC_check_errdetail("invalid option \"%s\"", item); |
| result = false; |
| break; |
| } |
| } |
| |
| /* |
| * It's possible to configure block sizes smaller than our assumed I/O |
| * alignment size, which could result in invalid I/O requests. |
| */ |
| #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE |
| if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT))) |
| { |
| GUC_check_errdetail("debug_io_direct is not supported for WAL because XLOG_BLCKSZ is too small"); |
| result = false; |
| } |
| #endif |
| #if BLCKSZ < PG_IO_ALIGN_SIZE |
| if (result && (flags & IO_DIRECT_DATA)) |
| { |
| GUC_check_errdetail("debug_io_direct is not supported for data because BLCKSZ is too small"); |
| result = false; |
| } |
| #endif |
| |
| pfree(rawstring); |
| list_free(elemlist); |
| #endif |
| |
| if (!result) |
| return result; |
| |
| /* Save the flags in *extra, for use by assign_debug_io_direct */ |
| *extra = guc_malloc(ERROR, sizeof(int)); |
| *((int *) *extra) = flags; |
| |
| return result; |
| } |
| |
| extern void |
| assign_debug_io_direct(const char *newval, void *extra) |
| { |
| int *flags = (int *) extra; |
| |
| io_direct_flags = *flags; |
| } |
| |
| void FileSetTempfile(File file, bool isTemp) |
| { |
| if (isTemp) |
| VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE; |
| else |
| VfdCache[file].fdstate &= ~FD_DELETE_AT_CLOSE; |
| } |