| /*------------------------------------------------------------------------- |
| * |
| * dynahash.c |
| * dynamic hash tables |
| * |
| * dynahash.c supports both local-to-a-backend hash tables and hash tables in |
| * shared memory. For shared hash tables, it is the caller's responsibility |
| * to provide appropriate access interlocking. The simplest convention is |
| * that a single LWLock protects the whole hash table. Searches (HASH_FIND or |
| * hash_seq_search) need only shared lock, but any update requires exclusive |
| * lock. For heavily-used shared tables, the single-lock approach creates a |
| * concurrency bottleneck, so we also support "partitioned" locking wherein |
| * there are multiple LWLocks guarding distinct subsets of the table. To use |
| * a hash table in partitioned mode, the HASH_PARTITION flag must be given |
| * to hash_create. This prevents any attempt to split buckets on-the-fly. |
| * Therefore, each hash bucket chain operates independently, and no fields |
| * of the hash header change after init except nentries and freeList. |
| * A partitioned table uses a spinlock to guard changes of those two fields. |
| * This lets any subset of the hash buckets be treated as a separately |
| * lockable partition. We expect callers to use the low-order bits of a |
| * lookup key's hash value as a partition number --- this will work because |
| * of the way calc_bucket() maps hash values to bucket numbers. |
| * |
| * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * |
| * IDENTIFICATION |
| * $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.79 2009/01/01 17:23:51 momjian Exp $ |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| /* |
| * Original comments: |
| * |
| * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson. |
| * Coded into C, with minor code improvements, and with hsearch(3) interface, |
| * by ejp@ausmelb.oz, Jul 26, 1988: 13:16; |
| * also, hcreate/hdestroy routines added to simulate hsearch(3). |
| * |
| * These routines simulate hsearch(3) and family, with the important |
| * difference that the hash table is dynamic - can grow indefinitely |
| * beyond its original size (as supplied to hcreate()). |
| * |
| * Performance appears to be comparable to that of hsearch(3). |
| * The 'source-code' options referred to in hsearch(3)'s 'man' page |
| * are not implemented; otherwise functionality is identical. |
| * |
| * Compilation controls: |
| * DEBUG controls some informative traces, mainly for debugging. |
| * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained; |
| * when combined with HASH_DEBUG, these are displayed by hdestroy(). |
| * |
| * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor |
| * concatenation property, in probably unnecessary code 'optimisation'. |
| * |
| * Modified margo@postgres.berkeley.edu February 1990 |
| * added multiple table interface |
| * Modified by sullivan@postgres.berkeley.edu April 1990 |
| * changed ctl structure for shared memory |
| */ |
| |
| #include "postgres.h" |
| |
| #include "access/xact.h" |
| #include "storage/shmem.h" |
| #include "storage/spin.h" |
| #include "utils/dynahash.h" |
| #include "utils/memutils.h" |
| |
| |
| /* |
| * Constants |
| * |
| * A hash table has a top-level "directory", each of whose entries points |
| * to a "segment" of ssize bucket headers. The maximum number of hash |
| * buckets is thus dsize * ssize (but dsize may be expansible). Of course, |
| * the number of records in the table can be larger, but we don't want a |
| * whole lot of records per bucket or performance goes down. |
| * |
| * In a hash table allocated in shared memory, the directory cannot be |
| * expanded because it must stay at a fixed address. The directory size |
| * should be selected using hash_select_dirsize (and you'd better have |
| * a good idea of the maximum number of entries!). For non-shared hash |
| * tables, the initial directory size can be left at the default. |
| */ |
| #define DEF_SEGSIZE 256 |
| #define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */ |
| #define DEF_DIRSIZE 256 |
| #define DEF_FFACTOR 1 /* default fill factor */ |
| |
| |
| /* A hash bucket is a linked list of HASHELEMENTs */ |
| typedef HASHELEMENT *HASHBUCKET; |
| |
| /* A hash segment is an array of bucket headers */ |
| typedef HASHBUCKET *HASHSEGMENT; |
| |
| /* |
| * Header structure for a hash table --- contains all changeable info |
| * |
| * In a shared-memory hash table, the HASHHDR is in shared memory, while |
| * each backend has a local HTAB struct. For a non-shared table, there isn't |
| * any functional difference between HASHHDR and HTAB, but we separate them |
| * anyway to share code between shared and non-shared tables. |
| */ |
| struct HASHHDR |
| { |
| /* In a partitioned table, take this lock to touch nentries or freeList */ |
| slock_t mutex; /* unused if not partitioned table */ |
| |
| /* These fields change during entry addition/deletion */ |
| long nentries; /* number of entries in hash table */ |
| HASHELEMENT *freeList; /* linked list of free elements */ |
| |
| /* These fields can change, but not in a partitioned table */ |
| /* Also, dsize can't change in a shared table, even if unpartitioned */ |
| long dsize; /* directory size */ |
| long nsegs; /* number of allocated segments (<= dsize) */ |
| uint32 max_bucket; /* ID of maximum bucket in use */ |
| uint32 high_mask; /* mask to modulo into entire table */ |
| uint32 low_mask; /* mask to modulo into lower half of table */ |
| |
| /* These fields are fixed at hashtable creation */ |
| Size keysize; /* hash key length in bytes */ |
| Size entrysize; /* total user element size in bytes */ |
| long num_partitions; /* # partitions (must be power of 2), or 0 */ |
| long ffactor; /* target fill factor */ |
| long max_dsize; /* 'dsize' limit if directory is fixed size */ |
| long ssize; /* segment size --- must be power of 2 */ |
| int sshift; /* segment shift = log2(ssize) */ |
| int nelem_alloc; /* number of entries to allocate at once */ |
| |
| #ifdef HASH_STATISTICS |
| |
| /* |
| * Count statistics here. NB: stats code doesn't bother with mutex, so |
| * counts could be corrupted a bit in a partitioned table. |
| */ |
| long accesses; |
| long collisions; |
| #endif |
| }; |
| |
| #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0) |
| |
| /* |
| * Top control structure for a hashtable --- in a shared table, each backend |
| * has its own copy (OK since no fields change at runtime) |
| */ |
| struct HTAB |
| { |
| HASHHDR *hctl; /* => shared control information */ |
| HASHSEGMENT *dir; /* directory of segment starts */ |
| HashValueFunc hash; /* hash function */ |
| HashCompareFunc match; /* key comparison function */ |
| HashCopyFunc keycopy; /* key copying function */ |
| HashAllocFunc alloc; /* memory allocator */ |
| MemoryContext hcxt; /* memory context if default allocator used */ |
| char *tabname; /* table name (for error messages) */ |
| bool isshared; /* true if table is in shared memory */ |
| |
| /* freezing a shared table isn't allowed, so we can keep state here */ |
| bool frozen; /* true = no more inserts allowed */ |
| |
| /* We keep local copies of these fixed values to reduce contention */ |
| Size keysize; /* hash key length in bytes */ |
| long ssize; /* segment size --- must be power of 2 */ |
| int sshift; /* segment shift = log2(ssize) */ |
| }; |
| |
| /* |
| * Key (also entry) part of a HASHELEMENT |
| */ |
| #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT))) |
| |
| /* |
| * Fast MOD arithmetic, assuming that y is a power of 2 ! |
| */ |
| #define MOD(x,y) ((x) & ((y)-1)) |
| |
| #if HASH_STATISTICS |
| static long hash_accesses, |
| hash_collisions, |
| hash_expansions; |
| #endif |
| |
| /* |
| * Private function prototypes |
| */ |
| static void *DynaHashAlloc(Size size); |
| static HASHSEGMENT seg_alloc(HTAB *hashp); |
| static bool element_alloc(HTAB *hashp, int nelem); |
| static bool dir_realloc(HTAB *hashp); |
| static bool expand_table(HTAB *hashp); |
| static HASHBUCKET get_hash_entry(HTAB *hashp); |
| static void hdefault(HTAB *hashp); |
| static int choose_nelem_alloc(Size entrysize); |
| static bool init_htab(HTAB *hashp, long nelem); |
| static void hash_corrupted(HTAB *hashp); |
| static void register_seq_scan(HTAB *hashp); |
| static void deregister_seq_scan(HTAB *hashp); |
| static bool has_seq_scans(HTAB *hashp); |
| |
| |
| /* |
| * memory allocation support |
| */ |
| static MemoryContext CurrentDynaHashCxt = NULL; |
| |
| static void * |
| DynaHashAlloc(Size size) |
| { |
| Assert(MemoryContextIsValid(CurrentDynaHashCxt)); |
| return MemoryContextAlloc(CurrentDynaHashCxt, size); |
| } |
| |
| |
| /* |
| * HashCompareFunc for string keys |
| * |
| * Because we copy keys with strlcpy(), they will be truncated at keysize-1 |
| * bytes, so we can only compare that many ... hence strncmp is almost but |
| * not quite the right thing. |
| */ |
| static int |
| string_compare(const char *key1, const char *key2, Size keysize) |
| { |
| return strncmp(key1, key2, keysize - 1); |
| } |
| |
| |
| /************************** CREATE ROUTINES **********************/ |
| |
| /* |
| * hash_create -- create a new dynamic hash table |
| * |
| * tabname: a name for the table (for debugging purposes) |
| * nelem: maximum number of elements expected |
| * *info: additional table parameters, as indicated by flags |
| * flags: bitmask indicating which parameters to take from *info |
| * |
| * Note: for a shared-memory hashtable, nelem needs to be a pretty good |
| * estimate, since we can't expand the table on the fly. But an unshared |
| * hashtable can be expanded on-the-fly, so it's better for nelem to be |
| * on the small side and let the table grow if it's exceeded. An overly |
| * large nelem will penalize hash_seq_search speed without buying much. |
| */ |
| HTAB * |
| hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) |
| { |
| HTAB *hashp; |
| HASHHDR *hctl; |
| |
| /* |
| * For shared hash tables, we have a local hash header (HTAB struct) that |
| * we allocate in TopMemoryContext; all else is in shared memory. |
| * |
| * For non-shared hash tables, everything including the hash header is in |
| * a memory context created specially for the hash table --- this makes |
| * hash_destroy very simple. The memory context is made a child of either |
| * a context specified by the caller, or TopMemoryContext if nothing is |
| * specified. |
| */ |
| if (flags & HASH_SHARED_MEM) |
| { |
| /* Set up to allocate the hash header */ |
| CurrentDynaHashCxt = TopMemoryContext; |
| } |
| else |
| { |
| /* Create the hash table's private memory context */ |
| if (flags & HASH_CONTEXT) |
| CurrentDynaHashCxt = info->hcxt; |
| else |
| CurrentDynaHashCxt = TopMemoryContext; |
| CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt, |
| tabname, |
| ALLOCSET_DEFAULT_MINSIZE, |
| ALLOCSET_DEFAULT_INITSIZE, |
| ALLOCSET_DEFAULT_MAXSIZE); |
| } |
| |
| /* Initialize the hash header, plus a copy of the table name */ |
| hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) +1); |
| MemSet(hashp, 0, sizeof(HTAB)); |
| |
| hashp->tabname = (char *) (hashp + 1); |
| strcpy(hashp->tabname, tabname); |
| |
| if (flags & HASH_FUNCTION) |
| hashp->hash = info->hash; |
| else |
| hashp->hash = string_hash; /* default hash function */ |
| |
| /* |
| * If you don't specify a match function, it defaults to string_compare if |
| * you used string_hash (either explicitly or by default) and to memcmp |
| * otherwise. (Prior to PostgreSQL 7.4, memcmp was always used.) |
| */ |
| if (flags & HASH_COMPARE) |
| hashp->match = info->match; |
| else if (hashp->hash == string_hash) |
| hashp->match = (HashCompareFunc) string_compare; |
| else |
| hashp->match = memcmp; |
| |
| /* |
| * Similarly, the key-copying function defaults to strlcpy or memcpy. |
| */ |
| if (flags & HASH_KEYCOPY) |
| hashp->keycopy = info->keycopy; |
| else if (hashp->hash == string_hash) |
| hashp->keycopy = (HashCopyFunc) strlcpy; |
| else |
| hashp->keycopy = memcpy; |
| |
| if (flags & HASH_ALLOC) |
| hashp->alloc = info->alloc; |
| else |
| hashp->alloc = DynaHashAlloc; |
| |
| if (flags & HASH_SHARED_MEM) |
| { |
| /* |
| * ctl structure and directory are preallocated for shared memory |
| * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as |
| * well. |
| */ |
| hashp->hctl = info->hctl; |
| hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR)); |
| hashp->hcxt = NULL; |
| hashp->isshared = true; |
| |
| /* hash table already exists, we're just attaching to it */ |
| if (flags & HASH_ATTACH) |
| { |
| /* make local copies of some heavily-used values */ |
| hctl = hashp->hctl; |
| hashp->keysize = hctl->keysize; |
| hashp->ssize = hctl->ssize; |
| hashp->sshift = hctl->sshift; |
| |
| return hashp; |
| } |
| } |
| else |
| { |
| /* setup hash table defaults */ |
| hashp->hctl = NULL; |
| hashp->dir = NULL; |
| hashp->hcxt = CurrentDynaHashCxt; |
| hashp->isshared = false; |
| } |
| |
| if (!hashp->hctl) |
| { |
| hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR)); |
| if (!hashp->hctl) |
| ereport(ERROR, |
| (errcode(ERRCODE_OUT_OF_MEMORY), |
| errmsg("out of memory"))); |
| } |
| |
| hashp->frozen = false; |
| |
| hdefault(hashp); |
| |
| hctl = hashp->hctl; |
| |
| if (flags & HASH_PARTITION) |
| { |
| /* Doesn't make sense to partition a local hash table */ |
| Assert(flags & HASH_SHARED_MEM); |
| /* # of partitions had better be a power of 2 */ |
| Assert(info->num_partitions == (1L << my_log2(info->num_partitions))); |
| |
| hctl->num_partitions = info->num_partitions; |
| } |
| |
| if (flags & HASH_SEGMENT) |
| { |
| hctl->ssize = info->ssize; |
| hctl->sshift = my_log2(info->ssize); |
| /* ssize had better be a power of 2 */ |
| Assert(hctl->ssize == (1L << hctl->sshift)); |
| } |
| if (flags & HASH_FFACTOR) |
| hctl->ffactor = info->ffactor; |
| |
| /* |
| * SHM hash tables have fixed directory size passed by the caller. |
| */ |
| if (flags & HASH_DIRSIZE) |
| { |
| hctl->max_dsize = info->max_dsize; |
| hctl->dsize = info->dsize; |
| } |
| |
| /* |
| * hash table now allocates space for key and data but you have to say how |
| * much space to allocate |
| */ |
| if (flags & HASH_ELEM) |
| { |
| Assert(info->entrysize >= info->keysize); |
| hctl->keysize = info->keysize; |
| hctl->entrysize = info->entrysize; |
| } |
| |
| /* make local copies of heavily-used constant fields */ |
| hashp->keysize = hctl->keysize; |
| hashp->ssize = hctl->ssize; |
| hashp->sshift = hctl->sshift; |
| |
| /* Build the hash directory structure */ |
| if (!init_htab(hashp, nelem)) |
| elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname); |
| |
| /* |
| * For a shared hash table, preallocate the requested number of elements. |
| * This reduces problems with run-time out-of-shared-memory conditions. |
| * |
| * For a non-shared hash table, preallocate the requested number of |
| * elements if it's less than our chosen nelem_alloc. This avoids wasting |
| * space if the caller correctly estimates a small table size. |
| */ |
| if ((flags & HASH_SHARED_MEM) || |
| nelem < hctl->nelem_alloc) |
| { |
| if (!element_alloc(hashp, (int) nelem)) |
| ereport(ERROR, |
| (errcode(ERRCODE_OUT_OF_MEMORY), |
| errmsg("out of memory"))); |
| } |
| |
| return hashp; |
| } |
| |
| /* |
| * Set default HASHHDR parameters. |
| */ |
| static void |
| hdefault(HTAB *hashp) |
| { |
| HASHHDR *hctl = hashp->hctl; |
| |
| MemSet(hctl, 0, sizeof(HASHHDR)); |
| |
| hctl->nentries = 0; |
| hctl->freeList = NULL; |
| |
| hctl->dsize = DEF_DIRSIZE; |
| hctl->nsegs = 0; |
| |
| /* rather pointless defaults for key & entry size */ |
| hctl->keysize = sizeof(char *); |
| hctl->entrysize = 2 * sizeof(char *); |
| |
| hctl->num_partitions = 0; /* not partitioned */ |
| |
| hctl->ffactor = DEF_FFACTOR; |
| |
| /* table has no fixed maximum size */ |
| hctl->max_dsize = NO_MAX_DSIZE; |
| |
| hctl->ssize = DEF_SEGSIZE; |
| hctl->sshift = DEF_SEGSIZE_SHIFT; |
| |
| #ifdef HASH_STATISTICS |
| hctl->accesses = hctl->collisions = 0; |
| #endif |
| } |
| |
| /* |
| * Given the user-specified entry size, choose nelem_alloc, ie, how many |
| * elements to add to the hash table when we need more. |
| */ |
| static int |
| choose_nelem_alloc(Size entrysize) |
| { |
| int nelem_alloc; |
| Size elementSize; |
| Size allocSize; |
| |
| /* Each element has a HASHELEMENT header plus user data. */ |
| /* NB: this had better match element_alloc() */ |
| elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize); |
| |
| /* |
| * The idea here is to choose nelem_alloc at least 32, but round up so |
| * that the allocation request will be a power of 2 or just less. This |
| * makes little difference for hash tables in shared memory, but for hash |
| * tables managed by palloc, the allocation request will be rounded up to |
| * a power of 2 anyway. If we fail to take this into account, we'll waste |
| * as much as half the allocated space. |
| */ |
| allocSize = 32 * 4; /* assume elementSize at least 8 */ |
| do |
| { |
| allocSize <<= 1; |
| nelem_alloc = allocSize / elementSize; |
| } while (nelem_alloc < 32); |
| |
| return nelem_alloc; |
| } |
| |
| /* |
| * Compute derived fields of hctl and build the initial directory/segment |
| * arrays |
| */ |
| static bool |
| init_htab(HTAB *hashp, long nelem) |
| { |
| HASHHDR *hctl = hashp->hctl; |
| HASHSEGMENT *segp; |
| long lnbuckets; |
| int nbuckets; |
| int nsegs; |
| |
| /* |
| * initialize mutex if it's a partitioned table |
| */ |
| if (IS_PARTITIONED(hctl)) |
| SpinLockInit(&hctl->mutex); |
| |
| /* |
| * Divide number of elements by the fill factor to determine a desired |
| * number of buckets. Allocate space for the next greater power of two |
| * number of buckets |
| */ |
| lnbuckets = (nelem - 1) / hctl->ffactor + 1; |
| |
| nbuckets = 1 << my_log2(lnbuckets); |
| |
| /* |
| * In a partitioned table, nbuckets must be at least equal to |
| * num_partitions; were it less, keys with apparently different partition |
| * numbers would map to the same bucket, breaking partition independence. |
| * (Normally nbuckets will be much bigger; this is just a safety check.) |
| */ |
| while (nbuckets < hctl->num_partitions) |
| nbuckets <<= 1; |
| |
| hctl->max_bucket = hctl->low_mask = nbuckets - 1; |
| hctl->high_mask = (nbuckets << 1) - 1; |
| |
| /* |
| * Figure number of directory segments needed, round up to a power of 2 |
| */ |
| nsegs = (nbuckets - 1) / hctl->ssize + 1; |
| nsegs = 1 << my_log2(nsegs); |
| |
| /* |
| * Make sure directory is big enough. If pre-allocated directory is too |
| * small, choke (caller screwed up). |
| */ |
| if (nsegs > hctl->dsize) |
| { |
| if (!(hashp->dir)) |
| hctl->dsize = nsegs; |
| else |
| return false; |
| } |
| |
| /* Allocate a directory */ |
| if (!(hashp->dir)) |
| { |
| CurrentDynaHashCxt = hashp->hcxt; |
| hashp->dir = (HASHSEGMENT *) |
| hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT)); |
| if (!hashp->dir) |
| return false; |
| } |
| |
| /* Allocate initial segments */ |
| for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++) |
| { |
| *segp = seg_alloc(hashp); |
| if (*segp == NULL) |
| return false; |
| } |
| |
| /* Choose number of entries to allocate at a time */ |
| hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize); |
| |
| #if HASH_DEBUG |
| fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n%s%ld\n", |
| "TABLE POINTER ", hashp, |
| "DIRECTORY SIZE ", hctl->dsize, |
| "SEGMENT SIZE ", hctl->ssize, |
| "SEGMENT SHIFT ", hctl->sshift, |
| "FILL FACTOR ", hctl->ffactor, |
| "MAX BUCKET ", hctl->max_bucket, |
| "HIGH MASK ", hctl->high_mask, |
| "LOW MASK ", hctl->low_mask, |
| "NSEGS ", hctl->nsegs, |
| "NENTRIES ", hctl->nentries); |
| #endif |
| return true; |
| } |
| |
| /* |
| * Estimate the space needed for a hashtable containing the given number |
| * of entries of given size. |
| * NOTE: this is used to estimate the footprint of hashtables in shared |
| * memory; therefore it does not count HTAB which is in local memory. |
| * NB: assumes that all hash structure parameters have default values! |
| */ |
| Size |
| hash_estimate_size(long num_entries, Size entrysize) |
| { |
| Size size; |
| long nBuckets, |
| nSegments, |
| nDirEntries, |
| nElementAllocs, |
| elementSize, |
| elementAllocCnt; |
| |
| /* estimate number of buckets wanted */ |
| nBuckets = 1L << my_log2((num_entries - 1) / DEF_FFACTOR + 1); |
| /* # of segments needed for nBuckets */ |
| nSegments = 1L << my_log2((nBuckets - 1) / DEF_SEGSIZE + 1); |
| /* directory entries */ |
| nDirEntries = DEF_DIRSIZE; |
| while (nDirEntries < nSegments) |
| nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */ |
| |
| /* fixed control info */ |
| size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */ |
| /* directory */ |
| size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT))); |
| /* segments */ |
| size = add_size(size, mul_size(nSegments, |
| MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET)))); |
| /* elements --- allocated in groups of choose_nelem_alloc() entries */ |
| elementAllocCnt = choose_nelem_alloc(entrysize); |
| nElementAllocs = (num_entries - 1) / elementAllocCnt + 1; |
| elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize); |
| size = add_size(size, |
| mul_size(nElementAllocs, |
| mul_size(elementAllocCnt, elementSize))); |
| |
| return size; |
| } |
| |
| /* |
| * Select an appropriate directory size for a hashtable with the given |
| * maximum number of entries. |
| * This is only needed for hashtables in shared memory, whose directories |
| * cannot be expanded dynamically. |
| * NB: assumes that all hash structure parameters have default values! |
| * |
| * XXX this had better agree with the behavior of init_htab()... |
| */ |
| long |
| hash_select_dirsize(long num_entries) |
| { |
| long nBuckets, |
| nSegments, |
| nDirEntries; |
| |
| /* estimate number of buckets wanted */ |
| nBuckets = 1L << my_log2((num_entries - 1) / DEF_FFACTOR + 1); |
| /* # of segments needed for nBuckets */ |
| nSegments = 1L << my_log2((nBuckets - 1) / DEF_SEGSIZE + 1); |
| /* directory entries */ |
| nDirEntries = DEF_DIRSIZE; |
| while (nDirEntries < nSegments) |
| nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */ |
| |
| return nDirEntries; |
| } |
| |
| /* |
| * Compute the required initial memory allocation for a shared-memory |
| * hashtable with the given parameters. We need space for the HASHHDR |
| * and for the (non expansible) directory. |
| */ |
| Size |
| hash_get_shared_size(HASHCTL *info, int flags) |
| { |
| Assert(flags & HASH_DIRSIZE); |
| Assert(info->dsize == info->max_dsize); |
| return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT); |
| } |
| |
| |
| /********************** DESTROY ROUTINES ************************/ |
| |
| void |
| hash_destroy(HTAB *hashp) |
| { |
| if (hashp != NULL) |
| { |
| /* allocation method must be one we know how to free, too */ |
| Assert(hashp->alloc == DynaHashAlloc); |
| /* so this hashtable must have it's own context */ |
| Assert(hashp->hcxt != NULL); |
| |
| hash_stats("destroy", hashp); |
| |
| /* |
| * Free everything by destroying the hash table's memory context. |
| */ |
| MemoryContextDelete(hashp->hcxt); |
| } |
| } |
| |
| void |
| hash_stats(const char *where, HTAB *hashp) |
| { |
| #if HASH_STATISTICS |
| fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n", |
| where, hashp->hctl->accesses, hashp->hctl->collisions); |
| |
| fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n", |
| hashp->hctl->nentries, (long) hashp->hctl->keysize, |
| hashp->hctl->max_bucket, hashp->hctl->nsegs); |
| fprintf(stderr, "%s: total accesses %ld total collisions %ld\n", |
| where, hash_accesses, hash_collisions); |
| fprintf(stderr, "hash_stats: total expansions %ld\n", |
| hash_expansions); |
| #endif |
| } |
| |
| /*******************************SEARCH ROUTINES *****************************/ |
| |
| |
| /* |
| * get_hash_value -- exported routine to calculate a key's hash value |
| * |
| * We export this because for partitioned tables, callers need to compute |
| * the partition number (from the low-order bits of the hash value) before |
| * searching. |
| */ |
| uint32 |
| get_hash_value(HTAB *hashp, const void *keyPtr) |
| { |
| return hashp->hash(keyPtr, hashp->keysize); |
| } |
| |
| /* Convert a hash value to a bucket number */ |
| static inline uint32 |
| calc_bucket(HASHHDR *hctl, uint32 hash_val) |
| { |
| uint32 bucket; |
| |
| bucket = hash_val & hctl->high_mask; |
| if (bucket > hctl->max_bucket) |
| bucket = bucket & hctl->low_mask; |
| |
| return bucket; |
| } |
| |
| /* |
| * hash_search -- look up key in table and perform action |
| * hash_search_with_hash_value -- same, with key's hash value already computed |
| * |
| * action is one of: |
| * HASH_FIND: look up key in table |
| * HASH_ENTER: look up key in table, creating entry if not present |
| * HASH_ENTER_NULL: same, but return NULL if out of memory |
| * HASH_REMOVE: look up key in table, remove entry if present |
| * |
| * Return value is a pointer to the element found/entered/removed if any, |
| * or NULL if no match was found. (NB: in the case of the REMOVE action, |
| * the result is a dangling pointer that shouldn't be dereferenced!) |
| * |
| * HASH_ENTER will normally ereport a generic "out of memory" error if |
| * it is unable to create a new entry. The HASH_ENTER_NULL operation is |
| * the same except it will return NULL if out of memory. Note that |
| * HASH_ENTER_NULL cannot be used with the default palloc-based allocator, |
| * since palloc internally ereports on out-of-memory. |
| * |
| * If foundPtr isn't NULL, then *foundPtr is set TRUE if we found an |
| * existing entry in the table, FALSE otherwise. This is needed in the |
| * HASH_ENTER case, but is redundant with the return value otherwise. |
| * |
| * For hash_search_with_hash_value, the hashvalue parameter must have been |
| * calculated with get_hash_value(). |
| */ |
| void * |
| hash_search(HTAB *hashp, |
| const void *keyPtr, |
| HASHACTION action, |
| bool *foundPtr) |
| { |
| return hash_search_with_hash_value(hashp, |
| keyPtr, |
| hashp->hash(keyPtr, hashp->keysize), |
| action, |
| foundPtr); |
| } |
| |
| void * |
| hash_search_with_hash_value(HTAB *hashp, |
| const void *keyPtr, |
| uint32 hashvalue, |
| HASHACTION action, |
| bool *foundPtr) |
| { |
| HASHHDR *hctl = hashp->hctl; |
| Size keysize; |
| uint32 bucket; |
| long segment_num; |
| long segment_ndx; |
| HASHSEGMENT segp; |
| HASHBUCKET currBucket; |
| HASHBUCKET *prevBucketPtr; |
| HashCompareFunc match; |
| |
| #if HASH_STATISTICS |
| hash_accesses++; |
| hctl->accesses++; |
| #endif |
| |
| /* |
| * Do the initial lookup |
| */ |
| bucket = calc_bucket(hctl, hashvalue); |
| |
| segment_num = bucket >> hashp->sshift; |
| segment_ndx = MOD(bucket, hashp->ssize); |
| |
| segp = hashp->dir[segment_num]; |
| |
| if (segp == NULL) |
| hash_corrupted(hashp); |
| |
| prevBucketPtr = &segp[segment_ndx]; |
| currBucket = *prevBucketPtr; |
| |
| /* |
| * Follow collision chain looking for matching key |
| */ |
| match = hashp->match; /* save one fetch in inner loop */ |
| keysize = hashp->keysize; /* ditto */ |
| |
| while (currBucket != NULL) |
| { |
| if (currBucket->hashvalue == hashvalue && |
| match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0) |
| break; |
| prevBucketPtr = &(currBucket->link); |
| currBucket = *prevBucketPtr; |
| #if HASH_STATISTICS |
| hash_collisions++; |
| hctl->collisions++; |
| #endif |
| } |
| |
| if (foundPtr) |
| *foundPtr = (bool) (currBucket != NULL); |
| |
| /* |
| * OK, now what? |
| */ |
| switch (action) |
| { |
| case HASH_FIND: |
| if (currBucket != NULL) |
| return (void *) ELEMENTKEY(currBucket); |
| return NULL; |
| |
| case HASH_REMOVE: |
| if (currBucket != NULL) |
| { |
| /* use volatile pointer to prevent code rearrangement */ |
| volatile HASHHDR *hctlv = hctl; |
| |
| /* if partitioned, must lock to touch nentries and freeList */ |
| if (IS_PARTITIONED(hctlv)) |
| SpinLockAcquire(&hctlv->mutex); |
| |
| Assert(hctlv->nentries > 0); |
| hctlv->nentries--; |
| |
| /* remove record from hash bucket's chain. */ |
| *prevBucketPtr = currBucket->link; |
| |
| /* add the record to the freelist for this table. */ |
| currBucket->link = hctlv->freeList; |
| hctlv->freeList = currBucket; |
| |
| if (IS_PARTITIONED(hctlv)) |
| SpinLockRelease(&hctlv->mutex); |
| |
| /* |
| * better hope the caller is synchronizing access to this |
| * element, because someone else is going to reuse it the next |
| * time something is added to the table |
| */ |
| return (void *) ELEMENTKEY(currBucket); |
| } |
| return NULL; |
| |
| case HASH_ENTER_NULL: |
| /* ENTER_NULL does not work with palloc-based allocator */ |
| Assert(hashp->alloc != DynaHashAlloc); |
| /* FALL THRU */ |
| |
| case HASH_ENTER: |
| /* Return existing element if found, else create one */ |
| if (currBucket != NULL) |
| return (void *) ELEMENTKEY(currBucket); |
| |
| /* disallow inserts if frozen */ |
| if (hashp->frozen) |
| elog(ERROR, "cannot insert into frozen hashtable \"%s\"", |
| hashp->tabname); |
| |
| currBucket = get_hash_entry(hashp); |
| if (currBucket == NULL) |
| { |
| /* out of memory */ |
| if (action == HASH_ENTER_NULL) |
| return NULL; |
| /* report a generic message */ |
| if (hashp->isshared) |
| ereport(ERROR, |
| (errcode(ERRCODE_OUT_OF_MEMORY), |
| errmsg("out of shared memory"))); |
| else |
| ereport(ERROR, |
| (errcode(ERRCODE_OUT_OF_MEMORY), |
| errmsg("out of memory"))); |
| } |
| |
| /* link into hashbucket chain */ |
| *prevBucketPtr = currBucket; |
| currBucket->link = NULL; |
| |
| /* copy key into record */ |
| currBucket->hashvalue = hashvalue; |
| hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize); |
| |
| /* caller is expected to fill the data field on return */ |
| |
| /* |
| * Check if it is time to split a bucket. Can't split if running |
| * in partitioned mode, nor if table is the subject of any active |
| * hash_seq_search scans. Strange order of these tests is to try |
| * to check cheaper conditions first. |
| */ |
| if (!IS_PARTITIONED(hctl) && |
| hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor && |
| !has_seq_scans(hashp)) |
| { |
| /* |
| * NOTE: failure to expand table is not a fatal error, it just |
| * means we have to run at higher fill factor than we wanted. |
| */ |
| expand_table(hashp); |
| } |
| |
| return (void *) ELEMENTKEY(currBucket); |
| } |
| |
| elog(ERROR, "unrecognized hash action code: %d", (int) action); |
| |
| return NULL; /* keep compiler quiet */ |
| } |
| |
| /* |
| * create a new entry if possible |
| */ |
| static HASHBUCKET |
| get_hash_entry(HTAB *hashp) |
| { |
| /* use volatile pointer to prevent code rearrangement */ |
| volatile HASHHDR *hctlv = hashp->hctl; |
| HASHBUCKET newElement; |
| |
| for (;;) |
| { |
| /* if partitioned, must lock to touch nentries and freeList */ |
| if (IS_PARTITIONED(hctlv)) |
| SpinLockAcquire(&hctlv->mutex); |
| |
| /* try to get an entry from the freelist */ |
| newElement = hctlv->freeList; |
| if (newElement != NULL) |
| break; |
| |
| /* no free elements. allocate another chunk of buckets */ |
| if (IS_PARTITIONED(hctlv)) |
| SpinLockRelease(&hctlv->mutex); |
| |
| if (!element_alloc(hashp, hctlv->nelem_alloc)) |
| { |
| /* out of memory */ |
| return NULL; |
| } |
| } |
| |
| /* remove entry from freelist, bump nentries */ |
| hctlv->freeList = newElement->link; |
| hctlv->nentries++; |
| |
| if (IS_PARTITIONED(hctlv)) |
| SpinLockRelease(&hctlv->mutex); |
| |
| return newElement; |
| } |
| |
| /* |
| * hash_get_num_entries -- get the number of entries in a hashtable |
| */ |
| long |
| hash_get_num_entries(HTAB *hashp) |
| { |
| /* |
| * We currently don't bother with the mutex; it's only sensible to call |
| * this function if you've got lock on all partitions of the table. |
| */ |
| return hashp->hctl->nentries; |
| } |
| |
| /* |
| * hash_seq_init/_search/_term |
| * Sequentially search through hash table and return |
| * all the elements one by one, return NULL when no more. |
| * |
| * hash_seq_term should be called if and only if the scan is abandoned before |
| * completion; if hash_seq_search returns NULL then it has already done the |
| * end-of-scan cleanup. |
| * |
| * NOTE: caller may delete the returned element before continuing the scan. |
| * However, deleting any other element while the scan is in progress is |
| * UNDEFINED (it might be the one that curIndex is pointing at!). Also, |
| * if elements are added to the table while the scan is in progress, it is |
| * unspecified whether they will be visited by the scan or not. |
| * |
| * NOTE: it is possible to use hash_seq_init/hash_seq_search without any |
| * worry about hash_seq_term cleanup, if the hashtable is first locked against |
| * further insertions by calling hash_freeze. This is used by nodeAgg.c, |
| * wherein it is inconvenient to track whether a scan is still open, and |
| * there's no possibility of further insertions after readout has begun. |
| * |
| * NOTE: to use this with a partitioned hashtable, caller had better hold |
| * at least shared lock on all partitions of the table throughout the scan! |
| * We can cope with insertions or deletions by our own backend, but *not* |
| * with concurrent insertions or deletions by another. |
| */ |
| void |
| hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp) |
| { |
| status->hashp = hashp; |
| status->curBucket = 0; |
| status->curEntry = NULL; |
| |
| if (hashp && !hashp->frozen) |
| register_seq_scan(hashp); |
| } |
| |
| void * |
| hash_seq_search(HASH_SEQ_STATUS *status) |
| { |
| HTAB *hashp; |
| HASHHDR *hctl; |
| uint32 max_bucket; |
| long ssize; |
| long segment_num; |
| long segment_ndx; |
| HASHSEGMENT segp; |
| uint32 curBucket; |
| HASHELEMENT *curElem; |
| |
| if ((curElem = status->curEntry) != NULL) |
| { |
| /* Continuing scan of curBucket... */ |
| status->curEntry = curElem->link; |
| if (status->curEntry == NULL) /* end of this bucket */ |
| ++status->curBucket; |
| return (void *) ELEMENTKEY(curElem); |
| } |
| |
| /* |
| * Search for next nonempty bucket starting at curBucket. |
| */ |
| curBucket = status->curBucket; |
| hashp = status->hashp; |
| if(!hashp) |
| return NULL; |
| |
| hctl = hashp->hctl; |
| ssize = hashp->ssize; |
| max_bucket = hctl->max_bucket; |
| |
| if (curBucket > max_bucket) |
| { |
| hash_seq_term(status); |
| return NULL; /* search is done */ |
| } |
| |
| /* |
| * first find the right segment in the table directory. |
| */ |
| segment_num = curBucket >> hashp->sshift; |
| segment_ndx = MOD(curBucket, ssize); |
| |
| segp = hashp->dir[segment_num]; |
| |
| /* |
| * Pick up the first item in this bucket's chain. If chain is not empty |
| * we can begin searching it. Otherwise we have to advance to find the |
| * next nonempty bucket. We try to optimize that case since searching a |
| * near-empty hashtable has to iterate this loop a lot. |
| */ |
| while ((curElem = segp[segment_ndx]) == NULL) |
| { |
| /* empty bucket, advance to next */ |
| if (++curBucket > max_bucket) |
| { |
| status->curBucket = curBucket; |
| hash_seq_term(status); |
| return NULL; /* search is done */ |
| } |
| if (++segment_ndx >= ssize) |
| { |
| segment_num++; |
| segment_ndx = 0; |
| segp = hashp->dir[segment_num]; |
| } |
| } |
| |
| /* Begin scan of curBucket... */ |
| status->curEntry = curElem->link; |
| if (status->curEntry == NULL) /* end of this bucket */ |
| ++curBucket; |
| status->curBucket = curBucket; |
| return (void *) ELEMENTKEY(curElem); |
| } |
| |
| void |
| hash_seq_term(HASH_SEQ_STATUS *status) |
| { |
| if (!status->hashp->frozen) |
| deregister_seq_scan(status->hashp); |
| } |
| |
| /* |
| * hash_freeze |
| * Freeze a hashtable against future insertions (deletions are |
| * still allowed) |
| * |
| * The reason for doing this is that by preventing any more bucket splits, |
| * we no longer need to worry about registering hash_seq_search scans, |
| * and thus caller need not be careful about ensuring hash_seq_term gets |
| * called at the right times. |
| * |
| * Multiple calls to hash_freeze() are allowed, but you can't freeze a table |
| * with active scans (since hash_seq_term would then do the wrong thing). |
| */ |
| void |
| hash_freeze(HTAB *hashp) |
| { |
| if (hashp->isshared) |
| elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname); |
| if (!hashp->frozen && has_seq_scans(hashp)) |
| elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans", |
| hashp->tabname); |
| hashp->frozen = true; |
| } |
| |
| |
| /********************************* UTILITIES ************************/ |
| |
| /* |
| * Expand the table by adding one more hash bucket. |
| */ |
| static bool |
| expand_table(HTAB *hashp) |
| { |
| HASHHDR *hctl = hashp->hctl; |
| HASHSEGMENT old_seg, |
| new_seg; |
| long old_bucket, |
| new_bucket; |
| long new_segnum, |
| new_segndx; |
| long old_segnum, |
| old_segndx; |
| HASHBUCKET *oldlink, |
| *newlink; |
| HASHBUCKET currElement, |
| nextElement; |
| |
| Assert(!IS_PARTITIONED(hctl)); |
| |
| #ifdef HASH_STATISTICS |
| hash_expansions++; |
| #endif |
| |
| new_bucket = hctl->max_bucket + 1; |
| new_segnum = new_bucket >> hashp->sshift; |
| new_segndx = MOD(new_bucket, hashp->ssize); |
| |
| if (new_segnum >= hctl->nsegs) |
| { |
| /* Allocate new segment if necessary -- could fail if dir full */ |
| if (new_segnum >= hctl->dsize) |
| if (!dir_realloc(hashp)) |
| return false; |
| if (!(hashp->dir[new_segnum] = seg_alloc(hashp))) |
| return false; |
| hctl->nsegs++; |
| } |
| |
| /* OK, we created a new bucket */ |
| hctl->max_bucket++; |
| |
| /* |
| * *Before* changing masks, find old bucket corresponding to same hash |
| * values; values in that bucket may need to be relocated to new bucket. |
| * Note that new_bucket is certainly larger than low_mask at this point, |
| * so we can skip the first step of the regular hash mask calc. |
| */ |
| old_bucket = (new_bucket & hctl->low_mask); |
| |
| /* |
| * If we crossed a power of 2, readjust masks. |
| */ |
| if ((uint32) new_bucket > hctl->high_mask) |
| { |
| hctl->low_mask = hctl->high_mask; |
| hctl->high_mask = (uint32) new_bucket | hctl->low_mask; |
| } |
| |
| /* |
| * Relocate records to the new bucket. NOTE: because of the way the hash |
| * masking is done in calc_bucket, only one old bucket can need to be |
| * split at this point. With a different way of reducing the hash value, |
| * that might not be true! |
| */ |
| old_segnum = old_bucket >> hashp->sshift; |
| old_segndx = MOD(old_bucket, hashp->ssize); |
| |
| old_seg = hashp->dir[old_segnum]; |
| new_seg = hashp->dir[new_segnum]; |
| |
| oldlink = &old_seg[old_segndx]; |
| newlink = &new_seg[new_segndx]; |
| |
| for (currElement = *oldlink; |
| currElement != NULL; |
| currElement = nextElement) |
| { |
| nextElement = currElement->link; |
| if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket) |
| { |
| *oldlink = currElement; |
| oldlink = &currElement->link; |
| } |
| else |
| { |
| *newlink = currElement; |
| newlink = &currElement->link; |
| } |
| } |
| /* don't forget to terminate the rebuilt hash chains... */ |
| *oldlink = NULL; |
| *newlink = NULL; |
| |
| return true; |
| } |
| |
| |
| static bool |
| dir_realloc(HTAB *hashp) |
| { |
| HASHSEGMENT *p; |
| HASHSEGMENT *old_p; |
| long new_dsize; |
| long old_dirsize; |
| long new_dirsize; |
| |
| if (hashp->hctl->max_dsize != NO_MAX_DSIZE) |
| return false; |
| |
| /* Reallocate directory */ |
| new_dsize = hashp->hctl->dsize << 1; |
| old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT); |
| new_dirsize = new_dsize * sizeof(HASHSEGMENT); |
| |
| old_p = hashp->dir; |
| CurrentDynaHashCxt = hashp->hcxt; |
| p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize); |
| |
| if (p != NULL) |
| { |
| memcpy(p, old_p, old_dirsize); |
| MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize); |
| hashp->dir = p; |
| hashp->hctl->dsize = new_dsize; |
| |
| /* XXX assume the allocator is palloc, so we know how to free */ |
| Assert(hashp->alloc == DynaHashAlloc); |
| pfree(old_p); |
| |
| return true; |
| } |
| |
| return false; |
| } |
| |
| |
| static HASHSEGMENT |
| seg_alloc(HTAB *hashp) |
| { |
| HASHSEGMENT segp; |
| |
| CurrentDynaHashCxt = hashp->hcxt; |
| segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize); |
| |
| if (!segp) |
| return NULL; |
| |
| MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize); |
| |
| return segp; |
| } |
| |
| /* |
| * allocate some new elements and link them into the free list |
| */ |
| static bool |
| element_alloc(HTAB *hashp, int nelem) |
| { |
| /* use volatile pointer to prevent code rearrangement */ |
| volatile HASHHDR *hctlv = hashp->hctl; |
| Size elementSize; |
| HASHELEMENT *firstElement; |
| HASHELEMENT *tmpElement; |
| HASHELEMENT *prevElement; |
| int i; |
| |
| /* Each element has a HASHELEMENT header plus user data. */ |
| elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctlv->entrysize); |
| |
| CurrentDynaHashCxt = hashp->hcxt; |
| firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize); |
| |
| if (!firstElement) |
| return false; |
| |
| /* prepare to link all the new entries into the freelist */ |
| prevElement = NULL; |
| tmpElement = firstElement; |
| for (i = 0; i < nelem; i++) |
| { |
| tmpElement->link = prevElement; |
| prevElement = tmpElement; |
| tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize); |
| } |
| |
| /* if partitioned, must lock to touch freeList */ |
| if (IS_PARTITIONED(hctlv)) |
| SpinLockAcquire(&hctlv->mutex); |
| |
| /* freelist could be nonempty if two backends did this concurrently */ |
| firstElement->link = hctlv->freeList; |
| hctlv->freeList = prevElement; |
| |
| if (IS_PARTITIONED(hctlv)) |
| SpinLockRelease(&hctlv->mutex); |
| |
| return true; |
| } |
| |
| /* complain when we have detected a corrupted hashtable */ |
| static void |
| hash_corrupted(HTAB *hashp) |
| { |
| /* |
| * If the corruption is in a shared hashtable, we'd better force a |
| * systemwide restart. Otherwise, just shut down this one backend. |
| */ |
| if (hashp->isshared) |
| elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname); |
| else |
| elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname); |
| } |
| |
| /* calculate ceil(log base 2) of num */ |
| int |
| my_log2(long num) |
| { |
| int i; |
| long limit; |
| |
| for (i = 0, limit = 1; limit < num; i++, limit <<= 1) |
| ; |
| return i; |
| } |
| |
| |
| /************************* SEQ SCAN TRACKING ************************/ |
| |
| /* |
| * We track active hash_seq_search scans here. The need for this mechanism |
| * comes from the fact that a scan will get confused if a bucket split occurs |
| * while it's in progress: it might visit entries twice, or even miss some |
| * entirely (if it's partway through the same bucket that splits). Hence |
| * we want to inhibit bucket splits if there are any active scans on the |
| * table being inserted into. This is a fairly rare case in current usage, |
| * so just postponing the split until the next insertion seems sufficient. |
| * |
| * Given present usages of the function, only a few scans are likely to be |
| * open concurrently; so a finite-size stack of open scans seems sufficient, |
| * and we don't worry that linear search is too slow. Note that we do |
| * allow multiple scans of the same hashtable to be open concurrently. |
| * |
| * This mechanism can support concurrent scan and insertion in a shared |
| * hashtable if it's the same backend doing both. It would fail otherwise, |
| * but locking reasons seem to preclude any such scenario anyway, so we don't |
| * worry. |
| * |
| * This arrangement is reasonably robust if a transient hashtable is deleted |
| * without notifying us. The absolute worst case is we might inhibit splits |
| * in another table created later at exactly the same address. We will give |
| * a warning at transaction end for reference leaks, so any bugs leading to |
| * lack of notification should be easy to catch. |
| */ |
| |
| #define MAX_SEQ_SCANS 100 |
| |
| static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */ |
| static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */ |
| static int num_seq_scans = 0; |
| |
| |
| /* Register a table as having an active hash_seq_search scan */ |
| static void |
| register_seq_scan(HTAB *hashp) |
| { |
| if (num_seq_scans >= MAX_SEQ_SCANS) |
| elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"", |
| hashp->tabname); |
| seq_scan_tables[num_seq_scans] = hashp; |
| seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel(); |
| num_seq_scans++; |
| } |
| |
| /* Deregister an active scan */ |
| static void |
| deregister_seq_scan(HTAB *hashp) |
| { |
| int i; |
| |
| /* Search backward since it's most likely at the stack top */ |
| for (i = num_seq_scans - 1; i >= 0; i--) |
| { |
| if (seq_scan_tables[i] == hashp) |
| { |
| seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1]; |
| seq_scan_level[i] = seq_scan_level[num_seq_scans - 1]; |
| num_seq_scans--; |
| return; |
| } |
| } |
| elog(ERROR, "no hash_seq_search scan for hash table \"%s\"", |
| hashp->tabname); |
| } |
| |
| /* Check if a table has any active scan */ |
| static bool |
| has_seq_scans(HTAB *hashp) |
| { |
| int i; |
| |
| for (i = 0; i < num_seq_scans; i++) |
| { |
| if (seq_scan_tables[i] == hashp) |
| return true; |
| } |
| return false; |
| } |
| |
| /* Clean up any open scans at end of transaction */ |
| void |
| AtEOXact_HashTables(bool isCommit) |
| { |
| /* |
| * During abort cleanup, open scans are expected; just silently clean 'em |
| * out. An open scan at commit means someone forgot a hash_seq_term() |
| * call, so complain. |
| * |
| * Note: it's tempting to try to print the tabname here, but refrain for |
| * fear of touching deallocated memory. This isn't a user-facing message |
| * anyway, so it needn't be pretty. |
| */ |
| if (isCommit) |
| { |
| int i; |
| |
| for (i = 0; i < num_seq_scans; i++) |
| { |
| elog(WARNING, "leaked hash_seq_search scan for hash table %p", |
| seq_scan_tables[i]); |
| } |
| } |
| num_seq_scans = 0; |
| } |
| |
| /* Clean up any open scans at end of subtransaction */ |
| void |
| AtEOSubXact_HashTables(bool isCommit, int nestDepth) |
| { |
| int i; |
| |
| /* |
| * Search backward to make cleanup easy. Note we must check all entries, |
| * not only those at the end of the array, because deletion technique |
| * doesn't keep them in order. |
| */ |
| for (i = num_seq_scans - 1; i >= 0; i--) |
| { |
| if (seq_scan_level[i] >= nestDepth) |
| { |
| if (isCommit) |
| elog(WARNING, "leaked hash_seq_search scan for hash table %p", |
| seq_scan_tables[i]); |
| seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1]; |
| seq_scan_level[i] = seq_scan_level[num_seq_scans - 1]; |
| num_seq_scans--; |
| } |
| } |
| } |