| /*------------------------------------------------------------------------- |
| * |
| * ginpostinglist.c |
| * routines for dealing with posting lists. |
| * |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * src/backend/access/gin/ginpostinglist.c |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "postgres.h" |
| |
| #include "access/gin_private.h" |
| |
| #ifdef USE_ASSERT_CHECKING |
| #define CHECK_ENCODING_ROUNDTRIP |
| #endif |
| |
| /* |
| * For encoding purposes, item pointers are represented as 64-bit unsigned |
| * integers. The lowest 11 bits represent the offset number, and the next |
| * lowest 32 bits are the block number. That leaves 21 bits unused, i.e. |
| * only 43 low bits are used. |
| * |
| * 11 bits is enough for the offset number, because MaxHeapTuplesPerPage < |
| * 2^11 on all supported block sizes. We are frugal with the bits, because |
| * smaller integers use fewer bytes in the varbyte encoding, saving disk |
| * space. (If we get a new table AM in the future that wants to use the full |
| * range of possible offset numbers, we'll need to change this.) |
| * |
| * These 43-bit integers are encoded using varbyte encoding. In each byte, |
| * the 7 low bits contain data, while the highest bit is a continuation bit. |
| * When the continuation bit is set, the next byte is part of the same |
| * integer, otherwise this is the last byte of this integer. 43 bits need |
| * at most 7 bytes in this encoding: |
| * |
| * 0XXXXXXX |
| * 1XXXXXXX 0XXXXYYY |
| * 1XXXXXXX 1XXXXYYY 0YYYYYYY |
| * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY |
| * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY |
| * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY |
| * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY |
| * |
| * X = bits used for offset number |
| * Y = bits used for block number |
| * u = unused bit |
| * |
| * The bytes are in stored in little-endian order. |
| * |
| * An important property of this encoding is that removing an item from list |
| * never increases the size of the resulting compressed posting list. Proof: |
| * |
| * Removing number is actually replacement of two numbers with their sum. We |
| * have to prove that varbyte encoding of a sum can't be longer than varbyte |
| * encoding of its summands. Sum of two numbers is at most one bit wider than |
| * the larger of the summands. Widening a number by one bit enlarges its length |
| * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum |
| * is at most one byte longer than varbyte encoding of larger summand. Lesser |
| * summand is at least one byte, so the sum cannot take more space than the |
| * summands, Q.E.D. |
| * |
| * This property greatly simplifies VACUUM, which can assume that posting |
| * lists always fit on the same page after vacuuming. Note that even though |
| * that holds for removing items from a posting list, you must also be |
| * careful to not cause expansion e.g. when merging uncompressed items on the |
| * page into the compressed lists, when vacuuming. |
| */ |
| |
| /* |
| * How many bits do you need to encode offset number? OffsetNumber is a 16-bit |
| * integer, but you can't fit that many items on a page. 11 ought to be more |
| * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and |
| * use the minimum number of bits, but that would require changing the on-disk |
| * format if MaxHeapTuplesPerPage changes. Better to leave some slack. |
| * |
| * |
| * Cloudberry modification: |
| * |
| * Cloudberry's append-only tables use the full 16 bit offset number range, so |
| * Cloudberry removes the storage optimization made by Postgres for heap tables. |
| * |
| */ |
| #define MaxHeapTuplesPerPageBits 16 |
| |
| /* Max. number of bytes needed to encode the largest supported integer. */ |
| #define MaxBytesPerInteger 7 |
| |
| static inline uint64 |
| itemptr_to_uint64(const ItemPointer iptr) |
| { |
| uint64 val; |
| |
| Assert(ItemPointerIsValid(iptr)); |
| /* |
| * Cloudberry allow 16 bits for the offsetnumber, which turns the below |
| * upstream assertion into an always-true comparison which generates a |
| * compiler warning; thus we need to keep this commented out. |
| */ |
| #if 0 |
| Assert(GinItemPointerGetOffsetNumber(iptr) < (1 << MaxHeapTuplesPerPageBits)); |
| #endif |
| |
| val = GinItemPointerGetBlockNumber(iptr); |
| val <<= MaxHeapTuplesPerPageBits; |
| val |= GinItemPointerGetOffsetNumber(iptr); |
| |
| return val; |
| } |
| |
| static inline void |
| uint64_to_itemptr(uint64 val, ItemPointer iptr) |
| { |
| GinItemPointerSetOffsetNumber(iptr, val & ((1 << MaxHeapTuplesPerPageBits) - 1)); |
| val = val >> MaxHeapTuplesPerPageBits; |
| GinItemPointerSetBlockNumber(iptr, val); |
| |
| Assert(ItemPointerIsValid(iptr)); |
| } |
| |
| /* |
| * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer. |
| */ |
| static void |
| encode_varbyte(uint64 val, unsigned char **ptr) |
| { |
| unsigned char *p = *ptr; |
| |
| while (val > 0x7F) |
| { |
| *(p++) = 0x80 | (val & 0x7F); |
| val >>= 7; |
| } |
| *(p++) = (unsigned char) val; |
| |
| *ptr = p; |
| } |
| |
| /* |
| * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer. |
| */ |
| static uint64 |
| decode_varbyte(unsigned char **ptr) |
| { |
| uint64 val; |
| unsigned char *p = *ptr; |
| uint64 c; |
| |
| /* 1st byte */ |
| c = *(p++); |
| val = c & 0x7F; |
| if (c & 0x80) |
| { |
| /* 2nd byte */ |
| c = *(p++); |
| val |= (c & 0x7F) << 7; |
| if (c & 0x80) |
| { |
| /* 3rd byte */ |
| c = *(p++); |
| val |= (c & 0x7F) << 14; |
| if (c & 0x80) |
| { |
| /* 4th byte */ |
| c = *(p++); |
| val |= (c & 0x7F) << 21; |
| if (c & 0x80) |
| { |
| /* 5th byte */ |
| c = *(p++); |
| val |= (c & 0x7F) << 28; |
| if (c & 0x80) |
| { |
| /* 6th byte */ |
| c = *(p++); |
| val |= (c & 0x7F) << 35; |
| if (c & 0x80) |
| { |
| /* 7th byte, should not have continuation bit */ |
| c = *(p++); |
| val |= c << 42; |
| Assert((c & 0x80) == 0); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| *ptr = p; |
| |
| return val; |
| } |
| |
| /* |
| * Encode a posting list. |
| * |
| * The encoded list is returned in a palloc'd struct, which will be at most |
| * 'maxsize' bytes in size. The number items in the returned segment is |
| * returned in *nwritten. If it's not equal to nipd, not all the items fit |
| * in 'maxsize', and only the first *nwritten were encoded. |
| * |
| * The allocated size of the returned struct is short-aligned, and the padding |
| * byte at the end, if any, is zero. |
| */ |
| GinPostingList * |
| ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize, |
| int *nwritten) |
| { |
| uint64 prev; |
| int totalpacked = 0; |
| int maxbytes; |
| GinPostingList *result; |
| unsigned char *ptr; |
| unsigned char *endptr; |
| |
| maxsize = SHORTALIGN_DOWN(maxsize); |
| |
| result = palloc(maxsize); |
| |
| maxbytes = maxsize - offsetof(GinPostingList, bytes); |
| Assert(maxbytes > 0); |
| |
| /* Store the first special item */ |
| result->first = ipd[0]; |
| |
| prev = itemptr_to_uint64(&result->first); |
| |
| ptr = result->bytes; |
| endptr = result->bytes + maxbytes; |
| for (totalpacked = 1; totalpacked < nipd; totalpacked++) |
| { |
| uint64 val = itemptr_to_uint64(&ipd[totalpacked]); |
| uint64 delta = val - prev; |
| |
| Assert(val > prev); |
| |
| if (endptr - ptr >= MaxBytesPerInteger) |
| encode_varbyte(delta, &ptr); |
| else |
| { |
| /* |
| * There are less than 7 bytes left. Have to check if the next |
| * item fits in that space before writing it out. |
| */ |
| unsigned char buf[MaxBytesPerInteger]; |
| unsigned char *p = buf; |
| |
| encode_varbyte(delta, &p); |
| if (p - buf > (endptr - ptr)) |
| break; /* output is full */ |
| |
| memcpy(ptr, buf, p - buf); |
| ptr += (p - buf); |
| } |
| prev = val; |
| } |
| result->nbytes = ptr - result->bytes; |
| |
| /* |
| * If we wrote an odd number of bytes, zero out the padding byte at the |
| * end. |
| */ |
| if (result->nbytes != SHORTALIGN(result->nbytes)) |
| result->bytes[result->nbytes] = 0; |
| |
| if (nwritten) |
| *nwritten = totalpacked; |
| |
| Assert(SizeOfGinPostingList(result) <= maxsize); |
| |
| /* |
| * Check that the encoded segment decodes back to the original items. |
| */ |
| #if defined (CHECK_ENCODING_ROUNDTRIP) |
| { |
| int ndecoded; |
| ItemPointer tmp = ginPostingListDecode(result, &ndecoded); |
| |
| Assert(ndecoded == totalpacked); |
| Assert(memcmp(tmp, ipd, ndecoded * sizeof(ItemPointerData)) == 0); |
| pfree(tmp); |
| } |
| #endif |
| |
| return result; |
| } |
| |
| /* |
| * Decode a compressed posting list into an array of item pointers. |
| * The number of items is returned in *ndecoded. |
| */ |
| ItemPointer |
| ginPostingListDecode(GinPostingList *plist, int *ndecoded_out) |
| { |
| return ginPostingListDecodeAllSegments(plist, |
| SizeOfGinPostingList(plist), |
| ndecoded_out); |
| } |
| |
| /* |
| * Decode multiple posting list segments into an array of item pointers. |
| * The number of items is returned in *ndecoded_out. The segments are stored |
| * one after each other, with total size 'len' bytes. |
| */ |
| ItemPointer |
| ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out) |
| { |
| ItemPointer result; |
| int nallocated; |
| uint64 val; |
| char *endseg = ((char *) segment) + len; |
| int ndecoded; |
| unsigned char *ptr; |
| unsigned char *endptr; |
| |
| /* |
| * Guess an initial size of the array. |
| */ |
| nallocated = segment->nbytes * 2 + 1; |
| result = palloc(nallocated * sizeof(ItemPointerData)); |
| |
| ndecoded = 0; |
| while ((char *) segment < endseg) |
| { |
| /* enlarge output array if needed */ |
| if (ndecoded >= nallocated) |
| { |
| nallocated *= 2; |
| result = repalloc(result, nallocated * sizeof(ItemPointerData)); |
| } |
| |
| /* copy the first item */ |
| Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first))); |
| Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0); |
| result[ndecoded] = segment->first; |
| ndecoded++; |
| |
| val = itemptr_to_uint64(&segment->first); |
| ptr = segment->bytes; |
| endptr = segment->bytes + segment->nbytes; |
| while (ptr < endptr) |
| { |
| /* enlarge output array if needed */ |
| if (ndecoded >= nallocated) |
| { |
| nallocated *= 2; |
| result = repalloc(result, nallocated * sizeof(ItemPointerData)); |
| } |
| |
| val += decode_varbyte(&ptr); |
| |
| uint64_to_itemptr(val, &result[ndecoded]); |
| ndecoded++; |
| } |
| segment = GinNextPostingListSegment(segment); |
| } |
| |
| if (ndecoded_out) |
| *ndecoded_out = ndecoded; |
| return result; |
| } |
| |
| /* |
| * Add all item pointers from a bunch of posting lists to a TIDBitmap. |
| */ |
| int |
| ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, |
| TIDBitmap *tbm) |
| { |
| int ndecoded; |
| ItemPointer items; |
| |
| items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded); |
| tbm_add_tuples(tbm, items, ndecoded, false); |
| pfree(items); |
| |
| return ndecoded; |
| } |
| |
| /* |
| * Merge two ordered arrays of itempointers, eliminating any duplicates. |
| * |
| * Returns a palloc'd array, and *nmerged is set to the number of items in |
| * the result, after eliminating duplicates. |
| */ |
| ItemPointer |
| ginMergeItemPointers(ItemPointerData *a, uint32 na, |
| ItemPointerData *b, uint32 nb, |
| int *nmerged) |
| { |
| ItemPointerData *dst; |
| |
| dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData)); |
| |
| /* |
| * If the argument arrays don't overlap, we can just append them to each |
| * other. |
| */ |
| if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0) |
| { |
| memcpy(dst, a, na * sizeof(ItemPointerData)); |
| memcpy(&dst[na], b, nb * sizeof(ItemPointerData)); |
| *nmerged = na + nb; |
| } |
| else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0) |
| { |
| memcpy(dst, b, nb * sizeof(ItemPointerData)); |
| memcpy(&dst[nb], a, na * sizeof(ItemPointerData)); |
| *nmerged = na + nb; |
| } |
| else |
| { |
| ItemPointerData *dptr = dst; |
| ItemPointerData *aptr = a; |
| ItemPointerData *bptr = b; |
| |
| while (aptr - a < na && bptr - b < nb) |
| { |
| int cmp = ginCompareItemPointers(aptr, bptr); |
| |
| if (cmp > 0) |
| *dptr++ = *bptr++; |
| else if (cmp == 0) |
| { |
| /* only keep one copy of the identical items */ |
| *dptr++ = *bptr++; |
| aptr++; |
| } |
| else |
| *dptr++ = *aptr++; |
| } |
| |
| while (aptr - a < na) |
| *dptr++ = *aptr++; |
| |
| while (bptr - b < nb) |
| *dptr++ = *bptr++; |
| |
| *nmerged = dptr - dst; |
| } |
| |
| return dst; |
| } |