| /*------------------------------------------------------------------------- |
| * |
| * toast_internals.c |
| * Functions for internal use by the TOAST system. |
| * |
| * Copyright (c) 2000-2023, PostgreSQL Global Development Group |
| * |
| * IDENTIFICATION |
| * src/backend/access/common/toast_internals.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "postgres.h" |
| |
| #include "access/detoast.h" |
| #include "access/genam.h" |
| #include "access/heapam.h" |
| #include "access/heaptoast.h" |
| #include "access/table.h" |
| #include "access/toast_internals.h" |
| #include "access/xact.h" |
| #include "catalog/catalog.h" |
| #include "common/pg_lzcompress.h" |
| #include "miscadmin.h" |
| #include "utils/fmgroids.h" |
| #include "utils/rel.h" |
| #include "utils/snapmgr.h" |
| |
| static bool toastrel_valueid_exists(Relation toastrel, Oid valueid); |
| static bool toastid_valueid_exists(Oid toastrelid, Oid valueid); |
| |
| /* ---------- |
| * toast_compress_datum - |
| * |
| * Create a compressed version of a varlena datum |
| * |
| * If we fail (ie, compressed result is actually bigger than original) |
| * then return NULL. We must not use compressed data if it'd expand |
| * the tuple! |
| * |
| * We use VAR{SIZE,DATA}_ANY so we can handle short varlenas here without |
| * copying them. But we can't handle external or compressed datums. |
| * ---------- |
| */ |
| Datum |
| toast_compress_datum(Datum value, char cmethod) |
| { |
| struct varlena *tmp = NULL; |
| int32 valsize; |
| ToastCompressionId cmid = TOAST_INVALID_COMPRESSION_ID; |
| |
| Assert(!VARATT_IS_EXTERNAL(DatumGetPointer(value))); |
| Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value))); |
| |
| valsize = VARSIZE_ANY_EXHDR(DatumGetPointer(value)); |
| |
| /* If the compression method is not valid, use the current default */ |
| if (!CompressionMethodIsValid(cmethod)) |
| cmethod = default_toast_compression; |
| |
| /* |
| * Call appropriate compression routine for the compression method. |
| */ |
| switch (cmethod) |
| { |
| case TOAST_PGLZ_COMPRESSION: |
| tmp = pglz_compress_datum((const struct varlena *) value); |
| cmid = TOAST_PGLZ_COMPRESSION_ID; |
| break; |
| case TOAST_LZ4_COMPRESSION: |
| tmp = lz4_compress_datum((const struct varlena *) value); |
| cmid = TOAST_LZ4_COMPRESSION_ID; |
| break; |
| default: |
| elog(ERROR, "invalid compression method %c", cmethod); |
| } |
| |
| if (tmp == NULL) |
| return PointerGetDatum(NULL); |
| |
| /* |
| * We recheck the actual size even if compression reports success, because |
| * it might be satisfied with having saved as little as one byte in the |
| * compressed data --- which could turn into a net loss once you consider |
| * header and alignment padding. Worst case, the compressed format might |
| * require three padding bytes (plus header, which is included in |
| * VARSIZE(tmp)), whereas the uncompressed format would take only one |
| * header byte and no padding if the value is short enough. So we insist |
| * on a savings of more than 2 bytes to ensure we have a gain. |
| */ |
| if (VARSIZE(tmp) < valsize - 2) |
| { |
| /* successful compression */ |
| Assert(cmid != TOAST_INVALID_COMPRESSION_ID); |
| TOAST_COMPRESS_SET_SIZE_AND_COMPRESS_METHOD(tmp, valsize, cmid); |
| return PointerGetDatum(tmp); |
| } |
| else |
| { |
| /* incompressible data */ |
| pfree(tmp); |
| return PointerGetDatum(NULL); |
| } |
| } |
| |
| /* ---------- |
| * toast_save_datum - |
| * |
| * Save one single datum into the secondary relation and return |
| * a Datum reference for it. |
| * |
| * rel: the main relation we're working with (not the toast rel!) |
| * value: datum to be pushed to toast storage |
| * oldexternal: if not NULL, toast pointer previously representing the datum |
| * options: options to be passed to heap_insert() for toast rows |
| * ---------- |
| */ |
| Datum |
| toast_save_datum(Relation rel, Datum value, |
| struct varlena *oldexternal, int options) |
| { |
| Relation toastrel; |
| Relation *toastidxs; |
| HeapTuple toasttup; |
| TupleDesc toasttupDesc; |
| Datum t_values[3]; |
| bool t_isnull[3]; |
| TransactionId xid = GetCurrentTransactionId(); |
| CommandId mycid = GetCurrentCommandId(true); |
| struct varlena *result; |
| struct varatt_external toast_pointer; |
| union |
| { |
| struct varlena hdr; |
| /* this is to make the union big enough for a chunk: */ |
| char data[TOAST_MAX_CHUNK_SIZE + VARHDRSZ]; |
| /* ensure union is aligned well enough: */ |
| int32 align_it; |
| } chunk_data; |
| int32 chunk_size; |
| int32 chunk_seq = 0; |
| char *data_p; |
| int32 data_todo; |
| Pointer dval = DatumGetPointer(value); |
| int num_indexes; |
| int validIndex; |
| |
| Assert(!VARATT_IS_EXTERNAL(value)); |
| |
| /* |
| * Open the toast relation and its indexes. We can use the index to check |
| * uniqueness of the OID we assign to the toasted item, even though it has |
| * additional columns besides OID. |
| */ |
| toastrel = table_open(rel->rd_rel->reltoastrelid, RowExclusiveLock); |
| toasttupDesc = toastrel->rd_att; |
| |
| /* Open all the toast indexes and look for the valid one */ |
| validIndex = toast_open_indexes(toastrel, |
| RowExclusiveLock, |
| &toastidxs, |
| &num_indexes); |
| |
| /* |
| * Get the data pointer and length, and compute va_rawsize and va_extinfo. |
| * |
| * va_rawsize is the size of the equivalent fully uncompressed datum, so |
| * we have to adjust for short headers. |
| * |
| * va_extinfo stored the actual size of the data payload in the toast |
| * records and the compression method in first 2 bits if data is |
| * compressed. |
| */ |
| if (VARATT_IS_SHORT(dval)) |
| { |
| data_p = VARDATA_SHORT(dval); |
| data_todo = VARSIZE_SHORT(dval) - VARHDRSZ_SHORT; |
| toast_pointer.va_rawsize = data_todo + VARHDRSZ; /* as if not short */ |
| toast_pointer.va_extinfo = data_todo; |
| } |
| else if (VARATT_IS_COMPRESSED(dval)) |
| { |
| data_p = VARDATA(dval); |
| data_todo = VARSIZE(dval) - VARHDRSZ; |
| /* rawsize in a compressed datum is just the size of the payload */ |
| toast_pointer.va_rawsize = VARDATA_COMPRESSED_GET_EXTSIZE(dval) + VARHDRSZ; |
| |
| /* set external size and compression method */ |
| VARATT_EXTERNAL_SET_SIZE_AND_COMPRESS_METHOD(toast_pointer, data_todo, |
| VARDATA_COMPRESSED_GET_COMPRESS_METHOD(dval)); |
| /* Assert that the numbers look like it's compressed */ |
| Assert(VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)); |
| } |
| else |
| { |
| data_p = VARDATA(dval); |
| data_todo = VARSIZE(dval) - VARHDRSZ; |
| toast_pointer.va_rawsize = VARSIZE(dval); |
| toast_pointer.va_extinfo = data_todo; |
| } |
| |
| /* |
| * Insert the correct table OID into the result TOAST pointer. |
| * |
| * Normally this is the actual OID of the target toast table, but during |
| * table-rewriting operations such as CLUSTER, we have to insert the OID |
| * of the table's real permanent toast table instead. rd_toastoid is set |
| * if we have to substitute such an OID. |
| */ |
| if (OidIsValid(rel->rd_toastoid)) |
| toast_pointer.va_toastrelid = rel->rd_toastoid; |
| else |
| toast_pointer.va_toastrelid = RelationGetRelid(toastrel); |
| |
| /* |
| * Choose an OID to use as the value ID for this toast value. |
| * |
| * Normally we just choose an unused OID within the toast table. But |
| * during table-rewriting operations where we are preserving an existing |
| * toast table OID, we want to preserve toast value OIDs too. So, if |
| * rd_toastoid is set and we had a prior external value from that same |
| * toast table, re-use its value ID. If we didn't have a prior external |
| * value (which is a corner case, but possible if the table's attstorage |
| * options have been changed), we have to pick a value ID that doesn't |
| * conflict with either new or existing toast value OIDs. |
| */ |
| if (!OidIsValid(rel->rd_toastoid)) |
| { |
| /* normal case: just choose an unused OID */ |
| toast_pointer.va_valueid = |
| GetNewOidWithIndex(toastrel, |
| RelationGetRelid(toastidxs[validIndex]), |
| (AttrNumber) 1); |
| } |
| else |
| { |
| /* rewrite case: check to see if value was in old toast table */ |
| toast_pointer.va_valueid = InvalidOid; |
| if (oldexternal != NULL) |
| { |
| struct varatt_external old_toast_pointer; |
| |
| Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal)); |
| /* Must copy to access aligned fields */ |
| VARATT_EXTERNAL_GET_POINTER(old_toast_pointer, oldexternal); |
| if (old_toast_pointer.va_toastrelid == rel->rd_toastoid) |
| { |
| /* This value came from the old toast table; reuse its OID */ |
| toast_pointer.va_valueid = old_toast_pointer.va_valueid; |
| |
| /* |
| * There is a corner case here: the table rewrite might have |
| * to copy both live and recently-dead versions of a row, and |
| * those versions could easily reference the same toast value. |
| * When we copy the second or later version of such a row, |
| * reusing the OID will mean we select an OID that's already |
| * in the new toast table. Check for that, and if so, just |
| * fall through without writing the data again. |
| * |
| * While annoying and ugly-looking, this is a good thing |
| * because it ensures that we wind up with only one copy of |
| * the toast value when there is only one copy in the old |
| * toast table. Before we detected this case, we'd have made |
| * multiple copies, wasting space; and what's worse, the |
| * copies belonging to already-deleted heap tuples would not |
| * be reclaimed by VACUUM. |
| */ |
| if (toastrel_valueid_exists(toastrel, |
| toast_pointer.va_valueid)) |
| { |
| /* Match, so short-circuit the data storage loop below */ |
| data_todo = 0; |
| } |
| } |
| } |
| if (toast_pointer.va_valueid == InvalidOid) |
| { |
| /* |
| * new value; must choose an OID that doesn't conflict in either |
| * old or new toast table |
| */ |
| do |
| { |
| toast_pointer.va_valueid = |
| GetNewOidWithIndex(toastrel, |
| RelationGetRelid(toastidxs[validIndex]), |
| (AttrNumber) 1); |
| } while (toastid_valueid_exists(rel->rd_toastoid, |
| toast_pointer.va_valueid)); |
| } |
| } |
| |
| /* |
| * Initialize constant parts of the tuple data |
| */ |
| t_values[0] = ObjectIdGetDatum(toast_pointer.va_valueid); |
| t_values[2] = PointerGetDatum(&chunk_data); |
| t_isnull[0] = false; |
| t_isnull[1] = false; |
| t_isnull[2] = false; |
| |
| /* |
| * Split up the item into chunks |
| */ |
| while (data_todo > 0) |
| { |
| int i; |
| |
| CHECK_FOR_INTERRUPTS(); |
| |
| /* |
| * Calculate the size of this chunk |
| */ |
| chunk_size = Min(TOAST_MAX_CHUNK_SIZE, data_todo); |
| |
| /* |
| * Build a tuple and store it |
| */ |
| t_values[1] = Int32GetDatum(chunk_seq++); |
| SET_VARSIZE(&chunk_data, chunk_size + VARHDRSZ); |
| memcpy(VARDATA(&chunk_data), data_p, chunk_size); |
| toasttup = heap_form_tuple(toasttupDesc, t_values, t_isnull); |
| |
| heap_insert(toastrel, toasttup, mycid, options, NULL, xid); |
| |
| /* |
| * Create the index entry. We cheat a little here by not using |
| * FormIndexDatum: this relies on the knowledge that the index columns |
| * are the same as the initial columns of the table for all the |
| * indexes. We also cheat by not providing an IndexInfo: this is okay |
| * for now because btree doesn't need one, but we might have to be |
| * more honest someday. |
| * |
| * Note also that there had better not be any user-created index on |
| * the TOAST table, since we don't bother to update anything else. |
| */ |
| for (i = 0; i < num_indexes; i++) |
| { |
| /* Only index relations marked as ready can be updated */ |
| if (toastidxs[i]->rd_index->indisready) |
| index_insert(toastidxs[i], t_values, t_isnull, |
| &(toasttup->t_self), |
| toastrel, |
| toastidxs[i]->rd_index->indisunique ? |
| UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, |
| false, NULL); |
| } |
| |
| /* |
| * Free memory |
| */ |
| heap_freetuple(toasttup); |
| |
| /* |
| * Move on to next chunk |
| */ |
| data_todo -= chunk_size; |
| data_p += chunk_size; |
| } |
| |
| /* |
| * Done - close toast relation and its indexes but keep the lock until |
| * commit, so as a concurrent reindex done directly on the toast relation |
| * would be able to wait for this transaction. |
| */ |
| toast_close_indexes(toastidxs, num_indexes, NoLock); |
| table_close(toastrel, NoLock); |
| |
| /* |
| * Create the TOAST pointer value that we'll return |
| */ |
| result = (struct varlena *) palloc(TOAST_POINTER_SIZE); |
| SET_VARTAG_EXTERNAL(result, VARTAG_ONDISK); |
| memcpy(VARDATA_EXTERNAL(result), &toast_pointer, sizeof(toast_pointer)); |
| |
| return PointerGetDatum(result); |
| } |
| |
| /* ---------- |
| * toast_delete_datum - |
| * |
| * Delete a single external stored value. |
| * ---------- |
| */ |
| void |
| toast_delete_datum(Relation rel, Datum value, bool is_speculative) |
| { |
| struct varlena *attr = (struct varlena *) DatumGetPointer(value); |
| struct varatt_external toast_pointer; |
| Relation toastrel; |
| Relation *toastidxs; |
| ScanKeyData toastkey; |
| SysScanDesc toastscan; |
| HeapTuple toasttup; |
| int num_indexes; |
| int validIndex; |
| SnapshotData SnapshotToast; |
| |
| if (!VARATT_IS_EXTERNAL_ONDISK(attr)) |
| return; |
| |
| /* Must copy to access aligned fields */ |
| VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); |
| |
| /* |
| * Open the toast relation and its indexes |
| */ |
| toastrel = table_open(toast_pointer.va_toastrelid, RowExclusiveLock); |
| |
| /* Fetch valid relation used for process */ |
| validIndex = toast_open_indexes(toastrel, |
| RowExclusiveLock, |
| &toastidxs, |
| &num_indexes); |
| |
| /* |
| * Setup a scan key to find chunks with matching va_valueid |
| */ |
| ScanKeyInit(&toastkey, |
| (AttrNumber) 1, |
| BTEqualStrategyNumber, F_OIDEQ, |
| ObjectIdGetDatum(toast_pointer.va_valueid)); |
| |
| /* |
| * Find all the chunks. (We don't actually care whether we see them in |
| * sequence or not, but since we've already locked the index we might as |
| * well use systable_beginscan_ordered.) |
| */ |
| init_toast_snapshot(&SnapshotToast); |
| toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex], |
| &SnapshotToast, 1, &toastkey); |
| while ((toasttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) |
| { |
| /* |
| * Have a chunk, delete it |
| */ |
| if (is_speculative) |
| heap_abort_speculative(toastrel, &toasttup->t_self); |
| else |
| simple_heap_delete(toastrel, &toasttup->t_self); |
| } |
| |
| /* |
| * End scan and close relations but keep the lock until commit, so as a |
| * concurrent reindex done directly on the toast relation would be able to |
| * wait for this transaction. |
| */ |
| systable_endscan_ordered(toastscan); |
| toast_close_indexes(toastidxs, num_indexes, NoLock); |
| table_close(toastrel, NoLock); |
| } |
| |
| /* ---------- |
| * toastrel_valueid_exists - |
| * |
| * Test whether a toast value with the given ID exists in the toast relation. |
| * For safety, we consider a value to exist if there are either live or dead |
| * toast rows with that ID; see notes for GetNewOidWithIndex(). |
| * ---------- |
| */ |
| static bool |
| toastrel_valueid_exists(Relation toastrel, Oid valueid) |
| { |
| bool result = false; |
| ScanKeyData toastkey; |
| SysScanDesc toastscan; |
| int num_indexes; |
| int validIndex; |
| Relation *toastidxs; |
| |
| /* Fetch a valid index relation */ |
| validIndex = toast_open_indexes(toastrel, |
| RowExclusiveLock, |
| &toastidxs, |
| &num_indexes); |
| |
| /* |
| * Setup a scan key to find chunks with matching va_valueid |
| */ |
| ScanKeyInit(&toastkey, |
| (AttrNumber) 1, |
| BTEqualStrategyNumber, F_OIDEQ, |
| ObjectIdGetDatum(valueid)); |
| |
| /* |
| * Is there any such chunk? |
| */ |
| toastscan = systable_beginscan(toastrel, |
| RelationGetRelid(toastidxs[validIndex]), |
| true, SnapshotAny, 1, &toastkey); |
| |
| if (systable_getnext(toastscan) != NULL) |
| result = true; |
| |
| systable_endscan(toastscan); |
| |
| /* Clean up */ |
| toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock); |
| |
| return result; |
| } |
| |
| /* ---------- |
| * toastid_valueid_exists - |
| * |
| * As above, but work from toast rel's OID not an open relation |
| * ---------- |
| */ |
| static bool |
| toastid_valueid_exists(Oid toastrelid, Oid valueid) |
| { |
| bool result; |
| Relation toastrel; |
| |
| toastrel = table_open(toastrelid, AccessShareLock); |
| |
| result = toastrel_valueid_exists(toastrel, valueid); |
| |
| table_close(toastrel, AccessShareLock); |
| |
| return result; |
| } |
| |
| /* ---------- |
| * toast_get_valid_index |
| * |
| * Get OID of valid index associated to given toast relation. A toast |
| * relation can have only one valid index at the same time. |
| */ |
| Oid |
| toast_get_valid_index(Oid toastoid, LOCKMODE lock) |
| { |
| int num_indexes; |
| int validIndex; |
| Oid validIndexOid; |
| Relation *toastidxs; |
| Relation toastrel; |
| |
| /* Open the toast relation */ |
| toastrel = table_open(toastoid, lock); |
| |
| /* Look for the valid index of the toast relation */ |
| validIndex = toast_open_indexes(toastrel, |
| lock, |
| &toastidxs, |
| &num_indexes); |
| validIndexOid = RelationGetRelid(toastidxs[validIndex]); |
| |
| /* Close the toast relation and all its indexes */ |
| toast_close_indexes(toastidxs, num_indexes, NoLock); |
| table_close(toastrel, NoLock); |
| |
| return validIndexOid; |
| } |
| |
| /* ---------- |
| * toast_open_indexes |
| * |
| * Get an array of the indexes associated to the given toast relation |
| * and return as well the position of the valid index used by the toast |
| * relation in this array. It is the responsibility of the caller of this |
| * function to close the indexes as well as free them. |
| */ |
| int |
| toast_open_indexes(Relation toastrel, |
| LOCKMODE lock, |
| Relation **toastidxs, |
| int *num_indexes) |
| { |
| int i = 0; |
| int res = 0; |
| bool found = false; |
| List *indexlist; |
| ListCell *lc; |
| |
| /* Get index list of the toast relation */ |
| indexlist = RelationGetIndexList(toastrel); |
| Assert(indexlist != NIL); |
| |
| *num_indexes = list_length(indexlist); |
| |
| /* Open all the index relations */ |
| *toastidxs = (Relation *) palloc(*num_indexes * sizeof(Relation)); |
| foreach(lc, indexlist) |
| (*toastidxs)[i++] = index_open(lfirst_oid(lc), lock); |
| |
| /* Fetch the first valid index in list */ |
| for (i = 0; i < *num_indexes; i++) |
| { |
| Relation toastidx = (*toastidxs)[i]; |
| |
| if (toastidx->rd_index->indisvalid) |
| { |
| res = i; |
| found = true; |
| break; |
| } |
| } |
| |
| /* |
| * Free index list, not necessary anymore as relations are opened and a |
| * valid index has been found. |
| */ |
| list_free(indexlist); |
| |
| /* |
| * The toast relation should have one valid index, so something is going |
| * wrong if there is nothing. |
| */ |
| if (!found) |
| elog(ERROR, "no valid index found for toast relation with Oid %u", |
| RelationGetRelid(toastrel)); |
| |
| return res; |
| } |
| |
| /* ---------- |
| * toast_close_indexes |
| * |
| * Close an array of indexes for a toast relation and free it. This should |
| * be called for a set of indexes opened previously with toast_open_indexes. |
| */ |
| void |
| toast_close_indexes(Relation *toastidxs, int num_indexes, LOCKMODE lock) |
| { |
| int i; |
| |
| /* Close relations and clean up things */ |
| for (i = 0; i < num_indexes; i++) |
| index_close(toastidxs[i], lock); |
| pfree(toastidxs); |
| } |
| |
| /* ---------- |
| * init_toast_snapshot |
| * |
| * Initialize an appropriate TOAST snapshot. We must use an MVCC snapshot |
| * to initialize the TOAST snapshot; since we don't know which one to use, |
| * just use the oldest one. This is safe: at worst, we will get a "snapshot |
| * too old" error that might have been avoided otherwise. |
| */ |
| void |
| init_toast_snapshot(Snapshot toast_snapshot) |
| { |
| Snapshot snapshot = GetOldestSnapshot(); |
| |
| /* |
| * GetOldestSnapshot returns NULL if the session has no active snapshots. |
| * We can get that if, for example, a procedure fetches a toasted value |
| * into a local variable, commits, and then tries to detoast the value. |
| * Such coding is unsafe, because once we commit there is nothing to |
| * prevent the toast data from being deleted. Detoasting *must* happen in |
| * the same transaction that originally fetched the toast pointer. Hence, |
| * rather than trying to band-aid over the problem, throw an error. (This |
| * is not very much protection, because in many scenarios the procedure |
| * would have already created a new transaction snapshot, preventing us |
| * from detecting the problem. But it's better than nothing, and for sure |
| * we shouldn't expend code on masking the problem more.) |
| */ |
| if (snapshot == NULL) |
| elog(ERROR, "cannot fetch toast data without an active snapshot"); |
| |
| /* |
| * Catalog snapshots can be returned by GetOldestSnapshot() even if not |
| * registered or active. That easily hides bugs around not having a |
| * snapshot set up - most of the time there is a valid catalog snapshot. |
| * So additionally insist that the current snapshot is registered or |
| * active. |
| */ |
| Assert(HaveRegisteredOrActiveSnapshot()); |
| |
| InitToastSnapshot(*toast_snapshot, snapshot->lsn, snapshot->whenTaken); |
| } |