src/backend/storage/large_object/inv_api.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * inv_api.c
  *	  routines for manipulating inversion fs large objects. This file
  *	  contains the user-level large object application interface routines.
  *
  *
  * Note: we access pg_largeobject.data using its C struct declaration.
  * This is safe because it immediately follows pageno which is an int4 field,
  * and therefore the data field will always be 4-byte aligned, even if it
  * is in the short 1-byte-header format.  We have to detoast it since it's
  * quite likely to be in compressed or short format.  We also need to check
  * for NULLs, since initdb will mark loid and pageno but not data as NOT NULL.
  *
  * Note: many of these routines leak memory in CurrentMemoryContext, as indeed
  * does most of the backend code.  We expect that CurrentMemoryContext will
  * be a short-lived context.  Data that must persist across function calls
  * is kept either in CacheMemoryContext (the Relation structs) or in the
  * memory context given to inv_open (for LargeObjectDesc structs).
  *
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
  *	  src/backend/storage/large_object/inv_api.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"

 #include <limits.h>

 #include "access/detoast.h"
 #include "access/genam.h"
 #include "access/htup_details.h"
 #include "access/sysattr.h"
 #include "access/table.h"
 #include "access/xact.h"
 #include "catalog/dependency.h"
 #include "catalog/indexing.h"
 #include "catalog/objectaccess.h"
 #include "catalog/pg_largeobject.h"
 #include "catalog/pg_largeobject_metadata.h"
 #include "libpq/libpq-fs.h"
 #include "miscadmin.h"
 #include "storage/large_object.h"
 #include "utils/acl.h"
 #include "utils/fmgroids.h"
 #include "utils/rel.h"
 #include "utils/snapmgr.h"


 /*
  * GUC: backwards-compatibility flag to suppress LO permission checks
  */
 bool		lo_compat_privileges;

 /*
  * All accesses to pg_largeobject and its index make use of a single Relation
  * reference, so that we only need to open pg_relation once per transaction.
  * To avoid problems when the first such reference occurs inside a
  * subtransaction, we execute a slightly klugy maneuver to assign ownership of
  * the Relation reference to TopTransactionResourceOwner.
  */
 static Relation lo_heap_r = NULL;
 static Relation lo_index_r = NULL;


 /*
  * Open pg_largeobject and its index, if not already done in current xact
  */
 static void
 open_lo_relation(void)
 {
 	ResourceOwner currentOwner;

 	if (lo_heap_r && lo_index_r)
 		return;					/* already open in current xact */

 	/* Arrange for the top xact to own these relation references */
 	currentOwner = CurrentResourceOwner;
 	CurrentResourceOwner = TopTransactionResourceOwner;

 	/* Use RowExclusiveLock since we might either read or write */
 	if (lo_heap_r == NULL)
 		lo_heap_r = table_open(LargeObjectRelationId, RowExclusiveLock);
 	if (lo_index_r == NULL)
 		lo_index_r = index_open(LargeObjectLOidPNIndexId, RowExclusiveLock);

 	CurrentResourceOwner = currentOwner;
 }

 /*
  * Clean up at main transaction end
  */
 void
 close_lo_relation(bool isCommit)
 {
 	if (lo_heap_r || lo_index_r)
 	{
 		/*
 		 * Only bother to close if committing; else abort cleanup will handle
 		 * it
 		 */
 		if (isCommit)
 		{
 			ResourceOwner currentOwner;

 			currentOwner = CurrentResourceOwner;
 			CurrentResourceOwner = TopTransactionResourceOwner;

 			if (lo_index_r)
 				index_close(lo_index_r, NoLock);
 			if (lo_heap_r)
 				table_close(lo_heap_r, NoLock);

 			CurrentResourceOwner = currentOwner;
 		}
 		lo_heap_r = NULL;
 		lo_index_r = NULL;
 	}
 }


 /*
  * Same as pg_largeobject.c's LargeObjectExists(), except snapshot to
  * read with can be specified.
  */
 static bool
 myLargeObjectExists(Oid loid, Snapshot snapshot)
 {
 	Relation	pg_lo_meta;
 	ScanKeyData skey[1];
 	SysScanDesc sd;
 	HeapTuple	tuple;
 	bool		retval = false;

 	ScanKeyInit(&skey[0],
 				Anum_pg_largeobject_metadata_oid,
 				BTEqualStrategyNumber, F_OIDEQ,
 				ObjectIdGetDatum(loid));

 	pg_lo_meta = table_open(LargeObjectMetadataRelationId,
 							AccessShareLock);

 	sd = systable_beginscan(pg_lo_meta,
 							LargeObjectMetadataOidIndexId, true,
 							snapshot, 1, skey);

 	tuple = systable_getnext(sd);
 	if (HeapTupleIsValid(tuple))
 		retval = true;

 	systable_endscan(sd);

 	table_close(pg_lo_meta, AccessShareLock);

 	return retval;
 }


 /*
  * Extract data field from a pg_largeobject tuple, detoasting if needed
  * and verifying that the length is sane.  Returns data pointer (a bytea *),
  * data length, and an indication of whether to pfree the data pointer.
  */
 static void
 getdatafield(Form_pg_largeobject tuple,
 			 bytea **pdatafield,
 			 int *plen,
 			 bool *pfreeit)
 {
 	bytea	   *datafield;
 	int			len;
 	bool		freeit;

 	datafield = &(tuple->data); /* see note at top of file */
 	freeit = false;
 	if (VARATT_IS_EXTENDED(datafield))
 	{
 		datafield = (bytea *)
 			detoast_attr((struct varlena *) datafield);
 		freeit = true;
 	}
 	len = VARSIZE(datafield) - VARHDRSZ;
 	if (len < 0 || len > LOBLKSIZE)
 		ereport(ERROR,
 				(errcode(ERRCODE_DATA_CORRUPTED),
 				 errmsg("pg_largeobject entry for OID %u, page %d has invalid data field size %d",
 						tuple->loid, tuple->pageno, len)));
 	*pdatafield = datafield;
 	*plen = len;
 	*pfreeit = freeit;
 }


 /*
  *	inv_create -- create a new large object
  *
  *	Arguments:
  *	  lobjId - OID to use for new large object, or InvalidOid to pick one
  *
  *	Returns:
  *	  OID of new object
  *
  * If lobjId is not InvalidOid, then an error occurs if the OID is already
  * in use.
  */
 Oid
 inv_create(Oid lobjId)
 {
 	Oid			lobjId_new;

 	/*
 	 * Create a new largeobject with empty data pages
 	 */
 	lobjId_new = LargeObjectCreate(lobjId);

 	/*
 	 * dependency on the owner of largeobject
 	 *
 	 * Note that LO dependencies are recorded using classId
 	 * LargeObjectRelationId for backwards-compatibility reasons.  Using
 	 * LargeObjectMetadataRelationId instead would simplify matters for the
 	 * backend, but it'd complicate pg_dump and possibly break other clients.
 	 */
 	recordDependencyOnOwner(LargeObjectRelationId,
 							lobjId_new, GetUserId());

 	/* Post creation hook for new large object */
 	InvokeObjectPostCreateHook(LargeObjectRelationId, lobjId_new, 0);

 	/*
 	 * Advance command counter to make new tuple visible to later operations.
 	 */
 	CommandCounterIncrement();

 	return lobjId_new;
 }

 /*
  *	inv_open -- access an existing large object.
  *
  * Returns a large object descriptor, appropriately filled in.
  * The descriptor and subsidiary data are allocated in the specified
  * memory context, which must be suitably long-lived for the caller's
  * purposes.  If the returned descriptor has a snapshot associated
  * with it, the caller must ensure that it also lives long enough,
  * e.g. by calling RegisterSnapshotOnOwner
  */
 LargeObjectDesc *
 inv_open(Oid lobjId, int flags, MemoryContext mcxt)
 {
 	LargeObjectDesc *retval;
 	Snapshot	snapshot = NULL;
 	int			descflags = 0;

 	/*
 	 * Historically, no difference is made between (INV_WRITE) and (INV_WRITE
 	 * | INV_READ), the caller being allowed to read the large object
 	 * descriptor in either case.
 	 */
 	if (flags & INV_WRITE)
 		descflags |= IFS_WRLOCK | IFS_RDLOCK;
 	if (flags & INV_READ)
 		descflags |= IFS_RDLOCK;

 	if (descflags == 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("invalid flags for opening a large object: %d",
 						flags)));

 	/* Get snapshot.  If write is requested, use an instantaneous snapshot. */
 	if (descflags & IFS_WRLOCK)
 		snapshot = NULL;
 	else
 		snapshot = GetActiveSnapshot();

 	/* Can't use LargeObjectExists here because we need to specify snapshot */
 	if (!myLargeObjectExists(lobjId, snapshot))
 		ereport(ERROR,
 				(errcode(ERRCODE_UNDEFINED_OBJECT),
 				 errmsg("large object %u does not exist", lobjId)));

 	/* Apply permission checks, again specifying snapshot */
 	if ((descflags & IFS_RDLOCK) != 0)
 	{
 		if (!lo_compat_privileges &&
 			pg_largeobject_aclcheck_snapshot(lobjId,
 											 GetUserId(),
 											 ACL_SELECT,
 											 snapshot) != ACLCHECK_OK)
 			ereport(ERROR,
 					(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 					 errmsg("permission denied for large object %u",
 							lobjId)));
 	}
 	if ((descflags & IFS_WRLOCK) != 0)
 	{
 		if (!lo_compat_privileges &&
 			pg_largeobject_aclcheck_snapshot(lobjId,
 											 GetUserId(),
 											 ACL_UPDATE,
 											 snapshot) != ACLCHECK_OK)
 			ereport(ERROR,
 					(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 					 errmsg("permission denied for large object %u",
 							lobjId)));
 	}

 	/* OK to create a descriptor */
 	retval = (LargeObjectDesc *) MemoryContextAlloc(mcxt,
 													sizeof(LargeObjectDesc));
 	retval->id = lobjId;
 	retval->offset = 0;
 	retval->flags = descflags;

 	/* caller sets if needed, not used by the functions in this file */
 	retval->subid = InvalidSubTransactionId;

 	/*
 	 * The snapshot (if any) is just the currently active snapshot.  The
 	 * caller will replace it with a longer-lived copy if needed.
 	 */
 	retval->snapshot = snapshot;

 	return retval;
 }

 /*
  * Closes a large object descriptor previously made by inv_open(), and
  * releases the long-term memory used by it.
  */
 void
 inv_close(LargeObjectDesc *obj_desc)
 {
 	Assert(PointerIsValid(obj_desc));
 	pfree(obj_desc);
 }

 /*
  * Destroys an existing large object (not to be confused with a descriptor!)
  *
  * Note we expect caller to have done any required permissions check.
  */
 int
 inv_drop(Oid lobjId)
 {
 	ObjectAddress object;

 	/*
 	 * Delete any comments and dependencies on the large object
 	 */
 	object.classId = LargeObjectRelationId;
 	object.objectId = lobjId;
 	object.objectSubId = 0;
 	performDeletion(&object, DROP_CASCADE, 0);

 	/*
 	 * Advance command counter so that tuple removal will be seen by later
 	 * large-object operations in this transaction.
 	 */
 	CommandCounterIncrement();

 	/* For historical reasons, we always return 1 on success. */
 	return 1;
 }

 /*
  * Determine size of a large object
  *
  * NOTE: LOs can contain gaps, just like Unix files.  We actually return
  * the offset of the last byte + 1.
  */
 static uint64
 inv_getsize(LargeObjectDesc *obj_desc)
 {
 	uint64		lastbyte = 0;
 	ScanKeyData skey[1];
 	SysScanDesc sd;
 	HeapTuple	tuple;

 	Assert(PointerIsValid(obj_desc));

 	open_lo_relation();

 	ScanKeyInit(&skey[0],
 				Anum_pg_largeobject_loid,
 				BTEqualStrategyNumber, F_OIDEQ,
 				ObjectIdGetDatum(obj_desc->id));

 	sd = systable_beginscan_ordered(lo_heap_r, lo_index_r,
 									obj_desc->snapshot, 1, skey);

 	/*
 	 * Because the pg_largeobject index is on both loid and pageno, but we
 	 * constrain only loid, a backwards scan should visit all pages of the
 	 * large object in reverse pageno order.  So, it's sufficient to examine
 	 * the first valid tuple (== last valid page).
 	 */
 	tuple = systable_getnext_ordered(sd, BackwardScanDirection);
 	if (HeapTupleIsValid(tuple))
 	{
 		Form_pg_largeobject data;
 		bytea	   *datafield;
 		int			len;
 		bool		pfreeit;

 		if (HeapTupleHasNulls(tuple))	/* paranoia */
 			elog(ERROR, "null field found in pg_largeobject");
 		data = (Form_pg_largeobject) GETSTRUCT(tuple);
 		getdatafield(data, &datafield, &len, &pfreeit);
 		lastbyte = (uint64) data->pageno * LOBLKSIZE + len;
 		if (pfreeit)
 			pfree(datafield);
 	}

 	systable_endscan_ordered(sd);

 	return lastbyte;
 }

 int64
 inv_seek(LargeObjectDesc *obj_desc, int64 offset, int whence)
 {
 	int64		newoffset;

 	Assert(PointerIsValid(obj_desc));

 	/*
 	 * We allow seek/tell if you have either read or write permission, so no
 	 * need for a permission check here.
 	 */

 	/*
 	 * Note: overflow in the additions is possible, but since we will reject
 	 * negative results, we don't need any extra test for that.
 	 */
 	switch (whence)
 	{
 		case SEEK_SET:
 			newoffset = offset;
 			break;
 		case SEEK_CUR:
 			newoffset = obj_desc->offset + offset;
 			break;
 		case SEEK_END:
 			newoffset = inv_getsize(obj_desc) + offset;
 			break;
 		default:
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("invalid whence setting: %d", whence)));
 			newoffset = 0;		/* keep compiler quiet */
 			break;
 	}

 	/*
 	 * use errmsg_internal here because we don't want to expose INT64_FORMAT
 	 * in translatable strings; doing better is not worth the trouble
 	 */
 	if (newoffset < 0 || newoffset > MAX_LARGE_OBJECT_SIZE)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg_internal("invalid large object seek target: " INT64_FORMAT,
 								 newoffset)));

 	obj_desc->offset = newoffset;
 	return newoffset;
 }

 int64
 inv_tell(LargeObjectDesc *obj_desc)
 {
 	Assert(PointerIsValid(obj_desc));

 	/*
 	 * We allow seek/tell if you have either read or write permission, so no
 	 * need for a permission check here.
 	 */

 	return obj_desc->offset;
 }

 int
 inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes)
 {
 	int			nread = 0;
 	int64		n;
 	int64		off;
 	int			len;
 	int32		pageno = (int32) (obj_desc->offset / LOBLKSIZE);
 	uint64		pageoff;
 	ScanKeyData skey[2];
 	SysScanDesc sd;
 	HeapTuple	tuple;

 	Assert(PointerIsValid(obj_desc));
 	Assert(buf != NULL);

 	if ((obj_desc->flags & IFS_RDLOCK) == 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("permission denied for large object %u",
 						obj_desc->id)));

 	if (nbytes <= 0)
 		return 0;

 	open_lo_relation();

 	ScanKeyInit(&skey[0],
 				Anum_pg_largeobject_loid,
 				BTEqualStrategyNumber, F_OIDEQ,
 				ObjectIdGetDatum(obj_desc->id));

 	ScanKeyInit(&skey[1],
 				Anum_pg_largeobject_pageno,
 				BTGreaterEqualStrategyNumber, F_INT4GE,
 				Int32GetDatum(pageno));

 	sd = systable_beginscan_ordered(lo_heap_r, lo_index_r,
 									obj_desc->snapshot, 2, skey);

 	while ((tuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL)
 	{
 		Form_pg_largeobject data;
 		bytea	   *datafield;
 		bool		pfreeit;

 		if (HeapTupleHasNulls(tuple))	/* paranoia */
 			elog(ERROR, "null field found in pg_largeobject");
 		data = (Form_pg_largeobject) GETSTRUCT(tuple);

 		/*
 		 * We expect the indexscan will deliver pages in order.  However,
 		 * there may be missing pages if the LO contains unwritten "holes". We
 		 * want missing sections to read out as zeroes.
 		 */
 		pageoff = ((uint64) data->pageno) * LOBLKSIZE;
 		if (pageoff > obj_desc->offset)
 		{
 			n = pageoff - obj_desc->offset;
 			n = (n <= (nbytes - nread)) ? n : (nbytes - nread);
 			MemSet(buf + nread, 0, n);
 			nread += n;
 			obj_desc->offset += n;
 		}

 		if (nread < nbytes)
 		{
 			Assert(obj_desc->offset >= pageoff);
 			off = (int) (obj_desc->offset - pageoff);
 			Assert(off >= 0 && off < LOBLKSIZE);

 			getdatafield(data, &datafield, &len, &pfreeit);
 			if (len > off)
 			{
 				n = len - off;
 				n = (n <= (nbytes - nread)) ? n : (nbytes - nread);
 				memcpy(buf + nread, VARDATA(datafield) + off, n);
 				nread += n;
 				obj_desc->offset += n;
 			}
 			if (pfreeit)
 				pfree(datafield);
 		}

 		if (nread >= nbytes)
 			break;
 	}

 	systable_endscan_ordered(sd);

 	return nread;
 }

 int
 inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes)
 {
 	int			nwritten = 0;
 	int			n;
 	int			off;
 	int			len;
 	int32		pageno = (int32) (obj_desc->offset / LOBLKSIZE);
 	ScanKeyData skey[2];
 	SysScanDesc sd;
 	HeapTuple	oldtuple;
 	Form_pg_largeobject olddata;
 	bool		neednextpage;
 	bytea	   *datafield;
 	bool		pfreeit;
 	union
 	{
 		bytea		hdr;
 		/* this is to make the union big enough for a LO data chunk: */
 		char		data[LOBLKSIZE + VARHDRSZ];
 		/* ensure union is aligned well enough: */
 		int32		align_it;
 	}			workbuf;
 	char	   *workb = VARDATA(&workbuf.hdr);
 	HeapTuple	newtup;
 	Datum		values[Natts_pg_largeobject];
 	bool		nulls[Natts_pg_largeobject];
 	bool		replace[Natts_pg_largeobject];
 	CatalogIndexState indstate;

 	Assert(PointerIsValid(obj_desc));
 	Assert(buf != NULL);

 	/* enforce writability because snapshot is probably wrong otherwise */
 	if ((obj_desc->flags & IFS_WRLOCK) == 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("permission denied for large object %u",
 						obj_desc->id)));

 	if (nbytes <= 0)
 		return 0;

 	/* this addition can't overflow because nbytes is only int32 */
 	if ((nbytes + obj_desc->offset) > MAX_LARGE_OBJECT_SIZE)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("invalid large object write request size: %d",
 						nbytes)));

 	open_lo_relation();

 	indstate = CatalogOpenIndexes(lo_heap_r);

 	ScanKeyInit(&skey[0],
 				Anum_pg_largeobject_loid,
 				BTEqualStrategyNumber, F_OIDEQ,
 				ObjectIdGetDatum(obj_desc->id));

 	ScanKeyInit(&skey[1],
 				Anum_pg_largeobject_pageno,
 				BTGreaterEqualStrategyNumber, F_INT4GE,
 				Int32GetDatum(pageno));

 	sd = systable_beginscan_ordered(lo_heap_r, lo_index_r,
 									obj_desc->snapshot, 2, skey);

 	oldtuple = NULL;
 	olddata = NULL;
 	neednextpage = true;

 	while (nwritten < nbytes)
 	{
 		/*
 		 * If possible, get next pre-existing page of the LO.  We expect the
 		 * indexscan will deliver these in order --- but there may be holes.
 		 */
 		if (neednextpage)
 		{
 			if ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL)
 			{
 				if (HeapTupleHasNulls(oldtuple))	/* paranoia */
 					elog(ERROR, "null field found in pg_largeobject");
 				olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple);
 				Assert(olddata->pageno >= pageno);
 			}
 			neednextpage = false;
 		}

 		/*
 		 * If we have a pre-existing page, see if it is the page we want to
 		 * write, or a later one.
 		 */
 		if (olddata != NULL && olddata->pageno == pageno)
 		{
 			/*
 			 * Update an existing page with fresh data.
 			 *
 			 * First, load old data into workbuf
 			 */
 			getdatafield(olddata, &datafield, &len, &pfreeit);
 			memcpy(workb, VARDATA(datafield), len);
 			if (pfreeit)
 				pfree(datafield);

 			/*
 			 * Fill any hole
 			 */
 			off = (int) (obj_desc->offset % LOBLKSIZE);
 			if (off > len)
 				MemSet(workb + len, 0, off - len);

 			/*
 			 * Insert appropriate portion of new data
 			 */
 			n = LOBLKSIZE - off;
 			n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten);
 			memcpy(workb + off, buf + nwritten, n);
 			nwritten += n;
 			obj_desc->offset += n;
 			off += n;
 			/* compute valid length of new page */
 			len = (len >= off) ? len : off;
 			SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ);

 			/*
 			 * Form and insert updated tuple
 			 */
 			memset(values, 0, sizeof(values));
 			memset(nulls, false, sizeof(nulls));
 			memset(replace, false, sizeof(replace));
 			values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
 			replace[Anum_pg_largeobject_data - 1] = true;
 			newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r),
 									   values, nulls, replace);
 			CatalogTupleUpdateWithInfo(lo_heap_r, &newtup->t_self, newtup,
 									   indstate);
 			heap_freetuple(newtup);

 			/*
 			 * We're done with this old page.
 			 */
 			oldtuple = NULL;
 			olddata = NULL;
 			neednextpage = true;
 		}
 		else
 		{
 			/*
 			 * Write a brand new page.
 			 *
 			 * First, fill any hole
 			 */
 			off = (int) (obj_desc->offset % LOBLKSIZE);
 			if (off > 0)
 				MemSet(workb, 0, off);

 			/*
 			 * Insert appropriate portion of new data
 			 */
 			n = LOBLKSIZE - off;
 			n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten);
 			memcpy(workb + off, buf + nwritten, n);
 			nwritten += n;
 			obj_desc->offset += n;
 			/* compute valid length of new page */
 			len = off + n;
 			SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ);

 			/*
 			 * Form and insert updated tuple
 			 */
 			memset(values, 0, sizeof(values));
 			memset(nulls, false, sizeof(nulls));
 			values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id);
 			values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno);
 			values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
 			newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls);
 			CatalogTupleInsertWithInfo(lo_heap_r, newtup, indstate);
 			heap_freetuple(newtup);
 		}
 		pageno++;
 	}

 	systable_endscan_ordered(sd);

 	CatalogCloseIndexes(indstate);

 	/*
 	 * Advance command counter so that my tuple updates will be seen by later
 	 * large-object operations in this transaction.
 	 */
 	CommandCounterIncrement();

 	return nwritten;
 }

 void
 inv_truncate(LargeObjectDesc *obj_desc, int64 len)
 {
 	int32		pageno = (int32) (len / LOBLKSIZE);
 	int32		off;
 	ScanKeyData skey[2];
 	SysScanDesc sd;
 	HeapTuple	oldtuple;
 	Form_pg_largeobject olddata;
 	union
 	{
 		bytea		hdr;
 		/* this is to make the union big enough for a LO data chunk: */
 		char		data[LOBLKSIZE + VARHDRSZ];
 		/* ensure union is aligned well enough: */
 		int32		align_it;
 	}			workbuf;
 	char	   *workb = VARDATA(&workbuf.hdr);
 	HeapTuple	newtup;
 	Datum		values[Natts_pg_largeobject];
 	bool		nulls[Natts_pg_largeobject];
 	bool		replace[Natts_pg_largeobject];
 	CatalogIndexState indstate;

 	Assert(PointerIsValid(obj_desc));

 	/* enforce writability because snapshot is probably wrong otherwise */
 	if ((obj_desc->flags & IFS_WRLOCK) == 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("permission denied for large object %u",
 						obj_desc->id)));

 	/*
 	 * use errmsg_internal here because we don't want to expose INT64_FORMAT
 	 * in translatable strings; doing better is not worth the trouble
 	 */
 	if (len < 0 || len > MAX_LARGE_OBJECT_SIZE)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg_internal("invalid large object truncation target: " INT64_FORMAT,
 								 len)));

 	open_lo_relation();

 	indstate = CatalogOpenIndexes(lo_heap_r);

 	/*
 	 * Set up to find all pages with desired loid and pageno >= target
 	 */
 	ScanKeyInit(&skey[0],
 				Anum_pg_largeobject_loid,
 				BTEqualStrategyNumber, F_OIDEQ,
 				ObjectIdGetDatum(obj_desc->id));

 	ScanKeyInit(&skey[1],
 				Anum_pg_largeobject_pageno,
 				BTGreaterEqualStrategyNumber, F_INT4GE,
 				Int32GetDatum(pageno));

 	sd = systable_beginscan_ordered(lo_heap_r, lo_index_r,
 									obj_desc->snapshot, 2, skey);

 	/*
 	 * If possible, get the page the truncation point is in. The truncation
 	 * point may be beyond the end of the LO or in a hole.
 	 */
 	olddata = NULL;
 	if ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL)
 	{
 		if (HeapTupleHasNulls(oldtuple))	/* paranoia */
 			elog(ERROR, "null field found in pg_largeobject");
 		olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple);
 		Assert(olddata->pageno >= pageno);
 	}

 	/*
 	 * If we found the page of the truncation point we need to truncate the
 	 * data in it.  Otherwise if we're in a hole, we need to create a page to
 	 * mark the end of data.
 	 */
 	if (olddata != NULL && olddata->pageno == pageno)
 	{
 		/* First, load old data into workbuf */
 		bytea	   *datafield;
 		int			pagelen;
 		bool		pfreeit;

 		getdatafield(olddata, &datafield, &pagelen, &pfreeit);
 		memcpy(workb, VARDATA(datafield), pagelen);
 		if (pfreeit)
 			pfree(datafield);

 		/*
 		 * Fill any hole
 		 */
 		off = len % LOBLKSIZE;
 		if (off > pagelen)
 			MemSet(workb + pagelen, 0, off - pagelen);

 		/* compute length of new page */
 		SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ);

 		/*
 		 * Form and insert updated tuple
 		 */
 		memset(values, 0, sizeof(values));
 		memset(nulls, false, sizeof(nulls));
 		memset(replace, false, sizeof(replace));
 		values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
 		replace[Anum_pg_largeobject_data - 1] = true;
 		newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r),
 								   values, nulls, replace);
 		CatalogTupleUpdateWithInfo(lo_heap_r, &newtup->t_self, newtup,
 								   indstate);
 		heap_freetuple(newtup);
 	}
 	else
 	{
 		/*
 		 * If the first page we found was after the truncation point, we're in
 		 * a hole that we'll fill, but we need to delete the later page
 		 * because the loop below won't visit it again.
 		 */
 		if (olddata != NULL)
 		{
 			Assert(olddata->pageno > pageno);
 			CatalogTupleDelete(lo_heap_r, &oldtuple->t_self);
 		}

 		/*
 		 * Write a brand new page.
 		 *
 		 * Fill the hole up to the truncation point
 		 */
 		off = len % LOBLKSIZE;
 		if (off > 0)
 			MemSet(workb, 0, off);

 		/* compute length of new page */
 		SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ);

 		/*
 		 * Form and insert new tuple
 		 */
 		memset(values, 0, sizeof(values));
 		memset(nulls, false, sizeof(nulls));
 		values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id);
 		values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno);
 		values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
 		newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls);
 		CatalogTupleInsertWithInfo(lo_heap_r, newtup, indstate);
 		heap_freetuple(newtup);
 	}

 	/*
 	 * Delete any pages after the truncation point.  If the initial search
 	 * didn't find a page, then of course there's nothing more to do.
 	 */
 	if (olddata != NULL)
 	{
 		while ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL)
 		{
 			CatalogTupleDelete(lo_heap_r, &oldtuple->t_self);
 		}
 	}

 	systable_endscan_ordered(sd);

 	CatalogCloseIndexes(indstate);

 	/*
 	 * Advance command counter so that tuple updates will be seen by later
 	 * large-object operations in this transaction.
 	 */
 	CommandCounterIncrement();
 }