src/backend/access/heap/hio.c - hawq - Git at Google

 /*-------------------------------------------------------------------------
  *
  * hio.c
  *	  POSTGRES heap access method input/output code.
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
  *	  $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.63 2006/07/03 22:45:37 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */

 #include "postgres.h"

 #include "access/heapam.h"
 #include "access/hio.h"
 #include "storage/freespace.h"


 /*
  * RelationPutHeapTuple - place tuple at specified page
  *
  * !!! EREPORT(ERROR) IS DISALLOWED HERE !!!  Must PANIC on failure!!!
  *
  * Note - caller must hold BUFFER_LOCK_EXCLUSIVE on the buffer.
  */
 void
 RelationPutHeapTuple(Relation relation __attribute__((unused)),
 					 Buffer buffer,
 					 HeapTuple tuple)
 {
 	Page		pageHeader;
 	OffsetNumber offnum;
 	ItemId		itemId;
 	Item		item;

 	/* Add the tuple to the page */
 	pageHeader = BufferGetPage(buffer);

 	offnum = PageAddItem(pageHeader, (Item) tuple->t_data,
 						 tuple->t_len, InvalidOffsetNumber, LP_USED);

 	if (offnum == InvalidOffsetNumber)
 		elog(PANIC, "failed to add tuple to page");

 	/* Update tuple->t_self to the actual position where it was stored */
 	ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum);

 	/* Insert the correct position into CTID of the stored tuple, too */
 	itemId = PageGetItemId(pageHeader, offnum);
 	item = PageGetItem(pageHeader, itemId);
 	((HeapTupleHeader) item)->t_ctid = tuple->t_self;
 }

 /*
  * RelationGetBufferForTuple
  *
  *	Returns pinned and exclusive-locked buffer of a page in given relation
  *	with free space >= given len.
  *
  *	If otherBuffer is not InvalidBuffer, then it references a previously
  *	pinned buffer of another page in the same relation; on return, this
  *	buffer will also be exclusive-locked.  (This case is used by heap_update;
  *	the otherBuffer contains the tuple being updated.)
  *
  *	The reason for passing otherBuffer is that if two backends are doing
  *	concurrent heap_update operations, a deadlock could occur if they try
  *	to lock the same two buffers in opposite orders.  To ensure that this
  *	can't happen, we impose the rule that buffers of a relation must be
  *	locked in increasing page number order.  This is most conveniently done
  *	by having RelationGetBufferForTuple lock them both, with suitable care
  *	for ordering.
  *
  *	NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the
  *	same buffer we select for insertion of the new tuple (this could only
  *	happen if space is freed in that page after heap_update finds there's not
  *	enough there).	In that case, the page will be pinned and locked only once.
  *
  *	If use_fsm is true (the normal case), we use FSM to help us find free
  *	space.	If use_fsm is false, we always append a new empty page to the
  *	end of the relation if the tuple won't fit on the current target page.
  *	This can save some cycles when we know the relation is new and doesn't
  *	contain useful amounts of free space.
  *
  *	The use_fsm = false case is also useful for non-WAL-logged additions to a
  *	relation, if the caller holds exclusive lock and is careful to invalidate
  *	relation->rd_targblock before the first insertion --- that ensures that
  *	all insertions will occur into newly added pages and not be intermixed
  *	with tuples from other transactions.  That way, a crash can't risk losing
  *	any committed data of other transactions.  (See heap_insert's comments
  *	for additional constraints needed for safe usage of this behavior.)
  *
  *	We always try to avoid filling existing pages further than the fillfactor.
  *	This is OK since this routine is not consulted when updating a tuple and
  *	keeping it on the same page, which is the scenario fillfactor is meant
  *	to reserve space for.
  *
  *	ereport(ERROR) is allowed here, so this routine *must* be called
  *	before any (unlogged) changes are made in buffer pool.
  */
 Buffer
 RelationGetBufferForTuple(Relation relation, Size len,
 						  Buffer otherBuffer, bool use_fsm)
 {
 	Buffer		buffer = InvalidBuffer;
 	Page		pageHeader;
 	Size		pageFreeSpace,
 				saveFreeSpace;
 	BlockNumber targetBlock,
 				otherBlock;
 	bool		needLock;

 	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

 	len = MAXALIGN(len);		/* be conservative */

 	/*
 	 * If we're gonna fail for oversize tuple, do it right away
 	 */
 	if (len > MaxHeapTupleSize)
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("row is too big: size %lu, maximum size %lu",
 						(unsigned long) len,
 						(unsigned long) MaxHeapTupleSize)));

 	/* Compute desired extra freespace due to fillfactor option */
 	saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
 												   HEAP_DEFAULT_FILLFACTOR);

 	if (otherBuffer != InvalidBuffer)
 		otherBlock = BufferGetBlockNumber(otherBuffer);
 	else
 		otherBlock = InvalidBlockNumber;		/* just to keep compiler quiet */

 	/*
 	 * We first try to put the tuple on the same page we last inserted a tuple
 	 * on, as cached in the relcache entry.  If that doesn't work, we ask the
 	 * shared Free Space Map to locate a suitable page.  Since the FSM's info
 	 * might be out of date, we have to be prepared to loop around and retry
 	 * multiple times.	(To insure this isn't an infinite loop, we must update
 	 * the FSM with the correct amount of free space on each page that proves
 	 * not to be suitable.)  If the FSM has no record of a page with enough
 	 * free space, we give up and extend the relation.
 	 *
 	 * When use_fsm is false, we either put the tuple onto the existing target
 	 * page or extend the relation.
 	 */
 	if (len + saveFreeSpace <= MaxHeapTupleSize)
 		targetBlock = relation->rd_targblock;
 	else
 	{
 		/* can't fit, don't screw up FSM request tracking by trying */
 		targetBlock = InvalidBlockNumber;
 		use_fsm = false;
 	}

 	if (targetBlock == InvalidBlockNumber && use_fsm)
 	{
 		/*
 		 * We have no cached target page, so ask the FSM for an initial
 		 * target.
 		 */
 		targetBlock = GetPageWithFreeSpace(&relation->rd_node,
 										   len + saveFreeSpace);

 		/*
 		 * If the FSM knows nothing of the rel, try the last page before we
 		 * give up and extend.	This avoids one-tuple-per-page syndrome during
 		 * bootstrapping or in a recently-started system.
 		 */
 		if (targetBlock == InvalidBlockNumber)
 		{
 			BlockNumber nblocks = RelationGetNumberOfBlocks(relation);

 			if (nblocks > 0)
 				targetBlock = nblocks - 1;
 		}
 	}

 	while (targetBlock != InvalidBlockNumber)
 	{
 		/*
 		 * Read and exclusive-lock the target block, as well as the other
 		 * block if one was given, taking suitable care with lock ordering and
 		 * the possibility they are the same block.
 		 */
 		if (otherBuffer == InvalidBuffer)
 		{
 			/* easy case */
 			buffer = ReadBuffer(relation, targetBlock);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		}
 		else if (otherBlock == targetBlock)
 		{
 			/* also easy case */
 			buffer = otherBuffer;
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		}
 		else if (otherBlock < targetBlock)
 		{
 			/* lock other buffer first */
 			buffer = ReadBuffer(relation, targetBlock);
 			LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		}
 		else
 		{
 			/* lock target buffer first */
 			buffer = ReadBuffer(relation, targetBlock);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 			LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
 		}

 		/*
 		 * Now we can check to see if there's enough free space here. If so,
 		 * we're done.
 		 */
 		pageHeader = (Page) BufferGetPage(buffer);
 		pageFreeSpace = PageGetFreeSpace(pageHeader);
 		if (len + saveFreeSpace <= pageFreeSpace)
 		{
 			/* use this page as future insert target, too */
 			relation->rd_targblock = targetBlock;
 			return buffer;
 		}

 		/*
 		 * Not enough space, so we must give up our page locks and pin (if
 		 * any) and prepare to look elsewhere.	We don't care which order we
 		 * unlock the two buffers in, so this can be slightly simpler than the
 		 * code above.
 		 */
 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 		if (otherBuffer == InvalidBuffer)
 			ReleaseBuffer(buffer);
 		else if (otherBlock != targetBlock)
 		{
 			LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
 			ReleaseBuffer(buffer);
 		}

 		/* Without FSM, always fall out of the loop and extend */
 		if (!use_fsm)
 			break;

 		/*
 		 * Update FSM as to condition of this page, and ask for another page
 		 * to try.
 		 */
 		targetBlock = RecordAndGetPageWithFreeSpace(&relation->rd_node,
 													targetBlock,
 													pageFreeSpace,
 													len + saveFreeSpace);
 	}

 	/*
 	 * Have to extend the relation.
 	 *
 	 * We have to use a lock to ensure no one else is extending the rel at the
 	 * same time, else we will both try to initialize the same new page.  We
 	 * can skip locking for new or temp relations, however, since no one else
 	 * could be accessing them.
 	 */
 	needLock = !RELATION_IS_LOCAL(relation);

 	if (needLock)
 		LockRelationForExtension(relation, ExclusiveLock);

 	/*
 	 * XXX This does an lseek - rather expensive - but at the moment it is the
 	 * only way to accurately determine how many blocks are in a relation.	Is
 	 * it worth keeping an accurate file length in shared memory someplace,
 	 * rather than relying on the kernel to do it for us?
 	 */
 	buffer = ReadBuffer(relation, P_NEW);

 	/*
 	 * We can be certain that locking the otherBuffer first is OK, since it
 	 * must have a lower page number.
 	 */
 	if (otherBuffer != InvalidBuffer)
 		LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);

 	/*
 	 * Now acquire lock on the new page.
 	 */
 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

 	/*
 	 * Release the file-extension lock; it's now OK for someone else to extend
 	 * the relation some more.	Note that we cannot release this lock before
 	 * we have buffer lock on the new page, or we risk a race condition
 	 * against vacuumlazy.c --- see comments therein.
 	 */
 	if (needLock)
 		UnlockRelationForExtension(relation, ExclusiveLock);

 	/*
 	 * We need to initialize the empty new page.  Double-check that it really
 	 * is empty (this should never happen, but if it does we don't want to
 	 * risk wiping out valid data).
 	 */
 	pageHeader = (Page) BufferGetPage(buffer);

 	if (!PageIsNew((PageHeader) pageHeader))
 		elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
 			 BufferGetBlockNumber(buffer),
 			 RelationGetRelationName(relation));

 	PageInit(pageHeader, BufferGetPageSize(buffer), 0);

 	if (len > PageGetFreeSpace(pageHeader))
 	{
 		/* We should not get here given the test at the top */
 		elog(PANIC, "tuple is too big: size %lu", (unsigned long) len);
 	}

 	/*
 	 * Remember the new page as our target for future insertions.
 	 *
 	 * XXX should we enter the new page into the free space map immediately,
 	 * or just keep it for this backend's exclusive use in the short run
 	 * (until VACUUM sees it)?	Seems to depend on whether you expect the
 	 * current backend to make more insertions or not, which is probably a
 	 * good bet most of the time.  So for now, don't add it to FSM yet.
 	 */
 	relation->rd_targblock = BufferGetBlockNumber(buffer);

 	return buffer;
 }
	/*-------------------------------------------------------------------------
	*
	* hio.c
	* POSTGRES heap access method input/output code.
	*
	* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
	* Portions Copyright (c) 1994, Regents of the University of California
	*
	*
	* IDENTIFICATION
	* $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.63 2006/07/03 22:45:37 tgl Exp $
	*
	*-------------------------------------------------------------------------
	*/

	#include "postgres.h"

	#include "access/heapam.h"
	#include "access/hio.h"
	#include "storage/freespace.h"


	/*
	* RelationPutHeapTuple - place tuple at specified page
	*
	* !!! EREPORT(ERROR) IS DISALLOWED HERE !!! Must PANIC on failure!!!
	*
	* Note - caller must hold BUFFER_LOCK_EXCLUSIVE on the buffer.
	*/
	void
	RelationPutHeapTuple(Relation relation __attribute__((unused)),
	Buffer buffer,
	HeapTuple tuple)
	{
	Page pageHeader;
	OffsetNumber offnum;
	ItemId itemId;
	Item item;

	/* Add the tuple to the page */
	pageHeader = BufferGetPage(buffer);

	offnum = PageAddItem(pageHeader, (Item) tuple->t_data,
	tuple->t_len, InvalidOffsetNumber, LP_USED);

	if (offnum == InvalidOffsetNumber)
	elog(PANIC, "failed to add tuple to page");

	/* Update tuple->t_self to the actual position where it was stored */
	ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum);

	/* Insert the correct position into CTID of the stored tuple, too */
	itemId = PageGetItemId(pageHeader, offnum);
	item = PageGetItem(pageHeader, itemId);
	((HeapTupleHeader) item)->t_ctid = tuple->t_self;
	}

	/*
	* RelationGetBufferForTuple
	*
	* Returns pinned and exclusive-locked buffer of a page in given relation
	* with free space >= given len.
	*
	* If otherBuffer is not InvalidBuffer, then it references a previously
	* pinned buffer of another page in the same relation; on return, this
	* buffer will also be exclusive-locked. (This case is used by heap_update;
	* the otherBuffer contains the tuple being updated.)
	*
	* The reason for passing otherBuffer is that if two backends are doing
	* concurrent heap_update operations, a deadlock could occur if they try
	* to lock the same two buffers in opposite orders. To ensure that this
	* can't happen, we impose the rule that buffers of a relation must be
	* locked in increasing page number order. This is most conveniently done
	* by having RelationGetBufferForTuple lock them both, with suitable care
	* for ordering.
	*
	* NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the
	* same buffer we select for insertion of the new tuple (this could only
	* happen if space is freed in that page after heap_update finds there's not
	* enough there). In that case, the page will be pinned and locked only once.
	*
	* If use_fsm is true (the normal case), we use FSM to help us find free
	* space. If use_fsm is false, we always append a new empty page to the
	* end of the relation if the tuple won't fit on the current target page.
	* This can save some cycles when we know the relation is new and doesn't
	* contain useful amounts of free space.
	*
	* The use_fsm = false case is also useful for non-WAL-logged additions to a
	* relation, if the caller holds exclusive lock and is careful to invalidate
	* relation->rd_targblock before the first insertion --- that ensures that
	* all insertions will occur into newly added pages and not be intermixed
	* with tuples from other transactions. That way, a crash can't risk losing
	* any committed data of other transactions. (See heap_insert's comments
	* for additional constraints needed for safe usage of this behavior.)
	*
	* We always try to avoid filling existing pages further than the fillfactor.
	* This is OK since this routine is not consulted when updating a tuple and
	* keeping it on the same page, which is the scenario fillfactor is meant
	* to reserve space for.
	*
	* ereport(ERROR) is allowed here, so this routine must be called
	* before any (unlogged) changes are made in buffer pool.
	*/
	Buffer
	RelationGetBufferForTuple(Relation relation, Size len,
	Buffer otherBuffer, bool use_fsm)
	{
	Buffer buffer = InvalidBuffer;
	Page pageHeader;
	Size pageFreeSpace,
	saveFreeSpace;
	BlockNumber targetBlock,
	otherBlock;
	bool needLock;

	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

	len = MAXALIGN(len); /* be conservative */

	/*
	* If we're gonna fail for oversize tuple, do it right away
	*/
	if (len > MaxHeapTupleSize)
	ereport(ERROR,
	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
	errmsg("row is too big: size %lu, maximum size %lu",
	(unsigned long) len,
	(unsigned long) MaxHeapTupleSize)));

	/* Compute desired extra freespace due to fillfactor option */
	saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
	HEAP_DEFAULT_FILLFACTOR);

	if (otherBuffer != InvalidBuffer)
	otherBlock = BufferGetBlockNumber(otherBuffer);
	else
	otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */

	/*
	* We first try to put the tuple on the same page we last inserted a tuple
	* on, as cached in the relcache entry. If that doesn't work, we ask the
	* shared Free Space Map to locate a suitable page. Since the FSM's info
	* might be out of date, we have to be prepared to loop around and retry
	* multiple times. (To insure this isn't an infinite loop, we must update
	* the FSM with the correct amount of free space on each page that proves
	* not to be suitable.) If the FSM has no record of a page with enough
	* free space, we give up and extend the relation.
	*
	* When use_fsm is false, we either put the tuple onto the existing target
	* page or extend the relation.
	*/
	if (len + saveFreeSpace <= MaxHeapTupleSize)
	targetBlock = relation->rd_targblock;
	else
	{
	/* can't fit, don't screw up FSM request tracking by trying */
	targetBlock = InvalidBlockNumber;
	use_fsm = false;
	}

	if (targetBlock == InvalidBlockNumber && use_fsm)
	{
	/*
	* We have no cached target page, so ask the FSM for an initial
	* target.
	*/
	targetBlock = GetPageWithFreeSpace(&relation->rd_node,
	len + saveFreeSpace);

	/*
	* If the FSM knows nothing of the rel, try the last page before we
	* give up and extend. This avoids one-tuple-per-page syndrome during
	* bootstrapping or in a recently-started system.
	*/
	if (targetBlock == InvalidBlockNumber)
	{
	BlockNumber nblocks = RelationGetNumberOfBlocks(relation);

	if (nblocks > 0)
	targetBlock = nblocks - 1;
	}
	}

	while (targetBlock != InvalidBlockNumber)
	{
	/*
	* Read and exclusive-lock the target block, as well as the other
	* block if one was given, taking suitable care with lock ordering and
	* the possibility they are the same block.
	*/
	if (otherBuffer == InvalidBuffer)
	{
	/* easy case */
	buffer = ReadBuffer(relation, targetBlock);
	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
	}
	else if (otherBlock == targetBlock)
	{
	/* also easy case */
	buffer = otherBuffer;
	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
	}
	else if (otherBlock < targetBlock)
	{
	/* lock other buffer first */
	buffer = ReadBuffer(relation, targetBlock);
	LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
	}
	else
	{
	/* lock target buffer first */
	buffer = ReadBuffer(relation, targetBlock);
	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
	LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
	}

	/*
	* Now we can check to see if there's enough free space here. If so,
	* we're done.
	*/
	pageHeader = (Page) BufferGetPage(buffer);
	pageFreeSpace = PageGetFreeSpace(pageHeader);
	if (len + saveFreeSpace <= pageFreeSpace)
	{
	/* use this page as future insert target, too */
	relation->rd_targblock = targetBlock;
	return buffer;
	}

	/*
	* Not enough space, so we must give up our page locks and pin (if
	* any) and prepare to look elsewhere. We don't care which order we
	* unlock the two buffers in, so this can be slightly simpler than the
	* code above.
	*/
	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
	if (otherBuffer == InvalidBuffer)
	ReleaseBuffer(buffer);
	else if (otherBlock != targetBlock)
	{
	LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
	ReleaseBuffer(buffer);
	}

	/* Without FSM, always fall out of the loop and extend */
	if (!use_fsm)
	break;

	/*
	* Update FSM as to condition of this page, and ask for another page
	* to try.
	*/
	targetBlock = RecordAndGetPageWithFreeSpace(&relation->rd_node,
	targetBlock,
	pageFreeSpace,
	len + saveFreeSpace);
	}

	/*
	* Have to extend the relation.
	*
	* We have to use a lock to ensure no one else is extending the rel at the
	* same time, else we will both try to initialize the same new page. We
	* can skip locking for new or temp relations, however, since no one else
	* could be accessing them.
	*/
	needLock = !RELATION_IS_LOCAL(relation);

	if (needLock)
	LockRelationForExtension(relation, ExclusiveLock);

	/*
	* XXX This does an lseek - rather expensive - but at the moment it is the
	* only way to accurately determine how many blocks are in a relation. Is
	* it worth keeping an accurate file length in shared memory someplace,
	* rather than relying on the kernel to do it for us?
	*/
	buffer = ReadBuffer(relation, P_NEW);

	/*
	* We can be certain that locking the otherBuffer first is OK, since it
	* must have a lower page number.
	*/
	if (otherBuffer != InvalidBuffer)
	LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);

	/*
	* Now acquire lock on the new page.
	*/
	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

	/*
	* Release the file-extension lock; it's now OK for someone else to extend
	* the relation some more. Note that we cannot release this lock before
	* we have buffer lock on the new page, or we risk a race condition
	* against vacuumlazy.c --- see comments therein.
	*/
	if (needLock)
	UnlockRelationForExtension(relation, ExclusiveLock);

	/*
	* We need to initialize the empty new page. Double-check that it really
	* is empty (this should never happen, but if it does we don't want to
	* risk wiping out valid data).
	*/
	pageHeader = (Page) BufferGetPage(buffer);

	if (!PageIsNew((PageHeader) pageHeader))
	elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
	BufferGetBlockNumber(buffer),
	RelationGetRelationName(relation));

	PageInit(pageHeader, BufferGetPageSize(buffer), 0);

	if (len > PageGetFreeSpace(pageHeader))
	{
	/* We should not get here given the test at the top */
	elog(PANIC, "tuple is too big: size %lu", (unsigned long) len);
	}

	/*
	* Remember the new page as our target for future insertions.
	*
	* XXX should we enter the new page into the free space map immediately,
	* or just keep it for this backend's exclusive use in the short run
	* (until VACUUM sees it)? Seems to depend on whether you expect the
	* current backend to make more insertions or not, which is probably a
	* good bet most of the time. So for now, don't add it to FSM yet.
	*/
	relation->rd_targblock = BufferGetBlockNumber(buffer);

	return buffer;
	}