1 /*-------------------------------------------------------------------------
4 * heap access method code
6 * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.292.2.1 2010/07/29 16:14:45 rhaas Exp $
15 * relation_open - open any relation by relation OID
16 * relation_openrv - open any relation specified by a RangeVar
17 * relation_close - close any relation
18 * heap_open - open a heap relation by relation OID
19 * heap_openrv - open a heap relation specified by a RangeVar
20 * heap_close - (now just a macro for relation_close)
21 * heap_beginscan - begin relation scan
22 * heap_rescan - restart a relation scan
23 * heap_endscan - end relation scan
24 * heap_getnext - retrieve next tuple in scan
25 * heap_fetch - retrieve tuple with given tid
26 * heap_insert - insert tuple into a relation
27 * heap_delete - delete a tuple from a relation
28 * heap_update - replace a tuple in a relation with another tuple
29 * heap_markpos - mark scan position
30 * heap_restrpos - restore position to marked location
31 * heap_sync - sync heap, for when no WAL has been written
34 * This file contains the heap_ routines which implement
35 * the POSTGRES heap access method used for all POSTGRES
38 *-------------------------------------------------------------------------
42 #include "access/heapam.h"
43 #include "access/hio.h"
44 #include "access/multixact.h"
45 #include "access/relscan.h"
46 #include "access/sysattr.h"
47 #include "access/transam.h"
48 #include "access/tuptoaster.h"
49 #include "access/valid.h"
50 #include "access/visibilitymap.h"
51 #include "access/xact.h"
52 #include "access/xlogutils.h"
53 #include "catalog/catalog.h"
54 #include "catalog/namespace.h"
55 #include "miscadmin.h"
57 #include "storage/bufmgr.h"
58 #include "storage/freespace.h"
59 #include "storage/lmgr.h"
60 #include "storage/procarray.h"
61 #include "storage/smgr.h"
62 #include "storage/standby.h"
63 #include "utils/datum.h"
64 #include "utils/inval.h"
65 #include "utils/lsyscache.h"
66 #include "utils/relcache.h"
67 #include "utils/snapmgr.h"
68 #include "utils/syscache.h"
69 #include "utils/tqual.h"
73 bool synchronize_seqscans = true;
76 static HeapScanDesc heap_beginscan_internal(Relation relation,
78 int nkeys, ScanKey key,
79 bool allow_strat, bool allow_sync,
81 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
82 ItemPointerData from, Buffer newbuf, HeapTuple newtup,
83 bool all_visible_cleared, bool new_all_visible_cleared);
84 static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
85 HeapTuple oldtup, HeapTuple newtup);
88 /* ----------------------------------------------------------------
89 * heap support routines
90 * ----------------------------------------------------------------
94 * initscan - scan code common to heap_beginscan and heap_rescan
98 initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
104 * Determine the number of blocks we have to scan.
106 * It is sufficient to do this once at scan start, since any tuples added
107 * while the scan is in progress will be invisible to my snapshot anyway.
108 * (That is not true when using a non-MVCC snapshot. However, we couldn't
109 * guarantee to return tuples added after scan start anyway, since they
110 * might go into pages we already scanned. To guarantee consistent
111 * results for a non-MVCC snapshot, the caller must hold some higher-level
112 * lock that ensures the interesting tuple(s) won't change.)
114 scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
117 * If the table is large relative to NBuffers, use a bulk-read access
118 * strategy and enable synchronized scanning (see syncscan.c). Although
119 * the thresholds for these features could be different, we make them the
120 * same so that there are only two behaviors to tune rather than four.
121 * (However, some callers need to be able to disable one or both of these
122 * behaviors, independently of the size of the table; also there is a GUC
123 * variable that can disable synchronized scanning.)
125 * During a rescan, don't make a new strategy object if we don't have to.
127 if (!scan->rs_rd->rd_istemp &&
128 scan->rs_nblocks > NBuffers / 4)
130 allow_strat = scan->rs_allow_strat;
131 allow_sync = scan->rs_allow_sync;
134 allow_strat = allow_sync = false;
138 if (scan->rs_strategy == NULL)
139 scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
143 if (scan->rs_strategy != NULL)
144 FreeAccessStrategy(scan->rs_strategy);
145 scan->rs_strategy = NULL;
151 * If rescan, keep the previous startblock setting so that rewinding a
152 * cursor doesn't generate surprising results. Reset the syncscan
155 scan->rs_syncscan = (allow_sync && synchronize_seqscans);
157 else if (allow_sync && synchronize_seqscans)
159 scan->rs_syncscan = true;
160 scan->rs_startblock = ss_get_location(scan->rs_rd, scan->rs_nblocks);
164 scan->rs_syncscan = false;
165 scan->rs_startblock = 0;
168 scan->rs_inited = false;
169 scan->rs_ctup.t_data = NULL;
170 ItemPointerSetInvalid(&scan->rs_ctup.t_self);
171 scan->rs_cbuf = InvalidBuffer;
172 scan->rs_cblock = InvalidBlockNumber;
174 /* we don't have a marked position... */
175 ItemPointerSetInvalid(&(scan->rs_mctid));
177 /* page-at-a-time fields are always invalid when not rs_inited */
180 * copy the scan key, if appropriate
183 memcpy(scan->rs_key, key, scan->rs_nkeys * sizeof(ScanKeyData));
186 * Currently, we don't have a stats counter for bitmap heap scans (but the
187 * underlying bitmap index scans will be counted).
189 if (!scan->rs_bitmapscan)
190 pgstat_count_heap_scan(scan->rs_rd);
194 * heapgetpage - subroutine for heapgettup()
196 * This routine reads and pins the specified page of the relation.
197 * In page-at-a-time mode it performs additional work, namely determining
198 * which tuples on the page are visible.
201 heapgetpage(HeapScanDesc scan, BlockNumber page)
208 OffsetNumber lineoff;
212 Assert(page < scan->rs_nblocks);
214 /* release previous scan buffer, if any */
215 if (BufferIsValid(scan->rs_cbuf))
217 ReleaseBuffer(scan->rs_cbuf);
218 scan->rs_cbuf = InvalidBuffer;
221 /* read page using selected strategy */
222 scan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, page,
223 RBM_NORMAL, scan->rs_strategy);
224 scan->rs_cblock = page;
226 if (!scan->rs_pageatatime)
229 buffer = scan->rs_cbuf;
230 snapshot = scan->rs_snapshot;
233 * Prune and repair fragmentation for the whole page, if possible.
235 Assert(TransactionIdIsValid(RecentGlobalXmin));
236 heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
239 * We must hold share lock on the buffer content while examining tuple
240 * visibility. Afterwards, however, the tuples we have found to be
241 * visible are guaranteed good as long as we hold the buffer pin.
243 LockBuffer(buffer, BUFFER_LOCK_SHARE);
245 dp = (Page) BufferGetPage(buffer);
246 lines = PageGetMaxOffsetNumber(dp);
250 * If the all-visible flag indicates that all tuples on the page are
251 * visible to everyone, we can skip the per-tuple visibility tests. But
252 * not in hot standby mode. A tuple that's already visible to all
253 * transactions in the master might still be invisible to a read-only
254 * transaction in the standby.
256 all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
258 for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
262 if (ItemIdIsNormal(lpp))
270 HeapTupleData loctup;
272 loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
273 loctup.t_len = ItemIdGetLength(lpp);
274 ItemPointerSet(&(loctup.t_self), page, lineoff);
276 valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
279 scan->rs_vistuples[ntup++] = lineoff;
283 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
285 Assert(ntup <= MaxHeapTuplesPerPage);
286 scan->rs_ntuples = ntup;
290 * heapgettup - fetch next heap tuple
292 * Initialize the scan if not already done; then advance to the next
293 * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
294 * or set scan->rs_ctup.t_data = NULL if no more tuples.
296 * dir == NoMovementScanDirection means "re-fetch the tuple indicated
299 * Note: the reason nkeys/key are passed separately, even though they are
300 * kept in the scan descriptor, is that the caller may not want us to check
303 * Note: when we fall off the end of the scan in either direction, we
304 * reset rs_inited. This means that a further request with the same
305 * scan direction will restart the scan, which is a bit odd, but a
306 * request with the opposite scan direction will start a fresh scan
307 * in the proper direction. The latter is required behavior for cursors,
308 * while the former case is generally undefined behavior in Postgres
309 * so we don't care too much.
313 heapgettup(HeapScanDesc scan,
318 HeapTuple tuple = &(scan->rs_ctup);
319 Snapshot snapshot = scan->rs_snapshot;
320 bool backward = ScanDirectionIsBackward(dir);
325 OffsetNumber lineoff;
330 * calculate next starting lineoff, given scan direction
332 if (ScanDirectionIsForward(dir))
334 if (!scan->rs_inited)
337 * return null immediately if relation is empty
339 if (scan->rs_nblocks == 0)
341 Assert(!BufferIsValid(scan->rs_cbuf));
342 tuple->t_data = NULL;
345 page = scan->rs_startblock; /* first page */
346 heapgetpage(scan, page);
347 lineoff = FirstOffsetNumber; /* first offnum */
348 scan->rs_inited = true;
352 /* continue from previously returned page/tuple */
353 page = scan->rs_cblock; /* current page */
354 lineoff = /* next offnum */
355 OffsetNumberNext(ItemPointerGetOffsetNumber(&(tuple->t_self)));
358 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
360 dp = (Page) BufferGetPage(scan->rs_cbuf);
361 lines = PageGetMaxOffsetNumber(dp);
362 /* page and lineoff now reference the physically next tid */
364 linesleft = lines - lineoff + 1;
368 if (!scan->rs_inited)
371 * return null immediately if relation is empty
373 if (scan->rs_nblocks == 0)
375 Assert(!BufferIsValid(scan->rs_cbuf));
376 tuple->t_data = NULL;
381 * Disable reporting to syncscan logic in a backwards scan; it's
382 * not very likely anyone else is doing the same thing at the same
383 * time, and much more likely that we'll just bollix things for
386 scan->rs_syncscan = false;
387 /* start from last page of the scan */
388 if (scan->rs_startblock > 0)
389 page = scan->rs_startblock - 1;
391 page = scan->rs_nblocks - 1;
392 heapgetpage(scan, page);
396 /* continue from previously returned page/tuple */
397 page = scan->rs_cblock; /* current page */
400 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
402 dp = (Page) BufferGetPage(scan->rs_cbuf);
403 lines = PageGetMaxOffsetNumber(dp);
405 if (!scan->rs_inited)
407 lineoff = lines; /* final offnum */
408 scan->rs_inited = true;
412 lineoff = /* previous offnum */
413 OffsetNumberPrev(ItemPointerGetOffsetNumber(&(tuple->t_self)));
415 /* page and lineoff now reference the physically previous tid */
422 * ``no movement'' scan direction: refetch prior tuple
424 if (!scan->rs_inited)
426 Assert(!BufferIsValid(scan->rs_cbuf));
427 tuple->t_data = NULL;
431 page = ItemPointerGetBlockNumber(&(tuple->t_self));
432 if (page != scan->rs_cblock)
433 heapgetpage(scan, page);
435 /* Since the tuple was previously fetched, needn't lock page here */
436 dp = (Page) BufferGetPage(scan->rs_cbuf);
437 lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
438 lpp = PageGetItemId(dp, lineoff);
439 Assert(ItemIdIsNormal(lpp));
441 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
442 tuple->t_len = ItemIdGetLength(lpp);
448 * advance the scan until we find a qualifying tuple or run out of stuff
451 lpp = PageGetItemId(dp, lineoff);
454 while (linesleft > 0)
456 if (ItemIdIsNormal(lpp))
460 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
461 tuple->t_len = ItemIdGetLength(lpp);
462 ItemPointerSet(&(tuple->t_self), page, lineoff);
465 * if current tuple qualifies, return it.
467 valid = HeapTupleSatisfiesVisibility(tuple,
471 if (valid && key != NULL)
472 HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
477 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
483 * otherwise move to the next item on the page
488 --lpp; /* move back in this page's ItemId array */
493 ++lpp; /* move forward in this page's ItemId array */
499 * if we get here, it means we've exhausted the items on this page and
500 * it's time to move to the next.
502 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
505 * advance to next/prior page and detect end of scan
509 finished = (page == scan->rs_startblock);
511 page = scan->rs_nblocks;
517 if (page >= scan->rs_nblocks)
519 finished = (page == scan->rs_startblock);
522 * Report our new scan position for synchronization purposes. We
523 * don't do that when moving backwards, however. That would just
524 * mess up any other forward-moving scanners.
526 * Note: we do this before checking for end of scan so that the
527 * final state of the position hint is back at the start of the
528 * rel. That's not strictly necessary, but otherwise when you run
529 * the same query multiple times the starting position would shift
530 * a little bit backwards on every invocation, which is confusing.
531 * We don't guarantee any specific ordering in general, though.
533 if (scan->rs_syncscan)
534 ss_report_location(scan->rs_rd, page);
538 * return NULL if we've exhausted all the pages
542 if (BufferIsValid(scan->rs_cbuf))
543 ReleaseBuffer(scan->rs_cbuf);
544 scan->rs_cbuf = InvalidBuffer;
545 scan->rs_cblock = InvalidBlockNumber;
546 tuple->t_data = NULL;
547 scan->rs_inited = false;
551 heapgetpage(scan, page);
553 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
555 dp = (Page) BufferGetPage(scan->rs_cbuf);
556 lines = PageGetMaxOffsetNumber((Page) dp);
561 lpp = PageGetItemId(dp, lines);
565 lineoff = FirstOffsetNumber;
566 lpp = PageGetItemId(dp, FirstOffsetNumber);
572 * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
574 * Same API as heapgettup, but used in page-at-a-time mode
576 * The internal logic is much the same as heapgettup's too, but there are some
577 * differences: we do not take the buffer content lock (that only needs to
578 * happen inside heapgetpage), and we iterate through just the tuples listed
579 * in rs_vistuples[] rather than all tuples on the page. Notice that
580 * lineindex is 0-based, where the corresponding loop variable lineoff in
581 * heapgettup is 1-based.
585 heapgettup_pagemode(HeapScanDesc scan,
590 HeapTuple tuple = &(scan->rs_ctup);
591 bool backward = ScanDirectionIsBackward(dir);
597 OffsetNumber lineoff;
602 * calculate next starting lineindex, given scan direction
604 if (ScanDirectionIsForward(dir))
606 if (!scan->rs_inited)
609 * return null immediately if relation is empty
611 if (scan->rs_nblocks == 0)
613 Assert(!BufferIsValid(scan->rs_cbuf));
614 tuple->t_data = NULL;
617 page = scan->rs_startblock; /* first page */
618 heapgetpage(scan, page);
620 scan->rs_inited = true;
624 /* continue from previously returned page/tuple */
625 page = scan->rs_cblock; /* current page */
626 lineindex = scan->rs_cindex + 1;
629 dp = (Page) BufferGetPage(scan->rs_cbuf);
630 lines = scan->rs_ntuples;
631 /* page and lineindex now reference the next visible tid */
633 linesleft = lines - lineindex;
637 if (!scan->rs_inited)
640 * return null immediately if relation is empty
642 if (scan->rs_nblocks == 0)
644 Assert(!BufferIsValid(scan->rs_cbuf));
645 tuple->t_data = NULL;
650 * Disable reporting to syncscan logic in a backwards scan; it's
651 * not very likely anyone else is doing the same thing at the same
652 * time, and much more likely that we'll just bollix things for
655 scan->rs_syncscan = false;
656 /* start from last page of the scan */
657 if (scan->rs_startblock > 0)
658 page = scan->rs_startblock - 1;
660 page = scan->rs_nblocks - 1;
661 heapgetpage(scan, page);
665 /* continue from previously returned page/tuple */
666 page = scan->rs_cblock; /* current page */
669 dp = (Page) BufferGetPage(scan->rs_cbuf);
670 lines = scan->rs_ntuples;
672 if (!scan->rs_inited)
674 lineindex = lines - 1;
675 scan->rs_inited = true;
679 lineindex = scan->rs_cindex - 1;
681 /* page and lineindex now reference the previous visible tid */
683 linesleft = lineindex + 1;
688 * ``no movement'' scan direction: refetch prior tuple
690 if (!scan->rs_inited)
692 Assert(!BufferIsValid(scan->rs_cbuf));
693 tuple->t_data = NULL;
697 page = ItemPointerGetBlockNumber(&(tuple->t_self));
698 if (page != scan->rs_cblock)
699 heapgetpage(scan, page);
701 /* Since the tuple was previously fetched, needn't lock page here */
702 dp = (Page) BufferGetPage(scan->rs_cbuf);
703 lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
704 lpp = PageGetItemId(dp, lineoff);
705 Assert(ItemIdIsNormal(lpp));
707 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
708 tuple->t_len = ItemIdGetLength(lpp);
710 /* check that rs_cindex is in sync */
711 Assert(scan->rs_cindex < scan->rs_ntuples);
712 Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
718 * advance the scan until we find a qualifying tuple or run out of stuff
723 while (linesleft > 0)
725 lineoff = scan->rs_vistuples[lineindex];
726 lpp = PageGetItemId(dp, lineoff);
727 Assert(ItemIdIsNormal(lpp));
729 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
730 tuple->t_len = ItemIdGetLength(lpp);
731 ItemPointerSet(&(tuple->t_self), page, lineoff);
734 * if current tuple qualifies, return it.
740 HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
744 scan->rs_cindex = lineindex;
750 scan->rs_cindex = lineindex;
755 * otherwise move to the next item on the page
765 * if we get here, it means we've exhausted the items on this page and
766 * it's time to move to the next.
770 finished = (page == scan->rs_startblock);
772 page = scan->rs_nblocks;
778 if (page >= scan->rs_nblocks)
780 finished = (page == scan->rs_startblock);
783 * Report our new scan position for synchronization purposes. We
784 * don't do that when moving backwards, however. That would just
785 * mess up any other forward-moving scanners.
787 * Note: we do this before checking for end of scan so that the
788 * final state of the position hint is back at the start of the
789 * rel. That's not strictly necessary, but otherwise when you run
790 * the same query multiple times the starting position would shift
791 * a little bit backwards on every invocation, which is confusing.
792 * We don't guarantee any specific ordering in general, though.
794 if (scan->rs_syncscan)
795 ss_report_location(scan->rs_rd, page);
799 * return NULL if we've exhausted all the pages
803 if (BufferIsValid(scan->rs_cbuf))
804 ReleaseBuffer(scan->rs_cbuf);
805 scan->rs_cbuf = InvalidBuffer;
806 scan->rs_cblock = InvalidBlockNumber;
807 tuple->t_data = NULL;
808 scan->rs_inited = false;
812 heapgetpage(scan, page);
814 dp = (Page) BufferGetPage(scan->rs_cbuf);
815 lines = scan->rs_ntuples;
818 lineindex = lines - 1;
825 #if defined(DISABLE_COMPLEX_MACRO)
827 * This is formatted so oddly so that the correspondence to the macro
828 * definition in access/heapam.h is maintained.
831 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
838 HeapTupleNoNulls(tup) ?
840 (tupleDesc)->attrs[(attnum) - 1]->attcacheoff >= 0 ?
842 fetchatt((tupleDesc)->attrs[(attnum) - 1],
843 (char *) (tup)->t_data + (tup)->t_data->t_hoff +
844 (tupleDesc)->attrs[(attnum) - 1]->attcacheoff)
847 nocachegetattr((tup), (attnum), (tupleDesc))
851 att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
858 nocachegetattr((tup), (attnum), (tupleDesc))
868 #endif /* defined(DISABLE_COMPLEX_MACRO) */
871 /* ----------------------------------------------------------------
872 * heap access method interface
873 * ----------------------------------------------------------------
877 * relation_open - open any relation by relation OID
879 * If lockmode is not "NoLock", the specified kind of lock is
880 * obtained on the relation. (Generally, NoLock should only be
881 * used if the caller knows it has some appropriate lock on the
884 * An error is raised if the relation does not exist.
886 * NB: a "relation" is anything with a pg_class entry. The caller is
887 * expected to check whether the relkind is something it can handle.
891 relation_open(Oid relationId, LOCKMODE lockmode)
895 Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
897 /* Get the lock before trying to open the relcache entry */
898 if (lockmode != NoLock)
899 LockRelationOid(relationId, lockmode);
901 /* The relcache does all the real work... */
902 r = RelationIdGetRelation(relationId);
904 if (!RelationIsValid(r))
905 elog(ERROR, "could not open relation with OID %u", relationId);
907 /* Make note that we've accessed a temporary relation */
909 MyXactAccessedTempRel = true;
917 * try_relation_open - open any relation by relation OID
919 * Same as relation_open, except return NULL instead of failing
920 * if the relation does not exist.
924 try_relation_open(Oid relationId, LOCKMODE lockmode)
928 Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
930 /* Get the lock first */
931 if (lockmode != NoLock)
932 LockRelationOid(relationId, lockmode);
935 * Now that we have the lock, probe to see if the relation really exists
938 if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relationId)))
940 /* Release useless lock */
941 if (lockmode != NoLock)
942 UnlockRelationOid(relationId, lockmode);
947 /* Should be safe to do a relcache load */
948 r = RelationIdGetRelation(relationId);
950 if (!RelationIsValid(r))
951 elog(ERROR, "could not open relation with OID %u", relationId);
953 /* Make note that we've accessed a temporary relation */
955 MyXactAccessedTempRel = true;
963 * relation_openrv - open any relation specified by a RangeVar
965 * Same as relation_open, but the relation is specified by a RangeVar.
969 relation_openrv(const RangeVar *relation, LOCKMODE lockmode)
974 * Check for shared-cache-inval messages before trying to open the
975 * relation. This is needed to cover the case where the name identifies a
976 * rel that has been dropped and recreated since the start of our
977 * transaction: if we don't flush the old syscache entry then we'll latch
978 * onto that entry and suffer an error when we do RelationIdGetRelation.
979 * Note that relation_open does not need to do this, since a relation's
982 * We skip this if asked for NoLock, on the assumption that the caller has
983 * already ensured some appropriate lock is held.
985 if (lockmode != NoLock)
986 AcceptInvalidationMessages();
988 /* Look up the appropriate relation using namespace search */
989 relOid = RangeVarGetRelid(relation, false);
991 /* Let relation_open do the rest */
992 return relation_open(relOid, lockmode);
996 * try_relation_openrv - open any relation specified by a RangeVar
998 * Same as relation_openrv, but return NULL instead of failing for
999 * relation-not-found. (Note that some other causes, such as
1000 * permissions problems, will still result in an ereport.)
1004 try_relation_openrv(const RangeVar *relation, LOCKMODE lockmode)
1009 * Check for shared-cache-inval messages before trying to open the
1010 * relation. This is needed to cover the case where the name identifies a
1011 * rel that has been dropped and recreated since the start of our
1012 * transaction: if we don't flush the old syscache entry then we'll latch
1013 * onto that entry and suffer an error when we do RelationIdGetRelation.
1014 * Note that relation_open does not need to do this, since a relation's
1015 * OID never changes.
1017 * We skip this if asked for NoLock, on the assumption that the caller has
1018 * already ensured some appropriate lock is held.
1020 if (lockmode != NoLock)
1021 AcceptInvalidationMessages();
1023 /* Look up the appropriate relation using namespace search */
1024 relOid = RangeVarGetRelid(relation, true);
1026 /* Return NULL on not-found */
1027 if (!OidIsValid(relOid))
1030 /* Let relation_open do the rest */
1031 return relation_open(relOid, lockmode);
1035 * relation_close - close any relation
1037 * If lockmode is not "NoLock", we then release the specified lock.
1039 * Note that it is often sensible to hold a lock beyond relation_close;
1040 * in that case, the lock is released automatically at xact end.
1044 relation_close(Relation relation, LOCKMODE lockmode)
1046 LockRelId relid = relation->rd_lockInfo.lockRelId;
1048 Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1050 /* The relcache does the real work... */
1051 RelationClose(relation);
1053 if (lockmode != NoLock)
1054 UnlockRelationId(&relid, lockmode);
1059 * heap_open - open a heap relation by relation OID
1061 * This is essentially relation_open plus check that the relation
1062 * is not an index nor a composite type. (The caller should also
1063 * check that it's not a view before assuming it has storage.)
1067 heap_open(Oid relationId, LOCKMODE lockmode)
1071 r = relation_open(relationId, lockmode);
1073 if (r->rd_rel->relkind == RELKIND_INDEX)
1075 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1076 errmsg("\"%s\" is an index",
1077 RelationGetRelationName(r))));
1078 else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1080 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1081 errmsg("\"%s\" is a composite type",
1082 RelationGetRelationName(r))));
1088 * heap_openrv - open a heap relation specified
1089 * by a RangeVar node
1091 * As above, but relation is specified by a RangeVar.
1095 heap_openrv(const RangeVar *relation, LOCKMODE lockmode)
1099 r = relation_openrv(relation, lockmode);
1101 if (r->rd_rel->relkind == RELKIND_INDEX)
1103 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1104 errmsg("\"%s\" is an index",
1105 RelationGetRelationName(r))));
1106 else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1108 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1109 errmsg("\"%s\" is a composite type",
1110 RelationGetRelationName(r))));
1116 * try_heap_openrv - open a heap relation specified
1117 * by a RangeVar node
1119 * As above, but return NULL instead of failing for relation-not-found.
1123 try_heap_openrv(const RangeVar *relation, LOCKMODE lockmode)
1127 r = try_relation_openrv(relation, lockmode);
1131 if (r->rd_rel->relkind == RELKIND_INDEX)
1133 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1134 errmsg("\"%s\" is an index",
1135 RelationGetRelationName(r))));
1136 else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1138 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1139 errmsg("\"%s\" is a composite type",
1140 RelationGetRelationName(r))));
1148 * heap_beginscan - begin relation scan
1150 * heap_beginscan_strat offers an extended API that lets the caller control
1151 * whether a nondefault buffer access strategy can be used, and whether
1152 * syncscan can be chosen (possibly resulting in the scan not starting from
1153 * block zero). Both of these default to TRUE with plain heap_beginscan.
1155 * heap_beginscan_bm is an alternative entry point for setting up a
1156 * HeapScanDesc for a bitmap heap scan. Although that scan technology is
1157 * really quite unlike a standard seqscan, there is just enough commonality
1158 * to make it worth using the same data structure.
1162 heap_beginscan(Relation relation, Snapshot snapshot,
1163 int nkeys, ScanKey key)
1165 return heap_beginscan_internal(relation, snapshot, nkeys, key,
1170 heap_beginscan_strat(Relation relation, Snapshot snapshot,
1171 int nkeys, ScanKey key,
1172 bool allow_strat, bool allow_sync)
1174 return heap_beginscan_internal(relation, snapshot, nkeys, key,
1175 allow_strat, allow_sync, false);
1179 heap_beginscan_bm(Relation relation, Snapshot snapshot,
1180 int nkeys, ScanKey key)
1182 return heap_beginscan_internal(relation, snapshot, nkeys, key,
1183 false, false, true);
1187 heap_beginscan_internal(Relation relation, Snapshot snapshot,
1188 int nkeys, ScanKey key,
1189 bool allow_strat, bool allow_sync,
1195 * increment relation ref count while scanning relation
1197 * This is just to make really sure the relcache entry won't go away while
1198 * the scan has a pointer to it. Caller should be holding the rel open
1199 * anyway, so this is redundant in all normal scenarios...
1201 RelationIncrementReferenceCount(relation);
1204 * allocate and initialize scan descriptor
1206 scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1208 scan->rs_rd = relation;
1209 scan->rs_snapshot = snapshot;
1210 scan->rs_nkeys = nkeys;
1211 scan->rs_bitmapscan = is_bitmapscan;
1212 scan->rs_strategy = NULL; /* set in initscan */
1213 scan->rs_allow_strat = allow_strat;
1214 scan->rs_allow_sync = allow_sync;
1217 * we can use page-at-a-time mode if it's an MVCC-safe snapshot
1219 scan->rs_pageatatime = IsMVCCSnapshot(snapshot);
1221 /* we only need to set this up once */
1222 scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1225 * we do this here instead of in initscan() because heap_rescan also calls
1226 * initscan() and we don't want to allocate memory again
1229 scan->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1231 scan->rs_key = NULL;
1233 initscan(scan, key, false);
1239 * heap_rescan - restart a relation scan
1243 heap_rescan(HeapScanDesc scan,
1247 * unpin scan buffers
1249 if (BufferIsValid(scan->rs_cbuf))
1250 ReleaseBuffer(scan->rs_cbuf);
1253 * reinitialize scan descriptor
1255 initscan(scan, key, true);
1259 * heap_endscan - end relation scan
1261 * See how to integrate with index scans.
1262 * Check handling if reldesc caching.
1266 heap_endscan(HeapScanDesc scan)
1268 /* Note: no locking manipulations needed */
1271 * unpin scan buffers
1273 if (BufferIsValid(scan->rs_cbuf))
1274 ReleaseBuffer(scan->rs_cbuf);
1277 * decrement relation reference count and free scan descriptor storage
1279 RelationDecrementReferenceCount(scan->rs_rd);
1282 pfree(scan->rs_key);
1284 if (scan->rs_strategy != NULL)
1285 FreeAccessStrategy(scan->rs_strategy);
1291 * heap_getnext - retrieve next tuple in scan
1293 * Fix to work with index relations.
1294 * We don't return the buffer anymore, but you can get it from the
1295 * returned HeapTuple.
1300 #define HEAPDEBUG_1 \
1301 elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
1302 RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
1303 #define HEAPDEBUG_2 \
1304 elog(DEBUG2, "heap_getnext returning EOS")
1305 #define HEAPDEBUG_3 \
1306 elog(DEBUG2, "heap_getnext returning tuple")
1311 #endif /* !defined(HEAPDEBUGALL) */
1315 heap_getnext(HeapScanDesc scan, ScanDirection direction)
1317 /* Note: no locking manipulations needed */
1319 HEAPDEBUG_1; /* heap_getnext( info ) */
1321 if (scan->rs_pageatatime)
1322 heapgettup_pagemode(scan, direction,
1323 scan->rs_nkeys, scan->rs_key);
1325 heapgettup(scan, direction, scan->rs_nkeys, scan->rs_key);
1327 if (scan->rs_ctup.t_data == NULL)
1329 HEAPDEBUG_2; /* heap_getnext returning EOS */
1334 * if we get here it means we have a new current scan tuple, so point to
1335 * the proper return buffer and return the tuple.
1337 HEAPDEBUG_3; /* heap_getnext returning tuple */
1339 pgstat_count_heap_getnext(scan->rs_rd);
1341 return &(scan->rs_ctup);
1345 * heap_fetch - retrieve tuple with given tid
1347 * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1348 * the tuple, fill in the remaining fields of *tuple, and check the tuple
1349 * against the specified snapshot.
1351 * If successful (tuple found and passes snapshot time qual), then *userbuf
1352 * is set to the buffer holding the tuple and TRUE is returned. The caller
1353 * must unpin the buffer when done with the tuple.
1355 * If the tuple is not found (ie, item number references a deleted slot),
1356 * then tuple->t_data is set to NULL and FALSE is returned.
1358 * If the tuple is found but fails the time qual check, then FALSE is returned
1359 * but tuple->t_data is left pointing to the tuple.
1361 * keep_buf determines what is done with the buffer in the FALSE-result cases.
1362 * When the caller specifies keep_buf = true, we retain the pin on the buffer
1363 * and return it in *userbuf (so the caller must eventually unpin it); when
1364 * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer.
1366 * stats_relation is the relation to charge the heap_fetch operation against
1367 * for statistical purposes. (This could be the heap rel itself, an
1368 * associated index, or NULL to not count the fetch at all.)
1370 * heap_fetch does not follow HOT chains: only the exact TID requested will
1373 * It is somewhat inconsistent that we ereport() on invalid block number but
1374 * return false on invalid item number. There are a couple of reasons though.
1375 * One is that the caller can relatively easily check the block number for
1376 * validity, but cannot check the item number without reading the page
1377 * himself. Another is that when we are following a t_ctid link, we can be
1378 * reasonably confident that the page number is valid (since VACUUM shouldn't
1379 * truncate off the destination page without having killed the referencing
1380 * tuple first), but the item number might well not be good.
1383 heap_fetch(Relation relation,
1388 Relation stats_relation)
1390 ItemPointer tid = &(tuple->t_self);
1394 OffsetNumber offnum;
1398 * Fetch and pin the appropriate page of the relation.
1400 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1403 * Need share lock on buffer to examine tuple commit status.
1405 LockBuffer(buffer, BUFFER_LOCK_SHARE);
1406 page = BufferGetPage(buffer);
1409 * We'd better check for out-of-range offnum in case of VACUUM since the
1412 offnum = ItemPointerGetOffsetNumber(tid);
1413 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1415 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1420 ReleaseBuffer(buffer);
1421 *userbuf = InvalidBuffer;
1423 tuple->t_data = NULL;
1428 * get the item line pointer corresponding to the requested tid
1430 lp = PageGetItemId(page, offnum);
1433 * Must check for deleted tuple.
1435 if (!ItemIdIsNormal(lp))
1437 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1442 ReleaseBuffer(buffer);
1443 *userbuf = InvalidBuffer;
1445 tuple->t_data = NULL;
1450 * fill in *tuple fields
1452 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1453 tuple->t_len = ItemIdGetLength(lp);
1454 tuple->t_tableOid = RelationGetRelid(relation);
1457 * check time qualification of tuple, then release lock
1459 valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1461 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1466 * All checks passed, so return the tuple as valid. Caller is now
1467 * responsible for releasing the buffer.
1471 /* Count the successful fetch against appropriate rel, if any */
1472 if (stats_relation != NULL)
1473 pgstat_count_heap_fetch(stats_relation);
1478 /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1483 ReleaseBuffer(buffer);
1484 *userbuf = InvalidBuffer;
1491 * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1493 * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1494 * of a HOT chain), and buffer is the buffer holding this tuple. We search
1495 * for the first chain member satisfying the given snapshot. If one is
1496 * found, we update *tid to reference that tuple's offset number, and
1497 * return TRUE. If no match, return FALSE without modifying *tid.
1499 * If all_dead is not NULL, we check non-visible tuples to see if they are
1500 * globally dead; *all_dead is set TRUE if all members of the HOT chain
1501 * are vacuumable, FALSE if not.
1503 * Unlike heap_fetch, the caller must already have pin and (at least) share
1504 * lock on the buffer; it is still pinned/locked at exit. Also unlike
1505 * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
1508 heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
1511 Page dp = (Page) BufferGetPage(buffer);
1512 TransactionId prev_xmax = InvalidTransactionId;
1513 OffsetNumber offnum;
1514 bool at_chain_start;
1519 Assert(TransactionIdIsValid(RecentGlobalXmin));
1521 Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
1522 offnum = ItemPointerGetOffsetNumber(tid);
1523 at_chain_start = true;
1525 /* Scan through possible multiple members of HOT-chain */
1529 HeapTupleData heapTuple;
1531 /* check for bogus TID */
1532 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
1535 lp = PageGetItemId(dp, offnum);
1537 /* check for unused, dead, or redirected items */
1538 if (!ItemIdIsNormal(lp))
1540 /* We should only see a redirect at start of chain */
1541 if (ItemIdIsRedirected(lp) && at_chain_start)
1543 /* Follow the redirect */
1544 offnum = ItemIdGetRedirect(lp);
1545 at_chain_start = false;
1548 /* else must be end of chain */
1552 heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
1553 heapTuple.t_len = ItemIdGetLength(lp);
1556 * Shouldn't see a HEAP_ONLY tuple at chain start.
1558 if (at_chain_start && HeapTupleIsHeapOnly(&heapTuple))
1562 * The xmin should match the previous xmax value, else chain is
1565 if (TransactionIdIsValid(prev_xmax) &&
1566 !TransactionIdEquals(prev_xmax,
1567 HeapTupleHeaderGetXmin(heapTuple.t_data)))
1570 /* If it's visible per the snapshot, we must return it */
1571 if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer))
1573 ItemPointerSetOffsetNumber(tid, offnum);
1580 * If we can't see it, maybe no one else can either. At caller
1581 * request, check whether all chain members are dead to all
1584 if (all_dead && *all_dead &&
1585 HeapTupleSatisfiesVacuum(heapTuple.t_data, RecentGlobalXmin,
1586 buffer) != HEAPTUPLE_DEAD)
1590 * Check to see if HOT chain continues past this tuple; if so fetch
1591 * the next offnum and loop around.
1593 if (HeapTupleIsHotUpdated(&heapTuple))
1595 Assert(ItemPointerGetBlockNumber(&heapTuple.t_data->t_ctid) ==
1596 ItemPointerGetBlockNumber(tid));
1597 offnum = ItemPointerGetOffsetNumber(&heapTuple.t_data->t_ctid);
1598 at_chain_start = false;
1599 prev_xmax = HeapTupleHeaderGetXmax(heapTuple.t_data);
1602 break; /* end of chain */
1609 * heap_hot_search - search HOT chain for tuple satisfying snapshot
1611 * This has the same API as heap_hot_search_buffer, except that the caller
1612 * does not provide the buffer containing the page, rather we access it
1616 heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
1622 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1623 LockBuffer(buffer, BUFFER_LOCK_SHARE);
1624 result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead);
1625 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1626 ReleaseBuffer(buffer);
1631 * heap_get_latest_tid - get the latest tid of a specified tuple
1633 * Actually, this gets the latest version that is visible according to
1634 * the passed snapshot. You can pass SnapshotDirty to get the very latest,
1635 * possibly uncommitted version.
1637 * *tid is both an input and an output parameter: it is updated to
1638 * show the latest version of the row. Note that it will not be changed
1639 * if no version of the row passes the snapshot test.
1642 heap_get_latest_tid(Relation relation,
1647 ItemPointerData ctid;
1648 TransactionId priorXmax;
1650 /* this is to avoid Assert failures on bad input */
1651 if (!ItemPointerIsValid(tid))
1655 * Since this can be called with user-supplied TID, don't trust the input
1656 * too much. (RelationGetNumberOfBlocks is an expensive check, so we
1657 * don't check t_ctid links again this way. Note that it would not do to
1658 * call it just once and save the result, either.)
1660 blk = ItemPointerGetBlockNumber(tid);
1661 if (blk >= RelationGetNumberOfBlocks(relation))
1662 elog(ERROR, "block number %u is out of range for relation \"%s\"",
1663 blk, RelationGetRelationName(relation));
1666 * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1667 * need to examine, and *tid is the TID we will return if ctid turns out
1670 * Note that we will loop until we reach the end of the t_ctid chain.
1671 * Depending on the snapshot passed, there might be at most one visible
1672 * version of the row, but we don't try to optimize for that.
1675 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1680 OffsetNumber offnum;
1686 * Read, pin, and lock the page.
1688 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1689 LockBuffer(buffer, BUFFER_LOCK_SHARE);
1690 page = BufferGetPage(buffer);
1693 * Check for bogus item number. This is not treated as an error
1694 * condition because it can happen while following a t_ctid link. We
1695 * just assume that the prior tid is OK and return it unchanged.
1697 offnum = ItemPointerGetOffsetNumber(&ctid);
1698 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1700 UnlockReleaseBuffer(buffer);
1703 lp = PageGetItemId(page, offnum);
1704 if (!ItemIdIsNormal(lp))
1706 UnlockReleaseBuffer(buffer);
1710 /* OK to access the tuple */
1712 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1713 tp.t_len = ItemIdGetLength(lp);
1716 * After following a t_ctid link, we might arrive at an unrelated
1717 * tuple. Check for XMIN match.
1719 if (TransactionIdIsValid(priorXmax) &&
1720 !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
1722 UnlockReleaseBuffer(buffer);
1727 * Check time qualification of tuple; if visible, set it as the new
1730 valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
1735 * If there's a valid t_ctid link, follow it, else we're done.
1737 if ((tp.t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) ||
1738 ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
1740 UnlockReleaseBuffer(buffer);
1744 ctid = tp.t_data->t_ctid;
1745 priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
1746 UnlockReleaseBuffer(buffer);
1752 * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
1754 * This is called after we have waited for the XMAX transaction to terminate.
1755 * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
1756 * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
1757 * hint bit if possible --- but beware that that may not yet be possible,
1758 * if the transaction committed asynchronously. Hence callers should look
1759 * only at XMAX_INVALID.
1762 UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
1764 Assert(TransactionIdEquals(HeapTupleHeaderGetXmax(tuple), xid));
1766 if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
1768 if (TransactionIdDidCommit(xid))
1769 HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
1772 HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
1773 InvalidTransactionId);
1779 * GetBulkInsertState - prepare status object for a bulk insert
1782 GetBulkInsertState(void)
1784 BulkInsertState bistate;
1786 bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
1787 bistate->strategy = GetAccessStrategy(BAS_BULKWRITE);
1788 bistate->current_buf = InvalidBuffer;
1793 * FreeBulkInsertState - clean up after finishing a bulk insert
1796 FreeBulkInsertState(BulkInsertState bistate)
1798 if (bistate->current_buf != InvalidBuffer)
1799 ReleaseBuffer(bistate->current_buf);
1800 FreeAccessStrategy(bistate->strategy);
1806 * heap_insert - insert tuple into a heap
1808 * The new tuple is stamped with current transaction ID and the specified
1811 * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not
1812 * logged in WAL, even for a non-temp relation. Safe usage of this behavior
1813 * requires that we arrange that all new tuples go into new pages not
1814 * containing any tuples from other transactions, and that the relation gets
1815 * fsync'd before commit. (See also heap_sync() comments)
1817 * The HEAP_INSERT_SKIP_FSM option is passed directly to
1818 * RelationGetBufferForTuple, which see for more info.
1820 * Note that these options will be applied when inserting into the heap's
1821 * TOAST table, too, if the tuple requires any out-of-line data.
1823 * The BulkInsertState object (if any; bistate can be NULL for default
1824 * behavior) is also just passed through to RelationGetBufferForTuple.
1826 * The return value is the OID assigned to the tuple (either here or by the
1827 * caller), or InvalidOid if no OID. The header fields of *tup are updated
1828 * to match the stored tuple; in particular tup->t_self receives the actual
1829 * TID where the tuple was stored. But note that any toasting of fields
1830 * within the tuple data is NOT reflected into *tup.
1833 heap_insert(Relation relation, HeapTuple tup, CommandId cid,
1834 int options, BulkInsertState bistate)
1836 TransactionId xid = GetCurrentTransactionId();
1839 bool all_visible_cleared = false;
1841 if (relation->rd_rel->relhasoids)
1844 /* this is redundant with an Assert in HeapTupleSetOid */
1845 Assert(tup->t_data->t_infomask & HEAP_HASOID);
1849 * If the object id of this tuple has already been assigned, trust the
1850 * caller. There are a couple of ways this can happen. At initial db
1851 * creation, the backend program sets oids for tuples. When we define
1852 * an index, we set the oid. Finally, in the future, we may allow
1853 * users to set their own object ids in order to support a persistent
1854 * object store (objects need to contain pointers to one another).
1856 if (!OidIsValid(HeapTupleGetOid(tup)))
1857 HeapTupleSetOid(tup, GetNewOid(relation));
1861 /* check there is not space for an OID */
1862 Assert(!(tup->t_data->t_infomask & HEAP_HASOID));
1865 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
1866 tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
1867 tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
1868 HeapTupleHeaderSetXmin(tup->t_data, xid);
1869 HeapTupleHeaderSetCmin(tup->t_data, cid);
1870 HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
1871 tup->t_tableOid = RelationGetRelid(relation);
1874 * If the new tuple is too big for storage or contains already toasted
1875 * out-of-line attributes from some other relation, invoke the toaster.
1877 * Note: below this point, heaptup is the data we actually intend to store
1878 * into the relation; tup is the caller's original untoasted data.
1880 if (relation->rd_rel->relkind != RELKIND_RELATION)
1882 /* toast table entries should never be recursively toasted */
1883 Assert(!HeapTupleHasExternal(tup));
1886 else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
1887 heaptup = toast_insert_or_update(relation, tup, NULL, options);
1891 /* Find buffer to insert this tuple into */
1892 buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
1893 InvalidBuffer, options, bistate);
1895 /* NO EREPORT(ERROR) from here till changes are logged */
1896 START_CRIT_SECTION();
1898 RelationPutHeapTuple(relation, buffer, heaptup);
1900 if (PageIsAllVisible(BufferGetPage(buffer)))
1902 all_visible_cleared = true;
1903 PageClearAllVisible(BufferGetPage(buffer));
1907 * XXX Should we set PageSetPrunable on this page ?
1909 * The inserting transaction may eventually abort thus making this tuple
1910 * DEAD and hence available for pruning. Though we don't want to optimize
1911 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
1912 * aborted tuple will never be pruned until next vacuum is triggered.
1914 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
1917 MarkBufferDirty(buffer);
1920 if (!(options & HEAP_INSERT_SKIP_WAL) && !relation->rd_istemp)
1922 xl_heap_insert xlrec;
1923 xl_heap_header xlhdr;
1925 XLogRecData rdata[3];
1926 Page page = BufferGetPage(buffer);
1927 uint8 info = XLOG_HEAP_INSERT;
1929 xlrec.all_visible_cleared = all_visible_cleared;
1930 xlrec.target.node = relation->rd_node;
1931 xlrec.target.tid = heaptup->t_self;
1932 rdata[0].data = (char *) &xlrec;
1933 rdata[0].len = SizeOfHeapInsert;
1934 rdata[0].buffer = InvalidBuffer;
1935 rdata[0].next = &(rdata[1]);
1937 xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
1938 xlhdr.t_infomask = heaptup->t_data->t_infomask;
1939 xlhdr.t_hoff = heaptup->t_data->t_hoff;
1942 * note we mark rdata[1] as belonging to buffer; if XLogInsert decides
1943 * to write the whole page to the xlog, we don't need to store
1944 * xl_heap_header in the xlog.
1946 rdata[1].data = (char *) &xlhdr;
1947 rdata[1].len = SizeOfHeapHeader;
1948 rdata[1].buffer = buffer;
1949 rdata[1].buffer_std = true;
1950 rdata[1].next = &(rdata[2]);
1952 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
1953 rdata[2].data = (char *) heaptup->t_data + offsetof(HeapTupleHeaderData, t_bits);
1954 rdata[2].len = heaptup->t_len - offsetof(HeapTupleHeaderData, t_bits);
1955 rdata[2].buffer = buffer;
1956 rdata[2].buffer_std = true;
1957 rdata[2].next = NULL;
1960 * If this is the single and first tuple on page, we can reinit the
1961 * page instead of restoring the whole thing. Set flag, and hide
1962 * buffer references from XLogInsert.
1964 if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
1965 PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
1967 info |= XLOG_HEAP_INIT_PAGE;
1968 rdata[1].buffer = rdata[2].buffer = InvalidBuffer;
1971 recptr = XLogInsert(RM_HEAP_ID, info, rdata);
1973 PageSetLSN(page, recptr);
1974 PageSetTLI(page, ThisTimeLineID);
1979 UnlockReleaseBuffer(buffer);
1981 /* Clear the bit in the visibility map if necessary */
1982 if (all_visible_cleared)
1983 visibilitymap_clear(relation,
1984 ItemPointerGetBlockNumber(&(heaptup->t_self)));
1987 * If tuple is cachable, mark it for invalidation from the caches in case
1988 * we abort. Note it is OK to do this after releasing the buffer, because
1989 * the heaptup data structure is all in local memory, not in the shared
1992 CacheInvalidateHeapTuple(relation, heaptup);
1994 pgstat_count_heap_insert(relation);
1997 * If heaptup is a private copy, release it. Don't forget to copy t_self
1998 * back to the caller's image, too.
2002 tup->t_self = heaptup->t_self;
2003 heap_freetuple(heaptup);
2006 return HeapTupleGetOid(tup);
2010 * simple_heap_insert - insert a tuple
2012 * Currently, this routine differs from heap_insert only in supplying
2013 * a default command ID and not allowing access to the speedup options.
2015 * This should be used rather than using heap_insert directly in most places
2016 * where we are modifying system catalogs.
2019 simple_heap_insert(Relation relation, HeapTuple tup)
2021 return heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2025 * heap_delete - delete a tuple
2027 * NB: do not call this directly unless you are prepared to deal with
2028 * concurrent-update conditions. Use simple_heap_delete instead.
2030 * relation - table to be modified (caller must hold suitable lock)
2031 * tid - TID of tuple to be deleted
2032 * ctid - output parameter, used only for failure case (see below)
2033 * update_xmax - output parameter, used only for failure case (see below)
2034 * cid - delete command ID (used for visibility test, and stored into
2035 * cmax if successful)
2036 * crosscheck - if not InvalidSnapshot, also check tuple against this
2037 * wait - true if should wait for any conflicting update to commit/abort
2039 * Normal, successful return value is HeapTupleMayBeUpdated, which
2040 * actually means we did delete it. Failure return codes are
2041 * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
2042 * (the last only possible if wait == false).
2044 * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
2045 * If t_ctid is the same as tid, the tuple was deleted; if different, the
2046 * tuple was updated, and t_ctid is the location of the replacement tuple.
2047 * (t_xmax is needed to verify that the replacement tuple matches.)
2050 heap_delete(Relation relation, ItemPointer tid,
2051 ItemPointer ctid, TransactionId *update_xmax,
2052 CommandId cid, Snapshot crosscheck, bool wait)
2055 TransactionId xid = GetCurrentTransactionId();
2060 bool have_tuple_lock = false;
2062 bool all_visible_cleared = false;
2064 Assert(ItemPointerIsValid(tid));
2066 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
2067 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2069 page = BufferGetPage(buffer);
2070 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
2071 Assert(ItemIdIsNormal(lp));
2073 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2074 tp.t_len = ItemIdGetLength(lp);
2078 result = HeapTupleSatisfiesUpdate(tp.t_data, cid, buffer);
2080 if (result == HeapTupleInvisible)
2082 UnlockReleaseBuffer(buffer);
2083 elog(ERROR, "attempted to delete invisible tuple");
2085 else if (result == HeapTupleBeingUpdated && wait)
2087 TransactionId xwait;
2090 /* must copy state data before unlocking buffer */
2091 xwait = HeapTupleHeaderGetXmax(tp.t_data);
2092 infomask = tp.t_data->t_infomask;
2094 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2097 * Acquire tuple lock to establish our priority for the tuple (see
2098 * heap_lock_tuple). LockTuple will release us when we are
2099 * next-in-line for the tuple.
2101 * If we are forced to "start over" below, we keep the tuple lock;
2102 * this arranges that we stay at the head of the line while rechecking
2105 if (!have_tuple_lock)
2107 LockTuple(relation, &(tp.t_self), ExclusiveLock);
2108 have_tuple_lock = true;
2112 * Sleep until concurrent transaction ends. Note that we don't care
2113 * if the locker has an exclusive or shared lock, because we need
2117 if (infomask & HEAP_XMAX_IS_MULTI)
2119 /* wait for multixact */
2120 MultiXactIdWait((MultiXactId) xwait);
2121 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2124 * If xwait had just locked the tuple then some other xact could
2125 * update this tuple before we get to this point. Check for xmax
2126 * change, and start over if so.
2128 if (!(tp.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
2129 !TransactionIdEquals(HeapTupleHeaderGetXmax(tp.t_data),
2134 * You might think the multixact is necessarily done here, but not
2135 * so: it could have surviving members, namely our own xact or
2136 * other subxacts of this backend. It is legal for us to delete
2137 * the tuple in either case, however (the latter case is
2138 * essentially a situation of upgrading our former shared lock to
2139 * exclusive). We don't bother changing the on-disk hint bits
2140 * since we are about to overwrite the xmax altogether.
2145 /* wait for regular transaction to end */
2146 XactLockTableWait(xwait);
2147 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2150 * xwait is done, but if xwait had just locked the tuple then some
2151 * other xact could update this tuple before we get to this point.
2152 * Check for xmax change, and start over if so.
2154 if ((tp.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
2155 !TransactionIdEquals(HeapTupleHeaderGetXmax(tp.t_data),
2159 /* Otherwise check if it committed or aborted */
2160 UpdateXmaxHintBits(tp.t_data, buffer, xwait);
2164 * We may overwrite if previous xmax aborted, or if it committed but
2165 * only locked the tuple without updating it.
2167 if (tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
2169 result = HeapTupleMayBeUpdated;
2171 result = HeapTupleUpdated;
2174 if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
2176 /* Perform additional check for serializable RI updates */
2177 if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
2178 result = HeapTupleUpdated;
2181 if (result != HeapTupleMayBeUpdated)
2183 Assert(result == HeapTupleSelfUpdated ||
2184 result == HeapTupleUpdated ||
2185 result == HeapTupleBeingUpdated);
2186 Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
2187 *ctid = tp.t_data->t_ctid;
2188 *update_xmax = HeapTupleHeaderGetXmax(tp.t_data);
2189 UnlockReleaseBuffer(buffer);
2190 if (have_tuple_lock)
2191 UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
2195 /* replace cid with a combo cid if necessary */
2196 HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
2198 START_CRIT_SECTION();
2201 * If this transaction commits, the tuple will become DEAD sooner or
2202 * later. Set flag that this page is a candidate for pruning once our xid
2203 * falls below the OldestXmin horizon. If the transaction finally aborts,
2204 * the subsequent page pruning will be a no-op and the hint will be
2207 PageSetPrunable(page, xid);
2209 if (PageIsAllVisible(page))
2211 all_visible_cleared = true;
2212 PageClearAllVisible(page);
2215 /* store transaction information of xact deleting the tuple */
2216 tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
2218 HEAP_XMAX_IS_MULTI |
2221 HeapTupleHeaderClearHotUpdated(tp.t_data);
2222 HeapTupleHeaderSetXmax(tp.t_data, xid);
2223 HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
2224 /* Make sure there is no forward chain link in t_ctid */
2225 tp.t_data->t_ctid = tp.t_self;
2227 MarkBufferDirty(buffer);
2230 if (!relation->rd_istemp)
2232 xl_heap_delete xlrec;
2234 XLogRecData rdata[2];
2236 xlrec.all_visible_cleared = all_visible_cleared;
2237 xlrec.target.node = relation->rd_node;
2238 xlrec.target.tid = tp.t_self;
2239 rdata[0].data = (char *) &xlrec;
2240 rdata[0].len = SizeOfHeapDelete;
2241 rdata[0].buffer = InvalidBuffer;
2242 rdata[0].next = &(rdata[1]);
2244 rdata[1].data = NULL;
2246 rdata[1].buffer = buffer;
2247 rdata[1].buffer_std = true;
2248 rdata[1].next = NULL;
2250 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE, rdata);
2252 PageSetLSN(page, recptr);
2253 PageSetTLI(page, ThisTimeLineID);
2258 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2261 * If the tuple has toasted out-of-line attributes, we need to delete
2262 * those items too. We have to do this before releasing the buffer
2263 * because we need to look at the contents of the tuple, but it's OK to
2264 * release the content lock on the buffer first.
2266 if (relation->rd_rel->relkind != RELKIND_RELATION)
2268 /* toast table entries should never be recursively toasted */
2269 Assert(!HeapTupleHasExternal(&tp));
2271 else if (HeapTupleHasExternal(&tp))
2272 toast_delete(relation, &tp);
2275 * Mark tuple for invalidation from system caches at next command
2276 * boundary. We have to do this before releasing the buffer because we
2277 * need to look at the contents of the tuple.
2279 CacheInvalidateHeapTuple(relation, &tp);
2281 /* Clear the bit in the visibility map if necessary */
2282 if (all_visible_cleared)
2283 visibilitymap_clear(relation, BufferGetBlockNumber(buffer));
2285 /* Now we can release the buffer */
2286 ReleaseBuffer(buffer);
2289 * Release the lmgr tuple lock, if we had it.
2291 if (have_tuple_lock)
2292 UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
2294 pgstat_count_heap_delete(relation);
2296 return HeapTupleMayBeUpdated;
2300 * simple_heap_delete - delete a tuple
2302 * This routine may be used to delete a tuple when concurrent updates of
2303 * the target tuple are not expected (for example, because we have a lock
2304 * on the relation associated with the tuple). Any failure is reported
2308 simple_heap_delete(Relation relation, ItemPointer tid)
2311 ItemPointerData update_ctid;
2312 TransactionId update_xmax;
2314 result = heap_delete(relation, tid,
2315 &update_ctid, &update_xmax,
2316 GetCurrentCommandId(true), InvalidSnapshot,
2317 true /* wait for commit */ );
2320 case HeapTupleSelfUpdated:
2321 /* Tuple was already updated in current command? */
2322 elog(ERROR, "tuple already updated by self");
2325 case HeapTupleMayBeUpdated:
2326 /* done successfully */
2329 case HeapTupleUpdated:
2330 elog(ERROR, "tuple concurrently updated");
2334 elog(ERROR, "unrecognized heap_delete status: %u", result);
2340 * heap_update - replace a tuple
2342 * NB: do not call this directly unless you are prepared to deal with
2343 * concurrent-update conditions. Use simple_heap_update instead.
2345 * relation - table to be modified (caller must hold suitable lock)
2346 * otid - TID of old tuple to be replaced
2347 * newtup - newly constructed tuple data to store
2348 * ctid - output parameter, used only for failure case (see below)
2349 * update_xmax - output parameter, used only for failure case (see below)
2350 * cid - update command ID (used for visibility test, and stored into
2351 * cmax/cmin if successful)
2352 * crosscheck - if not InvalidSnapshot, also check old tuple against this
2353 * wait - true if should wait for any conflicting update to commit/abort
2355 * Normal, successful return value is HeapTupleMayBeUpdated, which
2356 * actually means we *did* update it. Failure return codes are
2357 * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
2358 * (the last only possible if wait == false).
2360 * On success, the header fields of *newtup are updated to match the new
2361 * stored tuple; in particular, newtup->t_self is set to the TID where the
2362 * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
2363 * update was done. However, any TOAST changes in the new tuple's
2364 * data are not reflected into *newtup.
2366 * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
2367 * If t_ctid is the same as otid, the tuple was deleted; if different, the
2368 * tuple was updated, and t_ctid is the location of the replacement tuple.
2369 * (t_xmax is needed to verify that the replacement tuple matches.)
2372 heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
2373 ItemPointer ctid, TransactionId *update_xmax,
2374 CommandId cid, Snapshot crosscheck, bool wait)
2377 TransactionId xid = GetCurrentTransactionId();
2378 Bitmapset *hot_attrs;
2380 HeapTupleData oldtup;
2389 bool have_tuple_lock = false;
2391 bool use_hot_update = false;
2392 bool all_visible_cleared = false;
2393 bool all_visible_cleared_new = false;
2395 Assert(ItemPointerIsValid(otid));
2398 * Fetch the list of attributes to be checked for HOT update. This is
2399 * wasted effort if we fail to update or have to put the new tuple on a
2400 * different page. But we must compute the list before obtaining buffer
2401 * lock --- in the worst case, if we are doing an update on one of the
2402 * relevant system catalogs, we could deadlock if we try to fetch the list
2403 * later. In any case, the relcache caches the data so this is usually
2406 * Note that we get a copy here, so we need not worry about relcache flush
2407 * happening midway through.
2409 hot_attrs = RelationGetIndexAttrBitmap(relation);
2411 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(otid));
2412 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2414 page = BufferGetPage(buffer);
2415 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
2416 Assert(ItemIdIsNormal(lp));
2418 oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2419 oldtup.t_len = ItemIdGetLength(lp);
2420 oldtup.t_self = *otid;
2423 * Note: beyond this point, use oldtup not otid to refer to old tuple.
2424 * otid may very well point at newtup->t_self, which we will overwrite
2425 * with the new tuple's location, so there's great risk of confusion if we
2430 result = HeapTupleSatisfiesUpdate(oldtup.t_data, cid, buffer);
2432 if (result == HeapTupleInvisible)
2434 UnlockReleaseBuffer(buffer);
2435 elog(ERROR, "attempted to update invisible tuple");
2437 else if (result == HeapTupleBeingUpdated && wait)
2439 TransactionId xwait;
2442 /* must copy state data before unlocking buffer */
2443 xwait = HeapTupleHeaderGetXmax(oldtup.t_data);
2444 infomask = oldtup.t_data->t_infomask;
2446 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2449 * Acquire tuple lock to establish our priority for the tuple (see
2450 * heap_lock_tuple). LockTuple will release us when we are
2451 * next-in-line for the tuple.
2453 * If we are forced to "start over" below, we keep the tuple lock;
2454 * this arranges that we stay at the head of the line while rechecking
2457 if (!have_tuple_lock)
2459 LockTuple(relation, &(oldtup.t_self), ExclusiveLock);
2460 have_tuple_lock = true;
2464 * Sleep until concurrent transaction ends. Note that we don't care
2465 * if the locker has an exclusive or shared lock, because we need
2469 if (infomask & HEAP_XMAX_IS_MULTI)
2471 /* wait for multixact */
2472 MultiXactIdWait((MultiXactId) xwait);
2473 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2476 * If xwait had just locked the tuple then some other xact could
2477 * update this tuple before we get to this point. Check for xmax
2478 * change, and start over if so.
2480 if (!(oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
2481 !TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data),
2486 * You might think the multixact is necessarily done here, but not
2487 * so: it could have surviving members, namely our own xact or
2488 * other subxacts of this backend. It is legal for us to update
2489 * the tuple in either case, however (the latter case is
2490 * essentially a situation of upgrading our former shared lock to
2491 * exclusive). We don't bother changing the on-disk hint bits
2492 * since we are about to overwrite the xmax altogether.
2497 /* wait for regular transaction to end */
2498 XactLockTableWait(xwait);
2499 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2502 * xwait is done, but if xwait had just locked the tuple then some
2503 * other xact could update this tuple before we get to this point.
2504 * Check for xmax change, and start over if so.
2506 if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
2507 !TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data),
2511 /* Otherwise check if it committed or aborted */
2512 UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
2516 * We may overwrite if previous xmax aborted, or if it committed but
2517 * only locked the tuple without updating it.
2519 if (oldtup.t_data->t_infomask & (HEAP_XMAX_INVALID |
2521 result = HeapTupleMayBeUpdated;
2523 result = HeapTupleUpdated;
2526 if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
2528 /* Perform additional check for serializable RI updates */
2529 if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
2530 result = HeapTupleUpdated;
2533 if (result != HeapTupleMayBeUpdated)
2535 Assert(result == HeapTupleSelfUpdated ||
2536 result == HeapTupleUpdated ||
2537 result == HeapTupleBeingUpdated);
2538 Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
2539 *ctid = oldtup.t_data->t_ctid;
2540 *update_xmax = HeapTupleHeaderGetXmax(oldtup.t_data);
2541 UnlockReleaseBuffer(buffer);
2542 if (have_tuple_lock)
2543 UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
2544 bms_free(hot_attrs);
2548 /* Fill in OID and transaction status data for newtup */
2549 if (relation->rd_rel->relhasoids)
2552 /* this is redundant with an Assert in HeapTupleSetOid */
2553 Assert(newtup->t_data->t_infomask & HEAP_HASOID);
2555 HeapTupleSetOid(newtup, HeapTupleGetOid(&oldtup));
2559 /* check there is not space for an OID */
2560 Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
2563 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2564 newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2565 newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED);
2566 HeapTupleHeaderSetXmin(newtup->t_data, xid);
2567 HeapTupleHeaderSetCmin(newtup->t_data, cid);
2568 HeapTupleHeaderSetXmax(newtup->t_data, 0); /* for cleanliness */
2569 newtup->t_tableOid = RelationGetRelid(relation);
2572 * Replace cid with a combo cid if necessary. Note that we already put
2573 * the plain cid into the new tuple.
2575 HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
2578 * If the toaster needs to be activated, OR if the new tuple will not fit
2579 * on the same page as the old, then we need to release the content lock
2580 * (but not the pin!) on the old tuple's buffer while we are off doing
2581 * TOAST and/or table-file-extension work. We must mark the old tuple to
2582 * show that it's already being updated, else other processes may try to
2583 * update it themselves.
2585 * We need to invoke the toaster if there are already any out-of-line
2586 * toasted values present, or if the new tuple is over-threshold.
2588 if (relation->rd_rel->relkind != RELKIND_RELATION)
2590 /* toast table entries should never be recursively toasted */
2591 Assert(!HeapTupleHasExternal(&oldtup));
2592 Assert(!HeapTupleHasExternal(newtup));
2596 need_toast = (HeapTupleHasExternal(&oldtup) ||
2597 HeapTupleHasExternal(newtup) ||
2598 newtup->t_len > TOAST_TUPLE_THRESHOLD);
2600 pagefree = PageGetHeapFreeSpace(page);
2602 newtupsize = MAXALIGN(newtup->t_len);
2604 if (need_toast || newtupsize > pagefree)
2606 /* Clear obsolete visibility flags ... */
2607 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
2609 HEAP_XMAX_IS_MULTI |
2612 HeapTupleClearHotUpdated(&oldtup);
2613 /* ... and store info about transaction updating this tuple */
2614 HeapTupleHeaderSetXmax(oldtup.t_data, xid);
2615 HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
2616 /* temporarily make it look not-updated */
2617 oldtup.t_data->t_ctid = oldtup.t_self;
2618 already_marked = true;
2619 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2622 * Let the toaster do its thing, if needed.
2624 * Note: below this point, heaptup is the data we actually intend to
2625 * store into the relation; newtup is the caller's original untoasted
2630 /* Note we always use WAL and FSM during updates */
2631 heaptup = toast_insert_or_update(relation, newtup, &oldtup, 0);
2632 newtupsize = MAXALIGN(heaptup->t_len);
2638 * Now, do we need a new page for the tuple, or not? This is a bit
2639 * tricky since someone else could have added tuples to the page while
2640 * we weren't looking. We have to recheck the available space after
2641 * reacquiring the buffer lock. But don't bother to do that if the
2642 * former amount of free space is still not enough; it's unlikely
2643 * there's more free now than before.
2645 * What's more, if we need to get a new page, we will need to acquire
2646 * buffer locks on both old and new pages. To avoid deadlock against
2647 * some other backend trying to get the same two locks in the other
2648 * order, we must be consistent about the order we get the locks in.
2649 * We use the rule "lock the lower-numbered page of the relation
2650 * first". To implement this, we must do RelationGetBufferForTuple
2651 * while not holding the lock on the old page, and we must rely on it
2652 * to get the locks on both pages in the correct order.
2654 if (newtupsize > pagefree)
2656 /* Assume there's no chance to put heaptup on same page. */
2657 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
2662 /* Re-acquire the lock on the old tuple's page. */
2663 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2664 /* Re-check using the up-to-date free space */
2665 pagefree = PageGetHeapFreeSpace(page);
2666 if (newtupsize > pagefree)
2669 * Rats, it doesn't fit anymore. We must now unlock and
2670 * relock to avoid deadlock. Fortunately, this path should
2673 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2674 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
2679 /* OK, it fits here, so we're done. */
2686 /* No TOAST work needed, and it'll fit on same page */
2687 already_marked = false;
2693 * At this point newbuf and buffer are both pinned and locked, and newbuf
2694 * has enough space for the new tuple. If they are the same buffer, only
2698 if (newbuf == buffer)
2701 * Since the new tuple is going into the same page, we might be able
2702 * to do a HOT update. Check if any of the index columns have been
2703 * changed. If not, then HOT update is possible.
2705 if (HeapSatisfiesHOTUpdate(relation, hot_attrs, &oldtup, heaptup))
2706 use_hot_update = true;
2710 /* Set a hint that the old page could use prune/defrag */
2714 /* NO EREPORT(ERROR) from here till changes are logged */
2715 START_CRIT_SECTION();
2718 * If this transaction commits, the old tuple will become DEAD sooner or
2719 * later. Set flag that this page is a candidate for pruning once our xid
2720 * falls below the OldestXmin horizon. If the transaction finally aborts,
2721 * the subsequent page pruning will be a no-op and the hint will be
2724 * XXX Should we set hint on newbuf as well? If the transaction aborts,
2725 * there would be a prunable tuple in the newbuf; but for now we choose
2726 * not to optimize for aborts. Note that heap_xlog_update must be kept in
2727 * sync if this decision changes.
2729 PageSetPrunable(page, xid);
2733 /* Mark the old tuple as HOT-updated */
2734 HeapTupleSetHotUpdated(&oldtup);
2735 /* And mark the new tuple as heap-only */
2736 HeapTupleSetHeapOnly(heaptup);
2737 /* Mark the caller's copy too, in case different from heaptup */
2738 HeapTupleSetHeapOnly(newtup);
2742 /* Make sure tuples are correctly marked as not-HOT */
2743 HeapTupleClearHotUpdated(&oldtup);
2744 HeapTupleClearHeapOnly(heaptup);
2745 HeapTupleClearHeapOnly(newtup);
2748 RelationPutHeapTuple(relation, newbuf, heaptup); /* insert new tuple */
2750 if (!already_marked)
2752 /* Clear obsolete visibility flags ... */
2753 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
2755 HEAP_XMAX_IS_MULTI |
2758 /* ... and store info about transaction updating this tuple */
2759 HeapTupleHeaderSetXmax(oldtup.t_data, xid);
2760 HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
2763 /* record address of new tuple in t_ctid of old one */
2764 oldtup.t_data->t_ctid = heaptup->t_self;
2766 /* clear PD_ALL_VISIBLE flags */
2767 if (PageIsAllVisible(BufferGetPage(buffer)))
2769 all_visible_cleared = true;
2770 PageClearAllVisible(BufferGetPage(buffer));
2772 if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
2774 all_visible_cleared_new = true;
2775 PageClearAllVisible(BufferGetPage(newbuf));
2778 if (newbuf != buffer)
2779 MarkBufferDirty(newbuf);
2780 MarkBufferDirty(buffer);
2783 if (!relation->rd_istemp)
2785 XLogRecPtr recptr = log_heap_update(relation, buffer, oldtup.t_self,
2787 all_visible_cleared,
2788 all_visible_cleared_new);
2790 if (newbuf != buffer)
2792 PageSetLSN(BufferGetPage(newbuf), recptr);
2793 PageSetTLI(BufferGetPage(newbuf), ThisTimeLineID);
2795 PageSetLSN(BufferGetPage(buffer), recptr);
2796 PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
2801 if (newbuf != buffer)
2802 LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
2803 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2806 * Mark old tuple for invalidation from system caches at next command
2807 * boundary. We have to do this before releasing the buffer because we
2808 * need to look at the contents of the tuple.
2810 CacheInvalidateHeapTuple(relation, &oldtup);
2812 /* Clear bits in visibility map */
2813 if (all_visible_cleared)
2814 visibilitymap_clear(relation, BufferGetBlockNumber(buffer));
2815 if (all_visible_cleared_new)
2816 visibilitymap_clear(relation, BufferGetBlockNumber(newbuf));
2818 /* Now we can release the buffer(s) */
2819 if (newbuf != buffer)
2820 ReleaseBuffer(newbuf);
2821 ReleaseBuffer(buffer);
2824 * If new tuple is cachable, mark it for invalidation from the caches in
2825 * case we abort. Note it is OK to do this after releasing the buffer,
2826 * because the heaptup data structure is all in local memory, not in the
2829 CacheInvalidateHeapTuple(relation, heaptup);
2832 * Release the lmgr tuple lock, if we had it.
2834 if (have_tuple_lock)
2835 UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
2837 pgstat_count_heap_update(relation, use_hot_update);
2840 * If heaptup is a private copy, release it. Don't forget to copy t_self
2841 * back to the caller's image, too.
2843 if (heaptup != newtup)
2845 newtup->t_self = heaptup->t_self;
2846 heap_freetuple(heaptup);
2849 bms_free(hot_attrs);
2851 return HeapTupleMayBeUpdated;
2855 * Check if the specified attribute's value is same in both given tuples.
2856 * Subroutine for HeapSatisfiesHOTUpdate.
2859 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
2860 HeapTuple tup1, HeapTuple tup2)
2866 Form_pg_attribute att;
2869 * If it's a whole-tuple reference, say "not equal". It's not really
2870 * worth supporting this case, since it could only succeed after a no-op
2871 * update, which is hardly a case worth optimizing for.
2877 * Likewise, automatically say "not equal" for any system attribute other
2878 * than OID and tableOID; we cannot expect these to be consistent in a HOT
2879 * chain, or even to be set correctly yet in the new tuple.
2883 if (attrnum != ObjectIdAttributeNumber &&
2884 attrnum != TableOidAttributeNumber)
2889 * Extract the corresponding values. XXX this is pretty inefficient if
2890 * there are many indexed columns. Should HeapSatisfiesHOTUpdate do a
2891 * single heap_deform_tuple call on each tuple, instead? But that doesn't
2892 * work for system columns ...
2894 value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
2895 value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
2898 * If one value is NULL and other is not, then they are certainly not
2901 if (isnull1 != isnull2)
2905 * If both are NULL, they can be considered equal.
2911 * We do simple binary comparison of the two datums. This may be overly
2912 * strict because there can be multiple binary representations for the
2913 * same logical value. But we should be OK as long as there are no false
2914 * positives. Using a type-specific equality operator is messy because
2915 * there could be multiple notions of equality in different operator
2916 * classes; furthermore, we cannot safely invoke user-defined functions
2917 * while holding exclusive buffer lock.
2921 /* The only allowed system columns are OIDs, so do this */
2922 return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
2926 Assert(attrnum <= tupdesc->natts);
2927 att = tupdesc->attrs[attrnum - 1];
2928 return datumIsEqual(value1, value2, att->attbyval, att->attlen);
2933 * Check if the old and new tuples represent a HOT-safe update. To be able
2934 * to do a HOT update, we must not have changed any columns used in index
2937 * The set of attributes to be checked is passed in (we dare not try to
2938 * compute it while holding exclusive buffer lock...) NOTE that hot_attrs
2939 * is destructively modified! That is OK since this is invoked at most once
2942 * Returns true if safe to do HOT update.
2945 HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
2946 HeapTuple oldtup, HeapTuple newtup)
2950 while ((attrnum = bms_first_member(hot_attrs)) >= 0)
2952 /* Adjust for system attributes */
2953 attrnum += FirstLowInvalidHeapAttributeNumber;
2955 /* If the attribute value has changed, we can't do HOT update */
2956 if (!heap_tuple_attr_equals(RelationGetDescr(relation), attrnum,
2965 * simple_heap_update - replace a tuple
2967 * This routine may be used to update a tuple when concurrent updates of
2968 * the target tuple are not expected (for example, because we have a lock
2969 * on the relation associated with the tuple). Any failure is reported
2973 simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
2976 ItemPointerData update_ctid;
2977 TransactionId update_xmax;
2979 result = heap_update(relation, otid, tup,
2980 &update_ctid, &update_xmax,
2981 GetCurrentCommandId(true), InvalidSnapshot,
2982 true /* wait for commit */ );
2985 case HeapTupleSelfUpdated:
2986 /* Tuple was already updated in current command? */
2987 elog(ERROR, "tuple already updated by self");
2990 case HeapTupleMayBeUpdated:
2991 /* done successfully */
2994 case HeapTupleUpdated:
2995 elog(ERROR, "tuple concurrently updated");
2999 elog(ERROR, "unrecognized heap_update status: %u", result);
3005 * heap_lock_tuple - lock a tuple in shared or exclusive mode
3007 * Note that this acquires a buffer pin, which the caller must release.
3010 * relation: relation containing tuple (caller must hold suitable lock)
3011 * tuple->t_self: TID of tuple to lock (rest of struct need not be valid)
3012 * cid: current command ID (used for visibility test, and stored into
3013 * tuple's cmax if lock is successful)
3014 * mode: indicates if shared or exclusive tuple lock is desired
3015 * nowait: if true, ereport rather than blocking if lock not available
3017 * Output parameters:
3018 * *tuple: all fields filled in
3019 * *buffer: set to buffer holding tuple (pinned but not locked at exit)
3020 * *ctid: set to tuple's t_ctid, but only in failure cases
3021 * *update_xmax: set to tuple's xmax, but only in failure cases
3023 * Function result may be:
3024 * HeapTupleMayBeUpdated: lock was successfully acquired
3025 * HeapTupleSelfUpdated: lock failed because tuple updated by self
3026 * HeapTupleUpdated: lock failed because tuple updated by other xact
3028 * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
3029 * If t_ctid is the same as t_self, the tuple was deleted; if different, the
3030 * tuple was updated, and t_ctid is the location of the replacement tuple.
3031 * (t_xmax is needed to verify that the replacement tuple matches.)
3034 * NOTES: because the shared-memory lock table is of finite size, but users
3035 * could reasonably want to lock large numbers of tuples, we do not rely on
3036 * the standard lock manager to store tuple-level locks over the long term.
3037 * Instead, a tuple is marked as locked by setting the current transaction's
3038 * XID as its XMAX, and setting additional infomask bits to distinguish this
3039 * usage from the more normal case of having deleted the tuple. When
3040 * multiple transactions concurrently share-lock a tuple, the first locker's
3041 * XID is replaced in XMAX with a MultiTransactionId representing the set of
3042 * XIDs currently holding share-locks.
3044 * When it is necessary to wait for a tuple-level lock to be released, the
3045 * basic delay is provided by XactLockTableWait or MultiXactIdWait on the
3046 * contents of the tuple's XMAX. However, that mechanism will release all
3047 * waiters concurrently, so there would be a race condition as to which
3048 * waiter gets the tuple, potentially leading to indefinite starvation of
3049 * some waiters. The possibility of share-locking makes the problem much
3050 * worse --- a steady stream of share-lockers can easily block an exclusive
3051 * locker forever. To provide more reliable semantics about who gets a
3052 * tuple-level lock first, we use the standard lock manager. The protocol
3053 * for waiting for a tuple-level lock is really
3055 * XactLockTableWait()
3056 * mark tuple as locked by me
3058 * When there are multiple waiters, arbitration of who is to get the lock next
3059 * is provided by LockTuple(). However, at most one tuple-level lock will
3060 * be held or awaited per backend at any time, so we don't risk overflow
3061 * of the lock table. Note that incoming share-lockers are required to
3062 * do LockTuple as well, if there is any conflict, to ensure that they don't
3063 * starve out waiting exclusive-lockers. However, if there is not any active
3064 * conflict for a tuple, we don't incur any extra overhead.
3067 heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer,
3068 ItemPointer ctid, TransactionId *update_xmax,
3069 CommandId cid, LockTupleMode mode, bool nowait)
3072 ItemPointer tid = &(tuple->t_self);
3077 uint16 old_infomask;
3078 uint16 new_infomask;
3079 LOCKMODE tuple_lock_type;
3080 bool have_tuple_lock = false;
3082 tuple_lock_type = (mode == LockTupleShared) ? ShareLock : ExclusiveLock;
3084 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
3085 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
3087 page = BufferGetPage(*buffer);
3088 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
3089 Assert(ItemIdIsNormal(lp));
3091 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
3092 tuple->t_len = ItemIdGetLength(lp);
3093 tuple->t_tableOid = RelationGetRelid(relation);
3096 result = HeapTupleSatisfiesUpdate(tuple->t_data, cid, *buffer);
3098 if (result == HeapTupleInvisible)
3100 UnlockReleaseBuffer(*buffer);
3101 elog(ERROR, "attempted to lock invisible tuple");
3103 else if (result == HeapTupleBeingUpdated)
3105 TransactionId xwait;
3108 /* must copy state data before unlocking buffer */
3109 xwait = HeapTupleHeaderGetXmax(tuple->t_data);
3110 infomask = tuple->t_data->t_infomask;
3112 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
3115 * If we wish to acquire share lock, and the tuple is already
3116 * share-locked by a multixact that includes any subtransaction of the
3117 * current top transaction, then we effectively hold the desired lock
3118 * already. We *must* succeed without trying to take the tuple lock,
3119 * else we will deadlock against anyone waiting to acquire exclusive
3120 * lock. We don't need to make any state changes in this case.
3122 if (mode == LockTupleShared &&
3123 (infomask & HEAP_XMAX_IS_MULTI) &&
3124 MultiXactIdIsCurrent((MultiXactId) xwait))
3126 Assert(infomask & HEAP_XMAX_SHARED_LOCK);
3127 /* Probably can't hold tuple lock here, but may as well check */
3128 if (have_tuple_lock)
3129 UnlockTuple(relation, tid, tuple_lock_type);
3130 return HeapTupleMayBeUpdated;
3134 * Acquire tuple lock to establish our priority for the tuple.
3135 * LockTuple will release us when we are next-in-line for the tuple.
3136 * We must do this even if we are share-locking.
3138 * If we are forced to "start over" below, we keep the tuple lock;
3139 * this arranges that we stay at the head of the line while rechecking
3142 if (!have_tuple_lock)
3146 if (!ConditionalLockTuple(relation, tid, tuple_lock_type))
3148 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
3149 errmsg("could not obtain lock on row in relation \"%s\"",
3150 RelationGetRelationName(relation))));
3153 LockTuple(relation, tid, tuple_lock_type);
3154 have_tuple_lock = true;
3157 if (mode == LockTupleShared && (infomask & HEAP_XMAX_SHARED_LOCK))
3160 * Acquiring sharelock when there's at least one sharelocker
3161 * already. We need not wait for him/them to complete.
3163 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
3166 * Make sure it's still a shared lock, else start over. (It's OK
3167 * if the ownership of the shared lock has changed, though.)
3169 if (!(tuple->t_data->t_infomask & HEAP_XMAX_SHARED_LOCK))
3172 else if (infomask & HEAP_XMAX_IS_MULTI)
3174 /* wait for multixact to end */
3177 if (!ConditionalMultiXactIdWait((MultiXactId) xwait))
3179 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
3180 errmsg("could not obtain lock on row in relation \"%s\"",
3181 RelationGetRelationName(relation))));
3184 MultiXactIdWait((MultiXactId) xwait);
3186 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
3189 * If xwait had just locked the tuple then some other xact could
3190 * update this tuple before we get to this point. Check for xmax
3191 * change, and start over if so.
3193 if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
3194 !TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
3199 * You might think the multixact is necessarily done here, but not
3200 * so: it could have surviving members, namely our own xact or
3201 * other subxacts of this backend. It is legal for us to lock the
3202 * tuple in either case, however. We don't bother changing the
3203 * on-disk hint bits since we are about to overwrite the xmax
3209 /* wait for regular transaction to end */
3212 if (!ConditionalXactLockTableWait(xwait))
3214 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
3215 errmsg("could not obtain lock on row in relation \"%s\"",
3216 RelationGetRelationName(relation))));
3219 XactLockTableWait(xwait);
3221 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
3224 * xwait is done, but if xwait had just locked the tuple then some
3225 * other xact could update this tuple before we get to this point.
3226 * Check for xmax change, and start over if so.
3228 if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
3229 !TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
3233 /* Otherwise check if it committed or aborted */
3234 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
3238 * We may lock if previous xmax aborted, or if it committed but only
3239 * locked the tuple without updating it. The case where we didn't
3240 * wait because we are joining an existing shared lock is correctly
3243 if (tuple->t_data->t_infomask & (HEAP_XMAX_INVALID |
3245 result = HeapTupleMayBeUpdated;
3247 result = HeapTupleUpdated;
3250 if (result != HeapTupleMayBeUpdated)
3252 Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated);
3253 Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
3254 *ctid = tuple->t_data->t_ctid;
3255 *update_xmax = HeapTupleHeaderGetXmax(tuple->t_data);
3256 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
3257 if (have_tuple_lock)
3258 UnlockTuple(relation, tid, tuple_lock_type);
3263 * We might already hold the desired lock (or stronger), possibly under a
3264 * different subtransaction of the current top transaction. If so, there
3265 * is no need to change state or issue a WAL record. We already handled
3266 * the case where this is true for xmax being a MultiXactId, so now check
3267 * for cases where it is a plain TransactionId.
3269 * Note in particular that this covers the case where we already hold
3270 * exclusive lock on the tuple and the caller only wants shared lock. It
3271 * would certainly not do to give up the exclusive lock.
3273 xmax = HeapTupleHeaderGetXmax(tuple->t_data);
3274 old_infomask = tuple->t_data->t_infomask;
3276 if (!(old_infomask & (HEAP_XMAX_INVALID |
3277 HEAP_XMAX_COMMITTED |
3278 HEAP_XMAX_IS_MULTI)) &&
3279 (mode == LockTupleShared ?
3280 (old_infomask & HEAP_IS_LOCKED) :
3281 (old_infomask & HEAP_XMAX_EXCL_LOCK)) &&
3282 TransactionIdIsCurrentTransactionId(xmax))
3284 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
3285 /* Probably can't hold tuple lock here, but may as well check */
3286 if (have_tuple_lock)
3287 UnlockTuple(relation, tid, tuple_lock_type);
3288 return HeapTupleMayBeUpdated;
3292 * Compute the new xmax and infomask to store into the tuple. Note we do
3293 * not modify the tuple just yet, because that would leave it in the wrong
3294 * state if multixact.c elogs.
3296 xid = GetCurrentTransactionId();
3298 new_infomask = old_infomask & ~(HEAP_XMAX_COMMITTED |
3300 HEAP_XMAX_IS_MULTI |
3304 if (mode == LockTupleShared)
3307 * If this is the first acquisition of a shared lock in the current
3308 * transaction, set my per-backend OldestMemberMXactId setting. We can
3309 * be certain that the transaction will never become a member of any
3310 * older MultiXactIds than that. (We have to do this even if we end
3311 * up just using our own TransactionId below, since some other backend
3312 * could incorporate our XID into a MultiXact immediately afterwards.)
3314 MultiXactIdSetOldestMember();
3316 new_infomask |= HEAP_XMAX_SHARED_LOCK;
3319 * Check to see if we need a MultiXactId because there are multiple
3322 * HeapTupleSatisfiesUpdate will have set the HEAP_XMAX_INVALID bit if
3323 * the xmax was a MultiXactId but it was not running anymore. There is
3324 * a race condition, which is that the MultiXactId may have finished
3325 * since then, but that uncommon case is handled within
3326 * MultiXactIdExpand.
3328 * There is a similar race condition possible when the old xmax was a
3329 * regular TransactionId. We test TransactionIdIsInProgress again
3330 * just to narrow the window, but it's still possible to end up
3331 * creating an unnecessary MultiXactId. Fortunately this is harmless.
3333 if (!(old_infomask & (HEAP_XMAX_INVALID | HEAP_XMAX_COMMITTED)))
3335 if (old_infomask & HEAP_XMAX_IS_MULTI)
3338 * If the XMAX is already a MultiXactId, then we need to
3339 * expand it to include our own TransactionId.
3341 xid = MultiXactIdExpand((MultiXactId) xmax, xid);
3342 new_infomask |= HEAP_XMAX_IS_MULTI;
3344 else if (TransactionIdIsInProgress(xmax))
3347 * If the XMAX is a valid TransactionId, then we need to
3348 * create a new MultiXactId that includes both the old locker
3349 * and our own TransactionId.
3351 xid = MultiXactIdCreate(xmax, xid);
3352 new_infomask |= HEAP_XMAX_IS_MULTI;
3357 * Can get here iff HeapTupleSatisfiesUpdate saw the old xmax
3358 * as running, but it finished before
3359 * TransactionIdIsInProgress() got to run. Treat it like
3360 * there's no locker in the tuple.
3367 * There was no previous locker, so just insert our own
3374 /* We want an exclusive lock on the tuple */
3375 new_infomask |= HEAP_XMAX_EXCL_LOCK;
3378 START_CRIT_SECTION();
3381 * Store transaction information of xact locking the tuple.
3383 * Note: Cmax is meaningless in this context, so don't set it; this avoids
3384 * possibly generating a useless combo CID.
3386 tuple->t_data->t_infomask = new_infomask;
3387 HeapTupleHeaderClearHotUpdated(tuple->t_data);
3388 HeapTupleHeaderSetXmax(tuple->t_data, xid);
3389 /* Make sure there is no forward chain link in t_ctid */
3390 tuple->t_data->t_ctid = *tid;
3392 MarkBufferDirty(*buffer);
3395 * XLOG stuff. You might think that we don't need an XLOG record because
3396 * there is no state change worth restoring after a crash. You would be
3397 * wrong however: we have just written either a TransactionId or a
3398 * MultiXactId that may never have been seen on disk before, and we need
3399 * to make sure that there are XLOG entries covering those ID numbers.
3400 * Else the same IDs might be re-used after a crash, which would be
3401 * disastrous if this page made it to disk before the crash. Essentially
3402 * we have to enforce the WAL log-before-data rule even in this case.
3403 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
3404 * entries for everything anyway.)
3406 if (!relation->rd_istemp)
3410 XLogRecData rdata[2];
3412 xlrec.target.node = relation->rd_node;
3413 xlrec.target.tid = tuple->t_self;
3414 xlrec.locking_xid = xid;
3415 xlrec.xid_is_mxact = ((new_infomask & HEAP_XMAX_IS_MULTI) != 0);
3416 xlrec.shared_lock = (mode == LockTupleShared);
3417 rdata[0].data = (char *) &xlrec;
3418 rdata[0].len = SizeOfHeapLock;
3419 rdata[0].buffer = InvalidBuffer;
3420 rdata[0].next = &(rdata[1]);
3422 rdata[1].data = NULL;
3424 rdata[1].buffer = *buffer;
3425 rdata[1].buffer_std = true;
3426 rdata[1].next = NULL;
3428 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK, rdata);
3430 PageSetLSN(page, recptr);
3431 PageSetTLI(page, ThisTimeLineID);
3436 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
3439 * Don't update the visibility map here. Locking a tuple doesn't change
3444 * Now that we have successfully marked the tuple as locked, we can
3445 * release the lmgr tuple lock, if we had it.
3447 if (have_tuple_lock)
3448 UnlockTuple(relation, tid, tuple_lock_type);
3450 return HeapTupleMayBeUpdated;
3455 * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
3457 * Overwriting violates both MVCC and transactional safety, so the uses
3458 * of this function in Postgres are extremely limited. Nonetheless we
3459 * find some places to use it.
3461 * The tuple cannot change size, and therefore it's reasonable to assume
3462 * that its null bitmap (if any) doesn't change either. So we just
3463 * overwrite the data portion of the tuple without touching the null
3464 * bitmap or any of the header fields.
3466 * tuple is an in-memory tuple structure containing the data to be written
3467 * over the target tuple. Also, tuple->t_self identifies the target tuple.
3470 heap_inplace_update(Relation relation, HeapTuple tuple)
3474 OffsetNumber offnum;
3476 HeapTupleHeader htup;
3480 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
3481 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3482 page = (Page) BufferGetPage(buffer);
3484 offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
3485 if (PageGetMaxOffsetNumber(page) >= offnum)
3486 lp = PageGetItemId(page, offnum);
3488 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
3489 elog(ERROR, "heap_inplace_update: invalid lp");
3491 htup = (HeapTupleHeader) PageGetItem(page, lp);
3493 oldlen = ItemIdGetLength(lp) - htup->t_hoff;
3494 newlen = tuple->t_len - tuple->t_data->t_hoff;
3495 if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
3496 elog(ERROR, "heap_inplace_update: wrong tuple length");
3498 /* NO EREPORT(ERROR) from here till changes are logged */
3499 START_CRIT_SECTION();
3501 memcpy((char *) htup + htup->t_hoff,
3502 (char *) tuple->t_data + tuple->t_data->t_hoff,
3505 MarkBufferDirty(buffer);
3508 if (!relation->rd_istemp)
3510 xl_heap_inplace xlrec;
3512 XLogRecData rdata[2];
3514 xlrec.target.node = relation->rd_node;
3515 xlrec.target.tid = tuple->t_self;
3517 rdata[0].data = (char *) &xlrec;
3518 rdata[0].len = SizeOfHeapInplace;
3519 rdata[0].buffer = InvalidBuffer;
3520 rdata[0].next = &(rdata[1]);
3522 rdata[1].data = (char *) htup + htup->t_hoff;
3523 rdata[1].len = newlen;
3524 rdata[1].buffer = buffer;
3525 rdata[1].buffer_std = true;
3526 rdata[1].next = NULL;
3528 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE, rdata);
3530 PageSetLSN(page, recptr);
3531 PageSetTLI(page, ThisTimeLineID);
3536 UnlockReleaseBuffer(buffer);
3538 /* Send out shared cache inval if necessary */
3539 if (!IsBootstrapProcessingMode())
3540 CacheInvalidateHeapTuple(relation, tuple);
3547 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
3548 * are older than the specified cutoff XID. If so, replace them with
3549 * FrozenTransactionId or InvalidTransactionId as appropriate, and return
3550 * TRUE. Return FALSE if nothing was changed.
3552 * It is assumed that the caller has checked the tuple with
3553 * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
3554 * (else we should be removing the tuple, not freezing it).
3556 * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any
3557 * XID older than it could neither be running nor seen as running by any
3558 * open transaction. This ensures that the replacement will not change
3559 * anyone's idea of the tuple state. Also, since we assume the tuple is
3560 * not HEAPTUPLE_DEAD, the fact that an XID is not still running allows us
3561 * to assume that it is either committed good or aborted, as appropriate;
3562 * so we need no external state checks to decide what to do. (This is good
3563 * because this function is applied during WAL recovery, when we don't have
3564 * access to any such state, and can't depend on the hint bits to be set.)
3566 * In lazy VACUUM, we call this while initially holding only a shared lock
3567 * on the tuple's buffer. If any change is needed, we trade that in for an
3568 * exclusive lock before making the change. Caller should pass the buffer ID
3569 * if shared lock is held, InvalidBuffer if exclusive lock is already held.
3571 * Note: it might seem we could make the changes without exclusive lock, since
3572 * TransactionId read/write is assumed atomic anyway. However there is a race
3573 * condition: someone who just fetched an old XID that we overwrite here could
3574 * conceivably not finish checking the XID against pg_clog before we finish
3575 * the VACUUM and perhaps truncate off the part of pg_clog he needs. Getting
3576 * exclusive lock ensures no other backend is in process of checking the
3577 * tuple status. Also, getting exclusive lock makes it safe to adjust the
3581 heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
3584 bool changed = false;
3587 xid = HeapTupleHeaderGetXmin(tuple);
3588 if (TransactionIdIsNormal(xid) &&
3589 TransactionIdPrecedes(xid, cutoff_xid))
3591 if (buf != InvalidBuffer)
3593 /* trade in share lock for exclusive lock */
3594 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
3595 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
3596 buf = InvalidBuffer;
3598 HeapTupleHeaderSetXmin(tuple, FrozenTransactionId);
3601 * Might as well fix the hint bits too; usually XMIN_COMMITTED will
3602 * already be set here, but there's a small chance not.
3604 Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
3605 tuple->t_infomask |= HEAP_XMIN_COMMITTED;
3610 * When we release shared lock, it's possible for someone else to change
3611 * xmax before we get the lock back, so repeat the check after acquiring
3612 * exclusive lock. (We don't need this pushup for xmin, because only
3613 * VACUUM could be interested in changing an existing tuple's xmin, and
3614 * there's only one VACUUM allowed on a table at a time.)
3617 if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
3619 xid = HeapTupleHeaderGetXmax(tuple);
3620 if (TransactionIdIsNormal(xid) &&
3621 TransactionIdPrecedes(xid, cutoff_xid))
3623 if (buf != InvalidBuffer)
3625 /* trade in share lock for exclusive lock */
3626 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
3627 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
3628 buf = InvalidBuffer;
3629 goto recheck_xmax; /* see comment above */
3631 HeapTupleHeaderSetXmax(tuple, InvalidTransactionId);
3634 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED
3635 * + LOCKED. Normalize to INVALID just to be sure no one gets
3638 tuple->t_infomask &= ~HEAP_XMAX_COMMITTED;
3639 tuple->t_infomask |= HEAP_XMAX_INVALID;
3640 HeapTupleHeaderClearHotUpdated(tuple);
3647 * XXX perhaps someday we should zero out very old MultiXactIds here?
3649 * The only way a stale MultiXactId could pose a problem is if a
3650 * tuple, having once been multiply-share-locked, is not touched by
3651 * any vacuum or attempted lock or deletion for just over 4G MultiXact
3652 * creations, and then in the probably-narrow window where its xmax
3653 * is again a live MultiXactId, someone tries to lock or delete it.
3654 * Even then, another share-lock attempt would work fine. An
3655 * exclusive-lock or delete attempt would face unexpected delay, or
3656 * in the very worst case get a deadlock error. This seems an
3657 * extremely low-probability scenario with minimal downside even if
3658 * it does happen, so for now we don't do the extra bookkeeping that
3659 * would be needed to clean out MultiXactIds.
3665 * Although xvac per se could only be set by old-style VACUUM FULL, it
3666 * shares physical storage space with cmax, and so could be wiped out by
3667 * someone setting xmax. Hence recheck after changing lock, same as for
3670 * Old-style VACUUM FULL is gone, but we have to keep this code as long as
3671 * we support having MOVED_OFF/MOVED_IN tuples in the database.
3674 if (tuple->t_infomask & HEAP_MOVED)
3676 xid = HeapTupleHeaderGetXvac(tuple);
3677 if (TransactionIdIsNormal(xid) &&
3678 TransactionIdPrecedes(xid, cutoff_xid))
3680 if (buf != InvalidBuffer)
3682 /* trade in share lock for exclusive lock */
3683 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
3684 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
3685 buf = InvalidBuffer;
3686 goto recheck_xvac; /* see comment above */
3690 * If a MOVED_OFF tuple is not dead, the xvac transaction must
3691 * have failed; whereas a non-dead MOVED_IN tuple must mean the
3692 * xvac transaction succeeded.
3694 if (tuple->t_infomask & HEAP_MOVED_OFF)
3695 HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
3697 HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
3700 * Might as well fix the hint bits too; usually XMIN_COMMITTED
3701 * will already be set here, but there's a small chance not.
3703 Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
3704 tuple->t_infomask |= HEAP_XMIN_COMMITTED;
3714 * heap_markpos - mark scan position
3718 heap_markpos(HeapScanDesc scan)
3720 /* Note: no locking manipulations needed */
3722 if (scan->rs_ctup.t_data != NULL)
3724 scan->rs_mctid = scan->rs_ctup.t_self;
3725 if (scan->rs_pageatatime)
3726 scan->rs_mindex = scan->rs_cindex;
3729 ItemPointerSetInvalid(&scan->rs_mctid);
3733 * heap_restrpos - restore position to marked location
3737 heap_restrpos(HeapScanDesc scan)
3739 /* XXX no amrestrpos checking that ammarkpos called */
3741 if (!ItemPointerIsValid(&scan->rs_mctid))
3743 scan->rs_ctup.t_data = NULL;
3746 * unpin scan buffers
3748 if (BufferIsValid(scan->rs_cbuf))
3749 ReleaseBuffer(scan->rs_cbuf);
3750 scan->rs_cbuf = InvalidBuffer;
3751 scan->rs_cblock = InvalidBlockNumber;
3752 scan->rs_inited = false;
3757 * If we reached end of scan, rs_inited will now be false. We must
3758 * reset it to true to keep heapgettup from doing the wrong thing.
3760 scan->rs_inited = true;
3761 scan->rs_ctup.t_self = scan->rs_mctid;
3762 if (scan->rs_pageatatime)
3764 scan->rs_cindex = scan->rs_mindex;
3765 heapgettup_pagemode(scan,
3766 NoMovementScanDirection,
3767 0, /* needn't recheck scan keys */
3772 NoMovementScanDirection,
3773 0, /* needn't recheck scan keys */
3779 * If 'tuple' contains any visible XID greater than latestRemovedXid,
3780 * ratchet forwards latestRemovedXid to the greatest one found.
3781 * This is used as the basis for generating Hot Standby conflicts, so
3782 * if a tuple was never visible then removing it should not conflict
3786 HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
3787 TransactionId *latestRemovedXid)
3789 TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
3790 TransactionId xmax = HeapTupleHeaderGetXmax(tuple);
3791 TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
3793 if (tuple->t_infomask & HEAP_MOVED)
3795 if (TransactionIdPrecedes(*latestRemovedXid, xvac))
3796 *latestRemovedXid = xvac;
3800 * Ignore tuples inserted by an aborted transaction or
3801 * if the tuple was updated/deleted by the inserting transaction.
3803 * Look for a committed hint bit, or if no xmin bit is set, check clog.
3804 * This needs to work on both master and standby, where it is used
3805 * to assess btree delete records.
3807 if ((tuple->t_infomask & HEAP_XMIN_COMMITTED) ||
3808 (!(tuple->t_infomask & HEAP_XMIN_COMMITTED) &&
3809 !(tuple->t_infomask & HEAP_XMIN_INVALID) &&
3810 TransactionIdDidCommit(xmin)))
3813 TransactionIdFollows(xmax, *latestRemovedXid))
3814 *latestRemovedXid = xmax;
3817 /* *latestRemovedXid may still be invalid at end */
3821 * Perform XLogInsert to register a heap cleanup info message. These
3822 * messages are sent once per VACUUM and are required because
3823 * of the phasing of removal operations during a lazy VACUUM.
3824 * see comments for vacuum_log_cleanup_info().
3827 log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid)
3829 xl_heap_cleanup_info xlrec;
3834 xlrec.latestRemovedXid = latestRemovedXid;
3836 rdata.data = (char *) &xlrec;
3837 rdata.len = SizeOfHeapCleanupInfo;
3838 rdata.buffer = InvalidBuffer;
3841 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO, &rdata);
3847 * Perform XLogInsert for a heap-clean operation. Caller must already
3848 * have modified the buffer and marked it dirty.
3850 * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
3851 * zero-based tuple indexes. Now they are one-based like other uses
3854 * We also include latestRemovedXid, which is the greatest XID present in
3855 * the removed tuples. That allows recovery processing to cancel or wait
3856 * for long standby queries that can still see these tuples.
3859 log_heap_clean(Relation reln, Buffer buffer,
3860 OffsetNumber *redirected, int nredirected,
3861 OffsetNumber *nowdead, int ndead,
3862 OffsetNumber *nowunused, int nunused,
3863 TransactionId latestRemovedXid)
3865 xl_heap_clean xlrec;
3868 XLogRecData rdata[4];
3870 /* Caller should not call me on a temp relation */
3871 Assert(!reln->rd_istemp);
3873 xlrec.node = reln->rd_node;
3874 xlrec.block = BufferGetBlockNumber(buffer);
3875 xlrec.latestRemovedXid = latestRemovedXid;
3876 xlrec.nredirected = nredirected;
3877 xlrec.ndead = ndead;
3879 rdata[0].data = (char *) &xlrec;
3880 rdata[0].len = SizeOfHeapClean;
3881 rdata[0].buffer = InvalidBuffer;
3882 rdata[0].next = &(rdata[1]);
3885 * The OffsetNumber arrays are not actually in the buffer, but we pretend
3886 * that they are. When XLogInsert stores the whole buffer, the offset
3887 * arrays need not be stored too. Note that even if all three arrays are
3888 * empty, we want to expose the buffer as a candidate for whole-page
3889 * storage, since this record type implies a defragmentation operation
3890 * even if no item pointers changed state.
3892 if (nredirected > 0)
3894 rdata[1].data = (char *) redirected;
3895 rdata[1].len = nredirected * sizeof(OffsetNumber) * 2;
3899 rdata[1].data = NULL;
3902 rdata[1].buffer = buffer;
3903 rdata[1].buffer_std = true;
3904 rdata[1].next = &(rdata[2]);
3908 rdata[2].data = (char *) nowdead;
3909 rdata[2].len = ndead * sizeof(OffsetNumber);
3913 rdata[2].data = NULL;
3916 rdata[2].buffer = buffer;
3917 rdata[2].buffer_std = true;
3918 rdata[2].next = &(rdata[3]);
3922 rdata[3].data = (char *) nowunused;
3923 rdata[3].len = nunused * sizeof(OffsetNumber);
3927 rdata[3].data = NULL;
3930 rdata[3].buffer = buffer;
3931 rdata[3].buffer_std = true;
3932 rdata[3].next = NULL;
3934 info = XLOG_HEAP2_CLEAN;
3935 recptr = XLogInsert(RM_HEAP2_ID, info, rdata);
3941 * Perform XLogInsert for a heap-freeze operation. Caller must already
3942 * have modified the buffer and marked it dirty.
3945 log_heap_freeze(Relation reln, Buffer buffer,
3946 TransactionId cutoff_xid,
3947 OffsetNumber *offsets, int offcnt)
3949 xl_heap_freeze xlrec;
3951 XLogRecData rdata[2];
3953 /* Caller should not call me on a temp relation */
3954 Assert(!reln->rd_istemp);
3955 /* nor when there are no tuples to freeze */
3958 xlrec.node = reln->rd_node;
3959 xlrec.block = BufferGetBlockNumber(buffer);
3960 xlrec.cutoff_xid = cutoff_xid;
3962 rdata[0].data = (char *) &xlrec;
3963 rdata[0].len = SizeOfHeapFreeze;
3964 rdata[0].buffer = InvalidBuffer;
3965 rdata[0].next = &(rdata[1]);
3968 * The tuple-offsets array is not actually in the buffer, but pretend that
3969 * it is. When XLogInsert stores the whole buffer, the offsets array need
3970 * not be stored too.
3972 rdata[1].data = (char *) offsets;
3973 rdata[1].len = offcnt * sizeof(OffsetNumber);
3974 rdata[1].buffer = buffer;
3975 rdata[1].buffer_std = true;
3976 rdata[1].next = NULL;
3978 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE, rdata);
3984 * Perform XLogInsert for a heap-update operation. Caller must already
3985 * have modified the buffer(s) and marked them dirty.
3988 log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
3989 Buffer newbuf, HeapTuple newtup,
3990 bool all_visible_cleared, bool new_all_visible_cleared)
3992 xl_heap_update xlrec;
3993 xl_heap_header xlhdr;
3996 XLogRecData rdata[4];
3997 Page page = BufferGetPage(newbuf);
3999 /* Caller should not call me on a temp relation */
4000 Assert(!reln->rd_istemp);
4002 if (HeapTupleIsHeapOnly(newtup))
4003 info = XLOG_HEAP_HOT_UPDATE;
4005 info = XLOG_HEAP_UPDATE;
4007 xlrec.target.node = reln->rd_node;
4008 xlrec.target.tid = from;
4009 xlrec.all_visible_cleared = all_visible_cleared;
4010 xlrec.newtid = newtup->t_self;
4011 xlrec.new_all_visible_cleared = new_all_visible_cleared;
4013 rdata[0].data = (char *) &xlrec;
4014 rdata[0].len = SizeOfHeapUpdate;
4015 rdata[0].buffer = InvalidBuffer;
4016 rdata[0].next = &(rdata[1]);
4018 rdata[1].data = NULL;
4020 rdata[1].buffer = oldbuf;
4021 rdata[1].buffer_std = true;
4022 rdata[1].next = &(rdata[2]);
4024 xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
4025 xlhdr.t_infomask = newtup->t_data->t_infomask;
4026 xlhdr.t_hoff = newtup->t_data->t_hoff;
4029 * As with insert records, we need not store the rdata[2] segment if we
4030 * decide to store the whole buffer instead.
4032 rdata[2].data = (char *) &xlhdr;
4033 rdata[2].len = SizeOfHeapHeader;
4034 rdata[2].buffer = newbuf;
4035 rdata[2].buffer_std = true;
4036 rdata[2].next = &(rdata[3]);
4038 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
4039 rdata[3].data = (char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits);
4040 rdata[3].len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits);
4041 rdata[3].buffer = newbuf;
4042 rdata[3].buffer_std = true;
4043 rdata[3].next = NULL;
4045 /* If new tuple is the single and first tuple on page... */
4046 if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
4047 PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
4049 info |= XLOG_HEAP_INIT_PAGE;
4050 rdata[2].buffer = rdata[3].buffer = InvalidBuffer;
4053 recptr = XLogInsert(RM_HEAP_ID, info, rdata);
4059 * Perform XLogInsert of a HEAP_NEWPAGE record to WAL. Caller is responsible
4060 * for writing the page to disk after calling this routine.
4062 * Note: all current callers build pages in private memory and write them
4063 * directly to smgr, rather than using bufmgr. Therefore there is no need
4064 * to pass a buffer ID to XLogInsert, nor to perform MarkBufferDirty within
4065 * the critical section.
4067 * Note: the NEWPAGE log record is used for both heaps and indexes, so do
4068 * not do anything that assumes we are touching a heap.
4071 log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
4074 xl_heap_newpage xlrec;
4076 XLogRecData rdata[2];
4078 /* NO ELOG(ERROR) from here till newpage op is logged */
4079 START_CRIT_SECTION();
4081 xlrec.node = *rnode;
4082 xlrec.forknum = forkNum;
4083 xlrec.blkno = blkno;
4085 rdata[0].data = (char *) &xlrec;
4086 rdata[0].len = SizeOfHeapNewpage;
4087 rdata[0].buffer = InvalidBuffer;
4088 rdata[0].next = &(rdata[1]);
4090 rdata[1].data = (char *) page;
4091 rdata[1].len = BLCKSZ;
4092 rdata[1].buffer = InvalidBuffer;
4093 rdata[1].next = NULL;
4095 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_NEWPAGE, rdata);
4098 * The page may be uninitialized. If so, we can't set the LSN
4099 * and TLI because that would corrupt the page.
4101 if (!PageIsNew(page))
4103 PageSetLSN(page, recptr);
4104 PageSetTLI(page, ThisTimeLineID);
4113 * Handles CLEANUP_INFO
4116 heap_xlog_cleanup_info(XLogRecPtr lsn, XLogRecord *record)
4118 xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) XLogRecGetData(record);
4121 ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
4124 * Actual operation is a no-op. Record type exists to provide a means for
4125 * conflict processing to occur before we begin index vacuum actions. see
4126 * vacuumlazy.c and also comments in btvacuumpage()
4131 * Handles HEAP2_CLEAN record type
4134 heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
4136 xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
4140 OffsetNumber *redirected;
4141 OffsetNumber *nowdead;
4142 OffsetNumber *nowunused;
4149 * We're about to remove tuples. In Hot Standby mode, ensure that there's
4150 * no queries running for which the removed tuples are still visible.
4152 * Not all HEAP2_CLEAN records remove tuples with xids, so we only want to
4153 * conflict on the records that cause MVCC failures for user queries. If
4154 * latestRemovedXid is invalid, skip conflict processing.
4156 if (InHotStandby && TransactionIdIsValid(xlrec->latestRemovedXid))
4157 ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid,
4160 RestoreBkpBlocks(lsn, record, true);
4162 if (record->xl_info & XLR_BKP_BLOCK_1)
4165 buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
4166 if (!BufferIsValid(buffer))
4168 LockBufferForCleanup(buffer);
4169 page = (Page) BufferGetPage(buffer);
4171 if (XLByteLE(lsn, PageGetLSN(page)))
4173 UnlockReleaseBuffer(buffer);
4177 nredirected = xlrec->nredirected;
4178 ndead = xlrec->ndead;
4179 end = (OffsetNumber *) ((char *) xlrec + record->xl_len);
4180 redirected = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
4181 nowdead = redirected + (nredirected * 2);
4182 nowunused = nowdead + ndead;
4183 nunused = (end - nowunused);
4184 Assert(nunused >= 0);
4186 /* Update all item pointers per the record, and repair fragmentation */
4187 heap_page_prune_execute(buffer,
4188 redirected, nredirected,
4190 nowunused, nunused);
4192 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
4195 * Note: we don't worry about updating the page's prunability hints. At
4196 * worst this will cause an extra prune cycle to occur soon.
4199 PageSetLSN(page, lsn);
4200 PageSetTLI(page, ThisTimeLineID);
4201 MarkBufferDirty(buffer);
4202 UnlockReleaseBuffer(buffer);
4205 * Update the FSM as well.
4207 * XXX: We don't get here if the page was restored from full page image.
4208 * We don't bother to update the FSM in that case, it doesn't need to be
4209 * totally accurate anyway.
4211 XLogRecordPageWithFreeSpace(xlrec->node, xlrec->block, freespace);
4215 heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
4217 xl_heap_freeze *xlrec = (xl_heap_freeze *) XLogRecGetData(record);
4218 TransactionId cutoff_xid = xlrec->cutoff_xid;
4223 * In Hot Standby mode, ensure that there's no queries running which still
4224 * consider the frozen xids as running.
4227 ResolveRecoveryConflictWithSnapshot(cutoff_xid, xlrec->node);
4229 RestoreBkpBlocks(lsn, record, false);
4231 if (record->xl_info & XLR_BKP_BLOCK_1)
4234 buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
4235 if (!BufferIsValid(buffer))
4237 LockBufferForCleanup(buffer);
4238 page = (Page) BufferGetPage(buffer);
4240 if (XLByteLE(lsn, PageGetLSN(page)))
4242 UnlockReleaseBuffer(buffer);
4246 if (record->xl_len > SizeOfHeapFreeze)
4248 OffsetNumber *offsets;
4249 OffsetNumber *offsets_end;
4251 offsets = (OffsetNumber *) ((char *) xlrec + SizeOfHeapFreeze);
4252 offsets_end = (OffsetNumber *) ((char *) xlrec + record->xl_len);
4254 while (offsets < offsets_end)
4256 /* offsets[] entries are one-based */
4257 ItemId lp = PageGetItemId(page, *offsets);
4258 HeapTupleHeader tuple = (HeapTupleHeader) PageGetItem(page, lp);
4260 (void) heap_freeze_tuple(tuple, cutoff_xid, InvalidBuffer);
4265 PageSetLSN(page, lsn);
4266 PageSetTLI(page, ThisTimeLineID);
4267 MarkBufferDirty(buffer);
4268 UnlockReleaseBuffer(buffer);
4272 heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
4274 xl_heap_newpage *xlrec = (xl_heap_newpage *) XLogRecGetData(record);
4279 * Note: the NEWPAGE log record is used for both heaps and indexes, so do
4280 * not do anything that assumes we are touching a heap.
4282 buffer = XLogReadBufferExtended(xlrec->node, xlrec->forknum, xlrec->blkno,
4284 Assert(BufferIsValid(buffer));
4285 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
4286 page = (Page) BufferGetPage(buffer);
4288 Assert(record->xl_len == SizeOfHeapNewpage + BLCKSZ);
4289 memcpy(page, (char *) xlrec + SizeOfHeapNewpage, BLCKSZ);
4292 * The page may be uninitialized. If so, we can't set the LSN
4293 * and TLI because that would corrupt the page.
4295 if (!PageIsNew(page))
4297 PageSetLSN(page, lsn);
4298 PageSetTLI(page, ThisTimeLineID);
4301 MarkBufferDirty(buffer);
4302 UnlockReleaseBuffer(buffer);
4306 heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
4308 xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record);
4311 OffsetNumber offnum;
4313 HeapTupleHeader htup;
4316 blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid));
4319 * The visibility map may need to be fixed even if the heap page is
4320 * already up-to-date.
4322 if (xlrec->all_visible_cleared)
4324 Relation reln = CreateFakeRelcacheEntry(xlrec->target.node);
4326 visibilitymap_clear(reln, blkno);
4327 FreeFakeRelcacheEntry(reln);
4330 if (record->xl_info & XLR_BKP_BLOCK_1)
4333 buffer = XLogReadBuffer(xlrec->target.node, blkno, false);
4334 if (!BufferIsValid(buffer))
4336 page = (Page) BufferGetPage(buffer);
4338 if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
4340 UnlockReleaseBuffer(buffer);
4344 offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
4345 if (PageGetMaxOffsetNumber(page) >= offnum)
4346 lp = PageGetItemId(page, offnum);
4348 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
4349 elog(PANIC, "heap_delete_redo: invalid lp");
4351 htup = (HeapTupleHeader) PageGetItem(page, lp);
4353 htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
4355 HEAP_XMAX_IS_MULTI |
4358 HeapTupleHeaderClearHotUpdated(htup);
4359 HeapTupleHeaderSetXmax(htup, record->xl_xid);
4360 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
4362 /* Mark the page as a candidate for pruning */
4363 PageSetPrunable(page, record->xl_xid);
4365 if (xlrec->all_visible_cleared)
4366 PageClearAllVisible(page);
4368 /* Make sure there is no forward chain link in t_ctid */
4369 htup->t_ctid = xlrec->target.tid;
4370 PageSetLSN(page, lsn);
4371 PageSetTLI(page, ThisTimeLineID);
4372 MarkBufferDirty(buffer);
4373 UnlockReleaseBuffer(buffer);
4377 heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
4379 xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record);
4382 OffsetNumber offnum;
4385 HeapTupleHeaderData hdr;
4386 char data[MaxHeapTupleSize];
4388 HeapTupleHeader htup;
4389 xl_heap_header xlhdr;
4394 blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid));
4397 * The visibility map may need to be fixed even if the heap page is
4398 * already up-to-date.
4400 if (xlrec->all_visible_cleared)
4402 Relation reln = CreateFakeRelcacheEntry(xlrec->target.node);
4404 visibilitymap_clear(reln, blkno);
4405 FreeFakeRelcacheEntry(reln);
4408 if (record->xl_info & XLR_BKP_BLOCK_1)
4411 if (record->xl_info & XLOG_HEAP_INIT_PAGE)
4413 buffer = XLogReadBuffer(xlrec->target.node, blkno, true);
4414 Assert(BufferIsValid(buffer));
4415 page = (Page) BufferGetPage(buffer);
4417 PageInit(page, BufferGetPageSize(buffer), 0);
4421 buffer = XLogReadBuffer(xlrec->target.node, blkno, false);
4422 if (!BufferIsValid(buffer))
4424 page = (Page) BufferGetPage(buffer);
4426 if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
4428 UnlockReleaseBuffer(buffer);
4433 offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
4434 if (PageGetMaxOffsetNumber(page) + 1 < offnum)
4435 elog(PANIC, "heap_insert_redo: invalid max offset number");
4437 newlen = record->xl_len - SizeOfHeapInsert - SizeOfHeapHeader;
4438 Assert(newlen <= MaxHeapTupleSize);
4439 memcpy((char *) &xlhdr,
4440 (char *) xlrec + SizeOfHeapInsert,
4443 MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData));
4444 /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
4445 memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits),
4446 (char *) xlrec + SizeOfHeapInsert + SizeOfHeapHeader,
4448 newlen += offsetof(HeapTupleHeaderData, t_bits);
4449 htup->t_infomask2 = xlhdr.t_infomask2;
4450 htup->t_infomask = xlhdr.t_infomask;
4451 htup->t_hoff = xlhdr.t_hoff;
4452 HeapTupleHeaderSetXmin(htup, record->xl_xid);
4453 HeapTupleHeaderSetCmin(htup, FirstCommandId);
4454 htup->t_ctid = xlrec->target.tid;
4456 offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
4457 if (offnum == InvalidOffsetNumber)
4458 elog(PANIC, "heap_insert_redo: failed to add tuple");
4460 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
4462 PageSetLSN(page, lsn);
4463 PageSetTLI(page, ThisTimeLineID);
4465 if (xlrec->all_visible_cleared)
4466 PageClearAllVisible(page);
4468 MarkBufferDirty(buffer);
4469 UnlockReleaseBuffer(buffer);
4472 * If the page is running low on free space, update the FSM as well.
4473 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
4474 * better than that without knowing the fill-factor for the table.
4476 * XXX: We don't get here if the page was restored from full page image.
4477 * We don't bother to update the FSM in that case, it doesn't need to be
4478 * totally accurate anyway.
4480 if (freespace < BLCKSZ / 5)
4481 XLogRecordPageWithFreeSpace(xlrec->target.node, blkno, freespace);
4485 * Handles UPDATE and HOT_UPDATE
4488 heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
4490 xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
4492 bool samepage = (ItemPointerGetBlockNumber(&(xlrec->newtid)) ==
4493 ItemPointerGetBlockNumber(&(xlrec->target.tid)));
4495 OffsetNumber offnum;
4497 HeapTupleHeader htup;
4500 HeapTupleHeaderData hdr;
4501 char data[MaxHeapTupleSize];
4503 xl_heap_header xlhdr;
4509 * The visibility map may need to be fixed even if the heap page is
4510 * already up-to-date.
4512 if (xlrec->all_visible_cleared)
4514 Relation reln = CreateFakeRelcacheEntry(xlrec->target.node);
4516 visibilitymap_clear(reln,
4517 ItemPointerGetBlockNumber(&xlrec->target.tid));
4518 FreeFakeRelcacheEntry(reln);
4521 if (record->xl_info & XLR_BKP_BLOCK_1)
4524 return; /* backup block covered both changes */
4528 /* Deal with old tuple version */
4530 buffer = XLogReadBuffer(xlrec->target.node,
4531 ItemPointerGetBlockNumber(&(xlrec->target.tid)),
4533 if (!BufferIsValid(buffer))
4535 page = (Page) BufferGetPage(buffer);
4537 if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
4539 UnlockReleaseBuffer(buffer);
4545 offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
4546 if (PageGetMaxOffsetNumber(page) >= offnum)
4547 lp = PageGetItemId(page, offnum);
4549 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
4550 elog(PANIC, "heap_update_redo: invalid lp");
4552 htup = (HeapTupleHeader) PageGetItem(page, lp);
4554 htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
4556 HEAP_XMAX_IS_MULTI |
4560 HeapTupleHeaderSetHotUpdated(htup);
4562 HeapTupleHeaderClearHotUpdated(htup);
4563 HeapTupleHeaderSetXmax(htup, record->xl_xid);
4564 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
4565 /* Set forward chain link in t_ctid */
4566 htup->t_ctid = xlrec->newtid;
4568 /* Mark the page as a candidate for pruning */
4569 PageSetPrunable(page, record->xl_xid);
4571 if (xlrec->all_visible_cleared)
4572 PageClearAllVisible(page);
4575 * this test is ugly, but necessary to avoid thinking that insert change
4576 * is already applied
4580 PageSetLSN(page, lsn);
4581 PageSetTLI(page, ThisTimeLineID);
4582 MarkBufferDirty(buffer);
4583 UnlockReleaseBuffer(buffer);
4585 /* Deal with new tuple */
4590 * The visibility map may need to be fixed even if the heap page is
4591 * already up-to-date.
4593 if (xlrec->new_all_visible_cleared)
4595 Relation reln = CreateFakeRelcacheEntry(xlrec->target.node);
4597 visibilitymap_clear(reln, ItemPointerGetBlockNumber(&xlrec->newtid));
4598 FreeFakeRelcacheEntry(reln);
4601 if (record->xl_info & XLR_BKP_BLOCK_2)
4604 if (record->xl_info & XLOG_HEAP_INIT_PAGE)
4606 buffer = XLogReadBuffer(xlrec->target.node,
4607 ItemPointerGetBlockNumber(&(xlrec->newtid)),
4609 Assert(BufferIsValid(buffer));
4610 page = (Page) BufferGetPage(buffer);
4612 PageInit(page, BufferGetPageSize(buffer), 0);
4616 buffer = XLogReadBuffer(xlrec->target.node,
4617 ItemPointerGetBlockNumber(&(xlrec->newtid)),
4619 if (!BufferIsValid(buffer))
4621 page = (Page) BufferGetPage(buffer);
4623 if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
4625 UnlockReleaseBuffer(buffer);
4632 offnum = ItemPointerGetOffsetNumber(&(xlrec->newtid));
4633 if (PageGetMaxOffsetNumber(page) + 1 < offnum)
4634 elog(PANIC, "heap_update_redo: invalid max offset number");
4636 hsize = SizeOfHeapUpdate + SizeOfHeapHeader;
4638 newlen = record->xl_len - hsize;
4639 Assert(newlen <= MaxHeapTupleSize);
4640 memcpy((char *) &xlhdr,
4641 (char *) xlrec + SizeOfHeapUpdate,
4644 MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData));
4645 /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
4646 memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits),
4647 (char *) xlrec + hsize,
4649 newlen += offsetof(HeapTupleHeaderData, t_bits);
4650 htup->t_infomask2 = xlhdr.t_infomask2;
4651 htup->t_infomask = xlhdr.t_infomask;
4652 htup->t_hoff = xlhdr.t_hoff;
4654 HeapTupleHeaderSetXmin(htup, record->xl_xid);
4655 HeapTupleHeaderSetCmin(htup, FirstCommandId);
4656 /* Make sure there is no forward chain link in t_ctid */
4657 htup->t_ctid = xlrec->newtid;
4659 offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
4660 if (offnum == InvalidOffsetNumber)
4661 elog(PANIC, "heap_update_redo: failed to add tuple");
4663 if (xlrec->new_all_visible_cleared)
4664 PageClearAllVisible(page);
4666 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
4668 PageSetLSN(page, lsn);
4669 PageSetTLI(page, ThisTimeLineID);
4670 MarkBufferDirty(buffer);
4671 UnlockReleaseBuffer(buffer);
4674 * If the page is running low on free space, update the FSM as well.
4675 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
4676 * better than that without knowing the fill-factor for the table.
4678 * However, don't update the FSM on HOT updates, because after crash
4679 * recovery, either the old or the new tuple will certainly be dead and
4680 * prunable. After pruning, the page will have roughly as much free space
4681 * as it did before the update, assuming the new tuple is about the same
4682 * size as the old one.
4684 * XXX: We don't get here if the page was restored from full page image.
4685 * We don't bother to update the FSM in that case, it doesn't need to be
4686 * totally accurate anyway.
4688 if (!hot_update && freespace < BLCKSZ / 5)
4689 XLogRecordPageWithFreeSpace(xlrec->target.node,
4690 ItemPointerGetBlockNumber(&(xlrec->newtid)), freespace);
4694 heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
4696 xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record);
4699 OffsetNumber offnum;
4701 HeapTupleHeader htup;
4703 if (record->xl_info & XLR_BKP_BLOCK_1)
4706 buffer = XLogReadBuffer(xlrec->target.node,
4707 ItemPointerGetBlockNumber(&(xlrec->target.tid)),
4709 if (!BufferIsValid(buffer))
4711 page = (Page) BufferGetPage(buffer);
4713 if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
4715 UnlockReleaseBuffer(buffer);
4719 offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
4720 if (PageGetMaxOffsetNumber(page) >= offnum)
4721 lp = PageGetItemId(page, offnum);
4723 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
4724 elog(PANIC, "heap_lock_redo: invalid lp");
4726 htup = (HeapTupleHeader) PageGetItem(page, lp);
4728 htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
4730 HEAP_XMAX_IS_MULTI |
4733 if (xlrec->xid_is_mxact)
4734 htup->t_infomask |= HEAP_XMAX_IS_MULTI;
4735 if (xlrec->shared_lock)
4736 htup->t_infomask |= HEAP_XMAX_SHARED_LOCK;
4738 htup->t_infomask |= HEAP_XMAX_EXCL_LOCK;
4739 HeapTupleHeaderClearHotUpdated(htup);
4740 HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
4741 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
4742 /* Make sure there is no forward chain link in t_ctid */
4743 htup->t_ctid = xlrec->target.tid;
4744 PageSetLSN(page, lsn);
4745 PageSetTLI(page, ThisTimeLineID);
4746 MarkBufferDirty(buffer);
4747 UnlockReleaseBuffer(buffer);
4751 heap_xlog_inplace(XLogRecPtr lsn, XLogRecord *record)
4753 xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record);
4756 OffsetNumber offnum;
4758 HeapTupleHeader htup;
4762 if (record->xl_info & XLR_BKP_BLOCK_1)
4765 buffer = XLogReadBuffer(xlrec->target.node,
4766 ItemPointerGetBlockNumber(&(xlrec->target.tid)),
4768 if (!BufferIsValid(buffer))
4770 page = (Page) BufferGetPage(buffer);
4772 if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
4774 UnlockReleaseBuffer(buffer);
4778 offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
4779 if (PageGetMaxOffsetNumber(page) >= offnum)
4780 lp = PageGetItemId(page, offnum);
4782 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
4783 elog(PANIC, "heap_inplace_redo: invalid lp");
4785 htup = (HeapTupleHeader) PageGetItem(page, lp);
4787 oldlen = ItemIdGetLength(lp) - htup->t_hoff;
4788 newlen = record->xl_len - SizeOfHeapInplace;
4789 if (oldlen != newlen)
4790 elog(PANIC, "heap_inplace_redo: wrong tuple length");
4792 memcpy((char *) htup + htup->t_hoff,
4793 (char *) xlrec + SizeOfHeapInplace,
4796 PageSetLSN(page, lsn);
4797 PageSetTLI(page, ThisTimeLineID);
4798 MarkBufferDirty(buffer);
4799 UnlockReleaseBuffer(buffer);
4803 heap_redo(XLogRecPtr lsn, XLogRecord *record)
4805 uint8 info = record->xl_info & ~XLR_INFO_MASK;
4808 * These operations don't overwrite MVCC data so no conflict processing is
4809 * required. The ones in heap2 rmgr do.
4812 RestoreBkpBlocks(lsn, record, false);
4814 switch (info & XLOG_HEAP_OPMASK)
4816 case XLOG_HEAP_INSERT:
4817 heap_xlog_insert(lsn, record);
4819 case XLOG_HEAP_DELETE:
4820 heap_xlog_delete(lsn, record);
4822 case XLOG_HEAP_UPDATE:
4823 heap_xlog_update(lsn, record, false);
4825 case XLOG_HEAP_HOT_UPDATE:
4826 heap_xlog_update(lsn, record, true);
4828 case XLOG_HEAP_NEWPAGE:
4829 heap_xlog_newpage(lsn, record);
4831 case XLOG_HEAP_LOCK:
4832 heap_xlog_lock(lsn, record);
4834 case XLOG_HEAP_INPLACE:
4835 heap_xlog_inplace(lsn, record);
4838 elog(PANIC, "heap_redo: unknown op code %u", info);
4843 heap2_redo(XLogRecPtr lsn, XLogRecord *record)
4845 uint8 info = record->xl_info & ~XLR_INFO_MASK;
4848 * Note that RestoreBkpBlocks() is called after conflict processing within
4849 * each record type handling function.
4852 switch (info & XLOG_HEAP_OPMASK)
4854 case XLOG_HEAP2_FREEZE:
4855 heap_xlog_freeze(lsn, record);
4857 case XLOG_HEAP2_CLEAN:
4858 heap_xlog_clean(lsn, record);
4860 case XLOG_HEAP2_CLEANUP_INFO:
4861 heap_xlog_cleanup_info(lsn, record);
4864 elog(PANIC, "heap2_redo: unknown op code %u", info);
4869 out_target(StringInfo buf, xl_heaptid *target)
4871 appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u",
4872 target->node.spcNode, target->node.dbNode, target->node.relNode,
4873 ItemPointerGetBlockNumber(&(target->tid)),
4874 ItemPointerGetOffsetNumber(&(target->tid)));
4878 heap_desc(StringInfo buf, uint8 xl_info, char *rec)
4880 uint8 info = xl_info & ~XLR_INFO_MASK;
4882 info &= XLOG_HEAP_OPMASK;
4883 if (info == XLOG_HEAP_INSERT)
4885 xl_heap_insert *xlrec = (xl_heap_insert *) rec;
4887 if (xl_info & XLOG_HEAP_INIT_PAGE)
4888 appendStringInfo(buf, "insert(init): ");
4890 appendStringInfo(buf, "insert: ");
4891 out_target(buf, &(xlrec->target));
4893 else if (info == XLOG_HEAP_DELETE)
4895 xl_heap_delete *xlrec = (xl_heap_delete *) rec;
4897 appendStringInfo(buf, "delete: ");
4898 out_target(buf, &(xlrec->target));
4900 else if (info == XLOG_HEAP_UPDATE)
4902 xl_heap_update *xlrec = (xl_heap_update *) rec;
4904 if (xl_info & XLOG_HEAP_INIT_PAGE)
4905 appendStringInfo(buf, "update(init): ");
4907 appendStringInfo(buf, "update: ");
4908 out_target(buf, &(xlrec->target));
4909 appendStringInfo(buf, "; new %u/%u",
4910 ItemPointerGetBlockNumber(&(xlrec->newtid)),
4911 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
4913 else if (info == XLOG_HEAP_HOT_UPDATE)
4915 xl_heap_update *xlrec = (xl_heap_update *) rec;
4917 if (xl_info & XLOG_HEAP_INIT_PAGE) /* can this case happen? */
4918 appendStringInfo(buf, "hot_update(init): ");
4920 appendStringInfo(buf, "hot_update: ");
4921 out_target(buf, &(xlrec->target));
4922 appendStringInfo(buf, "; new %u/%u",
4923 ItemPointerGetBlockNumber(&(xlrec->newtid)),
4924 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
4926 else if (info == XLOG_HEAP_NEWPAGE)
4928 xl_heap_newpage *xlrec = (xl_heap_newpage *) rec;
4930 appendStringInfo(buf, "newpage: rel %u/%u/%u; fork %u, blk %u",
4931 xlrec->node.spcNode, xlrec->node.dbNode,
4932 xlrec->node.relNode, xlrec->forknum,
4935 else if (info == XLOG_HEAP_LOCK)
4937 xl_heap_lock *xlrec = (xl_heap_lock *) rec;
4939 if (xlrec->shared_lock)
4940 appendStringInfo(buf, "shared_lock: ");
4942 appendStringInfo(buf, "exclusive_lock: ");
4943 if (xlrec->xid_is_mxact)
4944 appendStringInfo(buf, "mxid ");
4946 appendStringInfo(buf, "xid ");
4947 appendStringInfo(buf, "%u ", xlrec->locking_xid);
4948 out_target(buf, &(xlrec->target));
4950 else if (info == XLOG_HEAP_INPLACE)
4952 xl_heap_inplace *xlrec = (xl_heap_inplace *) rec;
4954 appendStringInfo(buf, "inplace: ");
4955 out_target(buf, &(xlrec->target));
4958 appendStringInfo(buf, "UNKNOWN");
4962 heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
4964 uint8 info = xl_info & ~XLR_INFO_MASK;
4966 info &= XLOG_HEAP_OPMASK;
4967 if (info == XLOG_HEAP2_FREEZE)
4969 xl_heap_freeze *xlrec = (xl_heap_freeze *) rec;
4971 appendStringInfo(buf, "freeze: rel %u/%u/%u; blk %u; cutoff %u",
4972 xlrec->node.spcNode, xlrec->node.dbNode,
4973 xlrec->node.relNode, xlrec->block,
4976 else if (info == XLOG_HEAP2_CLEAN)
4978 xl_heap_clean *xlrec = (xl_heap_clean *) rec;
4980 appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u remxid %u",
4981 xlrec->node.spcNode, xlrec->node.dbNode,
4982 xlrec->node.relNode, xlrec->block,
4983 xlrec->latestRemovedXid);
4985 else if (info == XLOG_HEAP2_CLEANUP_INFO)
4987 xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec;
4989 appendStringInfo(buf, "cleanup info: remxid %u",
4990 xlrec->latestRemovedXid);
4993 appendStringInfo(buf, "UNKNOWN");
4997 * heap_sync - sync a heap, for use when no WAL has been written
4999 * This forces the heap contents (including TOAST heap if any) down to disk.
5000 * If we skipped using WAL, and it's not a temp relation, we must force the
5001 * relation down to disk before it's safe to commit the transaction. This
5002 * requires writing out any dirty buffers and then doing a forced fsync.
5004 * Indexes are not touched. (Currently, index operations associated with
5005 * the commands that use this are WAL-logged and so do not need fsync.
5006 * That behavior might change someday, but in any case it's likely that
5007 * any fsync decisions required would be per-index and hence not appropriate
5011 heap_sync(Relation rel)
5013 /* temp tables never need fsync */
5018 FlushRelationBuffers(rel);
5019 /* FlushRelationBuffers will have opened rd_smgr */
5020 smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM);
5022 /* FSM is not critical, don't bother syncing it */
5024 /* toast heap, if any */
5025 if (OidIsValid(rel->rd_rel->reltoastrelid))
5029 toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock);
5030 FlushRelationBuffers(toastrel);
5031 smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM);
5032 heap_close(toastrel, AccessShareLock);