1 /*-------------------------------------------------------------------------
4 * Misc functions used in Hot Standby mode.
6 * All functions for handling RM_STANDBY_ID, which relate to
7 * AccessExclusiveLocks and starting snapshots for Hot Standby mode.
8 * Plus conflict recovery processing.
10 * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
11 * Portions Copyright (c) 1994, Regents of the University of California
14 * $PostgreSQL: pgsql/src/backend/storage/ipc/standby.c,v 1.27.2.2 2010/08/19 22:55:10 tgl Exp $
16 *-------------------------------------------------------------------------
19 #include "access/transam.h"
20 #include "access/twophase.h"
21 #include "access/xact.h"
22 #include "access/xlog.h"
23 #include "miscadmin.h"
25 #include "storage/bufmgr.h"
26 #include "storage/lmgr.h"
27 #include "storage/proc.h"
28 #include "storage/procarray.h"
29 #include "storage/sinvaladt.h"
30 #include "storage/standby.h"
31 #include "utils/ps_status.h"
33 /* User-settable GUC parameters */
34 int vacuum_defer_cleanup_age;
35 int max_standby_archive_delay = 30 * 1000;
36 int max_standby_streaming_delay = 30 * 1000;
38 static List *RecoveryLockList;
40 static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
41 ProcSignalReason reason);
42 static void ResolveRecoveryConflictWithLock(Oid dbOid, Oid relOid);
43 static void LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
44 static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
48 * InitRecoveryTransactionEnvironment
49 * Initialize tracking of in-progress transactions in master
51 * We need to issue shared invalidations and hold locks. Holding locks
52 * means others may want to wait on us, so we need to make a lock table
53 * vxact entry like a real transaction. We could create and delete
54 * lock table entries for each transaction but its simpler just to create
55 * one permanent entry and leave it there all the time. Locks are then
56 * acquired and released as needed. Yes, this means you can see the
57 * Startup process in pg_locks once we have run this.
60 InitRecoveryTransactionEnvironment(void)
62 VirtualTransactionId vxid;
65 * Initialize shared invalidation management for Startup process, being
66 * careful to register ourselves as a sendOnly process so we don't need to
67 * read messages, nor will we get signalled when the queue starts filling
70 SharedInvalBackendInit(true);
73 * Record the PID and PGPROC structure of the startup process.
75 PublishStartupProcessInformation();
78 * Lock a virtual transaction id for Startup process.
80 * We need to do GetNextLocalTransactionId() because
81 * SharedInvalBackendInit() leaves localTransactionid invalid and the lock
82 * manager doesn't like that at all.
84 * Note that we don't need to run XactLockTableInsert() because nobody
85 * needs to wait on xids. That sounds a little strange, but table locks
86 * are held by vxids and row level locks are held by xids. All queries
87 * hold AccessShareLocks so never block while we write or lock new rows.
89 vxid.backendId = MyBackendId;
90 vxid.localTransactionId = GetNextLocalTransactionId();
91 VirtualXactLockTableInsert(vxid);
93 standbyState = STANDBY_INITIALIZED;
97 * ShutdownRecoveryTransactionEnvironment
98 * Shut down transaction tracking
100 * Prepare to switch from hot standby mode to normal operation. Shut down
101 * recovery-time transaction tracking.
104 ShutdownRecoveryTransactionEnvironment(void)
106 /* Mark all tracked in-progress transactions as finished. */
107 ExpireAllKnownAssignedTransactionIds();
109 /* Release all locks the tracked transactions were holding */
110 StandbyReleaseAllLocks();
115 * -----------------------------------------------------
116 * Standby wait timers and backend cancel logic
117 * -----------------------------------------------------
121 * Determine the cutoff time at which we want to start canceling conflicting
122 * transactions. Returns zero (a time safely in the past) if we are willing
126 GetStandbyLimitTime(void)
132 * The cutoff time is the last WAL data receipt time plus the appropriate
133 * delay variable. Delay of -1 means wait forever.
135 GetXLogReceiptTime(&rtime, &fromStream);
138 if (max_standby_streaming_delay < 0)
139 return 0; /* wait forever */
140 return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
144 if (max_standby_archive_delay < 0)
145 return 0; /* wait forever */
146 return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
150 #define STANDBY_INITIAL_WAIT_US 1000
151 static int standbyWait_us = STANDBY_INITIAL_WAIT_US;
154 * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
155 * We wait here for a while then return. If we decide we can't wait any
156 * more then we return true, if we can wait some more return false.
159 WaitExceedsMaxStandbyDelay(void)
163 /* Are we past the limit time? */
164 ltime = GetStandbyLimitTime();
165 if (ltime && GetCurrentTimestamp() >= ltime)
169 * Sleep a bit (this is essential to avoid busy-waiting).
171 pg_usleep(standbyWait_us);
174 * Progressively increase the sleep times, but not to more than 1s, since
175 * pg_usleep isn't interruptable on some platforms.
178 if (standbyWait_us > 1000000)
179 standbyWait_us = 1000000;
185 * This is the main executioner for any query backend that conflicts with
186 * recovery processing. Judgement has already been passed on it within
187 * a specific rmgr. Here we just issue the orders to the procs. The procs
188 * then throw the required error as instructed.
191 ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
192 ProcSignalReason reason)
194 while (VirtualTransactionIdIsValid(*waitlist))
196 TimestampTz waitStart;
199 pgstat_report_waiting(true);
201 waitStart = GetCurrentTimestamp();
202 new_status = NULL; /* we haven't changed the ps display */
204 /* reset standbyWait_us for each xact we wait for */
205 standbyWait_us = STANDBY_INITIAL_WAIT_US;
207 /* wait until the virtual xid is gone */
208 while (!ConditionalVirtualXactLockTableWait(*waitlist))
211 * Report via ps if we have been waiting for more than 500 msec
212 * (should that be configurable?)
214 if (update_process_title && new_status == NULL &&
215 TimestampDifferenceExceeds(waitStart, GetCurrentTimestamp(),
218 const char *old_status;
221 old_status = get_ps_display(&len);
222 new_status = (char *) palloc(len + 8 + 1);
223 memcpy(new_status, old_status, len);
224 strcpy(new_status + len, " waiting");
225 set_ps_display(new_status, false);
226 new_status[len] = '\0'; /* truncate off " waiting" */
229 /* Is it time to kill it? */
230 if (WaitExceedsMaxStandbyDelay())
235 * Now find out who to throw out of the balloon.
237 Assert(VirtualTransactionIdIsValid(*waitlist));
238 pid = CancelVirtualTransaction(*waitlist, reason);
241 * Wait a little bit for it to die so that we avoid flooding
242 * an unresponsive backend when system is heavily loaded.
249 /* Reset ps display if we changed it */
252 set_ps_display(new_status, false);
255 pgstat_report_waiting(false);
257 /* The virtual transaction is gone now, wait for the next one */
263 ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node)
265 VirtualTransactionId *backends;
268 * If we get passed InvalidTransactionId then we are a little surprised,
269 * but it is theoretically possible in normal running. It also happens
270 * when replaying already applied WAL records after a standby crash or
271 * restart. If latestRemovedXid is invalid then there is no conflict. That
272 * rule applies across all record types that suffer from this conflict.
274 if (!TransactionIdIsValid(latestRemovedXid))
277 backends = GetConflictingVirtualXIDs(latestRemovedXid,
280 ResolveRecoveryConflictWithVirtualXIDs(backends,
281 PROCSIG_RECOVERY_CONFLICT_SNAPSHOT);
285 ResolveRecoveryConflictWithTablespace(Oid tsid)
287 VirtualTransactionId *temp_file_users;
290 * Standby users may be currently using this tablespace for for their
291 * temporary files. We only care about current users because
292 * temp_tablespace parameter will just ignore tablespaces that no longer
295 * Ask everybody to cancel their queries immediately so we can ensure no
296 * temp files remain and we can remove the tablespace. Nuke the entire
297 * site from orbit, it's the only way to be sure.
299 * XXX: We could work out the pids of active backends using this
300 * tablespace by examining the temp filenames in the directory. We would
301 * then convert the pids into VirtualXIDs before attempting to cancel
304 * We don't wait for commit because drop tablespace is non-transactional.
306 temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
308 ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
309 PROCSIG_RECOVERY_CONFLICT_TABLESPACE);
313 ResolveRecoveryConflictWithDatabase(Oid dbid)
316 * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
317 * only waits for transactions and completely idle sessions would block
318 * us. This is rare enough that we do this as simply as possible: no wait,
319 * just force them off immediately.
321 * No locking is required here because we already acquired
322 * AccessExclusiveLock. Anybody trying to connect while we do this will
323 * block during InitPostgres() and then disconnect when they see the
324 * database has been removed.
326 while (CountDBBackends(dbid) > 0)
328 CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
331 * Wait awhile for them to die so that we avoid flooding an
332 * unresponsive backend when system is heavily loaded.
339 ResolveRecoveryConflictWithLock(Oid dbOid, Oid relOid)
341 VirtualTransactionId *backends;
342 bool report_memory_error = false;
343 bool lock_acquired = false;
344 int num_attempts = 0;
347 SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
350 * If blowing away everybody with conflicting locks doesn't work, after
351 * the first two attempts then we just start blowing everybody away until
352 * it does work. We do this because its likely that we either have too
353 * many locks and we just can't get one at all, or that there are many
354 * people crowding for the same table. Recovery must win; the end
355 * justifies the means.
357 while (!lock_acquired)
359 if (++num_attempts < 3)
360 backends = GetLockConflicts(&locktag, AccessExclusiveLock);
363 backends = GetConflictingVirtualXIDs(InvalidTransactionId,
365 report_memory_error = true;
368 ResolveRecoveryConflictWithVirtualXIDs(backends,
369 PROCSIG_RECOVERY_CONFLICT_LOCK);
371 if (LockAcquireExtended(&locktag, AccessExclusiveLock, true, true, false)
372 != LOCKACQUIRE_NOT_AVAIL)
373 lock_acquired = true;
378 * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
379 * to resolve conflicts with other backends holding buffer pins.
381 * We either resolve conflicts immediately or set a SIGALRM to wake us at
382 * the limit of our patience. The sleep in LockBufferForCleanup() is
383 * performed here, for code clarity.
385 * Resolve conflicts by sending a PROCSIG signal to all backends to check if
386 * they hold one of the buffer pins that is blocking Startup process. If so,
387 * backends will take an appropriate error action, ERROR or FATAL.
389 * We also must check for deadlocks. Deadlocks occur because if queries
390 * wait on a lock, that must be behind an AccessExclusiveLock, which can only
391 * be cleared if the Startup process replays a transaction completion record.
392 * If Startup process is also waiting then that is a deadlock. The deadlock
393 * can occur if the query is waiting and then the Startup sleeps, or if
394 * Startup is sleeping and the query waits on a lock. We protect against
395 * only the former sequence here, the latter sequence is checked prior to
396 * the query sleeping, in CheckRecoveryConflictDeadlock().
398 * Deadlocks are extremely rare, and relatively expensive to check for,
399 * so we don't do a deadlock check right away ... only if we have had to wait
400 * at least deadlock_timeout. Most of the logic about that is in proc.c.
403 ResolveRecoveryConflictWithBufferPin(void)
405 bool sig_alarm_enabled = false;
409 Assert(InHotStandby);
411 ltime = GetStandbyLimitTime();
412 now = GetCurrentTimestamp();
417 * We're willing to wait forever for conflicts, so set timeout for
418 * deadlock check (only)
420 if (enable_standby_sig_alarm(now, now, true))
421 sig_alarm_enabled = true;
423 elog(FATAL, "could not set timer for process wakeup");
425 else if (now >= ltime)
428 * We're already behind, so clear a path as quickly as possible.
430 SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
435 * Wake up at ltime, and check for deadlocks as well if we will be
436 * waiting longer than deadlock_timeout
438 if (enable_standby_sig_alarm(now, ltime, false))
439 sig_alarm_enabled = true;
441 elog(FATAL, "could not set timer for process wakeup");
444 /* Wait to be signaled by UnpinBuffer() */
447 if (sig_alarm_enabled)
449 if (!disable_standby_sig_alarm())
450 elog(FATAL, "could not disable timer for process wakeup");
455 SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
457 Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
458 reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
461 * We send signal to all backends to ask them if they are holding the
462 * buffer pin which is delaying the Startup process. We must not set the
463 * conflict flag yet, since most backends will be innocent. Let the
464 * SIGUSR1 handling in each backend decide their own fate.
466 CancelDBBackends(InvalidOid, reason, false);
470 * In Hot Standby perform early deadlock detection. We abort the lock
471 * wait if are about to sleep while holding the buffer pin that Startup
472 * process is waiting for. The deadlock occurs because we can only be
473 * waiting behind an AccessExclusiveLock, which can only clear when a
474 * transaction completion record is replayed, which can only occur when
475 * Startup process is not waiting. So if Startup process is waiting we
476 * never will clear that lock, so if we wait we cause deadlock. If we
477 * are the Startup process then no need to check for deadlocks.
480 CheckRecoveryConflictDeadlock(LWLockId partitionLock)
484 if (!HoldingBufferPinThatDelaysRecovery())
487 LWLockRelease(partitionLock);
490 * Error message should match ProcessInterrupts() but we avoid calling
491 * that because we aren't handling an interrupt at this point. Note that
492 * we only cancel the current transaction here, so if we are in a
493 * subtransaction and the pin is held by a parent, then the Startup
494 * process will continue to wait even though we have avoided deadlock.
497 (errcode(ERRCODE_QUERY_CANCELED),
498 errmsg("canceling statement due to conflict with recovery"),
499 errdetail("User transaction caused buffer deadlock with recovery.")));
503 * -----------------------------------------------------
504 * Locking in Recovery Mode
505 * -----------------------------------------------------
507 * All locks are held by the Startup process using a single virtual
508 * transaction. This implementation is both simpler and in some senses,
509 * more correct. The locks held mean "some original transaction held
510 * this lock, so query access is not allowed at this time". So the Startup
511 * process is the proxy by which the original locks are implemented.
513 * We only keep track of AccessExclusiveLocks, which are only ever held by
514 * one transaction on one relation, and don't worry about lock queuing.
516 * We keep a single dynamically expandible list of locks in local memory,
517 * RelationLockList, so we can keep track of the various entries made by
518 * the Startup process's virtual xid in the shared lock table.
520 * List elements use type xl_rel_lock, since the WAL record type exactly
521 * matches the information that we need to keep track of.
523 * We use session locks rather than normal locks so we don't need
529 StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
531 xl_standby_lock *newlock;
534 /* Already processed? */
535 if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
538 elog(trace_recovery(DEBUG4),
539 "adding recovery lock: db %u rel %u", dbOid, relOid);
541 /* dbOid is InvalidOid when we are locking a shared relation. */
542 Assert(OidIsValid(relOid));
544 newlock = palloc(sizeof(xl_standby_lock));
546 newlock->dbOid = dbOid;
547 newlock->relOid = relOid;
548 RecoveryLockList = lappend(RecoveryLockList, newlock);
551 * Attempt to acquire the lock as requested, if not resolve conflict
553 SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
555 if (LockAcquireExtended(&locktag, AccessExclusiveLock, true, true, false)
556 == LOCKACQUIRE_NOT_AVAIL)
557 ResolveRecoveryConflictWithLock(newlock->dbOid, newlock->relOid);
561 StandbyReleaseLocks(TransactionId xid)
568 * Release all matching locks and remove them from list
571 for (cell = list_head(RecoveryLockList); cell; cell = next)
573 xl_standby_lock *lock = (xl_standby_lock *) lfirst(cell);
577 if (!TransactionIdIsValid(xid) || lock->xid == xid)
581 elog(trace_recovery(DEBUG4),
582 "releasing recovery lock: xid %u db %u rel %u",
583 lock->xid, lock->dbOid, lock->relOid);
584 SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
585 if (!LockRelease(&locktag, AccessExclusiveLock, true))
587 "RecoveryLockList contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
588 lock->xid, lock->dbOid, lock->relOid);
590 RecoveryLockList = list_delete_cell(RecoveryLockList, cell, prev);
599 * Release locks for a transaction tree, starting at xid down, from
602 * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
603 * to remove any AccessExclusiveLocks requested by a transaction.
606 StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
610 StandbyReleaseLocks(xid);
612 for (i = 0; i < nsubxids; i++)
613 StandbyReleaseLocks(subxids[i]);
617 * StandbyReleaseLocksMany
618 * Release standby locks held by XIDs < removeXid
620 * If keepPreparedXacts is true, keep prepared transactions even if
621 * they're older than removeXid
624 StandbyReleaseLocksMany(TransactionId removeXid, bool keepPreparedXacts)
632 * Release all matching locks.
635 for (cell = list_head(RecoveryLockList); cell; cell = next)
637 xl_standby_lock *lock = (xl_standby_lock *) lfirst(cell);
641 if (!TransactionIdIsValid(removeXid) || TransactionIdPrecedes(lock->xid, removeXid))
643 if (keepPreparedXacts && StandbyTransactionIdIsPrepared(lock->xid))
645 elog(trace_recovery(DEBUG4),
646 "releasing recovery lock: xid %u db %u rel %u",
647 lock->xid, lock->dbOid, lock->relOid);
648 SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
649 if (!LockRelease(&locktag, AccessExclusiveLock, true))
651 "RecoveryLockList contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
652 lock->xid, lock->dbOid, lock->relOid);
653 RecoveryLockList = list_delete_cell(RecoveryLockList, cell, prev);
662 * Called at end of recovery and when we see a shutdown checkpoint.
665 StandbyReleaseAllLocks(void)
667 elog(trace_recovery(DEBUG2), "release all standby locks");
668 StandbyReleaseLocksMany(InvalidTransactionId, false);
672 * StandbyReleaseOldLocks
673 * Release standby locks held by XIDs < removeXid, as long
674 * as their not prepared transactions.
677 StandbyReleaseOldLocks(TransactionId removeXid)
679 StandbyReleaseLocksMany(removeXid, true);
683 * --------------------------------------------------------------------
684 * Recovery handling for Rmgr RM_STANDBY_ID
686 * These record types will only be created if XLogStandbyInfoActive()
687 * --------------------------------------------------------------------
691 standby_redo(XLogRecPtr lsn, XLogRecord *record)
693 uint8 info = record->xl_info & ~XLR_INFO_MASK;
695 /* Do nothing if we're not in hot standby mode */
696 if (standbyState == STANDBY_DISABLED)
699 if (info == XLOG_STANDBY_LOCK)
701 xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
704 for (i = 0; i < xlrec->nlocks; i++)
705 StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
706 xlrec->locks[i].dbOid,
707 xlrec->locks[i].relOid);
709 else if (info == XLOG_RUNNING_XACTS)
711 xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
712 RunningTransactionsData running;
714 running.xcnt = xlrec->xcnt;
715 running.subxid_overflow = xlrec->subxid_overflow;
716 running.nextXid = xlrec->nextXid;
717 running.latestCompletedXid = xlrec->latestCompletedXid;
718 running.oldestRunningXid = xlrec->oldestRunningXid;
719 running.xids = xlrec->xids;
721 ProcArrayApplyRecoveryInfo(&running);
724 elog(PANIC, "relation_redo: unknown op code %u", info);
728 standby_desc_running_xacts(StringInfo buf, xl_running_xacts *xlrec)
732 appendStringInfo(buf, " nextXid %u latestCompletedXid %u oldestRunningXid %u",
734 xlrec->latestCompletedXid,
735 xlrec->oldestRunningXid);
738 appendStringInfo(buf, "; %d xacts:", xlrec->xcnt);
739 for (i = 0; i < xlrec->xcnt; i++)
740 appendStringInfo(buf, " %u", xlrec->xids[i]);
743 if (xlrec->subxid_overflow)
744 appendStringInfo(buf, "; subxid ovf");
748 standby_desc(StringInfo buf, uint8 xl_info, char *rec)
750 uint8 info = xl_info & ~XLR_INFO_MASK;
752 if (info == XLOG_STANDBY_LOCK)
754 xl_standby_locks *xlrec = (xl_standby_locks *) rec;
757 appendStringInfo(buf, "AccessExclusive locks:");
759 for (i = 0; i < xlrec->nlocks; i++)
760 appendStringInfo(buf, " xid %u db %u rel %u",
761 xlrec->locks[i].xid, xlrec->locks[i].dbOid,
762 xlrec->locks[i].relOid);
764 else if (info == XLOG_RUNNING_XACTS)
766 xl_running_xacts *xlrec = (xl_running_xacts *) rec;
768 appendStringInfo(buf, " running xacts:");
769 standby_desc_running_xacts(buf, xlrec);
772 appendStringInfo(buf, "UNKNOWN");
776 * Log details of the current snapshot to WAL. This allows the snapshot state
777 * to be reconstructed on the standby.
779 * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
780 * start from a shutdown checkpoint because we know nothing was running
781 * at that time and our recovery snapshot is known empty. In the more
782 * typical case of an online checkpoint we need to jump through a few
783 * hoops to get a correct recovery snapshot and this requires a two or
784 * sometimes a three stage process.
786 * The initial snapshot must contain all running xids and all current
787 * AccessExclusiveLocks at a point in time on the standby. Assembling
788 * that information while the server is running requires many and
789 * various LWLocks, so we choose to derive that information piece by
790 * piece and then re-assemble that info on the standby. When that
791 * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
793 * Since locking on the primary when we derive the information is not
794 * strict, we note that there is a time window between the derivation and
795 * writing to WAL of the derived information. That allows race conditions
796 * that we must resolve, since xids and locks may enter or leave the
797 * snapshot during that window. This creates the issue that an xid or
798 * lock may start *after* the snapshot has been derived yet *before* the
799 * snapshot is logged in the running xacts WAL record. We resolve this by
800 * starting to accumulate changes at a point just prior to when we derive
801 * the snapshot on the primary, then ignore duplicates when we later apply
802 * the snapshot from the running xacts record. This is implemented during
803 * CreateCheckpoint() where we use the logical checkpoint location as
804 * our starting point and then write the running xacts record immediately
805 * before writing the main checkpoint WAL record. Since we always start
806 * up from a checkpoint and are immediately at our starting point, we
807 * unconditionally move to STANDBY_INITIALIZED. After this point we
809 * * move shared nextXid forwards as we see new xids
810 * * extend the clog and subtrans with each new xid
811 * * keep track of uncommitted known assigned xids
812 * * keep track of uncommitted AccessExclusiveLocks
814 * When we see a commit/abort we must remove known assigned xids and locks
815 * from the completing transaction. Attempted removals that cannot locate
816 * an entry are expected and must not cause an error when we are in state
817 * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
818 * KnownAssignedXidsRemove().
820 * Later, when we apply the running xact data we must be careful to ignore
821 * transactions already committed, since those commits raced ahead when
822 * making WAL entries.
825 LogStandbySnapshot(TransactionId *oldestActiveXid, TransactionId *nextXid)
827 RunningTransactions running;
828 xl_standby_lock *locks;
831 Assert(XLogStandbyInfoActive());
834 * Get details of any AccessExclusiveLocks being held at the moment.
836 * XXX GetRunningTransactionLocks() currently holds a lock on all
837 * partitions though it is possible to further optimise the locking. By
838 * reference counting locks and storing the value on the ProcArray entry
839 * for each backend we can easily tell if any locks need recording without
840 * trying to acquire the partition locks and scanning the lock table.
842 locks = GetRunningTransactionLocks(&nlocks);
844 LogAccessExclusiveLocks(nlocks, locks);
847 * Log details of all in-progress transactions. This should be the last
848 * record we write, because standby will open up when it sees this.
850 running = GetRunningTransactionData();
853 * The gap between GetRunningTransactionData() and
854 * LogCurrentRunningXacts() is what most of the fuss is about here, so
855 * artifically extending this interval is a great way to test the little
856 * used parts of the code.
858 LogCurrentRunningXacts(running);
860 *oldestActiveXid = running->oldestRunningXid;
861 *nextXid = running->nextXid;
865 * Record an enhanced snapshot of running transactions into WAL.
867 * The definitions of RunningTransactionsData and xl_xact_running_xacts
868 * are similar. We keep them separate because xl_xact_running_xacts
869 * is a contiguous chunk of memory and never exists fully until it is
873 LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
875 xl_running_xacts xlrec;
876 XLogRecData rdata[2];
880 xlrec.xcnt = CurrRunningXacts->xcnt;
881 xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
882 xlrec.nextXid = CurrRunningXacts->nextXid;
883 xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
884 xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
887 rdata[0].data = (char *) (&xlrec);
888 rdata[0].len = MinSizeOfXactRunningXacts;
889 rdata[0].buffer = InvalidBuffer;
891 /* array of TransactionIds */
894 rdata[0].next = &(rdata[1]);
895 rdata[1].data = (char *) CurrRunningXacts->xids;
896 rdata[1].len = xlrec.xcnt * sizeof(TransactionId);
897 rdata[1].buffer = InvalidBuffer;
901 rdata[lastrdata].next = NULL;
903 recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS, rdata);
905 if (CurrRunningXacts->subxid_overflow)
906 elog(trace_recovery(DEBUG2),
907 "snapshot of %u running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
908 CurrRunningXacts->xcnt,
909 recptr.xlogid, recptr.xrecoff,
910 CurrRunningXacts->oldestRunningXid,
911 CurrRunningXacts->latestCompletedXid,
912 CurrRunningXacts->nextXid);
914 elog(trace_recovery(DEBUG2),
915 "snapshot of %u running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
916 CurrRunningXacts->xcnt,
917 recptr.xlogid, recptr.xrecoff,
918 CurrRunningXacts->oldestRunningXid,
919 CurrRunningXacts->latestCompletedXid,
920 CurrRunningXacts->nextXid);
924 * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
925 * logged, as described in backend/storage/lmgr/README.
928 LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
930 XLogRecData rdata[2];
931 xl_standby_locks xlrec;
933 xlrec.nlocks = nlocks;
935 rdata[0].data = (char *) &xlrec;
936 rdata[0].len = offsetof(xl_standby_locks, locks);
937 rdata[0].buffer = InvalidBuffer;
938 rdata[0].next = &rdata[1];
940 rdata[1].data = (char *) locks;
941 rdata[1].len = nlocks * sizeof(xl_standby_lock);
942 rdata[1].buffer = InvalidBuffer;
943 rdata[1].next = NULL;
945 (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK, rdata);
949 * Individual logging of AccessExclusiveLocks for use during LockAcquire()
952 LogAccessExclusiveLock(Oid dbOid, Oid relOid)
954 xl_standby_lock xlrec;
956 xlrec.xid = GetTopTransactionId();
959 * Decode the locktag back to the original values, to avoid sending lots
960 * of empty bytes with every message. See lock.h to check how a locktag
961 * is defined for LOCKTAG_RELATION
964 xlrec.relOid = relOid;
966 LogAccessExclusiveLocks(1, &xlrec);
970 * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
973 LogAccessExclusiveLockPrepare(void)
976 * Ensure that a TransactionId has been assigned to this transaction,
977 * for two reasons, both related to lock release on the standby.
978 * First, we must assign an xid so that RecordTransactionCommit() and
979 * RecordTransactionAbort() do not optimise away the transaction
980 * completion record which recovery relies upon to release locks. It's
981 * a hack, but for a corner case not worth adding code for into the
982 * main commit path. Second, must must assign an xid before the lock
983 * is recorded in shared memory, otherwise a concurrently executing
984 * GetRunningTransactionLocks() might see a lock associated with an
985 * InvalidTransactionId which we later assert cannot happen.
987 (void) GetTopTransactionId();