src/backend/access/nbtree/nbtpage.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtpage.c
   4  *        BTree-specific page management code for the Postgres btree access
   5  *        method.
   6  *
   7  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  *
  11  * IDENTIFICATION
  12  *        $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.117 2010/02/01 13:40:28 sriggs Exp $
  13  *
  14  *      NOTES
  15  *         Postgres btree pages look like ordinary relation pages.      The opaque
  16  *         data at high addresses includes pointers to left and right siblings
  17  *         and flag data describing page state.  The first page in a btree, page
  18  *         zero, is special -- it stores meta-information describing the tree.
  19  *         Pages one and higher store the actual tree data.
  20  *
  21  *-------------------------------------------------------------------------
  22  */
  23 #include "postgres.h"
  24
  25 #include "access/nbtree.h"
  26 #include "access/transam.h"
  27 #include "miscadmin.h"
  28 #include "storage/bufmgr.h"
  29 #include "storage/freespace.h"
  30 #include "storage/indexfsm.h"
  31 #include "storage/lmgr.h"
  32 #include "utils/inval.h"
  33 #include "utils/snapmgr.h"
  34
  35
  36 /*
  37  *      _bt_initmetapage() -- Fill a page buffer with a correct metapage image
  38  */
  39 void
  40 _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
  41 {
  42         BTMetaPageData *metad;
  43         BTPageOpaque metaopaque;
  44
  45         _bt_pageinit(page, BLCKSZ);
  46
  47         metad = BTPageGetMeta(page);
  48         metad->btm_magic = BTREE_MAGIC;
  49         metad->btm_version = BTREE_VERSION;
  50         metad->btm_root = rootbknum;
  51         metad->btm_level = level;
  52         metad->btm_fastroot = rootbknum;
  53         metad->btm_fastlevel = level;
  54
  55         metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
  56         metaopaque->btpo_flags = BTP_META;
  57
  58         /*
  59          * Set pd_lower just past the end of the metadata.      This is not essential
  60          * but it makes the page look compressible to xlog.c.
  61          */
  62         ((PageHeader) page)->pd_lower =
  63                 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
  64 }
  65
  66 /*
  67  *      _bt_getroot() -- Get the root page of the btree.
  68  *
  69  *              Since the root page can move around the btree file, we have to read
  70  *              its location from the metadata page, and then read the root page
  71  *              itself.  If no root page exists yet, we have to create one.  The
  72  *              standard class of race conditions exists here; I think I covered
  73  *              them all in the Hopi Indian rain dance of lock requests below.
  74  *
  75  *              The access type parameter (BT_READ or BT_WRITE) controls whether
  76  *              a new root page will be created or not.  If access = BT_READ,
  77  *              and no root page exists, we just return InvalidBuffer.  For
  78  *              BT_WRITE, we try to create the root page if it doesn't exist.
  79  *              NOTE that the returned root page will have only a read lock set
  80  *              on it even if access = BT_WRITE!
  81  *
  82  *              The returned page is not necessarily the true root --- it could be
  83  *              a "fast root" (a page that is alone in its level due to deletions).
  84  *              Also, if the root page is split while we are "in flight" to it,
  85  *              what we will return is the old root, which is now just the leftmost
  86  *              page on a probably-not-very-wide level.  For most purposes this is
  87  *              as good as or better than the true root, so we do not bother to
  88  *              insist on finding the true root.  We do, however, guarantee to
  89  *              return a live (not deleted or half-dead) page.
  90  *
  91  *              On successful return, the root page is pinned and read-locked.
  92  *              The metadata page is not locked or pinned on exit.
  93  */
  94 Buffer
  95 _bt_getroot(Relation rel, int access)
  96 {
  97         Buffer          metabuf;
  98         Page            metapg;
  99         BTPageOpaque metaopaque;
 100         Buffer          rootbuf;
 101         Page            rootpage;
 102         BTPageOpaque rootopaque;
 103         BlockNumber rootblkno;
 104         uint32          rootlevel;
 105         BTMetaPageData *metad;
 106
 107         /*
 108          * Try to use previously-cached metapage data to find the root.  This
 109          * normally saves one buffer access per index search, which is a very
 110          * helpful savings in bufmgr traffic and hence contention.
 111          */
 112         if (rel->rd_amcache != NULL)
 113         {
 114                 metad = (BTMetaPageData *) rel->rd_amcache;
 115                 /* We shouldn't have cached it if any of these fail */
 116                 Assert(metad->btm_magic == BTREE_MAGIC);
 117                 Assert(metad->btm_version == BTREE_VERSION);
 118                 Assert(metad->btm_root != P_NONE);
 119
 120                 rootblkno = metad->btm_fastroot;
 121                 Assert(rootblkno != P_NONE);
 122                 rootlevel = metad->btm_fastlevel;
 123
 124                 rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
 125                 rootpage = BufferGetPage(rootbuf);
 126                 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 127
 128                 /*
 129                  * Since the cache might be stale, we check the page more carefully
 130                  * here than normal.  We *must* check that it's not deleted. If it's
 131                  * not alone on its level, then we reject too --- this may be overly
 132                  * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
 133                  * because that's not set in a "fast root".
 134                  */
 135                 if (!P_IGNORE(rootopaque) &&
 136                         rootopaque->btpo.level == rootlevel &&
 137                         P_LEFTMOST(rootopaque) &&
 138                         P_RIGHTMOST(rootopaque))
 139                 {
 140                         /* OK, accept cached page as the root */
 141                         return rootbuf;
 142                 }
 143                 _bt_relbuf(rel, rootbuf);
 144                 /* Cache is stale, throw it away */
 145                 if (rel->rd_amcache)
 146                         pfree(rel->rd_amcache);
 147                 rel->rd_amcache = NULL;
 148         }
 149
 150         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 151         metapg = BufferGetPage(metabuf);
 152         metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
 153         metad = BTPageGetMeta(metapg);
 154
 155         /* sanity-check the metapage */
 156         if (!(metaopaque->btpo_flags & BTP_META) ||
 157                 metad->btm_magic != BTREE_MAGIC)
 158                 ereport(ERROR,
 159                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 160                                  errmsg("index \"%s\" is not a btree",
 161                                                 RelationGetRelationName(rel))));
 162
 163         if (metad->btm_version != BTREE_VERSION)
 164                 ereport(ERROR,
 165                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 166                                  errmsg("version mismatch in index \"%s\": file version %d, code version %d",
 167                                                 RelationGetRelationName(rel),
 168                                                 metad->btm_version, BTREE_VERSION)));
 169
 170         /* if no root page initialized yet, do it */
 171         if (metad->btm_root == P_NONE)
 172         {
 173                 /* If access = BT_READ, caller doesn't want us to create root yet */
 174                 if (access == BT_READ)
 175                 {
 176                         _bt_relbuf(rel, metabuf);
 177                         return InvalidBuffer;
 178                 }
 179
 180                 /* trade in our read lock for a write lock */
 181                 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
 182                 LockBuffer(metabuf, BT_WRITE);
 183
 184                 /*
 185                  * Race condition:      if someone else initialized the metadata between
 186                  * the time we released the read lock and acquired the write lock, we
 187                  * must avoid doing it again.
 188                  */
 189                 if (metad->btm_root != P_NONE)
 190                 {
 191                         /*
 192                          * Metadata initialized by someone else.  In order to guarantee no
 193                          * deadlocks, we have to release the metadata page and start all
 194                          * over again.  (Is that really true? But it's hardly worth trying
 195                          * to optimize this case.)
 196                          */
 197                         _bt_relbuf(rel, metabuf);
 198                         return _bt_getroot(rel, access);
 199                 }
 200
 201                 /*
 202                  * Get, initialize, write, and leave a lock of the appropriate type on
 203                  * the new root page.  Since this is the first page in the tree, it's
 204                  * a leaf as well as the root.
 205                  */
 206                 rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
 207                 rootblkno = BufferGetBlockNumber(rootbuf);
 208                 rootpage = BufferGetPage(rootbuf);
 209                 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 210                 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
 211                 rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
 212                 rootopaque->btpo.level = 0;
 213                 rootopaque->btpo_cycleid = 0;
 214
 215                 /* NO ELOG(ERROR) till meta is updated */
 216                 START_CRIT_SECTION();
 217
 218                 metad->btm_root = rootblkno;
 219                 metad->btm_level = 0;
 220                 metad->btm_fastroot = rootblkno;
 221                 metad->btm_fastlevel = 0;
 222
 223                 MarkBufferDirty(rootbuf);
 224                 MarkBufferDirty(metabuf);
 225
 226                 /* XLOG stuff */
 227                 if (!rel->rd_istemp)
 228                 {
 229                         xl_btree_newroot xlrec;
 230                         XLogRecPtr      recptr;
 231                         XLogRecData rdata;
 232
 233                         xlrec.node = rel->rd_node;
 234                         xlrec.rootblk = rootblkno;
 235                         xlrec.level = 0;
 236
 237                         rdata.data = (char *) &xlrec;
 238                         rdata.len = SizeOfBtreeNewroot;
 239                         rdata.buffer = InvalidBuffer;
 240                         rdata.next = NULL;
 241
 242                         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);
 243
 244                         PageSetLSN(rootpage, recptr);
 245                         PageSetTLI(rootpage, ThisTimeLineID);
 246                         PageSetLSN(metapg, recptr);
 247                         PageSetTLI(metapg, ThisTimeLineID);
 248                 }
 249
 250                 END_CRIT_SECTION();
 251
 252                 /*
 253                  * Send out relcache inval for metapage change (probably unnecessary
 254                  * here, but let's be safe).
 255                  */
 256                 CacheInvalidateRelcache(rel);
 257
 258                 /*
 259                  * swap root write lock for read lock.  There is no danger of anyone
 260                  * else accessing the new root page while it's unlocked, since no one
 261                  * else knows where it is yet.
 262                  */
 263                 LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
 264                 LockBuffer(rootbuf, BT_READ);
 265
 266                 /* okay, metadata is correct, release lock on it */
 267                 _bt_relbuf(rel, metabuf);
 268         }
 269         else
 270         {
 271                 rootblkno = metad->btm_fastroot;
 272                 Assert(rootblkno != P_NONE);
 273                 rootlevel = metad->btm_fastlevel;
 274
 275                 /*
 276                  * Cache the metapage data for next time
 277                  */
 278                 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
 279                                                                                          sizeof(BTMetaPageData));
 280                 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
 281
 282                 /*
 283                  * We are done with the metapage; arrange to release it via first
 284                  * _bt_relandgetbuf call
 285                  */
 286                 rootbuf = metabuf;
 287
 288                 for (;;)
 289                 {
 290                         rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
 291                         rootpage = BufferGetPage(rootbuf);
 292                         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 293
 294                         if (!P_IGNORE(rootopaque))
 295                                 break;
 296
 297                         /* it's dead, Jim.  step right one page */
 298                         if (P_RIGHTMOST(rootopaque))
 299                                 elog(ERROR, "no live root page found in index \"%s\"",
 300                                          RelationGetRelationName(rel));
 301                         rootblkno = rootopaque->btpo_next;
 302                 }
 303
 304                 /* Note: can't check btpo.level on deleted pages */
 305                 if (rootopaque->btpo.level != rootlevel)
 306                         elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
 307                                  rootblkno, RelationGetRelationName(rel),
 308                                  rootopaque->btpo.level, rootlevel);
 309         }
 310
 311         /*
 312          * By here, we have a pin and read lock on the root page, and no lock set
 313          * on the metadata page.  Return the root page's buffer.
 314          */
 315         return rootbuf;
 316 }
 317
 318 /*
 319  *      _bt_gettrueroot() -- Get the true root page of the btree.
 320  *
 321  *              This is the same as the BT_READ case of _bt_getroot(), except
 322  *              we follow the true-root link not the fast-root link.
 323  *
 324  * By the time we acquire lock on the root page, it might have been split and
 325  * not be the true root anymore.  This is okay for the present uses of this
 326  * routine; we only really need to be able to move up at least one tree level
 327  * from whatever non-root page we were at.      If we ever do need to lock the
 328  * one true root page, we could loop here, re-reading the metapage on each
 329  * failure.  (Note that it wouldn't do to hold the lock on the metapage while
 330  * moving to the root --- that'd deadlock against any concurrent root split.)
 331  */
 332 Buffer
 333 _bt_gettrueroot(Relation rel)
 334 {
 335         Buffer          metabuf;
 336         Page            metapg;
 337         BTPageOpaque metaopaque;
 338         Buffer          rootbuf;
 339         Page            rootpage;
 340         BTPageOpaque rootopaque;
 341         BlockNumber rootblkno;
 342         uint32          rootlevel;
 343         BTMetaPageData *metad;
 344
 345         /*
 346          * We don't try to use cached metapage data here, since (a) this path is
 347          * not performance-critical, and (b) if we are here it suggests our cache
 348          * is out-of-date anyway.  In light of point (b), it's probably safest to
 349          * actively flush any cached metapage info.
 350          */
 351         if (rel->rd_amcache)
 352                 pfree(rel->rd_amcache);
 353         rel->rd_amcache = NULL;
 354
 355         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 356         metapg = BufferGetPage(metabuf);
 357         metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
 358         metad = BTPageGetMeta(metapg);
 359
 360         if (!(metaopaque->btpo_flags & BTP_META) ||
 361                 metad->btm_magic != BTREE_MAGIC)
 362                 ereport(ERROR,
 363                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 364                                  errmsg("index \"%s\" is not a btree",
 365                                                 RelationGetRelationName(rel))));
 366
 367         if (metad->btm_version != BTREE_VERSION)
 368                 ereport(ERROR,
 369                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 370                                  errmsg("version mismatch in index \"%s\": file version %d, code version %d",
 371                                                 RelationGetRelationName(rel),
 372                                                 metad->btm_version, BTREE_VERSION)));
 373
 374         /* if no root page initialized yet, fail */
 375         if (metad->btm_root == P_NONE)
 376         {
 377                 _bt_relbuf(rel, metabuf);
 378                 return InvalidBuffer;
 379         }
 380
 381         rootblkno = metad->btm_root;
 382         rootlevel = metad->btm_level;
 383
 384         /*
 385          * We are done with the metapage; arrange to release it via first
 386          * _bt_relandgetbuf call
 387          */
 388         rootbuf = metabuf;
 389
 390         for (;;)
 391         {
 392                 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
 393                 rootpage = BufferGetPage(rootbuf);
 394                 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 395
 396                 if (!P_IGNORE(rootopaque))
 397                         break;
 398
 399                 /* it's dead, Jim.  step right one page */
 400                 if (P_RIGHTMOST(rootopaque))
 401                         elog(ERROR, "no live root page found in index \"%s\"",
 402                                  RelationGetRelationName(rel));
 403                 rootblkno = rootopaque->btpo_next;
 404         }
 405
 406         /* Note: can't check btpo.level on deleted pages */
 407         if (rootopaque->btpo.level != rootlevel)
 408                 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
 409                          rootblkno, RelationGetRelationName(rel),
 410                          rootopaque->btpo.level, rootlevel);
 411
 412         return rootbuf;
 413 }
 414
 415 /*
 416  *      _bt_checkpage() -- Verify that a freshly-read page looks sane.
 417  */
 418 void
 419 _bt_checkpage(Relation rel, Buffer buf)
 420 {
 421         Page            page = BufferGetPage(buf);
 422
 423         /*
 424          * ReadBuffer verifies that every newly-read page passes
 425          * PageHeaderIsValid, which means it either contains a reasonably sane
 426          * page header or is all-zero.  We have to defend against the all-zero
 427          * case, however.
 428          */
 429         if (PageIsNew(page))
 430                 ereport(ERROR,
 431                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 432                          errmsg("index \"%s\" contains unexpected zero page at block %u",
 433                                         RelationGetRelationName(rel),
 434                                         BufferGetBlockNumber(buf)),
 435                                  errhint("Please REINDEX it.")));
 436
 437         /*
 438          * Additionally check that the special area looks sane.
 439          */
 440         if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
 441                 ereport(ERROR,
 442                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 443                                  errmsg("index \"%s\" contains corrupted page at block %u",
 444                                                 RelationGetRelationName(rel),
 445                                                 BufferGetBlockNumber(buf)),
 446                                  errhint("Please REINDEX it.")));
 447 }
 448
 449 /*
 450  *      _bt_getbuf() -- Get a buffer by block number for read or write.
 451  *
 452  *              blkno == P_NEW means to get an unallocated index page.  The page
 453  *              will be initialized before returning it.
 454  *
 455  *              When this routine returns, the appropriate lock is set on the
 456  *              requested buffer and its reference count has been incremented
 457  *              (ie, the buffer is "locked and pinned").  Also, we apply
 458  *              _bt_checkpage to sanity-check the page (except in P_NEW case).
 459  */
 460 Buffer
 461 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 462 {
 463         Buffer          buf;
 464
 465         if (blkno != P_NEW)
 466         {
 467                 /* Read an existing block of the relation */
 468                 buf = ReadBuffer(rel, blkno);
 469                 LockBuffer(buf, access);
 470                 _bt_checkpage(rel, buf);
 471         }
 472         else
 473         {
 474                 bool            needLock;
 475                 Page            page;
 476
 477                 Assert(access == BT_WRITE);
 478
 479                 /*
 480                  * First see if the FSM knows of any free pages.
 481                  *
 482                  * We can't trust the FSM's report unreservedly; we have to check that
 483                  * the page is still free.      (For example, an already-free page could
 484                  * have been re-used between the time the last VACUUM scanned it and
 485                  * the time the VACUUM made its FSM updates.)
 486                  *
 487                  * In fact, it's worse than that: we can't even assume that it's safe
 488                  * to take a lock on the reported page.  If somebody else has a lock
 489                  * on it, or even worse our own caller does, we could deadlock.  (The
 490                  * own-caller scenario is actually not improbable. Consider an index
 491                  * on a serial or timestamp column.  Nearly all splits will be at the
 492                  * rightmost page, so it's entirely likely that _bt_split will call us
 493                  * while holding a lock on the page most recently acquired from FSM. A
 494                  * VACUUM running concurrently with the previous split could well have
 495                  * placed that page back in FSM.)
 496                  *
 497                  * To get around that, we ask for only a conditional lock on the
 498                  * reported page.  If we fail, then someone else is using the page,
 499                  * and we may reasonably assume it's not free.  (If we happen to be
 500                  * wrong, the worst consequence is the page will be lost to use till
 501                  * the next VACUUM, which is no big problem.)
 502                  */
 503                 for (;;)
 504                 {
 505                         blkno = GetFreeIndexPage(rel);
 506                         if (blkno == InvalidBlockNumber)
 507                                 break;
 508                         buf = ReadBuffer(rel, blkno);
 509                         if (ConditionalLockBuffer(buf))
 510                         {
 511                                 page = BufferGetPage(buf);
 512                                 if (_bt_page_recyclable(page))
 513                                 {
 514                                         /* Okay to use page.  Re-initialize and return it */
 515                                         _bt_pageinit(page, BufferGetPageSize(buf));
 516                                         return buf;
 517                                 }
 518                                 elog(DEBUG2, "FSM returned nonrecyclable page");
 519                                 _bt_relbuf(rel, buf);
 520                         }
 521                         else
 522                         {
 523                                 elog(DEBUG2, "FSM returned nonlockable page");
 524                                 /* couldn't get lock, so just drop pin */
 525                                 ReleaseBuffer(buf);
 526                         }
 527                 }
 528
 529                 /*
 530                  * Extend the relation by one page.
 531                  *
 532                  * We have to use a lock to ensure no one else is extending the rel at
 533                  * the same time, else we will both try to initialize the same new
 534                  * page.  We can skip locking for new or temp relations, however,
 535                  * since no one else could be accessing them.
 536                  */
 537                 needLock = !RELATION_IS_LOCAL(rel);
 538
 539                 if (needLock)
 540                         LockRelationForExtension(rel, ExclusiveLock);
 541
 542                 buf = ReadBuffer(rel, P_NEW);
 543
 544                 /* Acquire buffer lock on new page */
 545                 LockBuffer(buf, BT_WRITE);
 546
 547                 /*
 548                  * Release the file-extension lock; it's now OK for someone else to
 549                  * extend the relation some more.  Note that we cannot release this
 550                  * lock before we have buffer lock on the new page, or we risk a race
 551                  * condition against btvacuumscan --- see comments therein.
 552                  */
 553                 if (needLock)
 554                         UnlockRelationForExtension(rel, ExclusiveLock);
 555
 556                 /* Initialize the new page before returning it */
 557                 page = BufferGetPage(buf);
 558                 Assert(PageIsNew(page));
 559                 _bt_pageinit(page, BufferGetPageSize(buf));
 560         }
 561
 562         /* ref count and lock type are correct */
 563         return buf;
 564 }
 565
 566 /*
 567  *      _bt_relandgetbuf() -- release a locked buffer and get another one.
 568  *
 569  * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
 570  * exception that blkno may not be P_NEW.  Also, if obuf is InvalidBuffer
 571  * then it reduces to just _bt_getbuf; allowing this case simplifies some
 572  * callers.
 573  *
 574  * The original motivation for using this was to avoid two entries to the
 575  * bufmgr when one would do.  However, now it's mainly just a notational
 576  * convenience.  The only case where it saves work over _bt_relbuf/_bt_getbuf
 577  * is when the target page is the same one already in the buffer.
 578  */
 579 Buffer
 580 _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
 581 {
 582         Buffer          buf;
 583
 584         Assert(blkno != P_NEW);
 585         if (BufferIsValid(obuf))
 586                 LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
 587         buf = ReleaseAndReadBuffer(obuf, rel, blkno);
 588         LockBuffer(buf, access);
 589         _bt_checkpage(rel, buf);
 590         return buf;
 591 }
 592
 593 /*
 594  *      _bt_relbuf() -- release a locked buffer.
 595  *
 596  * Lock and pin (refcount) are both dropped.
 597  */
 598 void
 599 _bt_relbuf(Relation rel, Buffer buf)
 600 {
 601         UnlockReleaseBuffer(buf);
 602 }
 603
 604 /*
 605  *      _bt_pageinit() -- Initialize a new page.
 606  *
 607  * On return, the page header is initialized; data space is empty;
 608  * special space is zeroed out.
 609  */
 610 void
 611 _bt_pageinit(Page page, Size size)
 612 {
 613         PageInit(page, size, sizeof(BTPageOpaqueData));
 614 }
 615
 616 /*
 617  *      _bt_page_recyclable() -- Is an existing page recyclable?
 618  *
 619  * This exists to make sure _bt_getbuf and btvacuumscan have the same
 620  * policy about whether a page is safe to re-use.
 621  */
 622 bool
 623 _bt_page_recyclable(Page page)
 624 {
 625         BTPageOpaque opaque;
 626
 627         /*
 628          * It's possible to find an all-zeroes page in an index --- for example, a
 629          * backend might successfully extend the relation one page and then crash
 630          * before it is able to make a WAL entry for adding the page. If we find a
 631          * zeroed page then reclaim it.
 632          */
 633         if (PageIsNew(page))
 634                 return true;
 635
 636         /*
 637          * Otherwise, recycle if deleted and too old to have any processes
 638          * interested in it.
 639          */
 640         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 641         if (P_ISDELETED(opaque) &&
 642                 TransactionIdPrecedesOrEquals(opaque->btpo.xact, RecentXmin))
 643                 return true;
 644         return false;
 645 }
 646
 647 /*
 648  * Delete item(s) from a btree page.
 649  *
 650  * This must only be used for deleting leaf items.      Deleting an item on a
 651  * non-leaf page has to be done as part of an atomic action that includes
 652  * deleting the page it points to.
 653  *
 654  * This routine assumes that the caller has pinned and locked the buffer.
 655  * Also, the given itemnos *must* appear in increasing order in the array.
 656  *
 657  * We record VACUUMs and b-tree deletes differently in WAL. InHotStandby
 658  * we need to be able to pin all of the blocks in the btree in physical
 659  * order when replaying the effects of a VACUUM, just as we do for the
 660  * original VACUUM itself. lastBlockVacuumed allows us to tell whether an
 661  * intermediate range of blocks has had no changes at all by VACUUM,
 662  * and so must be scanned anyway during replay. We always write a WAL record
 663  * for the last block in the index, whether or not it contained any items
 664  * to be removed. This allows us to scan right up to end of index to
 665  * ensure correct locking.
 666  */
 667 void
 668 _bt_delitems(Relation rel, Buffer buf,
 669                          OffsetNumber *itemnos, int nitems, bool isVacuum,
 670                          BlockNumber lastBlockVacuumed)
 671 {
 672         Page            page = BufferGetPage(buf);
 673         BTPageOpaque opaque;
 674
 675         Assert(isVacuum || lastBlockVacuumed == 0);
 676
 677         /* No ereport(ERROR) until changes are logged */
 678         START_CRIT_SECTION();
 679
 680         /* Fix the page */
 681         if (nitems > 0)
 682                 PageIndexMultiDelete(page, itemnos, nitems);
 683
 684         /*
 685          * We can clear the vacuum cycle ID since this page has certainly been
 686          * processed by the current vacuum scan.
 687          */
 688         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 689         opaque->btpo_cycleid = 0;
 690
 691         /*
 692          * Mark the page as not containing any LP_DEAD items.  This is not
 693          * certainly true (there might be some that have recently been marked, but
 694          * weren't included in our target-item list), but it will almost always be
 695          * true and it doesn't seem worth an additional page scan to check it.
 696          * Remember that BTP_HAS_GARBAGE is only a hint anyway.
 697          */
 698         opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
 699
 700         MarkBufferDirty(buf);
 701
 702         /* XLOG stuff */
 703         if (!rel->rd_istemp)
 704         {
 705                 XLogRecPtr      recptr;
 706                 XLogRecData rdata[2];
 707
 708                 if (isVacuum)
 709                 {
 710                         xl_btree_vacuum xlrec_vacuum;
 711                         xlrec_vacuum.node = rel->rd_node;
 712                         xlrec_vacuum.block = BufferGetBlockNumber(buf);
 713
 714                         xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
 715                         rdata[0].data = (char *) &xlrec_vacuum;
 716                         rdata[0].len = SizeOfBtreeVacuum;
 717                 }
 718                 else
 719                 {
 720                         xl_btree_delete xlrec_delete;
 721                         xlrec_delete.node = rel->rd_node;
 722                         xlrec_delete.block = BufferGetBlockNumber(buf);
 723
 724                         /*
 725                          * XXX: We would like to set an accurate latestRemovedXid, but
 726                          * there is no easy way of obtaining a useful value. So we punt
 727                          * and store InvalidTransactionId, which forces the standby to
 728                          * wait for/cancel all currently running transactions.
 729                          */
 730                         xlrec_delete.latestRemovedXid = InvalidTransactionId;
 731                         rdata[0].data = (char *) &xlrec_delete;
 732                         rdata[0].len = SizeOfBtreeDelete;
 733                 }
 734
 735                 rdata[0].buffer = InvalidBuffer;
 736                 rdata[0].next = &(rdata[1]);
 737
 738                 /*
 739                  * The target-offsets array is not in the buffer, but pretend that it
 740                  * is.  When XLogInsert stores the whole buffer, the offsets array
 741                  * need not be stored too.
 742                  */
 743                 if (nitems > 0)
 744                 {
 745                         rdata[1].data = (char *) itemnos;
 746                         rdata[1].len = nitems * sizeof(OffsetNumber);
 747                 }
 748                 else
 749                 {
 750                         rdata[1].data = NULL;
 751                         rdata[1].len = 0;
 752                 }
 753                 rdata[1].buffer = buf;
 754                 rdata[1].buffer_std = true;
 755                 rdata[1].next = NULL;
 756
 757                 if (isVacuum)
 758                         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata);
 759                 else
 760                         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
 761
 762                 PageSetLSN(page, recptr);
 763                 PageSetTLI(page, ThisTimeLineID);
 764         }
 765
 766         END_CRIT_SECTION();
 767 }
 768
 769 /*
 770  * Subroutine to pre-check whether a page deletion is safe, that is, its
 771  * parent page would be left in a valid or deletable state.
 772  *
 773  * "target" is the page we wish to delete, and "stack" is a search stack
 774  * leading to it (approximately).  Note that we will update the stack
 775  * entry(s) to reflect current downlink positions --- this is harmless and
 776  * indeed saves later search effort in _bt_pagedel.
 777  *
 778  * Note: it's OK to release page locks after checking, because a safe
 779  * deletion can't become unsafe due to concurrent activity.  A non-rightmost
 780  * page cannot become rightmost unless there's a concurrent page deletion,
 781  * but only VACUUM does page deletion and we only allow one VACUUM on an index
 782  * at a time.  An only child could acquire a sibling (of the same parent) only
 783  * by being split ... but that would make it a non-rightmost child so the
 784  * deletion is still safe.
 785  */
 786 static bool
 787 _bt_parent_deletion_safe(Relation rel, BlockNumber target, BTStack stack)
 788 {
 789         BlockNumber parent;
 790         OffsetNumber poffset,
 791                                 maxoff;
 792         Buffer          pbuf;
 793         Page            page;
 794         BTPageOpaque opaque;
 795
 796         /*
 797          * In recovery mode, assume the deletion being replayed is valid.  We
 798          * can't always check it because we won't have a full search stack, and we
 799          * should complain if there's a problem, anyway.
 800          */
 801         if (InRecovery)
 802                 return true;
 803
 804         /* Locate the parent's downlink (updating the stack entry if needed) */
 805         ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
 806         pbuf = _bt_getstackbuf(rel, stack, BT_READ);
 807         if (pbuf == InvalidBuffer)
 808                 elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
 809                          RelationGetRelationName(rel), target);
 810         parent = stack->bts_blkno;
 811         poffset = stack->bts_offset;
 812
 813         page = BufferGetPage(pbuf);
 814         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 815         maxoff = PageGetMaxOffsetNumber(page);
 816
 817         /*
 818          * If the target is the rightmost child of its parent, then we can't
 819          * delete, unless it's also the only child.
 820          */
 821         if (poffset >= maxoff)
 822         {
 823                 /* It's rightmost child... */
 824                 if (poffset == P_FIRSTDATAKEY(opaque))
 825                 {
 826                         /*
 827                          * It's only child, so safe if parent would itself be removable.
 828                          * We have to check the parent itself, and then recurse to test
 829                          * the conditions at the parent's parent.
 830                          */
 831                         if (P_RIGHTMOST(opaque) || P_ISROOT(opaque))
 832                         {
 833                                 _bt_relbuf(rel, pbuf);
 834                                 return false;
 835                         }
 836
 837                         _bt_relbuf(rel, pbuf);
 838                         return _bt_parent_deletion_safe(rel, parent, stack->bts_parent);
 839                 }
 840                 else
 841                 {
 842                         /* Unsafe to delete */
 843                         _bt_relbuf(rel, pbuf);
 844                         return false;
 845                 }
 846         }
 847         else
 848         {
 849                 /* Not rightmost child, so safe to delete */
 850                 _bt_relbuf(rel, pbuf);
 851                 return true;
 852         }
 853 }
 854
 855 /*
 856  * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
 857  *
 858  * This action unlinks the page from the b-tree structure, removing all
 859  * pointers leading to it --- but not touching its own left and right links.
 860  * The page cannot be physically reclaimed right away, since other processes
 861  * may currently be trying to follow links leading to the page; they have to
 862  * be allowed to use its right-link to recover.  See nbtree/README.
 863  *
 864  * On entry, the target buffer must be pinned and locked (either read or write
 865  * lock is OK).  This lock and pin will be dropped before exiting.
 866  *
 867  * The "stack" argument can be a search stack leading (approximately) to the
 868  * target page, or NULL --- outside callers typically pass NULL since they
 869  * have not done such a search, but internal recursion cases pass the stack
 870  * to avoid duplicated search effort.
 871  *
 872  * Returns the number of pages successfully deleted (zero if page cannot
 873  * be deleted now; could be more than one if parent pages were deleted too).
 874  *
 875  * NOTE: this leaks memory.  Rather than trying to clean up everything
 876  * carefully, it's better to run it in a temp context that can be reset
 877  * frequently.
 878  */
 879 int
 880 _bt_pagedel(Relation rel, Buffer buf, BTStack stack, bool vacuum_full)
 881 {
 882         int                     result;
 883         BlockNumber target,
 884                                 leftsib,
 885                                 rightsib,
 886                                 parent;
 887         OffsetNumber poffset,
 888                                 maxoff;
 889         uint32          targetlevel,
 890                                 ilevel;
 891         ItemId          itemid;
 892         IndexTuple      targetkey,
 893                                 itup;
 894         ScanKey         itup_scankey;
 895         Buffer          lbuf,
 896                                 rbuf,
 897                                 pbuf;
 898         bool            parent_half_dead;
 899         bool            parent_one_child;
 900         bool            rightsib_empty;
 901         Buffer          metabuf = InvalidBuffer;
 902         Page            metapg = NULL;
 903         BTMetaPageData *metad = NULL;
 904         Page            page;
 905         BTPageOpaque opaque;
 906
 907         /*
 908          * We can never delete rightmost pages nor root pages.  While at it, check
 909          * that page is not already deleted and is empty.
 910          */
 911         page = BufferGetPage(buf);
 912         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 913         if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
 914                 P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
 915         {
 916                 /* Should never fail to delete a half-dead page */
 917                 Assert(!P_ISHALFDEAD(opaque));
 918
 919                 _bt_relbuf(rel, buf);
 920                 return 0;
 921         }
 922
 923         /*
 924          * Save info about page, including a copy of its high key (it must have
 925          * one, being non-rightmost).
 926          */
 927         target = BufferGetBlockNumber(buf);
 928         targetlevel = opaque->btpo.level;
 929         leftsib = opaque->btpo_prev;
 930         itemid = PageGetItemId(page, P_HIKEY);
 931         targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
 932
 933         /*
 934          * To avoid deadlocks, we'd better drop the target page lock before going
 935          * further.
 936          */
 937         _bt_relbuf(rel, buf);
 938
 939         /*
 940          * We need an approximate pointer to the page's parent page.  We use the
 941          * standard search mechanism to search for the page's high key; this will
 942          * give us a link to either the current parent or someplace to its left
 943          * (if there are multiple equal high keys).  In recursion cases, the
 944          * caller already generated a search stack and we can just re-use that
 945          * work.
 946          */
 947         if (stack == NULL)
 948         {
 949                 if (!InRecovery)
 950                 {
 951                         /* we need an insertion scan key to do our search, so build one */
 952                         itup_scankey = _bt_mkscankey(rel, targetkey);
 953                         /* find the leftmost leaf page containing this key */
 954                         stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
 955                                                            &lbuf, BT_READ);
 956                         /* don't need a pin on that either */
 957                         _bt_relbuf(rel, lbuf);
 958
 959                         /*
 960                          * If we are trying to delete an interior page, _bt_search did
 961                          * more than we needed.  Locate the stack item pointing to our
 962                          * parent level.
 963                          */
 964                         ilevel = 0;
 965                         for (;;)
 966                         {
 967                                 if (stack == NULL)
 968                                         elog(ERROR, "not enough stack items");
 969                                 if (ilevel == targetlevel)
 970                                         break;
 971                                 stack = stack->bts_parent;
 972                                 ilevel++;
 973                         }
 974                 }
 975                 else
 976                 {
 977                         /*
 978                          * During WAL recovery, we can't use _bt_search (for one reason,
 979                          * it might invoke user-defined comparison functions that expect
 980                          * facilities not available in recovery mode).  Instead, just set
 981                          * up a dummy stack pointing to the left end of the parent tree
 982                          * level, from which _bt_getstackbuf will walk right to the parent
 983                          * page.  Painful, but we don't care too much about performance in
 984                          * this scenario.
 985                          */
 986                         pbuf = _bt_get_endpoint(rel, targetlevel + 1, false);
 987                         stack = (BTStack) palloc(sizeof(BTStackData));
 988                         stack->bts_blkno = BufferGetBlockNumber(pbuf);
 989                         stack->bts_offset = InvalidOffsetNumber;
 990                         /* bts_btentry will be initialized below */
 991                         stack->bts_parent = NULL;
 992                         _bt_relbuf(rel, pbuf);
 993                 }
 994         }
 995
 996         /*
 997          * We cannot delete a page that is the rightmost child of its immediate
 998          * parent, unless it is the only child --- in which case the parent has to
 999          * be deleted too, and the same condition applies recursively to it. We
1000          * have to check this condition all the way up before trying to delete. We
1001          * don't need to re-test when deleting a non-leaf page, though.
1002          */
1003         if (targetlevel == 0 &&
1004                 !_bt_parent_deletion_safe(rel, target, stack))
1005                 return 0;
1006
1007         /*
1008          * We have to lock the pages we need to modify in the standard order:
1009          * moving right, then up.  Else we will deadlock against other writers.
1010          *
1011          * So, we need to find and write-lock the current left sibling of the
1012          * target page.  The sibling that was current a moment ago could have
1013          * split, so we may have to move right.  This search could fail if either
1014          * the sibling or the target page was deleted by someone else meanwhile;
1015          * if so, give up.      (Right now, that should never happen, since page
1016          * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs
1017          * concurrently on the same table.)
1018          */
1019         if (leftsib != P_NONE)
1020         {
1021                 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
1022                 page = BufferGetPage(lbuf);
1023                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1024                 while (P_ISDELETED(opaque) || opaque->btpo_next != target)
1025                 {
1026                         /* step right one page */
1027                         leftsib = opaque->btpo_next;
1028                         _bt_relbuf(rel, lbuf);
1029                         if (leftsib == P_NONE)
1030                         {
1031                                 elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"",
1032                                          RelationGetRelationName(rel));
1033                                 return 0;
1034                         }
1035                         lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
1036                         page = BufferGetPage(lbuf);
1037                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1038                 }
1039         }
1040         else
1041                 lbuf = InvalidBuffer;
1042
1043         /*
1044          * Next write-lock the target page itself.      It should be okay to take just
1045          * a write lock not a superexclusive lock, since no scans would stop on an
1046          * empty page.
1047          */
1048         buf = _bt_getbuf(rel, target, BT_WRITE);
1049         page = BufferGetPage(buf);
1050         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1051
1052         /*
1053          * Check page is still empty etc, else abandon deletion.  The empty check
1054          * is necessary since someone else might have inserted into it while we
1055          * didn't have it locked; the others are just for paranoia's sake.
1056          */
1057         if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
1058                 P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
1059         {
1060                 _bt_relbuf(rel, buf);
1061                 if (BufferIsValid(lbuf))
1062                         _bt_relbuf(rel, lbuf);
1063                 return 0;
1064         }
1065         if (opaque->btpo_prev != leftsib)
1066                 elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"",
1067                          target, RelationGetRelationName(rel));
1068
1069         /*
1070          * And next write-lock the (current) right sibling.
1071          */
1072         rightsib = opaque->btpo_next;
1073         rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
1074
1075         /*
1076          * Next find and write-lock the current parent of the target page. This is
1077          * essentially the same as the corresponding step of splitting.
1078          */
1079         ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
1080         pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
1081         if (pbuf == InvalidBuffer)
1082                 elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
1083                          RelationGetRelationName(rel), target);
1084         parent = stack->bts_blkno;
1085         poffset = stack->bts_offset;
1086
1087         /*
1088          * If the target is the rightmost child of its parent, then we can't
1089          * delete, unless it's also the only child --- in which case the parent
1090          * changes to half-dead status.  The "can't delete" case should have been
1091          * detected by _bt_parent_deletion_safe, so complain if we see it now.
1092          */
1093         page = BufferGetPage(pbuf);
1094         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1095         maxoff = PageGetMaxOffsetNumber(page);
1096         parent_half_dead = false;
1097         parent_one_child = false;
1098         if (poffset >= maxoff)
1099         {
1100                 if (poffset == P_FIRSTDATAKEY(opaque))
1101                         parent_half_dead = true;
1102                 else
1103                         elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"",
1104                                  target, parent, RelationGetRelationName(rel));
1105         }
1106         else
1107         {
1108                 /* Will there be exactly one child left in this parent? */
1109                 if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff)
1110                         parent_one_child = true;
1111         }
1112
1113         /*
1114          * If we are deleting the next-to-last page on the target's level, then
1115          * the rightsib is a candidate to become the new fast root. (In theory, it
1116          * might be possible to push the fast root even further down, but the odds
1117          * of doing so are slim, and the locking considerations daunting.)
1118          *
1119          * We don't support handling this in the case where the parent is becoming
1120          * half-dead, even though it theoretically could occur.
1121          *
1122          * We can safely acquire a lock on the metapage here --- see comments for
1123          * _bt_newroot().
1124          */
1125         if (leftsib == P_NONE && !parent_half_dead)
1126         {
1127                 page = BufferGetPage(rbuf);
1128                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1129                 Assert(opaque->btpo.level == targetlevel);
1130                 if (P_RIGHTMOST(opaque))
1131                 {
1132                         /* rightsib will be the only one left on the level */
1133                         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1134                         metapg = BufferGetPage(metabuf);
1135                         metad = BTPageGetMeta(metapg);
1136
1137                         /*
1138                          * The expected case here is btm_fastlevel == targetlevel+1; if
1139                          * the fastlevel is <= targetlevel, something is wrong, and we
1140                          * choose to overwrite it to fix it.
1141                          */
1142                         if (metad->btm_fastlevel > targetlevel + 1)
1143                         {
1144                                 /* no update wanted */
1145                                 _bt_relbuf(rel, metabuf);
1146                                 metabuf = InvalidBuffer;
1147                         }
1148                 }
1149         }
1150
1151         /*
1152          * Here we begin doing the deletion.
1153          */
1154
1155         /* No ereport(ERROR) until changes are logged */
1156         START_CRIT_SECTION();
1157
1158         /*
1159          * Update parent.  The normal case is a tad tricky because we want to
1160          * delete the target's downlink and the *following* key.  Easiest way is
1161          * to copy the right sibling's downlink over the target downlink, and then
1162          * delete the following item.
1163          */
1164         page = BufferGetPage(pbuf);
1165         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1166         if (parent_half_dead)
1167         {
1168                 PageIndexTupleDelete(page, poffset);
1169                 opaque->btpo_flags |= BTP_HALF_DEAD;
1170         }
1171         else
1172         {
1173                 OffsetNumber nextoffset;
1174
1175                 itemid = PageGetItemId(page, poffset);
1176                 itup = (IndexTuple) PageGetItem(page, itemid);
1177                 Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
1178                 ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
1179
1180                 nextoffset = OffsetNumberNext(poffset);
1181                 /* This part is just for double-checking */
1182                 itemid = PageGetItemId(page, nextoffset);
1183                 itup = (IndexTuple) PageGetItem(page, itemid);
1184                 if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
1185                         elog(PANIC, "right sibling %u of block %u is not next child of %u in index \"%s\"",
1186                                  rightsib, target, BufferGetBlockNumber(pbuf),
1187                                  RelationGetRelationName(rel));
1188                 PageIndexTupleDelete(page, nextoffset);
1189         }
1190
1191         /*
1192          * Update siblings' side-links.  Note the target page's side-links will
1193          * continue to point to the siblings.
1194          */
1195         if (BufferIsValid(lbuf))
1196         {
1197                 page = BufferGetPage(lbuf);
1198                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1199                 Assert(opaque->btpo_next == target);
1200                 opaque->btpo_next = rightsib;
1201         }
1202         page = BufferGetPage(rbuf);
1203         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1204         Assert(opaque->btpo_prev == target);
1205         opaque->btpo_prev = leftsib;
1206         rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
1207
1208         /*
1209          * Mark the page itself deleted.  It can be recycled when all current
1210          * transactions are gone; or immediately if we're doing VACUUM FULL.
1211          */
1212         page = BufferGetPage(buf);
1213         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1214         opaque->btpo_flags &= ~BTP_HALF_DEAD;
1215         opaque->btpo_flags |= BTP_DELETED;
1216         opaque->btpo.xact =
1217                 vacuum_full ? FrozenTransactionId : ReadNewTransactionId();
1218
1219         /* And update the metapage, if needed */
1220         if (BufferIsValid(metabuf))
1221         {
1222                 metad->btm_fastroot = rightsib;
1223                 metad->btm_fastlevel = targetlevel;
1224                 MarkBufferDirty(metabuf);
1225         }
1226
1227         /* Must mark buffers dirty before XLogInsert */
1228         MarkBufferDirty(pbuf);
1229         MarkBufferDirty(rbuf);
1230         MarkBufferDirty(buf);
1231         if (BufferIsValid(lbuf))
1232                 MarkBufferDirty(lbuf);
1233
1234         /* XLOG stuff */
1235         if (!rel->rd_istemp)
1236         {
1237                 xl_btree_delete_page xlrec;
1238                 xl_btree_metadata xlmeta;
1239                 uint8           xlinfo;
1240                 XLogRecPtr      recptr;
1241                 XLogRecData rdata[5];
1242                 XLogRecData *nextrdata;
1243
1244                 xlrec.target.node = rel->rd_node;
1245                 ItemPointerSet(&(xlrec.target.tid), parent, poffset);
1246                 xlrec.deadblk = target;
1247                 xlrec.leftblk = leftsib;
1248                 xlrec.rightblk = rightsib;
1249
1250                 rdata[0].data = (char *) &xlrec;
1251                 rdata[0].len = SizeOfBtreeDeletePage;
1252                 rdata[0].buffer = InvalidBuffer;
1253                 rdata[0].next = nextrdata = &(rdata[1]);
1254
1255                 if (BufferIsValid(metabuf))
1256                 {
1257                         xlmeta.root = metad->btm_root;
1258                         xlmeta.level = metad->btm_level;
1259                         xlmeta.fastroot = metad->btm_fastroot;
1260                         xlmeta.fastlevel = metad->btm_fastlevel;
1261
1262                         nextrdata->data = (char *) &xlmeta;
1263                         nextrdata->len = sizeof(xl_btree_metadata);
1264                         nextrdata->buffer = InvalidBuffer;
1265                         nextrdata->next = nextrdata + 1;
1266                         nextrdata++;
1267                         xlinfo = XLOG_BTREE_DELETE_PAGE_META;
1268                 }
1269                 else if (parent_half_dead)
1270                         xlinfo = XLOG_BTREE_DELETE_PAGE_HALF;
1271                 else
1272                         xlinfo = XLOG_BTREE_DELETE_PAGE;
1273
1274                 nextrdata->data = NULL;
1275                 nextrdata->len = 0;
1276                 nextrdata->next = nextrdata + 1;
1277                 nextrdata->buffer = pbuf;
1278                 nextrdata->buffer_std = true;
1279                 nextrdata++;
1280
1281                 nextrdata->data = NULL;
1282                 nextrdata->len = 0;
1283                 nextrdata->buffer = rbuf;
1284                 nextrdata->buffer_std = true;
1285                 nextrdata->next = NULL;
1286
1287                 if (BufferIsValid(lbuf))
1288                 {
1289                         nextrdata->next = nextrdata + 1;
1290                         nextrdata++;
1291                         nextrdata->data = NULL;
1292                         nextrdata->len = 0;
1293                         nextrdata->buffer = lbuf;
1294                         nextrdata->buffer_std = true;
1295                         nextrdata->next = NULL;
1296                 }
1297
1298                 recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
1299
1300                 if (BufferIsValid(metabuf))
1301                 {
1302                         PageSetLSN(metapg, recptr);
1303                         PageSetTLI(metapg, ThisTimeLineID);
1304                 }
1305                 page = BufferGetPage(pbuf);
1306                 PageSetLSN(page, recptr);
1307                 PageSetTLI(page, ThisTimeLineID);
1308                 page = BufferGetPage(rbuf);
1309                 PageSetLSN(page, recptr);
1310                 PageSetTLI(page, ThisTimeLineID);
1311                 page = BufferGetPage(buf);
1312                 PageSetLSN(page, recptr);
1313                 PageSetTLI(page, ThisTimeLineID);
1314                 if (BufferIsValid(lbuf))
1315                 {
1316                         page = BufferGetPage(lbuf);
1317                         PageSetLSN(page, recptr);
1318                         PageSetTLI(page, ThisTimeLineID);
1319                 }
1320         }
1321
1322         END_CRIT_SECTION();
1323
1324         /* release metapage; send out relcache inval if metapage changed */
1325         if (BufferIsValid(metabuf))
1326         {
1327                 CacheInvalidateRelcache(rel);
1328                 _bt_relbuf(rel, metabuf);
1329         }
1330         /* can always release leftsib immediately */
1331         if (BufferIsValid(lbuf))
1332                 _bt_relbuf(rel, lbuf);
1333
1334         /*
1335          * If parent became half dead, recurse to delete it. Otherwise, if right
1336          * sibling is empty and is now the last child of the parent, recurse to
1337          * try to delete it.  (These cases cannot apply at the same time, though
1338          * the second case might itself recurse to the first.)
1339          *
1340          * When recursing to parent, we hold the lock on the target page until
1341          * done.  This delays any insertions into the keyspace that was just
1342          * effectively reassigned to the parent's right sibling.  If we allowed
1343          * that, and there were enough such insertions before we finish deleting
1344          * the parent, page splits within that keyspace could lead to inserting
1345          * out-of-order keys into the grandparent level.  It is thought that that
1346          * wouldn't have any serious consequences, but it still seems like a
1347          * pretty bad idea.
1348          */
1349         if (parent_half_dead)
1350         {
1351                 /* recursive call will release pbuf */
1352                 _bt_relbuf(rel, rbuf);
1353                 result = _bt_pagedel(rel, pbuf, stack->bts_parent, vacuum_full) + 1;
1354                 _bt_relbuf(rel, buf);
1355         }
1356         else if (parent_one_child && rightsib_empty)
1357         {
1358                 _bt_relbuf(rel, pbuf);
1359                 _bt_relbuf(rel, buf);
1360                 /* recursive call will release rbuf */
1361                 result = _bt_pagedel(rel, rbuf, stack, vacuum_full) + 1;
1362         }
1363         else
1364         {
1365                 _bt_relbuf(rel, pbuf);
1366                 _bt_relbuf(rel, buf);
1367                 _bt_relbuf(rel, rbuf);
1368                 result = 1;
1369         }
1370
1371         return result;
1372 }