src/backend/access/nbtree/nbtpage.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtpage.c
   4  *        BTree-specific page management code for the Postgres btree access
   5  *        method.
   6  *
   7  * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  *
  11  * IDENTIFICATION
  12  *        $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.53 2001/07/15 22:48:16 tgl Exp $
  13  *
  14  *      NOTES
  15  *         Postgres btree pages look like ordinary relation pages.      The opaque
  16  *         data at high addresses includes pointers to left and right siblings
  17  *         and flag data describing page state.  The first page in a btree, page
  18  *         zero, is special -- it stores meta-information describing the tree.
  19  *         Pages one and higher store the actual tree data.
  20  *
  21  *-------------------------------------------------------------------------
  22  */
  23 #include "postgres.h"
  24
  25 #include <time.h>
  26
  27 #include "access/nbtree.h"
  28 #include "miscadmin.h"
  29 #include "storage/lmgr.h"
  30
  31 extern bool FixBTree;                   /* comments in nbtree.c */
  32 extern Buffer _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release);
  33
  34 /*
  35  *      We use high-concurrency locking on btrees.      There are two cases in
  36  *      which we don't do locking.  One is when we're building the btree.
  37  *      Since the creating transaction has not committed, no one can see
  38  *      the index, and there's no reason to share locks.  The second case
  39  *      is when we're just starting up the database system.  We use some
  40  *      special-purpose initialization code in the relation cache manager
  41  *      (see utils/cache/relcache.c) to allow us to do indexed scans on
  42  *      the system catalogs before we'd normally be able to.  This happens
  43  *      before the lock table is fully initialized, so we can't use it.
  44  *      Strictly speaking, this violates 2pl, but we don't do 2pl on the
  45  *      system catalogs anyway, so I declare this to be okay.
  46  */
  47
  48 #define USELOCKING              (!BuildingBtree && !IsInitProcessingMode())
  49
  50 /*
  51  *      _bt_metapinit() -- Initialize the metadata page of a btree.
  52  */
  53 void
  54 _bt_metapinit(Relation rel)
  55 {
  56         Buffer          buf;
  57         Page            pg;
  58         BTMetaPageData metad;
  59         BTPageOpaque op;
  60
  61         /* can't be sharing this with anyone, now... */
  62         if (USELOCKING)
  63                 LockRelation(rel, AccessExclusiveLock);
  64
  65         if (RelationGetNumberOfBlocks(rel) != 0)
  66                 elog(ERROR, "Cannot initialize non-empty btree %s",
  67                          RelationGetRelationName(rel));
  68
  69         buf = ReadBuffer(rel, P_NEW);
  70         pg = BufferGetPage(buf);
  71         _bt_pageinit(pg, BufferGetPageSize(buf));
  72
  73         metad.btm_magic = BTREE_MAGIC;
  74         metad.btm_version = BTREE_VERSION;
  75         metad.btm_root = P_NONE;
  76         metad.btm_level = 0;
  77         memcpy((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
  78
  79         op = (BTPageOpaque) PageGetSpecialPointer(pg);
  80         op->btpo_flags = BTP_META;
  81
  82         WriteBuffer(buf);
  83
  84         /* all done */
  85         if (USELOCKING)
  86                 UnlockRelation(rel, AccessExclusiveLock);
  87 }
  88
  89 /*
  90  *      _bt_getroot() -- Get the root page of the btree.
  91  *
  92  *              Since the root page can move around the btree file, we have to read
  93  *              its location from the metadata page, and then read the root page
  94  *              itself.  If no root page exists yet, we have to create one.  The
  95  *              standard class of race conditions exists here; I think I covered
  96  *              them all in the Hopi Indian rain dance of lock requests below.
  97  *
  98  *              The access type parameter (BT_READ or BT_WRITE) controls whether
  99  *              a new root page will be created or not.  If access = BT_READ,
 100  *              and no root page exists, we just return InvalidBuffer.  For
 101  *              BT_WRITE, we try to create the root page if it doesn't exist.
 102  *              NOTE that the returned root page will have only a read lock set
 103  *              on it even if access = BT_WRITE!
 104  *
 105  *              On successful return, the root page is pinned and read-locked.
 106  *              The metadata page is not locked or pinned on exit.
 107  */
 108 Buffer
 109 _bt_getroot(Relation rel, int access)
 110 {
 111         Buffer          metabuf;
 112         Page            metapg;
 113         BTPageOpaque metaopaque;
 114         Buffer          rootbuf;
 115         Page            rootpage;
 116         BTPageOpaque rootopaque;
 117         BlockNumber rootblkno;
 118         BTMetaPageData *metad;
 119
 120         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 121         metapg = BufferGetPage(metabuf);
 122         metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
 123         metad = BTPageGetMeta(metapg);
 124
 125         if (!(metaopaque->btpo_flags & BTP_META) ||
 126                 metad->btm_magic != BTREE_MAGIC)
 127                 elog(ERROR, "Index %s is not a btree",
 128                          RelationGetRelationName(rel));
 129
 130         if (metad->btm_version != BTREE_VERSION)
 131                 elog(ERROR, "Version mismatch on %s: version %d file, version %d code",
 132                          RelationGetRelationName(rel),
 133                          metad->btm_version, BTREE_VERSION);
 134
 135         /* if no root page initialized yet, do it */
 136         if (metad->btm_root == P_NONE)
 137         {
 138                 /* If access = BT_READ, caller doesn't want us to create root yet */
 139                 if (access == BT_READ)
 140                 {
 141                         _bt_relbuf(rel, metabuf);
 142                         return InvalidBuffer;
 143                 }
 144
 145                 /* trade in our read lock for a write lock */
 146                 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
 147                 LockBuffer(metabuf, BT_WRITE);
 148
 149                 /*
 150                  * Race condition:      if someone else initialized the metadata
 151                  * between the time we released the read lock and acquired the
 152                  * write lock, above, we must avoid doing it again.
 153                  */
 154                 if (metad->btm_root == P_NONE)
 155                 {
 156
 157                         /*
 158                          * Get, initialize, write, and leave a lock of the appropriate
 159                          * type on the new root page.  Since this is the first page in
 160                          * the tree, it's a leaf as well as the root.
 161                          */
 162                         rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
 163                         rootblkno = BufferGetBlockNumber(rootbuf);
 164                         rootpage = BufferGetPage(rootbuf);
 165
 166                         /* NO ELOG(ERROR) till meta is updated */
 167                         START_CRIT_SECTION();
 168
 169                         metad->btm_root = rootblkno;
 170                         metad->btm_level = 1;
 171
 172                         _bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
 173                         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 174                         rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT);
 175
 176                         /* XLOG stuff */
 177                         {
 178                                 xl_btree_newroot xlrec;
 179                                 XLogRecPtr      recptr;
 180                                 XLogRecData rdata;
 181
 182                                 xlrec.node = rel->rd_node;
 183                                 xlrec.level = 1;
 184                                 BlockIdSet(&(xlrec.rootblk), rootblkno);
 185                                 rdata.buffer = InvalidBuffer;
 186                                 rdata.data = (char *) &xlrec;
 187                                 rdata.len = SizeOfBtreeNewroot;
 188                                 rdata.next = NULL;
 189
 190                                 recptr = XLogInsert(RM_BTREE_ID,
 191                                                    XLOG_BTREE_NEWROOT | XLOG_BTREE_LEAF, &rdata);
 192
 193                                 PageSetLSN(rootpage, recptr);
 194                                 PageSetSUI(rootpage, ThisStartUpID);
 195                                 PageSetLSN(metapg, recptr);
 196                                 PageSetSUI(metapg, ThisStartUpID);
 197                         }
 198
 199                         END_CRIT_SECTION();
 200
 201                         _bt_wrtnorelbuf(rel, rootbuf);
 202
 203                         /* swap write lock for read lock */
 204                         LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
 205                         LockBuffer(rootbuf, BT_READ);
 206
 207                         /* okay, metadata is correct, write and release it */
 208                         _bt_wrtbuf(rel, metabuf);
 209                 }
 210                 else
 211                 {
 212
 213                         /*
 214                          * Metadata initialized by someone else.  In order to
 215                          * guarantee no deadlocks, we have to release the metadata
 216                          * page and start all over again.
 217                          */
 218                         _bt_relbuf(rel, metabuf);
 219                         return _bt_getroot(rel, access);
 220                 }
 221         }
 222         else
 223         {
 224                 rootblkno = metad->btm_root;
 225                 _bt_relbuf(rel, metabuf);               /* done with the meta page */
 226
 227                 rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
 228         }
 229
 230         /*
 231          * Race condition:      If the root page split between the time we looked
 232          * at the metadata page and got the root buffer, then we got the wrong
 233          * buffer.      Release it and try again.
 234          */
 235         rootpage = BufferGetPage(rootbuf);
 236         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 237
 238         if (!P_ISROOT(rootopaque))
 239         {
 240
 241                 /*
 242                  * It happened, but if root page splitter failed to create new
 243                  * root page then we'll go in loop trying to call _bt_getroot
 244                  * again and again.
 245                  */
 246                 if (FixBTree)
 247                 {
 248                         Buffer          newrootbuf;
 249
 250         check_parent:;
 251                         if (BTreeInvalidParent(rootopaque)) /* unupdated! */
 252                         {
 253                                 LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
 254                                 LockBuffer(rootbuf, BT_WRITE);
 255
 256                                 /* handle concurrent fix of root page */
 257                                 if (BTreeInvalidParent(rootopaque))             /* unupdated! */
 258                                 {
 259                                         elog(NOTICE, "bt_getroot[%s]: fixing root page", RelationGetRelationName(rel));
 260                                         newrootbuf = _bt_fixroot(rel, rootbuf, true);
 261                                         LockBuffer(newrootbuf, BUFFER_LOCK_UNLOCK);
 262                                         LockBuffer(newrootbuf, BT_READ);
 263                                         rootbuf = newrootbuf;
 264                                         rootpage = BufferGetPage(rootbuf);
 265                                         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 266                                         /* New root might be splitted while changing lock */
 267                                         if (P_ISROOT(rootopaque))
 268                                                 return (rootbuf);
 269                                         /* rootbuf is read locked */
 270                                         goto check_parent;
 271                                 }
 272                                 else
 273                                 {
 274                                         /* someone else already fixed root */
 275                                         LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
 276                                         LockBuffer(rootbuf, BT_READ);
 277                                 }
 278                         }
 279
 280                         /*
 281                          * Ok, here we have old root page with btpo_parent pointing to
 282                          * upper level - check parent page because of there is good
 283                          * chance that parent is root page.
 284                          */
 285                         newrootbuf = _bt_getbuf(rel, rootopaque->btpo_parent, BT_READ);
 286                         _bt_relbuf(rel, rootbuf);
 287                         rootbuf = newrootbuf;
 288                         rootpage = BufferGetPage(rootbuf);
 289                         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 290                         if (P_ISROOT(rootopaque))
 291                                 return (rootbuf);
 292                         /* no luck -:( */
 293                 }
 294
 295                 /* try again */
 296                 _bt_relbuf(rel, rootbuf);
 297                 return _bt_getroot(rel, access);
 298         }
 299
 300         /*
 301          * By here, we have a correct lock on the root block, its reference
 302          * count is correct, and we have no lock set on the metadata page.
 303          * Return the root block.
 304          */
 305         return rootbuf;
 306 }
 307
 308 /*
 309  *      _bt_getbuf() -- Get a buffer by block number for read or write.
 310  *
 311  *              When this routine returns, the appropriate lock is set on the
 312  *              requested buffer and its reference count has been incremented
 313  *              (ie, the buffer is "locked and pinned").
 314  */
 315 Buffer
 316 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 317 {
 318         Buffer          buf;
 319
 320         if (blkno != P_NEW)
 321         {
 322                 /* Read an existing block of the relation */
 323                 buf = ReadBuffer(rel, blkno);
 324                 LockBuffer(buf, access);
 325         }
 326         else
 327         {
 328                 Page            page;
 329
 330                 /*
 331                  * Extend the relation by one page.
 332                  *
 333                  * Extend bufmgr code is unclean and so we have to use extra locking
 334                  * here.
 335                  */
 336                 LockPage(rel, 0, ExclusiveLock);
 337                 buf = ReadBuffer(rel, blkno);
 338                 LockBuffer(buf, access);
 339                 UnlockPage(rel, 0, ExclusiveLock);
 340
 341                 /* Initialize the new page before returning it */
 342                 page = BufferGetPage(buf);
 343                 _bt_pageinit(page, BufferGetPageSize(buf));
 344         }
 345
 346         /* ref count and lock type are correct */
 347         return buf;
 348 }
 349
 350 /*
 351  *      _bt_relbuf() -- release a locked buffer.
 352  *
 353  * Lock and pin (refcount) are both dropped.  Note that either read or
 354  * write lock can be dropped this way, but if we modified the buffer,
 355  * this is NOT the right way to release a write lock.
 356  */
 357 void
 358 _bt_relbuf(Relation rel, Buffer buf)
 359 {
 360         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 361         ReleaseBuffer(buf);
 362 }
 363
 364 /*
 365  *      _bt_wrtbuf() -- write a btree page to disk.
 366  *
 367  *              This routine releases the lock held on the buffer and our refcount
 368  *              for it.  It is an error to call _bt_wrtbuf() without a write lock
 369  *              and a pin on the buffer.
 370  *
 371  * NOTE: actually, the buffer manager just marks the shared buffer page
 372  * dirty here, the real I/O happens later.      Since we can't persuade the
 373  * Unix kernel to schedule disk writes in a particular order, there's not
 374  * much point in worrying about this.  The most we can say is that all the
 375  * writes will occur before commit.
 376  */
 377 void
 378 _bt_wrtbuf(Relation rel, Buffer buf)
 379 {
 380         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 381         WriteBuffer(buf);
 382 }
 383
 384 /*
 385  *      _bt_wrtnorelbuf() -- write a btree page to disk, but do not release
 386  *                                               our reference or lock.
 387  *
 388  *              It is an error to call _bt_wrtnorelbuf() without a write lock
 389  *              and a pin on the buffer.
 390  *
 391  * See above NOTE.
 392  */
 393 void
 394 _bt_wrtnorelbuf(Relation rel, Buffer buf)
 395 {
 396         WriteNoReleaseBuffer(buf);
 397 }
 398
 399 /*
 400  *      _bt_pageinit() -- Initialize a new page.
 401  */
 402 void
 403 _bt_pageinit(Page page, Size size)
 404 {
 405
 406         /*
 407          * Cargo_cult programming -- don't really need this to be zero, but
 408          * creating new pages is an infrequent occurrence and it makes me feel
 409          * good when I know they're empty.
 410          */
 411
 412         MemSet(page, 0, size);
 413
 414         PageInit(page, size, sizeof(BTPageOpaqueData));
 415         ((BTPageOpaque) PageGetSpecialPointer(page))->btpo_parent =
 416                 InvalidBlockNumber;
 417 }
 418
 419 /*
 420  *      _bt_metaproot() -- Change the root page of the btree.
 421  *
 422  *              Lehman and Yao require that the root page move around in order to
 423  *              guarantee deadlock-free short-term, fine-granularity locking.  When
 424  *              we split the root page, we record the new parent in the metadata page
 425  *              for the relation.  This routine does the work.
 426  *
 427  *              No direct preconditions, but if you don't have the write lock on
 428  *              at least the old root page when you call this, you're making a big
 429  *              mistake.  On exit, metapage data is correct and we no longer have
 430  *              a pin or lock on the metapage.
 431  */
 432 void
 433 _bt_metaproot(Relation rel, BlockNumber rootbknum, int level)
 434 {
 435         Buffer          metabuf;
 436         Page            metap;
 437         BTPageOpaque metaopaque;
 438         BTMetaPageData *metad;
 439
 440         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
 441         metap = BufferGetPage(metabuf);
 442         metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
 443         Assert(metaopaque->btpo_flags & BTP_META);
 444         metad = BTPageGetMeta(metap);
 445         metad->btm_root = rootbknum;
 446         if (level == 0)                         /* called from _do_insert */
 447                 metad->btm_level += 1;
 448         else
 449                 metad->btm_level = level;               /* called from btsort */
 450         _bt_wrtbuf(rel, metabuf);
 451 }
 452
 453 /*
 454  * Delete an item from a btree page.
 455  *
 456  * This routine assumes that the caller has pinned and locked the buffer,
 457  * and will write the buffer afterwards.
 458  */
 459 void
 460 _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
 461 {
 462         Page            page = BufferGetPage(buf);
 463         OffsetNumber offno;
 464
 465         offno = ItemPointerGetOffsetNumber(tid);
 466
 467         START_CRIT_SECTION();
 468
 469         PageIndexTupleDelete(page, offno);
 470
 471         /* XLOG stuff */
 472         {
 473                 xl_btree_delete xlrec;
 474                 XLogRecPtr      recptr;
 475                 XLogRecData rdata[2];
 476
 477                 xlrec.target.node = rel->rd_node;
 478                 xlrec.target.tid = *tid;
 479                 rdata[0].buffer = InvalidBuffer;
 480                 rdata[0].data = (char *) &xlrec;
 481                 rdata[0].len = SizeOfBtreeDelete;
 482                 rdata[0].next = &(rdata[1]);
 483
 484                 rdata[1].buffer = buf;
 485                 rdata[1].data = NULL;
 486                 rdata[1].len = 0;
 487                 rdata[1].next = NULL;
 488
 489                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
 490
 491                 PageSetLSN(page, recptr);
 492                 PageSetSUI(page, ThisStartUpID);
 493         }
 494
 495         END_CRIT_SECTION();
 496 }