1 /*-------------------------------------------------------------------------
4 * header file for postgres btree access method implementation.
7 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
10 * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.105 2006/10/04 00:30:07 momjian Exp $
12 *-------------------------------------------------------------------------
17 #include "access/itup.h"
18 #include "access/relscan.h"
19 #include "access/sdir.h"
20 #include "access/xlogutils.h"
23 /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
24 typedef uint16 BTCycleId;
27 * BTPageOpaqueData -- At the end of every page, we store a pointer
28 * to both siblings in the tree. This is used to do forward/backward
29 * index scans. The next-page link is also critical for recovery when
30 * a search has navigated to the wrong page due to concurrent page splits
31 * or deletions; see src/backend/access/nbtree/README for more info.
33 * In addition, we store the page's btree level (counting upwards from
34 * zero at a leaf page) as well as some flag bits indicating the page type
35 * and status. If the page is deleted, we replace the level with the
36 * next-transaction-ID value indicating when it is safe to reclaim the page.
38 * We also store a "vacuum cycle ID". When a page is split while VACUUM is
39 * processing the index, a nonzero value associated with the VACUUM run is
40 * stored into both halves of the split page. (If VACUUM is not running,
41 * both pages receive zero cycleids.) This allows VACUUM to detect whether
42 * a page was split since it started, with a small probability of false match
43 * if the page was last split some exact multiple of 65536 VACUUMs ago.
44 * Also, during a split, the BTP_SPLIT_END flag is cleared in the left
45 * (original) page, and set in the right page, but only if the next page
46 * to its right has a different cycleid.
48 * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
52 typedef struct BTPageOpaqueData
54 BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */
55 BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */
58 uint32 level; /* tree level --- zero for leaf pages */
59 TransactionId xact; /* next transaction ID, if deleted */
61 uint16 btpo_flags; /* flag bits, see below */
62 BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */
65 typedef BTPageOpaqueData *BTPageOpaque;
67 /* Bits defined in btpo_flags */
68 #define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */
69 #define BTP_ROOT (1 << 1) /* root page (has no parent) */
70 #define BTP_DELETED (1 << 2) /* page has been deleted from tree */
71 #define BTP_META (1 << 3) /* meta-page */
72 #define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */
73 #define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */
74 #define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DELETEd tuples */
78 * The Meta page is always the first page in the btree index.
79 * Its primary purpose is to point to the location of the btree root page.
80 * We also point to the "fast" root, which is the current effective root;
81 * see README for discussion.
84 typedef struct BTMetaPageData
86 uint32 btm_magic; /* should contain BTREE_MAGIC */
87 uint32 btm_version; /* should contain BTREE_VERSION */
88 BlockNumber btm_root; /* current root location */
89 uint32 btm_level; /* tree level of the root page */
90 BlockNumber btm_fastroot; /* current "fast" root location */
91 uint32 btm_fastlevel; /* tree level of the "fast" root page */
94 #define BTPageGetMeta(p) \
95 ((BTMetaPageData *) PageGetContents(p))
97 #define BTREE_METAPAGE 0 /* first page is meta */
98 #define BTREE_MAGIC 0x053162 /* magic number of btree pages */
99 #define BTREE_VERSION 2 /* current version number */
102 * We actually need to be able to fit three items on every page,
103 * so restrict any one item to 1/3 the per-page available space.
105 #define BTMaxItemSize(page) \
106 ((PageGetPageSize(page) - \
107 sizeof(PageHeaderData) - \
108 MAXALIGN(sizeof(BTPageOpaqueData))) / 3 - sizeof(ItemIdData))
111 * The leaf-page fillfactor defaults to 90% but is user-adjustable.
112 * For pages above the leaf level, we use a fixed 70% fillfactor.
113 * The fillfactor is applied during index build and when splitting
114 * a rightmost page; when splitting non-rightmost pages we try to
115 * divide the data equally.
117 #define BTREE_MIN_FILLFACTOR 10
118 #define BTREE_DEFAULT_FILLFACTOR 90
119 #define BTREE_NONLEAF_FILLFACTOR 70
122 * Test whether two btree entries are "the same".
125 * In addition, we must guarantee that all tuples in the index are unique,
126 * in order to satisfy some assumptions in Lehman and Yao. The way that we
127 * do this is by generating a new OID for every insertion that we do in the
128 * tree. This adds eight bytes to the size of btree index tuples. Note
129 * that we do not use the OID as part of a composite key; the OID only
130 * serves as a unique identifier for a given index tuple (logical position
134 * actually, we must guarantee that all tuples in A LEVEL
135 * are unique, not in ALL INDEX. So, we can use the t_tid
136 * as unique identifier for a given index tuple (logical position
137 * within a level). - vadim 04/09/97
139 #define BTTidSame(i1, i2) \
140 ( (i1).ip_blkid.bi_hi == (i2).ip_blkid.bi_hi && \
141 (i1).ip_blkid.bi_lo == (i2).ip_blkid.bi_lo && \
142 (i1).ip_posid == (i2).ip_posid )
143 #define BTEntrySame(i1, i2) \
144 BTTidSame((i1)->t_tid, (i2)->t_tid)
148 * In general, the btree code tries to localize its knowledge about
149 * page layout to a couple of routines. However, we need a special
150 * value to indicate "no page number" in those places where we expect
151 * page numbers. We can use zero for this because we never need to
152 * make a pointer to the metadata page.
158 * Macros to test whether a page is leftmost or rightmost on its tree level,
159 * as well as other state info kept in the opaque data.
161 #define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE)
162 #define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE)
163 #define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF)
164 #define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT)
165 #define P_ISDELETED(opaque) ((opaque)->btpo_flags & BTP_DELETED)
166 #define P_IGNORE(opaque) ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
167 #define P_HAS_GARBAGE(opaque) ((opaque)->btpo_flags & BTP_HAS_GARBAGE)
170 * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
171 * page. The high key is not a data key, but gives info about what range of
172 * keys is supposed to be on this page. The high key on a page is required
173 * to be greater than or equal to any data key that appears on the page.
174 * If we find ourselves trying to insert a key > high key, we know we need
175 * to move right (this should only happen if the page was split since we
176 * examined the parent page).
178 * Our insertion algorithm guarantees that we can use the initial least key
179 * on our right sibling as the high key. Once a page is created, its high
180 * key changes only if the page is split.
182 * On a non-rightmost page, the high key lives in item 1 and data items
183 * start in item 2. Rightmost pages have no high key, so we store data
184 * items beginning in item 1.
187 #define P_HIKEY ((OffsetNumber) 1)
188 #define P_FIRSTKEY ((OffsetNumber) 2)
189 #define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
192 * XLOG records for btree operations
194 * XLOG allows to store some information in high 4 bits of log
195 * record xl_info field
197 #define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */
198 #define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */
199 #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
200 #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */
201 #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
202 #define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add tuple with split of root */
203 #define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */
204 #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuple */
205 #define XLOG_BTREE_DELETE_PAGE 0x80 /* delete an entire page */
206 #define XLOG_BTREE_DELETE_PAGE_META 0x90 /* same, plus update metapage */
207 #define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
210 * All that we need to find changed index tuple
212 typedef struct xl_btreetid
215 ItemPointerData tid; /* changed tuple id */
219 * All that we need to regenerate the meta-data page
221 typedef struct xl_btree_metadata
225 BlockNumber fastroot;
230 * This is what we need to know about simple (without split) insert.
232 * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
233 * Note that INSERT_META implies it's not a leaf page.
235 typedef struct xl_btree_insert
237 xl_btreetid target; /* inserted tuple id */
238 /* BlockNumber downlink field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */
239 /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
240 /* INDEX TUPLE FOLLOWS AT END OF STRUCT */
243 #define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData)
246 * On insert with split we save items of both left and right siblings
247 * and restore content of both pages from log record. This way takes less
248 * xlog space than the normal approach, because if we did it standardly,
249 * XLogInsert would almost always think the right page is new and store its
252 * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
253 * The _L and _R variants indicate whether the inserted tuple went into the
254 * left or right split page (and thus, whether otherblk is the right or left
255 * page of the split pair). The _ROOT variants indicate that we are splitting
256 * the root page, and thus that a newroot record rather than an insert or
257 * split record should follow. Note that a split record never carries a
258 * metapage update --- we'll do that in the parent-level update.
260 typedef struct xl_btree_split
262 xl_btreetid target; /* inserted tuple id */
263 BlockNumber otherblk; /* second block participated in split: */
264 /* first one is stored in target' tid */
265 BlockNumber leftblk; /* prev/left block */
266 BlockNumber rightblk; /* next/right block */
267 uint32 level; /* tree level of page being split */
268 uint16 leftlen; /* len of left page items below */
269 /* LEFT AND RIGHT PAGES TUPLES FOLLOW AT THE END */
272 #define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
275 * This is what we need to know about delete of individual leaf index tuples.
276 * The WAL record can represent deletion of any number of index tuples on a
279 typedef struct xl_btree_delete
283 /* TARGET OFFSET NUMBERS FOLLOW AT THE END */
286 #define SizeOfBtreeDelete (offsetof(xl_btree_delete, block) + sizeof(BlockNumber))
289 * This is what we need to know about deletion of a btree page. The target
290 * identifies the tuple removed from the parent page (note that we remove
291 * this tuple's downlink and the *following* tuple's key). Note we do not
292 * store any content for the deleted page --- it is just rewritten as empty
295 typedef struct xl_btree_delete_page
297 xl_btreetid target; /* deleted tuple id in parent page */
298 BlockNumber deadblk; /* child block being deleted */
299 BlockNumber leftblk; /* child block's left sibling, if any */
300 BlockNumber rightblk; /* child block's right sibling */
301 /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_DELETE_PAGE_META */
302 } xl_btree_delete_page;
304 #define SizeOfBtreeDeletePage (offsetof(xl_btree_delete_page, rightblk) + sizeof(BlockNumber))
307 * New root log record. There are zero tuples if this is to establish an
308 * empty root, or two if it is the result of splitting an old root.
310 * Note that although this implies rewriting the metadata page, we don't need
311 * an xl_btree_metadata record --- the rootblk and level are sufficient.
313 typedef struct xl_btree_newroot
316 BlockNumber rootblk; /* location of new root */
317 uint32 level; /* its tree level */
318 /* 0 or 2 INDEX TUPLES FOLLOW AT END OF STRUCT */
321 #define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
325 * Operator strategy numbers for B-tree have been moved to access/skey.h,
326 * because many places need to use them in ScanKeyInit() calls.
330 * When a new operator class is declared, we require that the user
331 * supply us with an amproc procedure for determining whether, for
332 * two keys a and b, a < b, a = b, or a > b. This routine must
333 * return < 0, 0, > 0, respectively, in these three cases. Since we
334 * only have one such proc in amproc, it's number 1.
337 #define BTORDER_PROC 1
340 * We need to be able to tell the difference between read and write
341 * requests for pages, in order to do locking correctly.
344 #define BT_READ BUFFER_LOCK_SHARE
345 #define BT_WRITE BUFFER_LOCK_EXCLUSIVE
348 * BTStackData -- As we descend a tree, we push the (location, downlink)
349 * pairs from internal pages onto a private stack. If we split a
350 * leaf, we use this stack to walk back up the tree and insert data
351 * into parent pages (and possibly to split them, too). Lehman and
352 * Yao's update algorithm guarantees that under no circumstances can
353 * our private stack give us an irredeemably bad picture up the tree.
354 * Again, see the paper for details.
357 typedef struct BTStackData
359 BlockNumber bts_blkno;
360 OffsetNumber bts_offset;
361 IndexTupleData bts_btentry;
362 struct BTStackData *bts_parent;
365 typedef BTStackData *BTStack;
368 * BTScanOpaqueData is the btree-private state needed for an indexscan.
369 * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
370 * details of the preprocessing), information about the current location
371 * of the scan, and information about the marked location, if any. (We use
372 * BTScanPosData to represent the data needed for each of current and marked
373 * locations.) In addition we can remember some known-killed index entries
374 * that must be marked before we can move off the current page.
376 * Index scans work a page at a time: we pin and read-lock the page, identify
377 * all the matching items on the page and save them in BTScanPosData, then
378 * release the read-lock while returning the items to the caller for
379 * processing. This approach minimizes lock/unlock traffic. Note that we
380 * keep the pin on the index page until the caller is done with all the items
381 * (this is needed for VACUUM synchronization, see nbtree/README). When we
382 * are ready to step to the next page, if the caller has told us any of the
383 * items were killed, we re-lock the page to mark them killed, then unlock.
384 * Finally we drop the pin and step to the next page in the appropriate
387 * NOTE: in this implementation, btree does not use or set the
388 * currentItemData and currentMarkData fields of IndexScanDesc at all.
391 typedef struct BTScanPosItem /* what we remember about each match */
393 ItemPointerData heapTid; /* TID of referenced heap item */
394 OffsetNumber indexOffset; /* index item's location within page */
397 typedef struct BTScanPosData
399 Buffer buf; /* if valid, the buffer is pinned */
401 BlockNumber nextPage; /* page's right link when we scanned it */
404 * moreLeft and moreRight track whether we think there may be matching
405 * index entries to the left and right of the current page, respectively.
406 * We can clear the appropriate one of these flags when _bt_checkkeys()
407 * returns continuescan = false.
413 * The items array is always ordered in index order (ie, increasing
414 * indexoffset). When scanning backwards it is convenient to fill the
415 * array back-to-front, so we start at the last slot and fill downwards.
416 * Hence we need both a first-valid-entry and a last-valid-entry counter.
417 * itemIndex is a cursor showing which entry was last returned to caller.
419 int firstItem; /* first valid index in items[] */
420 int lastItem; /* last valid index in items[] */
421 int itemIndex; /* current index in items[] */
423 BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
426 typedef BTScanPosData *BTScanPos;
428 #define BTScanPosIsValid(scanpos) BufferIsValid((scanpos).buf)
430 typedef struct BTScanOpaqueData
432 /* these fields are set by _bt_preprocess_keys(): */
433 bool qual_ok; /* false if qual can never be satisfied */
434 int numberOfKeys; /* number of preprocessed scan keys */
435 ScanKey keyData; /* array of preprocessed scan keys */
437 /* info about killed items if any (killedItems is NULL if never used) */
438 int *killedItems; /* currPos.items indexes of killed items */
439 int numKilled; /* number of currently stored items */
442 * If the marked position is on the same page as current position, we
443 * don't use markPos, but just keep the marked itemIndex in markItemIndex
444 * (all the rest of currPos is valid for the mark position). Hence, to
445 * determine if there is a mark, first look at markItemIndex, then at
448 int markItemIndex; /* itemIndex, or -1 if not valid */
450 /* keep these last in struct for efficiency */
451 BTScanPosData currPos; /* current position data */
452 BTScanPosData markPos; /* marked position, if any */
455 typedef BTScanOpaqueData *BTScanOpaque;
458 * We use these private sk_flags bits in preprocessed scan keys
460 #define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */
461 #define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */
465 * prototypes for functions in nbtree.c (external entry points for btree)
467 extern Datum btbuild(PG_FUNCTION_ARGS);
468 extern Datum btinsert(PG_FUNCTION_ARGS);
469 extern Datum btbeginscan(PG_FUNCTION_ARGS);
470 extern Datum btgettuple(PG_FUNCTION_ARGS);
471 extern Datum btgetmulti(PG_FUNCTION_ARGS);
472 extern Datum btrescan(PG_FUNCTION_ARGS);
473 extern Datum btendscan(PG_FUNCTION_ARGS);
474 extern Datum btmarkpos(PG_FUNCTION_ARGS);
475 extern Datum btrestrpos(PG_FUNCTION_ARGS);
476 extern Datum btbulkdelete(PG_FUNCTION_ARGS);
477 extern Datum btvacuumcleanup(PG_FUNCTION_ARGS);
478 extern Datum btoptions(PG_FUNCTION_ARGS);
481 * prototypes for functions in nbtinsert.c
483 extern void _bt_doinsert(Relation rel, IndexTuple itup,
484 bool index_is_unique, Relation heapRel);
485 extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access);
486 extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
487 BTStack stack, bool is_root, bool is_only);
490 * prototypes for functions in nbtpage.c
492 extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
493 extern Buffer _bt_getroot(Relation rel, int access);
494 extern Buffer _bt_gettrueroot(Relation rel);
495 extern void _bt_checkpage(Relation rel, Buffer buf);
496 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
497 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
498 BlockNumber blkno, int access);
499 extern void _bt_relbuf(Relation rel, Buffer buf);
500 extern void _bt_pageinit(Page page, Size size);
501 extern bool _bt_page_recyclable(Page page);
502 extern void _bt_delitems(Relation rel, Buffer buf,
503 OffsetNumber *itemnos, int nitems);
504 extern int _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full);
507 * prototypes for functions in nbtsearch.c
509 extern BTStack _bt_search(Relation rel,
510 int keysz, ScanKey scankey, bool nextkey,
511 Buffer *bufP, int access);
512 extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
513 ScanKey scankey, bool nextkey, int access);
514 extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
515 ScanKey scankey, bool nextkey);
516 extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
517 Page page, OffsetNumber offnum);
518 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
519 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
520 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
523 * prototypes for functions in nbtutils.c
525 extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup);
526 extern ScanKey _bt_mkscankey_nodata(Relation rel);
527 extern void _bt_freeskey(ScanKey skey);
528 extern void _bt_freestack(BTStack stack);
529 extern void _bt_preprocess_keys(IndexScanDesc scan);
530 extern bool _bt_checkkeys(IndexScanDesc scan,
531 Page page, OffsetNumber offnum,
532 ScanDirection dir, bool *continuescan);
533 extern void _bt_killitems(IndexScanDesc scan, bool haveLock);
534 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
535 extern BTCycleId _bt_start_vacuum(Relation rel);
536 extern void _bt_end_vacuum(Relation rel);
537 extern Size BTreeShmemSize(void);
538 extern void BTreeShmemInit(void);
541 * prototypes for functions in nbtsort.c
543 typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */
545 extern BTSpool *_bt_spoolinit(Relation index, bool isunique, bool isdead);
546 extern void _bt_spooldestroy(BTSpool *btspool);
547 extern void _bt_spool(IndexTuple itup, BTSpool *btspool);
548 extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
551 * prototypes for functions in nbtxlog.c
553 extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
554 extern void btree_desc(StringInfo buf, uint8 xl_info, char *rec);
555 extern void btree_xlog_startup(void);
556 extern void btree_xlog_cleanup(void);
557 extern bool btree_safe_restartpoint(void);
559 #endif /* NBTREE_H */