1 /*-------------------------------------------------------------------------
4 * Hash table page management code for the Postgres hash access method
6 * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.40 2003/09/02 02:18:38 tgl Exp $
14 * Postgres hash pages look like ordinary relation pages. The opaque
15 * data at high addresses includes information about the page including
16 * whether a page is an overflow page or a true bucket, the bucket
17 * number, and the block numbers of the preceding and following pages
20 * The first page in a hash relation, page zero, is special -- it stores
21 * information describing the hash table; it is referred to as the
22 * "meta page." Pages one and higher store the actual data.
24 * There are also bitmap pages, which are not manipulated here;
27 *-------------------------------------------------------------------------
32 #include "access/genam.h"
33 #include "access/hash.h"
34 #include "miscadmin.h"
35 #include "storage/lmgr.h"
39 * We use high-concurrency locking on hash indices. There are two cases in
40 * which we don't do locking. One is when we're building the index.
41 * Since the creating transaction has not committed, no one can see
42 * the index, and there's no reason to share locks. The second case
43 * is when we're just starting up the database system. We use some
44 * special-purpose initialization code in the relation cache manager
45 * (see utils/cache/relcache.c) to allow us to do indexed scans on
46 * the system catalogs before we'd normally be able to. This happens
47 * before the lock table is fully initialized, so we can't use it.
48 * Strictly speaking, this violates 2pl, but we don't do 2pl on the
49 * system catalogs anyway.
51 * Note that our page locks are actual lockmanager locks, not buffer
52 * locks (as are used by btree, for example). This is a good idea because
53 * the algorithms are not deadlock-free, and we'd better be able to detect
54 * and recover from deadlocks.
56 * Another important difference from btree is that a hash indexscan
57 * retains both a lock and a buffer pin on the current index page
58 * between hashgettuple() calls (btree keeps only a buffer pin).
59 * Because of this, it's safe to do item deletions with only a regular
60 * write lock on a hash page --- there cannot be an indexscan stopped on
61 * the page being deleted, other than an indexscan of our own backend,
62 * which will be taken care of by _hash_adjscans.
64 #define USELOCKING (!BuildingHash && !IsInitProcessingMode())
67 static void _hash_setpagelock(Relation rel, BlockNumber blkno, int access);
68 static void _hash_unsetpagelock(Relation rel, BlockNumber blkno, int access);
69 static void _hash_splitbucket(Relation rel, Buffer metabuf,
70 Bucket obucket, Bucket nbucket);
74 * _hash_metapinit() -- Initialize the metadata page of a hash index,
75 * the two buckets that we begin with and the initial
79 _hash_metapinit(Relation rel)
82 HashPageOpaque pageopaque;
88 /* can't be sharing this with anyone, now... */
90 LockRelation(rel, AccessExclusiveLock);
92 if (RelationGetNumberOfBlocks(rel) != 0)
93 elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
94 RelationGetRelationName(rel));
96 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
97 pg = BufferGetPage(metabuf);
98 _hash_pageinit(pg, BufferGetPageSize(metabuf));
100 pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
101 pageopaque->hasho_oaddr = 0;
102 pageopaque->hasho_prevblkno = InvalidBlockNumber;
103 pageopaque->hasho_nextblkno = InvalidBlockNumber;
104 pageopaque->hasho_flag = LH_META_PAGE;
105 pageopaque->hasho_bucket = -1;
107 metap = (HashMetaPage) pg;
109 metap->hashm_magic = HASH_MAGIC;
110 metap->hashm_version = HASH_VERSION;
111 metap->hashm_ntuples = 0;
112 metap->hashm_nmaps = 0;
113 metap->hashm_ffactor = DEFAULT_FFACTOR;
114 metap->hashm_bsize = BufferGetPageSize(metabuf);
115 metap->hashm_bshift = _hash_log2(metap->hashm_bsize);
116 /* page size must be power of 2 */
117 Assert(metap->hashm_bsize == (1 << metap->hashm_bshift));
118 /* bitmap size is half of page size, to keep it also power of 2 */
119 metap->hashm_bmsize = (metap->hashm_bsize >> 1);
120 Assert(metap->hashm_bsize >= metap->hashm_bmsize +
121 MAXALIGN(sizeof(PageHeaderData)) +
122 MAXALIGN(sizeof(HashPageOpaqueData)));
123 Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));
125 metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
128 * We initialize the index with two buckets, 0 and 1, occupying physical
129 * blocks 1 and 2. The first freespace bitmap page is in block 3.
131 metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */
132 metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */
134 MemSet((char *) metap->hashm_spares, 0, sizeof(metap->hashm_spares));
135 MemSet((char *) metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
137 metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
138 metap->hashm_ovflpoint = 1;
139 metap->hashm_firstfree = 0;
142 * initialize the first two buckets
144 for (i = 0; i <= 1; i++)
146 buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
147 pg = BufferGetPage(buf);
148 _hash_pageinit(pg, BufferGetPageSize(buf));
149 pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
150 pageopaque->hasho_oaddr = 0;
151 pageopaque->hasho_prevblkno = InvalidBlockNumber;
152 pageopaque->hasho_nextblkno = InvalidBlockNumber;
153 pageopaque->hasho_flag = LH_BUCKET_PAGE;
154 pageopaque->hasho_bucket = i;
155 _hash_wrtbuf(rel, buf);
159 * Initialize bitmap page. Can't do this until we
160 * create the first two buckets, else smgr will complain.
162 _hash_initbitmap(rel, metap, 3);
165 _hash_wrtbuf(rel, metabuf);
168 UnlockRelation(rel, AccessExclusiveLock);
172 * _hash_getbuf() -- Get a buffer by block number for read or write.
174 * When this routine returns, the appropriate lock is set on the
175 * requested buffer its reference count is correct.
177 * XXX P_NEW is not used because, unlike the tree structures, we
178 * need the bucket blocks to be at certain block numbers. we must
179 * depend on the caller to call _hash_pageinit on the block if it
180 * knows that this is a new block.
183 _hash_getbuf(Relation rel, BlockNumber blkno, int access)
188 elog(ERROR, "hash AM does not use P_NEW");
193 _hash_setpagelock(rel, blkno, access);
196 elog(ERROR, "unrecognized hash access code: %d", access);
199 buf = ReadBuffer(rel, blkno);
201 /* ref count and lock type are correct */
206 * _hash_relbuf() -- release a locked buffer.
209 _hash_relbuf(Relation rel, Buffer buf, int access)
213 blkno = BufferGetBlockNumber(buf);
219 _hash_unsetpagelock(rel, blkno, access);
222 elog(ERROR, "unrecognized hash access code: %d", access);
230 * _hash_wrtbuf() -- write a hash page to disk.
232 * This routine releases the lock held on the buffer and our reference
233 * to it. It is an error to call _hash_wrtbuf() without a write lock
234 * or a reference to the buffer.
237 _hash_wrtbuf(Relation rel, Buffer buf)
241 blkno = BufferGetBlockNumber(buf);
243 _hash_unsetpagelock(rel, blkno, HASH_WRITE);
247 * _hash_wrtnorelbuf() -- write a hash page to disk, but do not release
248 * our reference or lock.
250 * It is an error to call _hash_wrtnorelbuf() without a write lock
251 * or a reference to the buffer.
254 _hash_wrtnorelbuf(Buffer buf)
258 blkno = BufferGetBlockNumber(buf);
259 WriteNoReleaseBuffer(buf);
263 * _hash_chgbufaccess() -- Change from read to write access or vice versa.
265 * When changing from write to read, we assume the buffer is dirty and tell
266 * bufmgr it must be written out.
269 _hash_chgbufaccess(Relation rel,
276 blkno = BufferGetBlockNumber(buf);
278 if (from_access == HASH_WRITE)
279 _hash_wrtnorelbuf(buf);
281 _hash_unsetpagelock(rel, blkno, from_access);
283 _hash_setpagelock(rel, blkno, to_access);
287 * _hash_pageinit() -- Initialize a new page.
290 _hash_pageinit(Page page, Size size)
292 Assert(PageIsNew(page));
293 PageInit(page, size, sizeof(HashPageOpaqueData));
297 * _hash_setpagelock() -- Acquire the requested type of lock on a page.
300 _hash_setpagelock(Relation rel,
309 LockPage(rel, blkno, ExclusiveLock);
312 LockPage(rel, blkno, ShareLock);
315 elog(ERROR, "unrecognized hash access code: %d", access);
322 * _hash_unsetpagelock() -- Release the specified type of lock on a page.
325 _hash_unsetpagelock(Relation rel,
334 UnlockPage(rel, blkno, ExclusiveLock);
337 UnlockPage(rel, blkno, ShareLock);
340 elog(ERROR, "unrecognized hash access code: %d", access);
347 * Delete a hash index item.
349 * It is safe to delete an item after acquiring a regular WRITE lock on
350 * the page, because no other backend can hold a READ lock on the page,
351 * and that means no other backend currently has an indexscan stopped on
352 * any item of the item being deleted. Our own backend might have such
353 * an indexscan (in fact *will*, since that's how VACUUM found the item
354 * in the first place), but _hash_adjscans will fix the scan position.
357 _hash_pagedel(Relation rel, ItemPointer tid)
365 HashPageOpaque opaque;
367 blkno = ItemPointerGetBlockNumber(tid);
368 offno = ItemPointerGetOffsetNumber(tid);
370 buf = _hash_getbuf(rel, blkno, HASH_WRITE);
371 page = BufferGetPage(buf);
372 _hash_checkpage(page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
373 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
375 PageIndexTupleDelete(page, offno);
377 if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE))
378 _hash_freeovflpage(rel, buf);
380 _hash_wrtbuf(rel, buf);
382 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
383 metap = (HashMetaPage) BufferGetPage(metabuf);
384 _hash_checkpage((Page) metap, LH_META_PAGE);
385 metap->hashm_ntuples--;
386 _hash_wrtbuf(rel, metabuf);
390 * Expand the hash table by creating one new bucket.
393 _hash_expandtable(Relation rel, Buffer metabuf)
400 metap = (HashMetaPage) BufferGetPage(metabuf);
401 _hash_checkpage((Page) metap, LH_META_PAGE);
403 _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_WRITE);
405 new_bucket = ++metap->hashm_maxbucket;
406 old_bucket = (new_bucket & metap->hashm_lowmask);
408 if (new_bucket > metap->hashm_highmask)
410 /* Starting a new doubling */
411 metap->hashm_lowmask = metap->hashm_highmask;
412 metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
416 * If the split point is increasing (hashm_maxbucket's log base 2
417 * increases), we need to adjust the hashm_spares[] array and
418 * hashm_ovflpoint so that future overflow pages will be created beyond
419 * this new batch of bucket pages.
421 * XXX should initialize new bucket pages to prevent out-of-order
424 spare_ndx = _hash_log2(metap->hashm_maxbucket + 1);
425 if (spare_ndx > metap->hashm_ovflpoint)
427 Assert(spare_ndx == metap->hashm_ovflpoint + 1);
428 metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
429 metap->hashm_ovflpoint = spare_ndx;
432 _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_READ);
434 /* Relocate records to the new bucket */
435 _hash_splitbucket(rel, metabuf, old_bucket, new_bucket);
440 * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
442 * We are splitting a bucket that consists of a base bucket page and zero
443 * or more overflow (bucket chain) pages. We must relocate tuples that
444 * belong in the new bucket, and compress out any free space in the old
448 _hash_splitbucket(Relation rel,
459 BlockNumber start_oblkno;
460 BlockNumber start_nblkno;
464 HashPageOpaque oopaque;
465 HashPageOpaque nopaque;
469 OffsetNumber ooffnum;
470 OffsetNumber noffnum;
471 OffsetNumber omaxoffnum;
474 TupleDesc itupdesc = RelationGetDescr(rel);
476 metap = (HashMetaPage) BufferGetPage(metabuf);
477 _hash_checkpage((Page) metap, LH_META_PAGE);
479 /* get the buffers & pages */
480 start_oblkno = BUCKET_TO_BLKNO(metap, obucket);
481 start_nblkno = BUCKET_TO_BLKNO(metap, nbucket);
482 oblkno = start_oblkno;
483 nblkno = start_nblkno;
484 obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
485 nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
486 opage = BufferGetPage(obuf);
487 npage = BufferGetPage(nbuf);
489 /* initialize the new bucket page */
490 _hash_pageinit(npage, BufferGetPageSize(nbuf));
491 nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
492 nopaque->hasho_prevblkno = InvalidBlockNumber;
493 nopaque->hasho_nextblkno = InvalidBlockNumber;
494 nopaque->hasho_flag = LH_BUCKET_PAGE;
495 nopaque->hasho_oaddr = 0;
496 nopaque->hasho_bucket = nbucket;
497 _hash_wrtnorelbuf(nbuf);
500 * make sure the old bucket isn't empty. advance 'opage' and friends
501 * through the overflow bucket chain until we find a non-empty page.
503 * XXX we should only need this once, if we are careful to preserve the
504 * invariant that overflow pages are never empty.
506 _hash_checkpage(opage, LH_BUCKET_PAGE);
507 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
508 if (PageIsEmpty(opage))
510 oblkno = oopaque->hasho_nextblkno;
511 _hash_relbuf(rel, obuf, HASH_WRITE);
512 if (!BlockNumberIsValid(oblkno))
515 * the old bucket is completely empty; of course, the new
516 * bucket will be as well, but since it's a base bucket page
519 _hash_relbuf(rel, nbuf, HASH_WRITE);
522 obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
523 opage = BufferGetPage(obuf);
524 _hash_checkpage(opage, LH_OVERFLOW_PAGE);
525 if (PageIsEmpty(opage))
526 elog(ERROR, "empty hash overflow page %u", oblkno);
527 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
531 * we are now guaranteed that 'opage' is not empty. partition the
532 * tuples in the old bucket between the old bucket and the new bucket,
533 * advancing along their respective overflow bucket chains and adding
534 * overflow pages as needed.
536 ooffnum = FirstOffsetNumber;
537 omaxoffnum = PageGetMaxOffsetNumber(opage);
541 * at each iteration through this loop, each of these variables
542 * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum
545 /* check if we're at the end of the page */
546 if (ooffnum > omaxoffnum)
548 /* at end of page, but check for overflow page */
549 oblkno = oopaque->hasho_nextblkno;
550 if (BlockNumberIsValid(oblkno))
553 * we ran out of tuples on this particular page, but we
554 * have more overflow pages; re-init values.
556 _hash_wrtbuf(rel, obuf);
557 obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
558 opage = BufferGetPage(obuf);
559 _hash_checkpage(opage, LH_OVERFLOW_PAGE);
560 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
561 /* we're guaranteed that an ovfl page has at least 1 tuple */
562 if (PageIsEmpty(opage))
563 elog(ERROR, "empty hash overflow page %u", oblkno);
564 ooffnum = FirstOffsetNumber;
565 omaxoffnum = PageGetMaxOffsetNumber(opage);
570 * We're at the end of the bucket chain, so now we're
571 * really done with everything. Before quitting, call
572 * _hash_squeezebucket to ensure the tuples remaining in the
573 * old bucket (including the overflow pages) are packed as
574 * tightly as possible. The new bucket is already tight.
576 _hash_wrtbuf(rel, obuf);
577 _hash_wrtbuf(rel, nbuf);
578 _hash_squeezebucket(rel, obucket, start_oblkno);
583 /* hash on the tuple */
584 hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
585 itup = &(hitem->hash_itup);
586 datum = index_getattr(itup, 1, itupdesc, &null);
589 bucket = _hash_call(rel, metap, datum);
591 if (bucket == nbucket)
594 * insert the tuple into the new bucket. if it doesn't fit on
595 * the current page in the new bucket, we must allocate a new
596 * overflow page and place the tuple on that page instead.
598 itemsz = IndexTupleDSize(hitem->hash_itup)
599 + (sizeof(HashItemData) - sizeof(IndexTupleData));
601 itemsz = MAXALIGN(itemsz);
603 if (PageGetFreeSpace(npage) < itemsz)
605 ovflbuf = _hash_addovflpage(rel, metabuf, nbuf);
606 _hash_wrtbuf(rel, nbuf);
608 npage = BufferGetPage(nbuf);
609 _hash_checkpage(npage, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
612 noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
613 if (PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED)
614 == InvalidOffsetNumber)
615 elog(ERROR, "failed to add index item to \"%s\"",
616 RelationGetRelationName(rel));
617 _hash_wrtnorelbuf(nbuf);
620 * now delete the tuple from the old bucket. after this
621 * section of code, 'ooffnum' will actually point to the
622 * ItemId to which we would point if we had advanced it before
623 * the deletion (PageIndexTupleDelete repacks the ItemId
624 * array). this also means that 'omaxoffnum' is exactly one
625 * less than it used to be, so we really can just decrement it
626 * instead of calling PageGetMaxOffsetNumber.
628 PageIndexTupleDelete(opage, ooffnum);
629 _hash_wrtnorelbuf(obuf);
630 omaxoffnum = OffsetNumberPrev(omaxoffnum);
633 * tidy up. if the old page was an overflow page and it is
634 * now empty, we must free it (we want to preserve the
635 * invariant that overflow pages cannot be empty).
637 if (PageIsEmpty(opage) &&
638 (oopaque->hasho_flag & LH_OVERFLOW_PAGE))
640 oblkno = _hash_freeovflpage(rel, obuf);
642 /* check that we're not through the bucket chain */
643 if (!BlockNumberIsValid(oblkno))
645 _hash_wrtbuf(rel, nbuf);
646 _hash_squeezebucket(rel, obucket, start_oblkno);
651 * re-init. again, we're guaranteed that an ovfl page has
652 * at least one tuple.
654 obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
655 opage = BufferGetPage(obuf);
656 _hash_checkpage(opage, LH_OVERFLOW_PAGE);
657 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
658 if (PageIsEmpty(opage))
659 elog(ERROR, "empty hash overflow page %u", oblkno);
660 ooffnum = FirstOffsetNumber;
661 omaxoffnum = PageGetMaxOffsetNumber(opage);
667 * the tuple stays on this page. we didn't move anything, so
668 * we didn't delete anything and therefore we don't have to
669 * change 'omaxoffnum'.
671 Assert(bucket == obucket);
672 ooffnum = OffsetNumberNext(ooffnum);