OSDN Git Service

This patch makes some improvements to the rtree index implementation:
authorNeil Conway <neilc@samurai.com>
Tue, 18 Jan 2005 23:25:55 +0000 (23:25 +0000)
committerNeil Conway <neilc@samurai.com>
Tue, 18 Jan 2005 23:25:55 +0000 (23:25 +0000)
(1) Keep a pin on the scan's current buffer and mark buffer. This
avoids the need to do a ReadBuffer() for each tuple produced by the
scan. Since ReadBuffer() is expensive, this is a significant win.

(2) Convert a ReleaseBuffer(); ReadBuffer() pair into
ReleaseAndReadBuffer(). Surely not a huge win, but it saves a lock
acquire/release...

(3) Remove a bunch of duplicated code in rtget.c; make rtnext() handle
both the "initial result" and "subsequent result" cases.

(4) Add support for index tuple killing

(5) Remove rtscancache(): it is dead code, for the same reason that
gistscancache() is dead code (an index scan ought not be invoked with
NoMovementScanDirection).

The end result is about a 10% improvement in rtree index scan perf,
according to contrib/rtree_gist/bench.

src/backend/access/rtree/rtget.c
src/backend/access/rtree/rtree.c
src/backend/access/rtree/rtscan.c
src/include/access/rtree.h

index 868b178..31963e8 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/rtree/rtget.c,v 1.33 2004/12/31 21:59:26 pgsql Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/rtree/rtget.c,v 1.34 2005/01/18 23:25:43 neilc Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "access/relscan.h"
 #include "access/rtree.h"
 
-static OffsetNumber findnext(IndexScanDesc s, Page p, OffsetNumber n,
+static OffsetNumber findnext(IndexScanDesc s, OffsetNumber n,
                 ScanDirection dir);
-static bool rtscancache(IndexScanDesc s, ScanDirection dir);
-static bool rtfirst(IndexScanDesc s, ScanDirection dir);
 static bool rtnext(IndexScanDesc s, ScanDirection dir);
 
 
@@ -31,138 +29,106 @@ rtgettuple(PG_FUNCTION_ARGS)
 {
        IndexScanDesc s = (IndexScanDesc) PG_GETARG_POINTER(0);
        ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
-       bool            res;
-
-       /* if we have it cached in the scan desc, just return the value */
-       if (rtscancache(s, dir))
-               PG_RETURN_BOOL(true);
-
-       /* not cached, so we'll have to do some work */
-       if (ItemPointerIsValid(&(s->currentItemData)))
-               res = rtnext(s, dir);
-       else
-               res = rtfirst(s, dir);
-       PG_RETURN_BOOL(res);
-}
-
-static bool
-rtfirst(IndexScanDesc s, ScanDirection dir)
-{
-       Buffer          b;
-       Page            p;
-       OffsetNumber n;
-       OffsetNumber maxoff;
-       RTreePageOpaque po;
+       Page page;
+       OffsetNumber offnum;
        RTreeScanOpaque so;
-       RTSTACK    *stk;
-       BlockNumber blk;
-       IndexTuple      it;
 
-       b = ReadBuffer(s->indexRelation, P_ROOT);
-       p = BufferGetPage(b);
-       po = (RTreePageOpaque) PageGetSpecialPointer(p);
        so = (RTreeScanOpaque) s->opaque;
 
-       for (;;)
+       /*
+        * If we've already produced a tuple and the executor has informed
+        * us that it should be marked "killed", do so know.
+        */
+       if (s->kill_prior_tuple && ItemPointerIsValid(&(s->currentItemData)))
        {
-               maxoff = PageGetMaxOffsetNumber(p);
-               if (ScanDirectionIsBackward(dir))
-                       n = findnext(s, p, maxoff, dir);
-               else
-                       n = findnext(s, p, FirstOffsetNumber, dir);
-
-               while (n < FirstOffsetNumber || n > maxoff)
-               {
-                       ReleaseBuffer(b);
-                       if (so->s_stack == NULL)
-                               return false;
-
-                       stk = so->s_stack;
-                       b = ReadBuffer(s->indexRelation, stk->rts_blk);
-                       p = BufferGetPage(b);
-                       po = (RTreePageOpaque) PageGetSpecialPointer(p);
-                       maxoff = PageGetMaxOffsetNumber(p);
+               offnum = ItemPointerGetOffsetNumber(&(s->currentItemData));
+               page = BufferGetPage(so->curbuf);
+               PageGetItemId(page, offnum)->lp_flags |= LP_DELETE;
+               SetBufferCommitInfoNeedsSave(so->curbuf);
+       }
 
-                       if (ScanDirectionIsBackward(dir))
-                               n = OffsetNumberPrev(stk->rts_child);
-                       else
-                               n = OffsetNumberNext(stk->rts_child);
-                       so->s_stack = stk->rts_parent;
-                       pfree(stk);
+       /*
+        * Get the next tuple that matches the search key; if asked to
+        * skip killed tuples, find the first non-killed tuple that
+        * matches. Return as soon as we've run out of matches or we've
+        * found an acceptable match.
+        */
+       for (;;)
+       {
+               bool res = rtnext(s, dir);
 
-                       n = findnext(s, p, n, dir);
-               }
-               if (po->flags & F_LEAF)
+               if (res == true && s->ignore_killed_tuples)
                {
-                       ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n);
-
-                       it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-
-                       s->xs_ctup.t_self = it->t_tid;
-
-                       ReleaseBuffer(b);
-                       return true;
+                       offnum = ItemPointerGetOffsetNumber(&(s->currentItemData));
+                       page = BufferGetPage(so->curbuf);
+                       if (ItemIdDeleted(PageGetItemId(page, offnum)))
+                               continue;
                }
-               else
-               {
-                       stk = (RTSTACK *) palloc(sizeof(RTSTACK));
-                       stk->rts_child = n;
-                       stk->rts_blk = BufferGetBlockNumber(b);
-                       stk->rts_parent = so->s_stack;
-                       so->s_stack = stk;
-
-                       it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-                       blk = ItemPointerGetBlockNumber(&(it->t_tid));
 
-                       ReleaseBuffer(b);
-                       b = ReadBuffer(s->indexRelation, blk);
-                       p = BufferGetPage(b);
-                       po = (RTreePageOpaque) PageGetSpecialPointer(p);
-               }
+               PG_RETURN_BOOL(res);
        }
 }
 
 static bool
 rtnext(IndexScanDesc s, ScanDirection dir)
 {
-       Buffer          b;
        Page            p;
        OffsetNumber n;
-       OffsetNumber maxoff;
        RTreePageOpaque po;
        RTreeScanOpaque so;
-       RTSTACK    *stk;
-       BlockNumber blk;
-       IndexTuple      it;
 
-       blk = ItemPointerGetBlockNumber(&(s->currentItemData));
-       n = ItemPointerGetOffsetNumber(&(s->currentItemData));
+       so = (RTreeScanOpaque) s->opaque;
 
-       if (ScanDirectionIsForward(dir))
-               n = OffsetNumberNext(n);
-       else
-               n = OffsetNumberPrev(n);
+       if (!ItemPointerIsValid(&(s->currentItemData)))
+       {
+               /* first call: start at the root */
+               Assert(BufferIsValid(so->curbuf) == false);
+               so->curbuf = ReadBuffer(s->indexRelation, P_ROOT);
+       }
 
-       b = ReadBuffer(s->indexRelation, blk);
-       p = BufferGetPage(b);
+       p = BufferGetPage(so->curbuf);
        po = (RTreePageOpaque) PageGetSpecialPointer(p);
-       so = (RTreeScanOpaque) s->opaque;
+
+       if (!ItemPointerIsValid(&(s->currentItemData)))
+       {
+               /* first call: start at first/last offset */
+               if (ScanDirectionIsForward(dir))
+                       n = FirstOffsetNumber;
+               else
+                       n = PageGetMaxOffsetNumber(p);
+       }
+       else
+       {
+               /* go on to the next offset */
+               n = ItemPointerGetOffsetNumber(&(s->currentItemData));
+               if (ScanDirectionIsForward(dir))
+                       n = OffsetNumberNext(n);
+               else
+                       n = OffsetNumberPrev(n);
+       }
 
        for (;;)
        {
-               maxoff = PageGetMaxOffsetNumber(p);
-               n = findnext(s, p, n, dir);
+               IndexTuple      it;
+               RTSTACK    *stk;
+
+               n = findnext(s, n, dir);
 
-               while (n < FirstOffsetNumber || n > maxoff)
+               /* no match on this page, so read in the next stack entry */
+               if (n == InvalidOffsetNumber)
                {
-                       ReleaseBuffer(b);
+                       /* if out of stack entries, we're done */
                        if (so->s_stack == NULL)
+                       {
+                               ReleaseBuffer(so->curbuf);
+                               so->curbuf = InvalidBuffer;
                                return false;
+                       }
 
                        stk = so->s_stack;
-                       b = ReadBuffer(s->indexRelation, stk->rts_blk);
-                       p = BufferGetPage(b);
-                       maxoff = PageGetMaxOffsetNumber(p);
+                       so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation,
+                                                                                         stk->rts_blk);
+                       p = BufferGetPage(so->curbuf);
                        po = (RTreePageOpaque) PageGetSpecialPointer(p);
 
                        if (ScanDirectionIsBackward(dir))
@@ -172,33 +138,41 @@ rtnext(IndexScanDesc s, ScanDirection dir)
                        so->s_stack = stk->rts_parent;
                        pfree(stk);
 
-                       n = findnext(s, p, n, dir);
+                       continue;
                }
+
                if (po->flags & F_LEAF)
                {
-                       ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n);
-
+                       ItemPointerSet(&(s->currentItemData),
+                                                  BufferGetBlockNumber(so->curbuf),
+                                                  n);
                        it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-
                        s->xs_ctup.t_self = it->t_tid;
-
-                       ReleaseBuffer(b);
                        return true;
                }
                else
                {
+                       BlockNumber blk;
+
                        stk = (RTSTACK *) palloc(sizeof(RTSTACK));
                        stk->rts_child = n;
-                       stk->rts_blk = BufferGetBlockNumber(b);
+                       stk->rts_blk = BufferGetBlockNumber(so->curbuf);
                        stk->rts_parent = so->s_stack;
                        so->s_stack = stk;
 
                        it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
                        blk = ItemPointerGetBlockNumber(&(it->t_tid));
 
-                       ReleaseBuffer(b);
-                       b = ReadBuffer(s->indexRelation, blk);
-                       p = BufferGetPage(b);
+                       /*
+                        * Note that we release the pin on the page as we descend
+                        * down the tree, even though there's a good chance we'll
+                        * eventually need to re-read the buffer later in this
+                        * scan. This may or may not be optimal, but it doesn't
+                        * seem likely to make a huge performance difference
+                        * either way.
+                        */
+                       so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation, blk);
+                       p = BufferGetPage(so->curbuf);
                        po = (RTreePageOpaque) PageGetSpecialPointer(p);
 
                        if (ScanDirectionIsBackward(dir))
@@ -209,17 +183,26 @@ rtnext(IndexScanDesc s, ScanDirection dir)
        }
 }
 
+/*
+ * Return the offset of the next matching index entry. We begin the
+ * search at offset "n" and search for matches in the direction
+ * "dir". If no more matching entries are found on the page,
+ * InvalidOffsetNumber is returned.
+ */
 static OffsetNumber
-findnext(IndexScanDesc s, Page p, OffsetNumber n, ScanDirection dir)
+findnext(IndexScanDesc s, OffsetNumber n, ScanDirection dir)
 {
        OffsetNumber maxoff;
        IndexTuple      it;
        RTreePageOpaque po;
        RTreeScanOpaque so;
+       Page p;
+
+       so = (RTreeScanOpaque) s->opaque;
+       p = BufferGetPage(so->curbuf);
 
        maxoff = PageGetMaxOffsetNumber(p);
        po = (RTreePageOpaque) PageGetSpecialPointer(p);
-       so = (RTreeScanOpaque) s->opaque;
 
        /*
         * If we modified the index during the scan, we may have a pointer to
@@ -256,28 +239,8 @@ findnext(IndexScanDesc s, Page p, OffsetNumber n, ScanDirection dir)
                        n = OffsetNumberNext(n);
        }
 
-       return n;
-}
-
-static bool
-rtscancache(IndexScanDesc s, ScanDirection dir)
-{
-       Buffer          b;
-       Page            p;
-       OffsetNumber n;
-       IndexTuple      it;
-
-       if (!(ScanDirectionIsNoMovement(dir)
-                 && ItemPointerIsValid(&(s->currentItemData))))
-               return false;
-
-       b = ReadBuffer(s->indexRelation,
-                                  ItemPointerGetBlockNumber(&(s->currentItemData)));
-       p = BufferGetPage(b);
-       n = ItemPointerGetOffsetNumber(&(s->currentItemData));
-       it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-       s->xs_ctup.t_self = it->t_tid;
-       ReleaseBuffer(b);
-
-       return true;
+       if (n >= FirstOffsetNumber && n <= maxoff)
+               return n;                                               /* found a match on this page */
+       else
+               return InvalidOffsetNumber;             /* no match, go to next page */
 }
index c2dd2bb..b1762bf 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/rtree/rtree.c,v 1.85 2004/12/31 21:59:26 pgsql Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/rtree/rtree.c,v 1.86 2005/01/18 23:25:47 neilc Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -280,12 +280,8 @@ rtdoinsert(Relation r, IndexTuple itup, RTSTATE *rtstate)
 
        do
        {
-               /* let go of current buffer before getting next */
-               if (buffer != InvalidBuffer)
-                       ReleaseBuffer(buffer);
-
-               /* get next buffer */
-               buffer = ReadBuffer(r, blk);
+               /* release the current buffer, read in the next one */
+               buffer = ReleaseAndReadBuffer(buffer, r, blk);
                page = (Page) BufferGetPage(buffer);
 
                opaque = (RTreePageOpaque) PageGetSpecialPointer(page);
index 7801c5d..e608cb4 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/rtree/rtscan.c,v 1.56 2004/12/31 21:59:26 pgsql Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/rtree/rtscan.c,v 1.57 2005/01/18 23:25:48 neilc Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -89,12 +89,24 @@ rtrescan(PG_FUNCTION_ARGS)
                freestack(p->s_markstk);
                p->s_stack = p->s_markstk = NULL;
                p->s_flags = 0x0;
+               /* drop pins on buffers -- no locks held */
+               if (BufferIsValid(p->curbuf))
+               {
+                       ReleaseBuffer(p->curbuf);
+                       p->curbuf = InvalidBuffer;
+               }
+               if (BufferIsValid(p->markbuf))
+               {
+                       ReleaseBuffer(p->markbuf);
+                       p->markbuf = InvalidBuffer;
+               }
        }
        else
        {
                /* initialize opaque data */
                p = (RTreeScanOpaque) palloc(sizeof(RTreeScanOpaqueData));
                p->s_stack = p->s_markstk = NULL;
+               p->curbuf = p->markbuf = InvalidBuffer;
                p->s_internalNKey = s->numberOfKeys;
                p->s_flags = 0x0;
                s->opaque = p;
@@ -175,6 +187,18 @@ rtmarkpos(PG_FUNCTION_ARGS)
        freestack(p->s_markstk);
        p->s_markstk = o;
 
+       /* Update markbuf: make sure to bump ref count on curbuf */
+       if (BufferIsValid(p->markbuf))
+       {
+               ReleaseBuffer(p->markbuf);
+               p->markbuf = InvalidBuffer;
+       }
+       if (BufferIsValid(p->curbuf))
+       {
+               IncrBufferRefCount(p->curbuf);
+               p->markbuf = p->curbuf;
+       }
+
        PG_RETURN_VOID();
 }
 
@@ -211,6 +235,18 @@ rtrestrpos(PG_FUNCTION_ARGS)
        freestack(p->s_stack);
        p->s_stack = o;
 
+       /* Update curbuf; be sure to bump ref count on markbuf */
+       if (BufferIsValid(p->curbuf))
+       {
+               ReleaseBuffer(p->curbuf);
+               p->curbuf = InvalidBuffer;
+       }
+       if (BufferIsValid(p->markbuf))
+       {
+               IncrBufferRefCount(p->markbuf);
+               p->curbuf = p->markbuf;
+       }
+
        PG_RETURN_VOID();
 }
 
@@ -226,11 +262,14 @@ rtendscan(PG_FUNCTION_ARGS)
        {
                freestack(p->s_stack);
                freestack(p->s_markstk);
+               if (BufferIsValid(p->curbuf))
+                       ReleaseBuffer(p->curbuf);
+               if (BufferIsValid(p->markbuf))
+                       ReleaseBuffer(p->markbuf);
                pfree(s->opaque);
        }
 
        rtdropscan(s);
-       /* XXX don't unset read lock -- two-phase locking */
 
        PG_RETURN_VOID();
 }
index a5c7685..d06ccdc 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/rtree.h,v 1.36 2004/12/31 22:03:21 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/access/rtree.h,v 1.37 2005/01/18 23:25:55 neilc Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -59,11 +59,14 @@ typedef struct RTSTACK
 /*
  *     When we're doing a scan, we need to keep track of the parent stack
  *     for the marked and current items.  Also, rtrees have the following
- *     property:  if you're looking for the box (1,1,2,2), on the internal
- *     nodes you have to search for all boxes that *contain* (1,1,2,2), and
- *     not the ones that match it.  We have a private scan key for internal
- *     nodes in the opaque structure for rtrees for this reason.  See
- *     access/index-rtree/rtscan.c and rtstrat.c for how it gets initialized.
+ *     property: if you're looking for the box (1,1,2,2), on the internal
+ *     nodes you have to search for all boxes that *contain* (1,1,2,2),
+ *     and not the ones that match it.  We have a private scan key for
+ *     internal nodes in the opaque structure for rtrees for this reason.
+ *     See access/index-rtree/rtscan.c and rtstrat.c for how it gets
+ *     initialized. We also keep pins on the scan's current buffer and
+ *     marked buffer, if any: this avoids the need to invoke ReadBuffer()
+ *     for each tuple produced by the index scan.
  */
 
 typedef struct RTreeScanOpaqueData
@@ -73,6 +76,8 @@ typedef struct RTreeScanOpaqueData
        uint16          s_flags;
        int                     s_internalNKey;
        ScanKey         s_internalKey;
+       Buffer          curbuf;
+       Buffer          markbuf;
 } RTreeScanOpaqueData;
 
 typedef RTreeScanOpaqueData *RTreeScanOpaque;