1 /*-------------------------------------------------------------------------
4 * Routines to support bitmapped scans of relations
6 * NOTE: it is critical that this plan type only be used with MVCC-compliant
7 * snapshots (ie, regular snapshots, not SnapshotNow or one of the other
8 * special snapshots). The reason is that since index and heap scans are
9 * decoupled, there can be no assurance that the index tuple prompting a
10 * visit to a particular heap TID still exists when the visit is made.
11 * Therefore the tuple might not exist anymore either (which is OK because
12 * heap_fetch will cope) --- but worse, the tuple slot could have been
13 * re-used for a newer tuple. With an MVCC snapshot the newer tuple is
14 * certain to fail the time qual and so it will not be mistakenly returned.
15 * With SnapshotNow we might return a tuple that doesn't meet the required
16 * index qual conditions.
19 * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
20 * Portions Copyright (c) 1994, Regents of the University of California
24 * src/backend/executor/nodeBitmapHeapscan.c
26 *-------------------------------------------------------------------------
30 * ExecBitmapHeapScan scans a relation using bitmap info
31 * ExecBitmapHeapNext workhorse for above
32 * ExecInitBitmapHeapScan creates and initializes state info.
33 * ExecReScanBitmapHeapScan prepares to rescan the plan.
34 * ExecEndBitmapHeapScan releases all storage.
38 #include "access/heapam.h"
39 #include "access/relscan.h"
40 #include "access/transam.h"
41 #include "executor/execdebug.h"
42 #include "executor/nodeBitmapHeapscan.h"
44 #include "storage/bufmgr.h"
45 #include "storage/predicate.h"
46 #include "utils/memutils.h"
47 #include "utils/rel.h"
48 #include "utils/snapmgr.h"
49 #include "utils/tqual.h"
52 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
53 static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres);
56 /* ----------------------------------------------------------------
59 * Retrieve next tuple from the BitmapHeapScan node's currentRelation
60 * ----------------------------------------------------------------
62 static TupleTableSlot *
63 BitmapHeapNext(BitmapHeapScanState *node)
65 ExprContext *econtext;
68 TBMIterator *tbmiterator;
69 TBMIterateResult *tbmres;
70 TBMIterator *prefetch_iterator;
71 OffsetNumber targoffset;
75 * extract necessary information from index scan node
77 econtext = node->ss.ps.ps_ExprContext;
78 slot = node->ss.ss_ScanTupleSlot;
79 scan = node->ss.ss_currentScanDesc;
81 tbmiterator = node->tbmiterator;
82 tbmres = node->tbmres;
83 prefetch_iterator = node->prefetch_iterator;
86 * If we haven't yet performed the underlying index scan, do it, and begin
87 * the iteration over the bitmap.
89 * For prefetching, we use *two* iterators, one for the pages we are
90 * actually scanning and another that runs ahead of the first for
91 * prefetching. node->prefetch_pages tracks exactly how many pages ahead
92 * the prefetch iterator is. Also, node->prefetch_target tracks the
93 * desired prefetch distance, which starts small and increases up to the
94 * GUC-controlled maximum, target_prefetch_pages. This is to avoid doing
95 * a lot of prefetching in a scan that stops after a few tuples because of
100 tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
102 if (!tbm || !IsA(tbm, TIDBitmap))
103 elog(ERROR, "unrecognized result from subplan");
106 node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
107 node->tbmres = tbmres = NULL;
110 if (target_prefetch_pages > 0)
112 node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
113 node->prefetch_pages = 0;
114 node->prefetch_target = -1;
116 #endif /* USE_PREFETCH */
125 * Get next page of results if needed
129 node->tbmres = tbmres = tbm_iterate(tbmiterator);
132 /* no more entries in the bitmap */
137 if (node->prefetch_pages > 0)
139 /* The main iterator has closed the distance by one page */
140 node->prefetch_pages--;
142 else if (prefetch_iterator)
144 /* Do not let the prefetch iterator get behind the main one */
145 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
147 if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
148 elog(ERROR, "prefetch and main iterators are out of sync");
150 #endif /* USE_PREFETCH */
153 * Ignore any claimed entries past what we think is the end of the
154 * relation. (This is probably not necessary given that we got at
155 * least AccessShareLock on the table before performing any of the
156 * indexscans, but let's be safe.)
158 if (tbmres->blockno >= scan->rs_nblocks)
160 node->tbmres = tbmres = NULL;
165 * Fetch the current heap page and identify candidate tuples.
167 bitgetpage(scan, tbmres);
170 * Set rs_cindex to first slot to examine
177 * Increase prefetch target if it's not yet at the max. Note that
178 * we will increase it to zero after fetching the very first
179 * page/tuple, then to one after the second tuple is fetched, then
180 * it doubles as later pages are fetched.
182 if (node->prefetch_target >= target_prefetch_pages)
183 /* don't increase any further */ ;
184 else if (node->prefetch_target >= target_prefetch_pages / 2)
185 node->prefetch_target = target_prefetch_pages;
186 else if (node->prefetch_target > 0)
187 node->prefetch_target *= 2;
189 node->prefetch_target++;
190 #endif /* USE_PREFETCH */
195 * Continuing in previously obtained page; advance rs_cindex
202 * Try to prefetch at least a few pages even before we get to the
203 * second page if we don't stop reading after the first tuple.
205 if (node->prefetch_target < target_prefetch_pages)
206 node->prefetch_target++;
207 #endif /* USE_PREFETCH */
211 * Out of range? If so, nothing more to look at on this page
213 if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples)
215 node->tbmres = tbmres = NULL;
222 * We issue prefetch requests *after* fetching the current page to try
223 * to avoid having prefetching interfere with the main I/O. Also, this
224 * should happen only when we have determined there is still something
225 * to do on the current page, else we may uselessly prefetch the same
226 * page we are just about to request for real.
228 if (prefetch_iterator)
230 while (node->prefetch_pages < node->prefetch_target)
232 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
236 /* No more pages to prefetch */
237 tbm_end_iterate(prefetch_iterator);
238 node->prefetch_iterator = prefetch_iterator = NULL;
241 node->prefetch_pages++;
242 PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
245 #endif /* USE_PREFETCH */
248 * Okay to fetch the tuple
250 targoffset = scan->rs_vistuples[scan->rs_cindex];
251 dp = (Page) BufferGetPage(scan->rs_cbuf);
252 lp = PageGetItemId(dp, targoffset);
253 Assert(ItemIdIsNormal(lp));
255 scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
256 scan->rs_ctup.t_len = ItemIdGetLength(lp);
257 ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
259 pgstat_count_heap_fetch(scan->rs_rd);
262 * Set up the result slot to point to this tuple. Note that the slot
263 * acquires a pin on the buffer.
265 ExecStoreTuple(&scan->rs_ctup,
271 * If we are using lossy info, we have to recheck the qual conditions
276 econtext->ecxt_scantuple = slot;
277 ResetExprContext(econtext);
279 if (!ExecQual(node->bitmapqualorig, econtext, false))
281 /* Fails recheck, so drop it and loop back for another */
282 ExecClearTuple(slot);
287 /* OK to return this tuple */
292 * if we get here it means we are at the end of the scan..
294 return ExecClearTuple(slot);
298 * bitgetpage - subroutine for BitmapHeapNext()
300 * This routine reads and pins the specified page of the relation, then
301 * builds an array indicating which tuples on the page are both potentially
302 * interesting according to the bitmap, and visible according to the snapshot.
305 bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
307 BlockNumber page = tbmres->blockno;
313 * Acquire pin on the target heap page, trading in any pin we held before.
315 Assert(page < scan->rs_nblocks);
317 scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
320 buffer = scan->rs_cbuf;
321 snapshot = scan->rs_snapshot;
326 * Prune and repair fragmentation for the whole page, if possible.
328 Assert(TransactionIdIsValid(RecentGlobalXmin));
329 heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
332 * We must hold share lock on the buffer content while examining tuple
333 * visibility. Afterwards, however, the tuples we have found to be
334 * visible are guaranteed good as long as we hold the buffer pin.
336 LockBuffer(buffer, BUFFER_LOCK_SHARE);
339 * We need two separate strategies for lossy and non-lossy cases.
341 if (tbmres->ntuples >= 0)
344 * Bitmap is non-lossy, so we just look through the offsets listed in
345 * tbmres; but we have to follow any HOT chain starting at each such
350 for (curslot = 0; curslot < tbmres->ntuples; curslot++)
352 OffsetNumber offnum = tbmres->offsets[curslot];
354 HeapTupleData heapTuple;
356 ItemPointerSet(&tid, page, offnum);
357 if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
358 &heapTuple, NULL, true))
359 scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
365 * Bitmap is lossy, so we must examine each item pointer on the page.
366 * But we can ignore HOT chains, since we'll check each tuple anyway.
368 Page dp = (Page) BufferGetPage(buffer);
369 OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
372 for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
375 HeapTupleData loctup;
378 lp = PageGetItemId(dp, offnum);
379 if (!ItemIdIsNormal(lp))
381 loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
382 loctup.t_len = ItemIdGetLength(lp);
383 loctup.t_tableOid = scan->rs_rd->rd_id;
384 ItemPointerSet(&loctup.t_self, page, offnum);
385 valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
388 scan->rs_vistuples[ntup++] = offnum;
389 PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
391 CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
396 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
398 Assert(ntup <= MaxHeapTuplesPerPage);
399 scan->rs_ntuples = ntup;
403 * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
406 BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
408 ExprContext *econtext;
411 * extract necessary information from index scan node
413 econtext = node->ss.ps.ps_ExprContext;
415 /* Does the tuple meet the original qual conditions? */
416 econtext->ecxt_scantuple = slot;
418 ResetExprContext(econtext);
420 return ExecQual(node->bitmapqualorig, econtext, false);
423 /* ----------------------------------------------------------------
424 * ExecBitmapHeapScan(node)
425 * ----------------------------------------------------------------
428 ExecBitmapHeapScan(BitmapHeapScanState *node)
430 return ExecScan(&node->ss,
431 (ExecScanAccessMtd) BitmapHeapNext,
432 (ExecScanRecheckMtd) BitmapHeapRecheck);
435 /* ----------------------------------------------------------------
436 * ExecReScanBitmapHeapScan(node)
437 * ----------------------------------------------------------------
440 ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
442 /* rescan to release any page pin */
443 heap_rescan(node->ss.ss_currentScanDesc, NULL);
445 if (node->tbmiterator)
446 tbm_end_iterate(node->tbmiterator);
447 if (node->prefetch_iterator)
448 tbm_end_iterate(node->prefetch_iterator);
452 node->tbmiterator = NULL;
454 node->prefetch_iterator = NULL;
456 ExecScanReScan(&node->ss);
459 * if chgParam of subnode is not null then plan will be re-scanned by
460 * first ExecProcNode.
462 if (node->ss.ps.lefttree->chgParam == NULL)
463 ExecReScan(node->ss.ps.lefttree);
466 /* ----------------------------------------------------------------
467 * ExecEndBitmapHeapScan
468 * ----------------------------------------------------------------
471 ExecEndBitmapHeapScan(BitmapHeapScanState *node)
474 HeapScanDesc scanDesc;
477 * extract information from the node
479 relation = node->ss.ss_currentRelation;
480 scanDesc = node->ss.ss_currentScanDesc;
483 * Free the exprcontext
485 ExecFreeExprContext(&node->ss.ps);
488 * clear out tuple table slots
490 ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
491 ExecClearTuple(node->ss.ss_ScanTupleSlot);
494 * close down subplans
496 ExecEndNode(outerPlanState(node));
499 * release bitmap if any
501 if (node->tbmiterator)
502 tbm_end_iterate(node->tbmiterator);
503 if (node->prefetch_iterator)
504 tbm_end_iterate(node->prefetch_iterator);
511 heap_endscan(scanDesc);
514 * close the heap relation.
516 ExecCloseScanRelation(relation);
519 /* ----------------------------------------------------------------
520 * ExecInitBitmapHeapScan
522 * Initializes the scan's state information.
523 * ----------------------------------------------------------------
525 BitmapHeapScanState *
526 ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
528 BitmapHeapScanState *scanstate;
529 Relation currentRelation;
531 /* check for unsupported flags */
532 Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
535 * Assert caller didn't ask for an unsafe snapshot --- see comments at
538 Assert(IsMVCCSnapshot(estate->es_snapshot));
541 * create state structure
543 scanstate = makeNode(BitmapHeapScanState);
544 scanstate->ss.ps.plan = (Plan *) node;
545 scanstate->ss.ps.state = estate;
547 scanstate->tbm = NULL;
548 scanstate->tbmiterator = NULL;
549 scanstate->tbmres = NULL;
550 scanstate->prefetch_iterator = NULL;
551 scanstate->prefetch_pages = 0;
552 scanstate->prefetch_target = 0;
555 * Miscellaneous initialization
557 * create expression context for node
559 ExecAssignExprContext(estate, &scanstate->ss.ps);
561 scanstate->ss.ps.ps_TupFromTlist = false;
564 * initialize child expressions
566 scanstate->ss.ps.targetlist = (List *)
567 ExecInitExpr((Expr *) node->scan.plan.targetlist,
568 (PlanState *) scanstate);
569 scanstate->ss.ps.qual = (List *)
570 ExecInitExpr((Expr *) node->scan.plan.qual,
571 (PlanState *) scanstate);
572 scanstate->bitmapqualorig = (List *)
573 ExecInitExpr((Expr *) node->bitmapqualorig,
574 (PlanState *) scanstate);
577 * tuple table initialization
579 ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
580 ExecInitScanTupleSlot(estate, &scanstate->ss);
583 * open the base relation and acquire appropriate lock on it.
585 currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid);
587 scanstate->ss.ss_currentRelation = currentRelation;
590 * Even though we aren't going to do a conventional seqscan, it is useful
591 * to create a HeapScanDesc --- most of the fields in it are usable.
593 scanstate->ss.ss_currentScanDesc = heap_beginscan_bm(currentRelation,
599 * get the scan type from the relation descriptor.
601 ExecAssignScanType(&scanstate->ss, RelationGetDescr(currentRelation));
604 * Initialize result tuple type and projection info.
606 ExecAssignResultTypeFromTL(&scanstate->ss.ps);
607 ExecAssignScanProjectionInfo(&scanstate->ss);
610 * initialize child nodes
612 * We do this last because the child nodes will open indexscans on our
613 * relation's indexes, and we want to be sure we have acquired a lock on
614 * the relation first.
616 outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);