OSDN Git Service

Grab predicate locks on matching tuples in a lossy bitmap heap scan.
[pg-rex/syncrep.git] / src / backend / executor / nodeBitmapHeapscan.c
1 /*-------------------------------------------------------------------------
2  *
3  * nodeBitmapHeapscan.c
4  *        Routines to support bitmapped scans of relations
5  *
6  * NOTE: it is critical that this plan type only be used with MVCC-compliant
7  * snapshots (ie, regular snapshots, not SnapshotNow or one of the other
8  * special snapshots).  The reason is that since index and heap scans are
9  * decoupled, there can be no assurance that the index tuple prompting a
10  * visit to a particular heap TID still exists when the visit is made.
11  * Therefore the tuple might not exist anymore either (which is OK because
12  * heap_fetch will cope) --- but worse, the tuple slot could have been
13  * re-used for a newer tuple.  With an MVCC snapshot the newer tuple is
14  * certain to fail the time qual and so it will not be mistakenly returned.
15  * With SnapshotNow we might return a tuple that doesn't meet the required
16  * index qual conditions.
17  *
18  *
19  * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
20  * Portions Copyright (c) 1994, Regents of the University of California
21  *
22  *
23  * IDENTIFICATION
24  *        src/backend/executor/nodeBitmapHeapscan.c
25  *
26  *-------------------------------------------------------------------------
27  */
28 /*
29  * INTERFACE ROUTINES
30  *              ExecBitmapHeapScan                      scans a relation using bitmap info
31  *              ExecBitmapHeapNext                      workhorse for above
32  *              ExecInitBitmapHeapScan          creates and initializes state info.
33  *              ExecReScanBitmapHeapScan        prepares to rescan the plan.
34  *              ExecEndBitmapHeapScan           releases all storage.
35  */
36 #include "postgres.h"
37
38 #include "access/heapam.h"
39 #include "access/relscan.h"
40 #include "access/transam.h"
41 #include "executor/execdebug.h"
42 #include "executor/nodeBitmapHeapscan.h"
43 #include "pgstat.h"
44 #include "storage/bufmgr.h"
45 #include "storage/predicate.h"
46 #include "utils/memutils.h"
47 #include "utils/snapmgr.h"
48 #include "utils/tqual.h"
49
50
51 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
52 static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres);
53
54
55 /* ----------------------------------------------------------------
56  *              BitmapHeapNext
57  *
58  *              Retrieve next tuple from the BitmapHeapScan node's currentRelation
59  * ----------------------------------------------------------------
60  */
61 static TupleTableSlot *
62 BitmapHeapNext(BitmapHeapScanState *node)
63 {
64         ExprContext *econtext;
65         HeapScanDesc scan;
66         TIDBitmap  *tbm;
67         TBMIterator *tbmiterator;
68         TBMIterateResult *tbmres;
69         TBMIterator *prefetch_iterator;
70         OffsetNumber targoffset;
71         TupleTableSlot *slot;
72
73         /*
74          * extract necessary information from index scan node
75          */
76         econtext = node->ss.ps.ps_ExprContext;
77         slot = node->ss.ss_ScanTupleSlot;
78         scan = node->ss.ss_currentScanDesc;
79         tbm = node->tbm;
80         tbmiterator = node->tbmiterator;
81         tbmres = node->tbmres;
82         prefetch_iterator = node->prefetch_iterator;
83
84         /*
85          * If we haven't yet performed the underlying index scan, do it, and begin
86          * the iteration over the bitmap.
87          *
88          * For prefetching, we use *two* iterators, one for the pages we are
89          * actually scanning and another that runs ahead of the first for
90          * prefetching.  node->prefetch_pages tracks exactly how many pages ahead
91          * the prefetch iterator is.  Also, node->prefetch_target tracks the
92          * desired prefetch distance, which starts small and increases up to the
93          * GUC-controlled maximum, target_prefetch_pages.  This is to avoid doing
94          * a lot of prefetching in a scan that stops after a few tuples because of
95          * a LIMIT.
96          */
97         if (tbm == NULL)
98         {
99                 tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
100
101                 if (!tbm || !IsA(tbm, TIDBitmap))
102                         elog(ERROR, "unrecognized result from subplan");
103
104                 node->tbm = tbm;
105                 node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
106                 node->tbmres = tbmres = NULL;
107
108 #ifdef USE_PREFETCH
109                 if (target_prefetch_pages > 0)
110                 {
111                         node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
112                         node->prefetch_pages = 0;
113                         node->prefetch_target = -1;
114                 }
115 #endif   /* USE_PREFETCH */
116         }
117
118         for (;;)
119         {
120                 Page            dp;
121                 ItemId          lp;
122
123                 /*
124                  * Get next page of results if needed
125                  */
126                 if (tbmres == NULL)
127                 {
128                         node->tbmres = tbmres = tbm_iterate(tbmiterator);
129                         if (tbmres == NULL)
130                         {
131                                 /* no more entries in the bitmap */
132                                 break;
133                         }
134
135 #ifdef USE_PREFETCH
136                         if (node->prefetch_pages > 0)
137                         {
138                                 /* The main iterator has closed the distance by one page */
139                                 node->prefetch_pages--;
140                         }
141                         else if (prefetch_iterator)
142                         {
143                                 /* Do not let the prefetch iterator get behind the main one */
144                                 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
145
146                                 if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
147                                         elog(ERROR, "prefetch and main iterators are out of sync");
148                         }
149 #endif   /* USE_PREFETCH */
150
151                         /*
152                          * Ignore any claimed entries past what we think is the end of the
153                          * relation.  (This is probably not necessary given that we got at
154                          * least AccessShareLock on the table before performing any of the
155                          * indexscans, but let's be safe.)
156                          */
157                         if (tbmres->blockno >= scan->rs_nblocks)
158                         {
159                                 node->tbmres = tbmres = NULL;
160                                 continue;
161                         }
162
163                         /*
164                          * Fetch the current heap page and identify candidate tuples.
165                          */
166                         bitgetpage(scan, tbmres);
167
168                         /*
169                          * Set rs_cindex to first slot to examine
170                          */
171                         scan->rs_cindex = 0;
172
173 #ifdef USE_PREFETCH
174
175                         /*
176                          * Increase prefetch target if it's not yet at the max.  Note that
177                          * we will increase it to zero after fetching the very first
178                          * page/tuple, then to one after the second tuple is fetched, then
179                          * it doubles as later pages are fetched.
180                          */
181                         if (node->prefetch_target >= target_prefetch_pages)
182                                  /* don't increase any further */ ;
183                         else if (node->prefetch_target >= target_prefetch_pages / 2)
184                                 node->prefetch_target = target_prefetch_pages;
185                         else if (node->prefetch_target > 0)
186                                 node->prefetch_target *= 2;
187                         else
188                                 node->prefetch_target++;
189 #endif   /* USE_PREFETCH */
190                 }
191                 else
192                 {
193                         /*
194                          * Continuing in previously obtained page; advance rs_cindex
195                          */
196                         scan->rs_cindex++;
197
198 #ifdef USE_PREFETCH
199
200                         /*
201                          * Try to prefetch at least a few pages even before we get to the
202                          * second page if we don't stop reading after the first tuple.
203                          */
204                         if (node->prefetch_target < target_prefetch_pages)
205                                 node->prefetch_target++;
206 #endif   /* USE_PREFETCH */
207                 }
208
209                 /*
210                  * Out of range?  If so, nothing more to look at on this page
211                  */
212                 if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples)
213                 {
214                         node->tbmres = tbmres = NULL;
215                         continue;
216                 }
217
218 #ifdef USE_PREFETCH
219
220                 /*
221                  * We issue prefetch requests *after* fetching the current page to try
222                  * to avoid having prefetching interfere with the main I/O. Also, this
223                  * should happen only when we have determined there is still something
224                  * to do on the current page, else we may uselessly prefetch the same
225                  * page we are just about to request for real.
226                  */
227                 if (prefetch_iterator)
228                 {
229                         while (node->prefetch_pages < node->prefetch_target)
230                         {
231                                 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
232
233                                 if (tbmpre == NULL)
234                                 {
235                                         /* No more pages to prefetch */
236                                         tbm_end_iterate(prefetch_iterator);
237                                         node->prefetch_iterator = prefetch_iterator = NULL;
238                                         break;
239                                 }
240                                 node->prefetch_pages++;
241                                 PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
242                         }
243                 }
244 #endif   /* USE_PREFETCH */
245
246                 /*
247                  * Okay to fetch the tuple
248                  */
249                 targoffset = scan->rs_vistuples[scan->rs_cindex];
250                 dp = (Page) BufferGetPage(scan->rs_cbuf);
251                 lp = PageGetItemId(dp, targoffset);
252                 Assert(ItemIdIsNormal(lp));
253
254                 scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
255                 scan->rs_ctup.t_len = ItemIdGetLength(lp);
256                 ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
257
258                 pgstat_count_heap_fetch(scan->rs_rd);
259
260                 /*
261                  * Set up the result slot to point to this tuple. Note that the slot
262                  * acquires a pin on the buffer.
263                  */
264                 ExecStoreTuple(&scan->rs_ctup,
265                                            slot,
266                                            scan->rs_cbuf,
267                                            false);
268
269                 /*
270                  * If we are using lossy info, we have to recheck the qual conditions
271                  * at every tuple.
272                  */
273                 if (tbmres->recheck)
274                 {
275                         econtext->ecxt_scantuple = slot;
276                         ResetExprContext(econtext);
277
278                         if (!ExecQual(node->bitmapqualorig, econtext, false))
279                         {
280                                 /* Fails recheck, so drop it and loop back for another */
281                                 ExecClearTuple(slot);
282                                 continue;
283                         }
284                 }
285
286                 /* OK to return this tuple */
287                 return slot;
288         }
289
290         /*
291          * if we get here it means we are at the end of the scan..
292          */
293         return ExecClearTuple(slot);
294 }
295
296 /*
297  * bitgetpage - subroutine for BitmapHeapNext()
298  *
299  * This routine reads and pins the specified page of the relation, then
300  * builds an array indicating which tuples on the page are both potentially
301  * interesting according to the bitmap, and visible according to the snapshot.
302  */
303 static void
304 bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
305 {
306         BlockNumber page = tbmres->blockno;
307         Buffer          buffer;
308         Snapshot        snapshot;
309         int                     ntup;
310
311         /*
312          * Acquire pin on the target heap page, trading in any pin we held before.
313          */
314         Assert(page < scan->rs_nblocks);
315
316         scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
317                                                                                  scan->rs_rd,
318                                                                                  page);
319         buffer = scan->rs_cbuf;
320         snapshot = scan->rs_snapshot;
321
322         ntup = 0;
323
324         /*
325          * Prune and repair fragmentation for the whole page, if possible.
326          */
327         Assert(TransactionIdIsValid(RecentGlobalXmin));
328         heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
329
330         /*
331          * We must hold share lock on the buffer content while examining tuple
332          * visibility.  Afterwards, however, the tuples we have found to be
333          * visible are guaranteed good as long as we hold the buffer pin.
334          */
335         LockBuffer(buffer, BUFFER_LOCK_SHARE);
336
337         /*
338          * We need two separate strategies for lossy and non-lossy cases.
339          */
340         if (tbmres->ntuples >= 0)
341         {
342                 /*
343                  * Bitmap is non-lossy, so we just look through the offsets listed in
344                  * tbmres; but we have to follow any HOT chain starting at each such
345                  * offset.
346                  */
347                 int                     curslot;
348
349                 for (curslot = 0; curslot < tbmres->ntuples; curslot++)
350                 {
351                         OffsetNumber offnum = tbmres->offsets[curslot];
352                         ItemPointerData tid;
353                         HeapTupleData   heapTuple;
354
355                         ItemPointerSet(&tid, page, offnum);
356                         if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
357                                                                            &heapTuple, NULL, true))
358                                 scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
359                 }
360         }
361         else
362         {
363                 /*
364                  * Bitmap is lossy, so we must examine each item pointer on the page.
365                  * But we can ignore HOT chains, since we'll check each tuple anyway.
366                  */
367                 Page            dp = (Page) BufferGetPage(buffer);
368                 OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
369                 OffsetNumber offnum;
370
371                 for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
372                 {
373                         ItemId          lp;
374                         HeapTupleData loctup;
375                         bool            valid;
376
377                         lp = PageGetItemId(dp, offnum);
378                         if (!ItemIdIsNormal(lp))
379                                 continue;
380                         loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
381                         loctup.t_len = ItemIdGetLength(lp);
382                         loctup.t_tableOid = scan->rs_rd->rd_id;
383                         ItemPointerSet(&loctup.t_self, page, offnum);
384                         valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
385                         if (valid)
386                         {
387                                 scan->rs_vistuples[ntup++] = offnum;
388                                 PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
389                         }
390                         CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
391                                                                                         buffer, snapshot);
392                 }
393         }
394
395         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
396
397         Assert(ntup <= MaxHeapTuplesPerPage);
398         scan->rs_ntuples = ntup;
399 }
400
401 /*
402  * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
403  */
404 static bool
405 BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
406 {
407         ExprContext *econtext;
408
409         /*
410          * extract necessary information from index scan node
411          */
412         econtext = node->ss.ps.ps_ExprContext;
413
414         /* Does the tuple meet the original qual conditions? */
415         econtext->ecxt_scantuple = slot;
416
417         ResetExprContext(econtext);
418
419         return ExecQual(node->bitmapqualorig, econtext, false);
420 }
421
422 /* ----------------------------------------------------------------
423  *              ExecBitmapHeapScan(node)
424  * ----------------------------------------------------------------
425  */
426 TupleTableSlot *
427 ExecBitmapHeapScan(BitmapHeapScanState *node)
428 {
429         return ExecScan(&node->ss,
430                                         (ExecScanAccessMtd) BitmapHeapNext,
431                                         (ExecScanRecheckMtd) BitmapHeapRecheck);
432 }
433
434 /* ----------------------------------------------------------------
435  *              ExecReScanBitmapHeapScan(node)
436  * ----------------------------------------------------------------
437  */
438 void
439 ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
440 {
441         /* rescan to release any page pin */
442         heap_rescan(node->ss.ss_currentScanDesc, NULL);
443
444         if (node->tbmiterator)
445                 tbm_end_iterate(node->tbmiterator);
446         if (node->prefetch_iterator)
447                 tbm_end_iterate(node->prefetch_iterator);
448         if (node->tbm)
449                 tbm_free(node->tbm);
450         node->tbm = NULL;
451         node->tbmiterator = NULL;
452         node->tbmres = NULL;
453         node->prefetch_iterator = NULL;
454
455         ExecScanReScan(&node->ss);
456
457         /*
458          * if chgParam of subnode is not null then plan will be re-scanned by
459          * first ExecProcNode.
460          */
461         if (node->ss.ps.lefttree->chgParam == NULL)
462                 ExecReScan(node->ss.ps.lefttree);
463 }
464
465 /* ----------------------------------------------------------------
466  *              ExecEndBitmapHeapScan
467  * ----------------------------------------------------------------
468  */
469 void
470 ExecEndBitmapHeapScan(BitmapHeapScanState *node)
471 {
472         Relation        relation;
473         HeapScanDesc scanDesc;
474
475         /*
476          * extract information from the node
477          */
478         relation = node->ss.ss_currentRelation;
479         scanDesc = node->ss.ss_currentScanDesc;
480
481         /*
482          * Free the exprcontext
483          */
484         ExecFreeExprContext(&node->ss.ps);
485
486         /*
487          * clear out tuple table slots
488          */
489         ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
490         ExecClearTuple(node->ss.ss_ScanTupleSlot);
491
492         /*
493          * close down subplans
494          */
495         ExecEndNode(outerPlanState(node));
496
497         /*
498          * release bitmap if any
499          */
500         if (node->tbmiterator)
501                 tbm_end_iterate(node->tbmiterator);
502         if (node->prefetch_iterator)
503                 tbm_end_iterate(node->prefetch_iterator);
504         if (node->tbm)
505                 tbm_free(node->tbm);
506
507         /*
508          * close heap scan
509          */
510         heap_endscan(scanDesc);
511
512         /*
513          * close the heap relation.
514          */
515         ExecCloseScanRelation(relation);
516 }
517
518 /* ----------------------------------------------------------------
519  *              ExecInitBitmapHeapScan
520  *
521  *              Initializes the scan's state information.
522  * ----------------------------------------------------------------
523  */
524 BitmapHeapScanState *
525 ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
526 {
527         BitmapHeapScanState *scanstate;
528         Relation        currentRelation;
529
530         /* check for unsupported flags */
531         Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
532
533         /*
534          * Assert caller didn't ask for an unsafe snapshot --- see comments at
535          * head of file.
536          */
537         Assert(IsMVCCSnapshot(estate->es_snapshot));
538
539         /*
540          * create state structure
541          */
542         scanstate = makeNode(BitmapHeapScanState);
543         scanstate->ss.ps.plan = (Plan *) node;
544         scanstate->ss.ps.state = estate;
545
546         scanstate->tbm = NULL;
547         scanstate->tbmiterator = NULL;
548         scanstate->tbmres = NULL;
549         scanstate->prefetch_iterator = NULL;
550         scanstate->prefetch_pages = 0;
551         scanstate->prefetch_target = 0;
552
553         /*
554          * Miscellaneous initialization
555          *
556          * create expression context for node
557          */
558         ExecAssignExprContext(estate, &scanstate->ss.ps);
559
560         scanstate->ss.ps.ps_TupFromTlist = false;
561
562         /*
563          * initialize child expressions
564          */
565         scanstate->ss.ps.targetlist = (List *)
566                 ExecInitExpr((Expr *) node->scan.plan.targetlist,
567                                          (PlanState *) scanstate);
568         scanstate->ss.ps.qual = (List *)
569                 ExecInitExpr((Expr *) node->scan.plan.qual,
570                                          (PlanState *) scanstate);
571         scanstate->bitmapqualorig = (List *)
572                 ExecInitExpr((Expr *) node->bitmapqualorig,
573                                          (PlanState *) scanstate);
574
575         /*
576          * tuple table initialization
577          */
578         ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
579         ExecInitScanTupleSlot(estate, &scanstate->ss);
580
581         /*
582          * open the base relation and acquire appropriate lock on it.
583          */
584         currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid);
585
586         scanstate->ss.ss_currentRelation = currentRelation;
587
588         /*
589          * Even though we aren't going to do a conventional seqscan, it is useful
590          * to create a HeapScanDesc --- most of the fields in it are usable.
591          */
592         scanstate->ss.ss_currentScanDesc = heap_beginscan_bm(currentRelation,
593                                                                                                                  estate->es_snapshot,
594                                                                                                                  0,
595                                                                                                                  NULL);
596
597         /*
598          * get the scan type from the relation descriptor.
599          */
600         ExecAssignScanType(&scanstate->ss, RelationGetDescr(currentRelation));
601
602         /*
603          * Initialize result tuple type and projection info.
604          */
605         ExecAssignResultTypeFromTL(&scanstate->ss.ps);
606         ExecAssignScanProjectionInfo(&scanstate->ss);
607
608         /*
609          * initialize child nodes
610          *
611          * We do this last because the child nodes will open indexscans on our
612          * relation's indexes, and we want to be sure we have acquired a lock on
613          * the relation first.
614          */
615         outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
616
617         /*
618          * all done.
619          */
620         return scanstate;
621 }