OSDN Git Service

Fix typo in sslmode documentation
[pg-rex/syncrep.git] / src / backend / executor / nodeBitmapHeapscan.c
1 /*-------------------------------------------------------------------------
2  *
3  * nodeBitmapHeapscan.c
4  *        Routines to support bitmapped scans of relations
5  *
6  * NOTE: it is critical that this plan type only be used with MVCC-compliant
7  * snapshots (ie, regular snapshots, not SnapshotNow or one of the other
8  * special snapshots).  The reason is that since index and heap scans are
9  * decoupled, there can be no assurance that the index tuple prompting a
10  * visit to a particular heap TID still exists when the visit is made.
11  * Therefore the tuple might not exist anymore either (which is OK because
12  * heap_fetch will cope) --- but worse, the tuple slot could have been
13  * re-used for a newer tuple.  With an MVCC snapshot the newer tuple is
14  * certain to fail the time qual and so it will not be mistakenly returned.
15  * With SnapshotNow we might return a tuple that doesn't meet the required
16  * index qual conditions.
17  *
18  *
19  * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
20  * Portions Copyright (c) 1994, Regents of the University of California
21  *
22  *
23  * IDENTIFICATION
24  *        src/backend/executor/nodeBitmapHeapscan.c
25  *
26  *-------------------------------------------------------------------------
27  */
28 /*
29  * INTERFACE ROUTINES
30  *              ExecBitmapHeapScan                      scans a relation using bitmap info
31  *              ExecBitmapHeapNext                      workhorse for above
32  *              ExecInitBitmapHeapScan          creates and initializes state info.
33  *              ExecReScanBitmapHeapScan        prepares to rescan the plan.
34  *              ExecEndBitmapHeapScan           releases all storage.
35  */
36 #include "postgres.h"
37
38 #include "access/heapam.h"
39 #include "access/relscan.h"
40 #include "access/transam.h"
41 #include "executor/execdebug.h"
42 #include "executor/nodeBitmapHeapscan.h"
43 #include "pgstat.h"
44 #include "storage/bufmgr.h"
45 #include "storage/predicate.h"
46 #include "utils/memutils.h"
47 #include "utils/rel.h"
48 #include "utils/snapmgr.h"
49 #include "utils/tqual.h"
50
51
52 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
53 static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres);
54
55
56 /* ----------------------------------------------------------------
57  *              BitmapHeapNext
58  *
59  *              Retrieve next tuple from the BitmapHeapScan node's currentRelation
60  * ----------------------------------------------------------------
61  */
62 static TupleTableSlot *
63 BitmapHeapNext(BitmapHeapScanState *node)
64 {
65         ExprContext *econtext;
66         HeapScanDesc scan;
67         TIDBitmap  *tbm;
68         TBMIterator *tbmiterator;
69         TBMIterateResult *tbmres;
70         TBMIterator *prefetch_iterator;
71         OffsetNumber targoffset;
72         TupleTableSlot *slot;
73
74         /*
75          * extract necessary information from index scan node
76          */
77         econtext = node->ss.ps.ps_ExprContext;
78         slot = node->ss.ss_ScanTupleSlot;
79         scan = node->ss.ss_currentScanDesc;
80         tbm = node->tbm;
81         tbmiterator = node->tbmiterator;
82         tbmres = node->tbmres;
83         prefetch_iterator = node->prefetch_iterator;
84
85         /*
86          * If we haven't yet performed the underlying index scan, do it, and begin
87          * the iteration over the bitmap.
88          *
89          * For prefetching, we use *two* iterators, one for the pages we are
90          * actually scanning and another that runs ahead of the first for
91          * prefetching.  node->prefetch_pages tracks exactly how many pages ahead
92          * the prefetch iterator is.  Also, node->prefetch_target tracks the
93          * desired prefetch distance, which starts small and increases up to the
94          * GUC-controlled maximum, target_prefetch_pages.  This is to avoid doing
95          * a lot of prefetching in a scan that stops after a few tuples because of
96          * a LIMIT.
97          */
98         if (tbm == NULL)
99         {
100                 tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
101
102                 if (!tbm || !IsA(tbm, TIDBitmap))
103                         elog(ERROR, "unrecognized result from subplan");
104
105                 node->tbm = tbm;
106                 node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
107                 node->tbmres = tbmres = NULL;
108
109 #ifdef USE_PREFETCH
110                 if (target_prefetch_pages > 0)
111                 {
112                         node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
113                         node->prefetch_pages = 0;
114                         node->prefetch_target = -1;
115                 }
116 #endif   /* USE_PREFETCH */
117         }
118
119         for (;;)
120         {
121                 Page            dp;
122                 ItemId          lp;
123
124                 /*
125                  * Get next page of results if needed
126                  */
127                 if (tbmres == NULL)
128                 {
129                         node->tbmres = tbmres = tbm_iterate(tbmiterator);
130                         if (tbmres == NULL)
131                         {
132                                 /* no more entries in the bitmap */
133                                 break;
134                         }
135
136 #ifdef USE_PREFETCH
137                         if (node->prefetch_pages > 0)
138                         {
139                                 /* The main iterator has closed the distance by one page */
140                                 node->prefetch_pages--;
141                         }
142                         else if (prefetch_iterator)
143                         {
144                                 /* Do not let the prefetch iterator get behind the main one */
145                                 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
146
147                                 if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
148                                         elog(ERROR, "prefetch and main iterators are out of sync");
149                         }
150 #endif   /* USE_PREFETCH */
151
152                         /*
153                          * Ignore any claimed entries past what we think is the end of the
154                          * relation.  (This is probably not necessary given that we got at
155                          * least AccessShareLock on the table before performing any of the
156                          * indexscans, but let's be safe.)
157                          */
158                         if (tbmres->blockno >= scan->rs_nblocks)
159                         {
160                                 node->tbmres = tbmres = NULL;
161                                 continue;
162                         }
163
164                         /*
165                          * Fetch the current heap page and identify candidate tuples.
166                          */
167                         bitgetpage(scan, tbmres);
168
169                         /*
170                          * Set rs_cindex to first slot to examine
171                          */
172                         scan->rs_cindex = 0;
173
174 #ifdef USE_PREFETCH
175
176                         /*
177                          * Increase prefetch target if it's not yet at the max.  Note that
178                          * we will increase it to zero after fetching the very first
179                          * page/tuple, then to one after the second tuple is fetched, then
180                          * it doubles as later pages are fetched.
181                          */
182                         if (node->prefetch_target >= target_prefetch_pages)
183                                  /* don't increase any further */ ;
184                         else if (node->prefetch_target >= target_prefetch_pages / 2)
185                                 node->prefetch_target = target_prefetch_pages;
186                         else if (node->prefetch_target > 0)
187                                 node->prefetch_target *= 2;
188                         else
189                                 node->prefetch_target++;
190 #endif   /* USE_PREFETCH */
191                 }
192                 else
193                 {
194                         /*
195                          * Continuing in previously obtained page; advance rs_cindex
196                          */
197                         scan->rs_cindex++;
198
199 #ifdef USE_PREFETCH
200
201                         /*
202                          * Try to prefetch at least a few pages even before we get to the
203                          * second page if we don't stop reading after the first tuple.
204                          */
205                         if (node->prefetch_target < target_prefetch_pages)
206                                 node->prefetch_target++;
207 #endif   /* USE_PREFETCH */
208                 }
209
210                 /*
211                  * Out of range?  If so, nothing more to look at on this page
212                  */
213                 if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples)
214                 {
215                         node->tbmres = tbmres = NULL;
216                         continue;
217                 }
218
219 #ifdef USE_PREFETCH
220
221                 /*
222                  * We issue prefetch requests *after* fetching the current page to try
223                  * to avoid having prefetching interfere with the main I/O. Also, this
224                  * should happen only when we have determined there is still something
225                  * to do on the current page, else we may uselessly prefetch the same
226                  * page we are just about to request for real.
227                  */
228                 if (prefetch_iterator)
229                 {
230                         while (node->prefetch_pages < node->prefetch_target)
231                         {
232                                 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
233
234                                 if (tbmpre == NULL)
235                                 {
236                                         /* No more pages to prefetch */
237                                         tbm_end_iterate(prefetch_iterator);
238                                         node->prefetch_iterator = prefetch_iterator = NULL;
239                                         break;
240                                 }
241                                 node->prefetch_pages++;
242                                 PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
243                         }
244                 }
245 #endif   /* USE_PREFETCH */
246
247                 /*
248                  * Okay to fetch the tuple
249                  */
250                 targoffset = scan->rs_vistuples[scan->rs_cindex];
251                 dp = (Page) BufferGetPage(scan->rs_cbuf);
252                 lp = PageGetItemId(dp, targoffset);
253                 Assert(ItemIdIsNormal(lp));
254
255                 scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
256                 scan->rs_ctup.t_len = ItemIdGetLength(lp);
257                 ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
258
259                 pgstat_count_heap_fetch(scan->rs_rd);
260
261                 /*
262                  * Set up the result slot to point to this tuple. Note that the slot
263                  * acquires a pin on the buffer.
264                  */
265                 ExecStoreTuple(&scan->rs_ctup,
266                                            slot,
267                                            scan->rs_cbuf,
268                                            false);
269
270                 /*
271                  * If we are using lossy info, we have to recheck the qual conditions
272                  * at every tuple.
273                  */
274                 if (tbmres->recheck)
275                 {
276                         econtext->ecxt_scantuple = slot;
277                         ResetExprContext(econtext);
278
279                         if (!ExecQual(node->bitmapqualorig, econtext, false))
280                         {
281                                 /* Fails recheck, so drop it and loop back for another */
282                                 ExecClearTuple(slot);
283                                 continue;
284                         }
285                 }
286
287                 /* OK to return this tuple */
288                 return slot;
289         }
290
291         /*
292          * if we get here it means we are at the end of the scan..
293          */
294         return ExecClearTuple(slot);
295 }
296
297 /*
298  * bitgetpage - subroutine for BitmapHeapNext()
299  *
300  * This routine reads and pins the specified page of the relation, then
301  * builds an array indicating which tuples on the page are both potentially
302  * interesting according to the bitmap, and visible according to the snapshot.
303  */
304 static void
305 bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
306 {
307         BlockNumber page = tbmres->blockno;
308         Buffer          buffer;
309         Snapshot        snapshot;
310         int                     ntup;
311
312         /*
313          * Acquire pin on the target heap page, trading in any pin we held before.
314          */
315         Assert(page < scan->rs_nblocks);
316
317         scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
318                                                                                  scan->rs_rd,
319                                                                                  page);
320         buffer = scan->rs_cbuf;
321         snapshot = scan->rs_snapshot;
322
323         ntup = 0;
324
325         /*
326          * Prune and repair fragmentation for the whole page, if possible.
327          */
328         Assert(TransactionIdIsValid(RecentGlobalXmin));
329         heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
330
331         /*
332          * We must hold share lock on the buffer content while examining tuple
333          * visibility.  Afterwards, however, the tuples we have found to be
334          * visible are guaranteed good as long as we hold the buffer pin.
335          */
336         LockBuffer(buffer, BUFFER_LOCK_SHARE);
337
338         /*
339          * We need two separate strategies for lossy and non-lossy cases.
340          */
341         if (tbmres->ntuples >= 0)
342         {
343                 /*
344                  * Bitmap is non-lossy, so we just look through the offsets listed in
345                  * tbmres; but we have to follow any HOT chain starting at each such
346                  * offset.
347                  */
348                 int                     curslot;
349
350                 for (curslot = 0; curslot < tbmres->ntuples; curslot++)
351                 {
352                         OffsetNumber offnum = tbmres->offsets[curslot];
353                         ItemPointerData tid;
354                         HeapTupleData   heapTuple;
355
356                         ItemPointerSet(&tid, page, offnum);
357                         if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
358                                                                            &heapTuple, NULL, true))
359                                 scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
360                 }
361         }
362         else
363         {
364                 /*
365                  * Bitmap is lossy, so we must examine each item pointer on the page.
366                  * But we can ignore HOT chains, since we'll check each tuple anyway.
367                  */
368                 Page            dp = (Page) BufferGetPage(buffer);
369                 OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
370                 OffsetNumber offnum;
371
372                 for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
373                 {
374                         ItemId          lp;
375                         HeapTupleData loctup;
376                         bool            valid;
377
378                         lp = PageGetItemId(dp, offnum);
379                         if (!ItemIdIsNormal(lp))
380                                 continue;
381                         loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
382                         loctup.t_len = ItemIdGetLength(lp);
383                         loctup.t_tableOid = scan->rs_rd->rd_id;
384                         ItemPointerSet(&loctup.t_self, page, offnum);
385                         valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
386                         if (valid)
387                         {
388                                 scan->rs_vistuples[ntup++] = offnum;
389                                 PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
390                         }
391                         CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
392                                                                                         buffer, snapshot);
393                 }
394         }
395
396         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
397
398         Assert(ntup <= MaxHeapTuplesPerPage);
399         scan->rs_ntuples = ntup;
400 }
401
402 /*
403  * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
404  */
405 static bool
406 BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
407 {
408         ExprContext *econtext;
409
410         /*
411          * extract necessary information from index scan node
412          */
413         econtext = node->ss.ps.ps_ExprContext;
414
415         /* Does the tuple meet the original qual conditions? */
416         econtext->ecxt_scantuple = slot;
417
418         ResetExprContext(econtext);
419
420         return ExecQual(node->bitmapqualorig, econtext, false);
421 }
422
423 /* ----------------------------------------------------------------
424  *              ExecBitmapHeapScan(node)
425  * ----------------------------------------------------------------
426  */
427 TupleTableSlot *
428 ExecBitmapHeapScan(BitmapHeapScanState *node)
429 {
430         return ExecScan(&node->ss,
431                                         (ExecScanAccessMtd) BitmapHeapNext,
432                                         (ExecScanRecheckMtd) BitmapHeapRecheck);
433 }
434
435 /* ----------------------------------------------------------------
436  *              ExecReScanBitmapHeapScan(node)
437  * ----------------------------------------------------------------
438  */
439 void
440 ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
441 {
442         /* rescan to release any page pin */
443         heap_rescan(node->ss.ss_currentScanDesc, NULL);
444
445         if (node->tbmiterator)
446                 tbm_end_iterate(node->tbmiterator);
447         if (node->prefetch_iterator)
448                 tbm_end_iterate(node->prefetch_iterator);
449         if (node->tbm)
450                 tbm_free(node->tbm);
451         node->tbm = NULL;
452         node->tbmiterator = NULL;
453         node->tbmres = NULL;
454         node->prefetch_iterator = NULL;
455
456         ExecScanReScan(&node->ss);
457
458         /*
459          * if chgParam of subnode is not null then plan will be re-scanned by
460          * first ExecProcNode.
461          */
462         if (node->ss.ps.lefttree->chgParam == NULL)
463                 ExecReScan(node->ss.ps.lefttree);
464 }
465
466 /* ----------------------------------------------------------------
467  *              ExecEndBitmapHeapScan
468  * ----------------------------------------------------------------
469  */
470 void
471 ExecEndBitmapHeapScan(BitmapHeapScanState *node)
472 {
473         Relation        relation;
474         HeapScanDesc scanDesc;
475
476         /*
477          * extract information from the node
478          */
479         relation = node->ss.ss_currentRelation;
480         scanDesc = node->ss.ss_currentScanDesc;
481
482         /*
483          * Free the exprcontext
484          */
485         ExecFreeExprContext(&node->ss.ps);
486
487         /*
488          * clear out tuple table slots
489          */
490         ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
491         ExecClearTuple(node->ss.ss_ScanTupleSlot);
492
493         /*
494          * close down subplans
495          */
496         ExecEndNode(outerPlanState(node));
497
498         /*
499          * release bitmap if any
500          */
501         if (node->tbmiterator)
502                 tbm_end_iterate(node->tbmiterator);
503         if (node->prefetch_iterator)
504                 tbm_end_iterate(node->prefetch_iterator);
505         if (node->tbm)
506                 tbm_free(node->tbm);
507
508         /*
509          * close heap scan
510          */
511         heap_endscan(scanDesc);
512
513         /*
514          * close the heap relation.
515          */
516         ExecCloseScanRelation(relation);
517 }
518
519 /* ----------------------------------------------------------------
520  *              ExecInitBitmapHeapScan
521  *
522  *              Initializes the scan's state information.
523  * ----------------------------------------------------------------
524  */
525 BitmapHeapScanState *
526 ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
527 {
528         BitmapHeapScanState *scanstate;
529         Relation        currentRelation;
530
531         /* check for unsupported flags */
532         Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
533
534         /*
535          * Assert caller didn't ask for an unsafe snapshot --- see comments at
536          * head of file.
537          */
538         Assert(IsMVCCSnapshot(estate->es_snapshot));
539
540         /*
541          * create state structure
542          */
543         scanstate = makeNode(BitmapHeapScanState);
544         scanstate->ss.ps.plan = (Plan *) node;
545         scanstate->ss.ps.state = estate;
546
547         scanstate->tbm = NULL;
548         scanstate->tbmiterator = NULL;
549         scanstate->tbmres = NULL;
550         scanstate->prefetch_iterator = NULL;
551         scanstate->prefetch_pages = 0;
552         scanstate->prefetch_target = 0;
553
554         /*
555          * Miscellaneous initialization
556          *
557          * create expression context for node
558          */
559         ExecAssignExprContext(estate, &scanstate->ss.ps);
560
561         scanstate->ss.ps.ps_TupFromTlist = false;
562
563         /*
564          * initialize child expressions
565          */
566         scanstate->ss.ps.targetlist = (List *)
567                 ExecInitExpr((Expr *) node->scan.plan.targetlist,
568                                          (PlanState *) scanstate);
569         scanstate->ss.ps.qual = (List *)
570                 ExecInitExpr((Expr *) node->scan.plan.qual,
571                                          (PlanState *) scanstate);
572         scanstate->bitmapqualorig = (List *)
573                 ExecInitExpr((Expr *) node->bitmapqualorig,
574                                          (PlanState *) scanstate);
575
576         /*
577          * tuple table initialization
578          */
579         ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
580         ExecInitScanTupleSlot(estate, &scanstate->ss);
581
582         /*
583          * open the base relation and acquire appropriate lock on it.
584          */
585         currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid);
586
587         scanstate->ss.ss_currentRelation = currentRelation;
588
589         /*
590          * Even though we aren't going to do a conventional seqscan, it is useful
591          * to create a HeapScanDesc --- most of the fields in it are usable.
592          */
593         scanstate->ss.ss_currentScanDesc = heap_beginscan_bm(currentRelation,
594                                                                                                                  estate->es_snapshot,
595                                                                                                                  0,
596                                                                                                                  NULL);
597
598         /*
599          * get the scan type from the relation descriptor.
600          */
601         ExecAssignScanType(&scanstate->ss, RelationGetDescr(currentRelation));
602
603         /*
604          * Initialize result tuple type and projection info.
605          */
606         ExecAssignResultTypeFromTL(&scanstate->ss.ps);
607         ExecAssignScanProjectionInfo(&scanstate->ss);
608
609         /*
610          * initialize child nodes
611          *
612          * We do this last because the child nodes will open indexscans on our
613          * relation's indexes, and we want to be sure we have acquired a lock on
614          * the relation first.
615          */
616         outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
617
618         /*
619          * all done.
620          */
621         return scanstate;
622 }