1 /*-------------------------------------------------------------------------
4 * Utility code for Postgres btree implementation.
6 * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/nbtree/nbtutils.c
13 *-------------------------------------------------------------------------
20 #include "access/genam.h"
21 #include "access/nbtree.h"
22 #include "access/reloptions.h"
23 #include "access/relscan.h"
24 #include "executor/execdebug.h"
25 #include "miscadmin.h"
26 #include "storage/bufmgr.h"
27 #include "storage/lwlock.h"
28 #include "storage/shmem.h"
29 #include "utils/lsyscache.h"
32 static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
33 ScanKey leftarg, ScanKey rightarg,
35 static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
36 static void _bt_mark_scankey_required(ScanKey skey);
37 static bool _bt_check_rowcompare(ScanKey skey,
38 IndexTuple tuple, TupleDesc tupdesc,
39 ScanDirection dir, bool *continuescan);
44 * Build an insertion scan key that contains comparison data from itup
45 * as well as comparator routines appropriate to the key datatypes.
47 * The result is intended for use with _bt_compare().
50 _bt_mkscankey(Relation rel, IndexTuple itup)
58 itupdesc = RelationGetDescr(rel);
59 natts = RelationGetNumberOfAttributes(rel);
60 indoption = rel->rd_indoption;
62 skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
64 for (i = 0; i < natts; i++)
72 * We can use the cached (default) support procs since no cross-type
73 * comparison can be needed.
75 procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
76 arg = index_getattr(itup, i + 1, itupdesc, &null);
77 flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
78 ScanKeyEntryInitializeWithInfo(&skey[i],
83 rel->rd_indcollation[i],
92 * _bt_mkscankey_nodata
93 * Build an insertion scan key that contains 3-way comparator routines
94 * appropriate to the key datatypes, but no comparison data. The
95 * comparison data ultimately used must match the key datatypes.
97 * The result cannot be used with _bt_compare(), unless comparison
98 * data is first stored into the key entries. Currently this
99 * routine is only called by nbtsort.c and tuplesort.c, which have
100 * their own comparison routines.
103 _bt_mkscankey_nodata(Relation rel)
110 natts = RelationGetNumberOfAttributes(rel);
111 indoption = rel->rd_indoption;
113 skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
115 for (i = 0; i < natts; i++)
121 * We can use the cached (default) support procs since no cross-type
122 * comparison can be needed.
124 procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
125 flags = SK_ISNULL | (indoption[i] << SK_BT_INDOPTION_SHIFT);
126 ScanKeyEntryInitializeWithInfo(&skey[i],
128 (AttrNumber) (i + 1),
131 rel->rd_indcollation[i],
140 * free a scan key made by either _bt_mkscankey or _bt_mkscankey_nodata.
143 _bt_freeskey(ScanKey skey)
149 * free a retracement stack made by _bt_search.
152 _bt_freestack(BTStack stack)
156 while (stack != NULL)
159 stack = stack->bts_parent;
166 * _bt_preprocess_keys() -- Preprocess scan keys
168 * The caller-supplied search-type keys (in scan->keyData[]) are copied to
169 * so->keyData[] with possible transformation. scan->numberOfKeys is
170 * the number of input keys, so->numberOfKeys gets the number of output
171 * keys (possibly less, never greater).
173 * The output keys are marked with additional sk_flag bits beyond the
174 * system-standard bits supplied by the caller. The DESC and NULLS_FIRST
175 * indoption bits for the relevant index attribute are copied into the flags.
176 * Also, for a DESC column, we commute (flip) all the sk_strategy numbers
177 * so that the index sorts in the desired direction.
179 * One key purpose of this routine is to discover how many scan keys
180 * must be satisfied to continue the scan. It also attempts to eliminate
181 * redundant keys and detect contradictory keys. (If the index opfamily
182 * provides incomplete sets of cross-type operators, we may fail to detect
183 * redundant or contradictory keys, but we can survive that.)
185 * The output keys must be sorted by index attribute. Presently we expect
186 * (but verify) that the input keys are already so sorted --- this is done
187 * by group_clauses_by_indexkey() in indxpath.c. Some reordering of the keys
188 * within each attribute may be done as a byproduct of the processing here,
189 * but no other code depends on that.
191 * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
192 * if they must be satisfied in order to continue the scan forward or backward
193 * respectively. _bt_checkkeys uses these flags. For example, if the quals
194 * are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple
195 * (1,2,7), but we must continue the scan in case there are tuples (1,3,z).
196 * But once we reach tuples like (1,4,z) we can stop scanning because no
197 * later tuples could match. This is reflected by marking the x and y keys,
198 * but not the z key, with SK_BT_REQFWD. In general, the keys for leading
199 * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD.
200 * For the first attribute without an "=" key, any "<" and "<=" keys are
201 * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD.
202 * This can be seen to be correct by considering the above example. Note
203 * in particular that if there are no keys for a given attribute, the keys for
204 * subsequent attributes can never be required; for instance "WHERE y = 4"
205 * requires a full-index scan.
207 * If possible, redundant keys are eliminated: we keep only the tightest
208 * >/>= bound and the tightest </<= bound, and if there's an = key then
209 * that's the only one returned. (So, we return either a single = key,
210 * or one or two boundary-condition keys for each attr.) However, if we
211 * cannot compare two keys for lack of a suitable cross-type operator,
212 * we cannot eliminate either. If there are two such keys of the same
213 * operator strategy, the second one is just pushed into the output array
214 * without further processing here. We may also emit both >/>= or both
215 * </<= keys if we can't compare them. The logic about required keys still
216 * works if we don't eliminate redundant keys.
218 * As a byproduct of this work, we can detect contradictory quals such
219 * as "x = 1 AND x > 2". If we see that, we return so->qual_ok = FALSE,
220 * indicating the scan need not be run at all since no tuples can match.
221 * (In this case we do not bother completing the output key array!)
222 * Again, missing cross-type operators might cause us to fail to prove the
223 * quals contradictory when they really are, but the scan will work correctly.
225 * Row comparison keys are currently also treated without any smarts:
226 * we just transfer them into the preprocessed array without any
227 * editorialization. We can treat them the same as an ordinary inequality
228 * comparison on the row's first index column, for the purposes of the logic
229 * about required keys.
231 * Note: the reason we have to copy the preprocessed scan keys into private
232 * storage is that we are modifying the array based on comparisons of the
233 * key argument values, which could change on a rescan. Therefore we can't
234 * overwrite the caller's data structure.
237 _bt_preprocess_keys(IndexScanDesc scan)
239 BTScanOpaque so = (BTScanOpaque) scan->opaque;
240 int numberOfKeys = scan->numberOfKeys;
241 int16 *indoption = scan->indexRelation->rd_indoption;
242 int new_numberOfKeys;
243 int numberOfEqualCols;
247 ScanKey xform[BTMaxStrategyNumber];
253 /* initialize result variables */
255 so->numberOfKeys = 0;
257 if (numberOfKeys < 1)
258 return; /* done if qual-less scan */
260 inkeys = scan->keyData;
261 outkeys = so->keyData;
263 /* we check that input keys are correctly ordered */
264 if (cur->sk_attno < 1)
265 elog(ERROR, "btree index keys must be ordered by attribute");
267 /* We can short-circuit most of the work if there's just one key */
268 if (numberOfKeys == 1)
270 /* Apply indoption to scankey (might change sk_strategy!) */
271 if (!_bt_fix_scankey_strategy(cur, indoption))
273 memcpy(outkeys, cur, sizeof(ScanKeyData));
274 so->numberOfKeys = 1;
275 /* We can mark the qual as required if it's for first index col */
276 if (cur->sk_attno == 1)
277 _bt_mark_scankey_required(outkeys);
282 * Otherwise, do the full set of pushups.
284 new_numberOfKeys = 0;
285 numberOfEqualCols = 0;
288 * Initialize for processing of keys for attr 1.
290 * xform[i] points to the currently best scan key of strategy type i+1; it
291 * is NULL if we haven't yet found such a key for this attr.
294 memset(xform, 0, sizeof(xform));
297 * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
298 * handle after-last-key processing. Actual exit from the loop is at the
299 * "break" statement below.
301 for (i = 0;; cur++, i++)
303 if (i < numberOfKeys)
305 /* Apply indoption to scankey (might change sk_strategy!) */
306 if (!_bt_fix_scankey_strategy(cur, indoption))
308 /* NULL can't be matched, so give up */
315 * If we are at the end of the keys for a particular attr, finish up
316 * processing and emit the cleaned-up keys.
318 if (i == numberOfKeys || cur->sk_attno != attno)
320 int priorNumberOfEqualCols = numberOfEqualCols;
322 /* check input keys are correctly ordered */
323 if (i < numberOfKeys && cur->sk_attno < attno)
324 elog(ERROR, "btree index keys must be ordered by attribute");
327 * If = has been specified, all other keys can be eliminated as
328 * redundant. If we have a case like key = 1 AND key > 2, we can
329 * set qual_ok to false and abandon further processing.
331 * We also have to deal with the case of "key IS NULL", which is
332 * unsatisfiable in combination with any other index condition.
333 * By the time we get here, that's been classified as an equality
334 * check, and we've rejected any combination of it with a regular
335 * equality condition; but not with other types of conditions.
337 if (xform[BTEqualStrategyNumber - 1])
339 ScanKey eq = xform[BTEqualStrategyNumber - 1];
341 for (j = BTMaxStrategyNumber; --j >= 0;)
343 ScanKey chk = xform[j];
345 if (!chk || j == (BTEqualStrategyNumber - 1))
348 if (eq->sk_flags & SK_SEARCHNULL)
350 /* IS NULL is contradictory to anything else */
355 if (_bt_compare_scankey_args(scan, chk, eq, chk,
360 /* keys proven mutually contradictory */
364 /* else discard the redundant non-equality key */
367 /* else, cannot determine redundancy, keep both keys */
369 /* track number of attrs for which we have "=" keys */
373 /* try to keep only one of <, <= */
374 if (xform[BTLessStrategyNumber - 1]
375 && xform[BTLessEqualStrategyNumber - 1])
377 ScanKey lt = xform[BTLessStrategyNumber - 1];
378 ScanKey le = xform[BTLessEqualStrategyNumber - 1];
380 if (_bt_compare_scankey_args(scan, le, lt, le,
384 xform[BTLessEqualStrategyNumber - 1] = NULL;
386 xform[BTLessStrategyNumber - 1] = NULL;
390 /* try to keep only one of >, >= */
391 if (xform[BTGreaterStrategyNumber - 1]
392 && xform[BTGreaterEqualStrategyNumber - 1])
394 ScanKey gt = xform[BTGreaterStrategyNumber - 1];
395 ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1];
397 if (_bt_compare_scankey_args(scan, ge, gt, ge,
401 xform[BTGreaterEqualStrategyNumber - 1] = NULL;
403 xform[BTGreaterStrategyNumber - 1] = NULL;
408 * Emit the cleaned-up keys into the outkeys[] array, and then
409 * mark them if they are required. They are required (possibly
410 * only in one direction) if all attrs before this one had "=".
412 for (j = BTMaxStrategyNumber; --j >= 0;)
416 ScanKey outkey = &outkeys[new_numberOfKeys++];
418 memcpy(outkey, xform[j], sizeof(ScanKeyData));
419 if (priorNumberOfEqualCols == attno - 1)
420 _bt_mark_scankey_required(outkey);
425 * Exit loop here if done.
427 if (i == numberOfKeys)
430 /* Re-initialize for new attno */
431 attno = cur->sk_attno;
432 memset(xform, 0, sizeof(xform));
435 /* check strategy this key's operator corresponds to */
436 j = cur->sk_strategy - 1;
438 /* if row comparison, push it directly to the output array */
439 if (cur->sk_flags & SK_ROW_HEADER)
441 ScanKey outkey = &outkeys[new_numberOfKeys++];
443 memcpy(outkey, cur, sizeof(ScanKeyData));
444 if (numberOfEqualCols == attno - 1)
445 _bt_mark_scankey_required(outkey);
448 * We don't support RowCompare using equality; such a qual would
449 * mess up the numberOfEqualCols tracking.
451 Assert(j != (BTEqualStrategyNumber - 1));
455 /* have we seen one of these before? */
456 if (xform[j] == NULL)
458 /* nope, so remember this scankey */
463 /* yup, keep only the more restrictive key */
464 if (_bt_compare_scankey_args(scan, cur, cur, xform[j],
469 else if (j == (BTEqualStrategyNumber - 1))
471 /* key == a && key == b, but a != b */
475 /* else old key is more restrictive, keep it */
480 * We can't determine which key is more restrictive. Keep the
481 * previous one in xform[j] and push this one directly to the
484 ScanKey outkey = &outkeys[new_numberOfKeys++];
486 memcpy(outkey, cur, sizeof(ScanKeyData));
487 if (numberOfEqualCols == attno - 1)
488 _bt_mark_scankey_required(outkey);
493 so->numberOfKeys = new_numberOfKeys;
497 * Compare two scankey values using a specified operator.
499 * The test we want to perform is logically "leftarg op rightarg", where
500 * leftarg and rightarg are the sk_argument values in those ScanKeys, and
501 * the comparison operator is the one in the op ScanKey. However, in
502 * cross-data-type situations we may need to look up the correct operator in
503 * the index's opfamily: it is the one having amopstrategy = op->sk_strategy
504 * and amoplefttype/amoprighttype equal to the two argument datatypes.
506 * If the opfamily doesn't supply a complete set of cross-type operators we
507 * may not be able to make the comparison. If we can make the comparison
508 * we store the operator result in *result and return TRUE. We return FALSE
509 * if the comparison could not be made.
511 * Note: op always points at the same ScanKey as either leftarg or rightarg.
512 * Since we don't scribble on the scankeys, this aliasing should cause no
515 * Note: this routine needs to be insensitive to any DESC option applied
516 * to the index column. For example, "x < 4" is a tighter constraint than
517 * "x < 5" regardless of which way the index is sorted.
520 _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
521 ScanKey leftarg, ScanKey rightarg,
524 Relation rel = scan->indexRelation;
530 StrategyNumber strat;
533 * First, deal with cases where one or both args are NULL. This should
534 * only happen when the scankeys represent IS NULL/NOT NULL conditions.
536 if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ISNULL)
541 if (leftarg->sk_flags & SK_ISNULL)
543 Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
548 if (rightarg->sk_flags & SK_ISNULL)
550 Assert(rightarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
557 * We treat NULL as either greater than or less than all other values.
558 * Since true > false, the tests below work correctly for NULLS LAST
559 * logic. If the index is NULLS FIRST, we need to flip the strategy.
561 strat = op->sk_strategy;
562 if (op->sk_flags & SK_BT_NULLS_FIRST)
563 strat = BTCommuteStrategyNumber(strat);
567 case BTLessStrategyNumber:
568 *result = (leftnull < rightnull);
570 case BTLessEqualStrategyNumber:
571 *result = (leftnull <= rightnull);
573 case BTEqualStrategyNumber:
574 *result = (leftnull == rightnull);
576 case BTGreaterEqualStrategyNumber:
577 *result = (leftnull >= rightnull);
579 case BTGreaterStrategyNumber:
580 *result = (leftnull > rightnull);
583 elog(ERROR, "unrecognized StrategyNumber: %d", (int) strat);
584 *result = false; /* keep compiler quiet */
591 * The opfamily we need to worry about is identified by the index column.
593 Assert(leftarg->sk_attno == rightarg->sk_attno);
595 opcintype = rel->rd_opcintype[leftarg->sk_attno - 1];
598 * Determine the actual datatypes of the ScanKey arguments. We have to
599 * support the convention that sk_subtype == InvalidOid means the opclass
600 * input type; this is a hack to simplify life for ScanKeyInit().
602 lefttype = leftarg->sk_subtype;
603 if (lefttype == InvalidOid)
604 lefttype = opcintype;
605 righttype = rightarg->sk_subtype;
606 if (righttype == InvalidOid)
607 righttype = opcintype;
608 optype = op->sk_subtype;
609 if (optype == InvalidOid)
613 * If leftarg and rightarg match the types expected for the "op" scankey,
614 * we can use its already-looked-up comparison function.
616 if (lefttype == opcintype && righttype == optype)
618 *result = DatumGetBool(FunctionCall2Coll(&op->sk_func,
620 leftarg->sk_argument,
621 rightarg->sk_argument));
626 * Otherwise, we need to go to the syscache to find the appropriate
627 * operator. (This cannot result in infinite recursion, since no
628 * indexscan initiated by syscache lookup will use cross-data-type
631 * If the sk_strategy was flipped by _bt_fix_scankey_strategy, we have to
632 * un-flip it to get the correct opfamily member.
634 strat = op->sk_strategy;
635 if (op->sk_flags & SK_BT_DESC)
636 strat = BTCommuteStrategyNumber(strat);
638 cmp_op = get_opfamily_member(rel->rd_opfamily[leftarg->sk_attno - 1],
642 if (OidIsValid(cmp_op))
644 RegProcedure cmp_proc = get_opcode(cmp_op);
646 if (RegProcedureIsValid(cmp_proc))
648 *result = DatumGetBool(OidFunctionCall2Coll(cmp_proc,
650 leftarg->sk_argument,
651 rightarg->sk_argument));
656 /* Can't make the comparison */
657 *result = false; /* suppress compiler warnings */
662 * Adjust a scankey's strategy and flags setting as needed for indoptions.
664 * We copy the appropriate indoption value into the scankey sk_flags
665 * (shifting to avoid clobbering system-defined flag bits). Also, if
666 * the DESC option is set, commute (flip) the operator strategy number.
668 * A secondary purpose is to check for IS NULL/NOT NULL scankeys and set up
669 * the strategy field correctly for them.
671 * Lastly, for ordinary scankeys (not IS NULL/NOT NULL), we check for a
672 * NULL comparison value. Since all btree operators are assumed strict,
673 * a NULL means that the qual cannot be satisfied. We return TRUE if the
674 * comparison value isn't NULL, or FALSE if the scan should be abandoned.
676 * This function is applied to the *input* scankey structure; therefore
677 * on a rescan we will be looking at already-processed scankeys. Hence
678 * we have to be careful not to re-commute the strategy if we already did it.
679 * It's a bit ugly to modify the caller's copy of the scankey but in practice
680 * there shouldn't be any problem, since the index's indoptions are certainly
681 * not going to change while the scankey survives.
684 _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
688 addflags = indoption[skey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
691 * We treat all btree operators as strict (even if they're not so marked
692 * in pg_proc). This means that it is impossible for an operator condition
693 * with a NULL comparison constant to succeed, and we can reject it right
696 * However, we now also support "x IS NULL" clauses as search conditions,
697 * so in that case keep going. The planner has not filled in any
698 * particular strategy in this case, so set it to BTEqualStrategyNumber
699 * --- we can treat IS NULL as an equality operator for purposes of search
702 * Likewise, "x IS NOT NULL" is supported. We treat that as either "less
703 * than NULL" in a NULLS LAST index, or "greater than NULL" in a NULLS
706 * Note: someday we might have to fill in sk_collation from the index
707 * column's collation. At the moment this is a non-issue because we'll
708 * never actually call the comparison operator on a NULL.
710 if (skey->sk_flags & SK_ISNULL)
712 /* SK_ISNULL shouldn't be set in a row header scankey */
713 Assert(!(skey->sk_flags & SK_ROW_HEADER));
715 /* Set indoption flags in scankey (might be done already) */
716 skey->sk_flags |= addflags;
718 /* Set correct strategy for IS NULL or NOT NULL search */
719 if (skey->sk_flags & SK_SEARCHNULL)
721 skey->sk_strategy = BTEqualStrategyNumber;
722 skey->sk_subtype = InvalidOid;
723 skey->sk_collation = InvalidOid;
725 else if (skey->sk_flags & SK_SEARCHNOTNULL)
727 if (skey->sk_flags & SK_BT_NULLS_FIRST)
728 skey->sk_strategy = BTGreaterStrategyNumber;
730 skey->sk_strategy = BTLessStrategyNumber;
731 skey->sk_subtype = InvalidOid;
732 skey->sk_collation = InvalidOid;
736 /* regular qual, so it cannot be satisfied */
740 /* Needn't do the rest */
744 /* Adjust strategy for DESC, if we didn't already */
745 if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC))
746 skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy);
747 skey->sk_flags |= addflags;
749 /* If it's a row header, fix row member flags and strategies similarly */
750 if (skey->sk_flags & SK_ROW_HEADER)
752 ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
756 Assert(subkey->sk_flags & SK_ROW_MEMBER);
757 addflags = indoption[subkey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
758 if ((addflags & SK_BT_DESC) && !(subkey->sk_flags & SK_BT_DESC))
759 subkey->sk_strategy = BTCommuteStrategyNumber(subkey->sk_strategy);
760 subkey->sk_flags |= addflags;
761 if (subkey->sk_flags & SK_ROW_END)
771 * Mark a scankey as "required to continue the scan".
773 * Depending on the operator type, the key may be required for both scan
774 * directions or just one. Also, if the key is a row comparison header,
775 * we have to mark the appropriate subsidiary ScanKeys as required. In
776 * such cases, the first subsidiary key is required, but subsequent ones
777 * are required only as long as they correspond to successive index columns
778 * and match the leading column as to sort direction.
779 * Otherwise the row comparison ordering is different from the index ordering
780 * and so we can't stop the scan on the basis of those lower-order columns.
782 * Note: when we set required-key flag bits in a subsidiary scankey, we are
783 * scribbling on a data structure belonging to the index AM's caller, not on
784 * our private copy. This should be OK because the marking will not change
785 * from scan to scan within a query, and so we'd just re-mark the same way
786 * anyway on a rescan. Something to keep an eye on though.
789 _bt_mark_scankey_required(ScanKey skey)
793 switch (skey->sk_strategy)
795 case BTLessStrategyNumber:
796 case BTLessEqualStrategyNumber:
797 addflags = SK_BT_REQFWD;
799 case BTEqualStrategyNumber:
800 addflags = SK_BT_REQFWD | SK_BT_REQBKWD;
802 case BTGreaterEqualStrategyNumber:
803 case BTGreaterStrategyNumber:
804 addflags = SK_BT_REQBKWD;
807 elog(ERROR, "unrecognized StrategyNumber: %d",
808 (int) skey->sk_strategy);
809 addflags = 0; /* keep compiler quiet */
813 skey->sk_flags |= addflags;
815 if (skey->sk_flags & SK_ROW_HEADER)
817 ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
818 AttrNumber attno = skey->sk_attno;
820 /* First subkey should be same as the header says */
821 Assert(subkey->sk_attno == attno);
825 Assert(subkey->sk_flags & SK_ROW_MEMBER);
826 if (subkey->sk_attno != attno)
827 break; /* non-adjacent key, so not required */
828 if (subkey->sk_strategy != skey->sk_strategy)
829 break; /* wrong direction, so not required */
830 subkey->sk_flags |= addflags;
831 if (subkey->sk_flags & SK_ROW_END)
840 * Test whether an indextuple satisfies all the scankey conditions.
842 * If so, copy its TID into scan->xs_ctup.t_self, and return TRUE.
843 * If not, return FALSE (xs_ctup is not changed).
845 * If the tuple fails to pass the qual, we also determine whether there's
846 * any need to continue the scan beyond this tuple, and set *continuescan
847 * accordingly. See comments for _bt_preprocess_keys(), above, about how
850 * scan: index scan descriptor (containing a search-type scankey)
851 * page: buffer page containing index tuple
852 * offnum: offset number of index tuple (must be a valid item!)
853 * dir: direction we are scanning in
854 * continuescan: output parameter (will be set correctly in all cases)
857 _bt_checkkeys(IndexScanDesc scan,
858 Page page, OffsetNumber offnum,
859 ScanDirection dir, bool *continuescan)
861 ItemId iid = PageGetItemId(page, offnum);
870 *continuescan = true; /* default assumption */
873 * If the scan specifies not to return killed tuples, then we treat a
874 * killed tuple as not passing the qual. Most of the time, it's a win to
875 * not bother examining the tuple's index keys, but just return
876 * immediately with continuescan = true to proceed to the next tuple.
877 * However, if this is the last tuple on the page, we should check the
878 * index keys to prevent uselessly advancing to the next page.
880 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
882 /* return immediately if there are more tuples on the page */
883 if (ScanDirectionIsForward(dir))
885 if (offnum < PageGetMaxOffsetNumber(page))
890 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
892 if (offnum > P_FIRSTDATAKEY(opaque))
897 * OK, we want to check the keys, but we'll return FALSE even if the
898 * tuple passes the key tests.
905 tuple = (IndexTuple) PageGetItem(page, iid);
907 tupdesc = RelationGetDescr(scan->indexRelation);
908 so = (BTScanOpaque) scan->opaque;
909 keysz = so->numberOfKeys;
911 for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
917 /* row-comparison keys need special processing */
918 if (key->sk_flags & SK_ROW_HEADER)
920 if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan))
925 datum = index_getattr(tuple,
930 if (key->sk_flags & SK_ISNULL)
932 /* Handle IS NULL/NOT NULL tests */
933 if (key->sk_flags & SK_SEARCHNULL)
936 continue; /* tuple satisfies this qual */
940 Assert(key->sk_flags & SK_SEARCHNOTNULL);
942 continue; /* tuple satisfies this qual */
946 * Tuple fails this qual. If it's a required qual for the current
947 * scan direction, then we can conclude no further tuples will
950 if ((key->sk_flags & SK_BT_REQFWD) &&
951 ScanDirectionIsForward(dir))
952 *continuescan = false;
953 else if ((key->sk_flags & SK_BT_REQBKWD) &&
954 ScanDirectionIsBackward(dir))
955 *continuescan = false;
958 * In any case, this indextuple doesn't match the qual.
965 if (key->sk_flags & SK_BT_NULLS_FIRST)
968 * Since NULLs are sorted before non-NULLs, we know we have
969 * reached the lower limit of the range of values for this
970 * index attr. On a backward scan, we can stop if this qual
971 * is one of the "must match" subset. On a forward scan,
972 * however, we should keep going.
974 if ((key->sk_flags & SK_BT_REQBKWD) &&
975 ScanDirectionIsBackward(dir))
976 *continuescan = false;
981 * Since NULLs are sorted after non-NULLs, we know we have
982 * reached the upper limit of the range of values for this
983 * index attr. On a forward scan, we can stop if this qual is
984 * one of the "must match" subset. On a backward scan,
985 * however, we should keep going.
987 if ((key->sk_flags & SK_BT_REQFWD) &&
988 ScanDirectionIsForward(dir))
989 *continuescan = false;
993 * In any case, this indextuple doesn't match the qual.
998 test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
999 datum, key->sk_argument);
1001 if (!DatumGetBool(test))
1004 * Tuple fails this qual. If it's a required qual for the current
1005 * scan direction, then we can conclude no further tuples will
1008 * Note: because we stop the scan as soon as any required equality
1009 * qual fails, it is critical that equality quals be used for the
1010 * initial positioning in _bt_first() when they are available. See
1011 * comments in _bt_first().
1013 if ((key->sk_flags & SK_BT_REQFWD) &&
1014 ScanDirectionIsForward(dir))
1015 *continuescan = false;
1016 else if ((key->sk_flags & SK_BT_REQBKWD) &&
1017 ScanDirectionIsBackward(dir))
1018 *continuescan = false;
1021 * In any case, this indextuple doesn't match the qual.
1027 /* If we get here, the tuple passes all index quals. */
1029 scan->xs_ctup.t_self = tuple->t_tid;
1035 * Test whether an indextuple satisfies a row-comparison scan condition.
1037 * Return true if so, false if not. If not, also clear *continuescan if
1038 * it's not possible for any future tuples in the current scan direction
1041 * This is a subroutine for _bt_checkkeys, which see for more info.
1044 _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
1045 ScanDirection dir, bool *continuescan)
1047 ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
1048 int32 cmpresult = 0;
1051 /* First subkey should be same as the header says */
1052 Assert(subkey->sk_attno == skey->sk_attno);
1054 /* Loop over columns of the row condition */
1060 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1062 datum = index_getattr(tuple,
1069 if (subkey->sk_flags & SK_BT_NULLS_FIRST)
1072 * Since NULLs are sorted before non-NULLs, we know we have
1073 * reached the lower limit of the range of values for this
1074 * index attr. On a backward scan, we can stop if this qual is
1075 * one of the "must match" subset. On a forward scan,
1076 * however, we should keep going.
1078 if ((subkey->sk_flags & SK_BT_REQBKWD) &&
1079 ScanDirectionIsBackward(dir))
1080 *continuescan = false;
1085 * Since NULLs are sorted after non-NULLs, we know we have
1086 * reached the upper limit of the range of values for this
1087 * index attr. On a forward scan, we can stop if this qual is
1088 * one of the "must match" subset. On a backward scan,
1089 * however, we should keep going.
1091 if ((subkey->sk_flags & SK_BT_REQFWD) &&
1092 ScanDirectionIsForward(dir))
1093 *continuescan = false;
1097 * In any case, this indextuple doesn't match the qual.
1102 if (subkey->sk_flags & SK_ISNULL)
1105 * Unlike the simple-scankey case, this isn't a disallowed case.
1106 * But it can never match. If all the earlier row comparison
1107 * columns are required for the scan direction, we can stop the
1108 * scan, because there can't be another tuple that will succeed.
1110 if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
1112 if ((subkey->sk_flags & SK_BT_REQFWD) &&
1113 ScanDirectionIsForward(dir))
1114 *continuescan = false;
1115 else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
1116 ScanDirectionIsBackward(dir))
1117 *continuescan = false;
1121 /* Perform the test --- three-way comparison not bool operator */
1122 cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
1123 subkey->sk_collation,
1125 subkey->sk_argument));
1127 if (subkey->sk_flags & SK_BT_DESC)
1128 cmpresult = -cmpresult;
1130 /* Done comparing if unequal, else advance to next column */
1134 if (subkey->sk_flags & SK_ROW_END)
1140 * At this point cmpresult indicates the overall result of the row
1141 * comparison, and subkey points to the deciding column (or the last
1142 * column if the result is "=").
1144 switch (subkey->sk_strategy)
1146 /* EQ and NE cases aren't allowed here */
1147 case BTLessStrategyNumber:
1148 result = (cmpresult < 0);
1150 case BTLessEqualStrategyNumber:
1151 result = (cmpresult <= 0);
1153 case BTGreaterEqualStrategyNumber:
1154 result = (cmpresult >= 0);
1156 case BTGreaterStrategyNumber:
1157 result = (cmpresult > 0);
1160 elog(ERROR, "unrecognized RowCompareType: %d",
1161 (int) subkey->sk_strategy);
1162 result = 0; /* keep compiler quiet */
1169 * Tuple fails this qual. If it's a required qual for the current
1170 * scan direction, then we can conclude no further tuples will pass,
1171 * either. Note we have to look at the deciding column, not
1172 * necessarily the first or last column of the row condition.
1174 if ((subkey->sk_flags & SK_BT_REQFWD) &&
1175 ScanDirectionIsForward(dir))
1176 *continuescan = false;
1177 else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
1178 ScanDirectionIsBackward(dir))
1179 *continuescan = false;
1186 * _bt_killitems - set LP_DEAD state for items an indexscan caller has
1187 * told us were killed
1189 * scan->so contains information about the current page and killed tuples
1190 * thereon (generally, this should only be called if so->numKilled > 0).
1192 * The caller must have pin on so->currPos.buf, but may or may not have
1193 * read-lock, as indicated by haveLock. Note that we assume read-lock
1194 * is sufficient for setting LP_DEAD status (which is only a hint).
1196 * We match items by heap TID before assuming they are the right ones to
1197 * delete. We cope with cases where items have moved right due to insertions.
1198 * If an item has moved off the current page due to a split, we'll fail to
1199 * find it and do nothing (this is not an error case --- we assume the item
1200 * will eventually get marked in a future indexscan). Note that because we
1201 * hold pin on the target page continuously from initially reading the items
1202 * until applying this function, VACUUM cannot have deleted any items from
1203 * the page, and so there is no need to search left from the recorded offset.
1204 * (This observation also guarantees that the item is still the right one
1205 * to delete, which might otherwise be questionable since heap TIDs can get
1209 _bt_killitems(IndexScanDesc scan, bool haveLock)
1211 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1213 BTPageOpaque opaque;
1214 OffsetNumber minoff;
1215 OffsetNumber maxoff;
1217 bool killedsomething = false;
1219 Assert(BufferIsValid(so->currPos.buf));
1222 LockBuffer(so->currPos.buf, BT_READ);
1224 page = BufferGetPage(so->currPos.buf);
1225 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1226 minoff = P_FIRSTDATAKEY(opaque);
1227 maxoff = PageGetMaxOffsetNumber(page);
1229 for (i = 0; i < so->numKilled; i++)
1231 int itemIndex = so->killedItems[i];
1232 BTScanPosItem *kitem = &so->currPos.items[itemIndex];
1233 OffsetNumber offnum = kitem->indexOffset;
1235 Assert(itemIndex >= so->currPos.firstItem &&
1236 itemIndex <= so->currPos.lastItem);
1237 if (offnum < minoff)
1238 continue; /* pure paranoia */
1239 while (offnum <= maxoff)
1241 ItemId iid = PageGetItemId(page, offnum);
1242 IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
1244 if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
1246 /* found the item */
1247 ItemIdMarkDead(iid);
1248 killedsomething = true;
1249 break; /* out of inner search loop */
1251 offnum = OffsetNumberNext(offnum);
1256 * Since this can be redone later if needed, it's treated the same as a
1257 * commit-hint-bit status update for heap tuples: we mark the buffer dirty
1258 * but don't make a WAL log entry.
1260 * Whenever we mark anything LP_DEAD, we also set the page's
1261 * BTP_HAS_GARBAGE flag, which is likewise just a hint.
1263 if (killedsomething)
1265 opaque->btpo_flags |= BTP_HAS_GARBAGE;
1266 SetBufferCommitInfoNeedsSave(so->currPos.buf);
1270 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1273 * Always reset the scan state, so we don't look for same items on other
1281 * The following routines manage a shared-memory area in which we track
1282 * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
1283 * operations. There is a single counter which increments each time we
1284 * start a vacuum to assign it a cycle ID. Since multiple vacuums could
1285 * be active concurrently, we have to track the cycle ID for each active
1286 * vacuum; this requires at most MaxBackends entries (usually far fewer).
1287 * We assume at most one vacuum can be active for a given index.
1289 * Access to the shared memory area is controlled by BtreeVacuumLock.
1290 * In principle we could use a separate lmgr locktag for each index,
1291 * but a single LWLock is much cheaper, and given the short time that
1292 * the lock is ever held, the concurrency hit should be minimal.
1295 typedef struct BTOneVacInfo
1297 LockRelId relid; /* global identifier of an index */
1298 BTCycleId cycleid; /* cycle ID for its active VACUUM */
1301 typedef struct BTVacInfo
1303 BTCycleId cycle_ctr; /* cycle ID most recently assigned */
1304 int num_vacuums; /* number of currently active VACUUMs */
1305 int max_vacuums; /* allocated length of vacuums[] array */
1306 BTOneVacInfo vacuums[1]; /* VARIABLE LENGTH ARRAY */
1309 static BTVacInfo *btvacinfo;
1313 * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
1314 * or zero if there is no active VACUUM
1316 * Note: for correct interlocking, the caller must already hold pin and
1317 * exclusive lock on each buffer it will store the cycle ID into. This
1318 * ensures that even if a VACUUM starts immediately afterwards, it cannot
1319 * process those pages until the page split is complete.
1322 _bt_vacuum_cycleid(Relation rel)
1324 BTCycleId result = 0;
1327 /* Share lock is enough since this is a read-only operation */
1328 LWLockAcquire(BtreeVacuumLock, LW_SHARED);
1330 for (i = 0; i < btvacinfo->num_vacuums; i++)
1332 BTOneVacInfo *vac = &btvacinfo->vacuums[i];
1334 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1335 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1337 result = vac->cycleid;
1342 LWLockRelease(BtreeVacuumLock);
1347 * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
1349 * Note: the caller must guarantee that it will eventually call
1350 * _bt_end_vacuum, else we'll permanently leak an array slot. To ensure
1351 * that this happens even in elog(FATAL) scenarios, the appropriate coding
1352 * is not just a PG_TRY, but
1353 * PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
1356 _bt_start_vacuum(Relation rel)
1362 LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
1365 * Assign the next cycle ID, being careful to avoid zero as well as the
1366 * reserved high values.
1368 result = ++(btvacinfo->cycle_ctr);
1369 if (result == 0 || result > MAX_BT_CYCLE_ID)
1370 result = btvacinfo->cycle_ctr = 1;
1372 /* Let's just make sure there's no entry already for this index */
1373 for (i = 0; i < btvacinfo->num_vacuums; i++)
1375 vac = &btvacinfo->vacuums[i];
1376 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1377 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1380 * Unlike most places in the backend, we have to explicitly
1381 * release our LWLock before throwing an error. This is because
1382 * we expect _bt_end_vacuum() to be called before transaction
1383 * abort cleanup can run to release LWLocks.
1385 LWLockRelease(BtreeVacuumLock);
1386 elog(ERROR, "multiple active vacuums for index \"%s\"",
1387 RelationGetRelationName(rel));
1391 /* OK, add an entry */
1392 if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
1394 LWLockRelease(BtreeVacuumLock);
1395 elog(ERROR, "out of btvacinfo slots");
1397 vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
1398 vac->relid = rel->rd_lockInfo.lockRelId;
1399 vac->cycleid = result;
1400 btvacinfo->num_vacuums++;
1402 LWLockRelease(BtreeVacuumLock);
1407 * _bt_end_vacuum --- mark a btree VACUUM operation as done
1409 * Note: this is deliberately coded not to complain if no entry is found;
1410 * this allows the caller to put PG_TRY around the start_vacuum operation.
1413 _bt_end_vacuum(Relation rel)
1417 LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
1419 /* Find the array entry */
1420 for (i = 0; i < btvacinfo->num_vacuums; i++)
1422 BTOneVacInfo *vac = &btvacinfo->vacuums[i];
1424 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1425 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1427 /* Remove it by shifting down the last entry */
1428 *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
1429 btvacinfo->num_vacuums--;
1434 LWLockRelease(BtreeVacuumLock);
1438 * _bt_end_vacuum wrapped as an on_shmem_exit callback function
1441 _bt_end_vacuum_callback(int code, Datum arg)
1443 _bt_end_vacuum((Relation) DatumGetPointer(arg));
1447 * BTreeShmemSize --- report amount of shared memory space needed
1450 BTreeShmemSize(void)
1454 size = offsetof(BTVacInfo, vacuums[0]);
1455 size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
1460 * BTreeShmemInit --- initialize this module's shared memory
1463 BTreeShmemInit(void)
1467 btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
1471 if (!IsUnderPostmaster)
1473 /* Initialize shared memory area */
1477 * It doesn't really matter what the cycle counter starts at, but
1478 * having it always start the same doesn't seem good. Seed with
1479 * low-order bits of time() instead.
1481 btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
1483 btvacinfo->num_vacuums = 0;
1484 btvacinfo->max_vacuums = MaxBackends;
1491 btoptions(PG_FUNCTION_ARGS)
1493 Datum reloptions = PG_GETARG_DATUM(0);
1494 bool validate = PG_GETARG_BOOL(1);
1497 result = default_reloptions(reloptions, validate, RELOPT_KIND_BTREE);
1499 PG_RETURN_BYTEA_P(result);