OSDN Git Service

Fix cost_mergejoin's failure to adjust for rescanning of non-unique merge join
authorTom Lane <tgl@sss.pgh.pa.us>
Fri, 6 Feb 2009 23:43:24 +0000 (23:43 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Fri, 6 Feb 2009 23:43:24 +0000 (23:43 +0000)
keys when considering a semi or anti join.  This requires estimating the
selectivity of the merge qual as though it were a regular inner join condition.
To allow caching both that and the real outer-join-aware selectivity, split
RestrictInfo.this_selec into two fields.

This fixes one of the problems reported by Kevin Grittner.

src/backend/nodes/copyfuncs.c
src/backend/nodes/outfuncs.c
src/backend/optimizer/path/clausesel.c
src/backend/optimizer/path/costsize.c
src/backend/optimizer/path/equivclass.c
src/backend/optimizer/path/orindxpath.c
src/backend/optimizer/prep/prepunion.c
src/backend/optimizer/util/restrictinfo.c
src/include/nodes/relation.h

index de57c87..bc4232b 100644 (file)
@@ -15,7 +15,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.422 2009/02/02 19:31:39 alvherre Exp $
+ *       $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.423 2009/02/06 23:43:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1606,7 +1606,8 @@ _copyRestrictInfo(RestrictInfo *from)
        /* EquivalenceClasses are never copied, so shallow-copy the pointers */
        COPY_SCALAR_FIELD(parent_ec);
        COPY_SCALAR_FIELD(eval_cost);
-       COPY_SCALAR_FIELD(this_selec);
+       COPY_SCALAR_FIELD(norm_selec);
+       COPY_SCALAR_FIELD(outer_selec);
        COPY_NODE_FIELD(mergeopfamilies);
        /* EquivalenceClasses are never copied, so shallow-copy the pointers */
        COPY_SCALAR_FIELD(left_ec);
index 74df2f3..5dc9db9 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.351 2009/02/02 19:31:39 alvherre Exp $
+ *       $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.352 2009/02/06 23:43:23 tgl Exp $
  *
  * NOTES
  *       Every node type that can appear in stored rules' parsetrees *must*
@@ -1609,7 +1609,8 @@ _outRestrictInfo(StringInfo str, RestrictInfo *node)
        WRITE_BITMAPSET_FIELD(right_relids);
        WRITE_NODE_FIELD(orclause);
        /* don't write parent_ec, leads to infinite recursion in plan tree dump */
-       WRITE_FLOAT_FIELD(this_selec, "%.4f");
+       WRITE_FLOAT_FIELD(norm_selec, "%.4f");
+       WRITE_FLOAT_FIELD(outer_selec, "%.4f");
        WRITE_NODE_FIELD(mergeopfamilies);
        /* don't write left_ec, leads to infinite recursion in plan tree dump */
        /* don't write right_ec, leads to infinite recursion in plan tree dump */
index e9a94e7..ee02689 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/path/clausesel.c,v 1.96 2009/01/01 17:23:43 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/path/clausesel.c,v 1.97 2009/02/06 23:43:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -516,21 +516,34 @@ clause_selectivity(PlannerInfo *root,
                /*
                 * If the clause is marked redundant, always return 1.0.
                 */
-               if (rinfo->this_selec > 1)
+               if (rinfo->norm_selec > 1)
                        return (Selectivity) 1.0;
 
                /*
                 * If possible, cache the result of the selectivity calculation for
                 * the clause.  We can cache if varRelid is zero or the clause
                 * contains only vars of that relid --- otherwise varRelid will affect
-                * the result, so mustn't cache.
+                * the result, so mustn't cache.  Outer join quals might be examined
+                * with either their join's actual jointype or JOIN_INNER, so we need
+                * two cache variables to remember both cases.  Note: we assume the
+                * result won't change if we are switching the input relations or
+                * considering a unique-ified case, so we only need one cache variable
+                * for all non-JOIN_INNER cases.
                 */
                if (varRelid == 0 ||
                        bms_is_subset_singleton(rinfo->clause_relids, varRelid))
                {
                        /* Cacheable --- do we already have the result? */
-                       if (rinfo->this_selec >= 0)
-                               return rinfo->this_selec;
+                       if (jointype == JOIN_INNER)
+                       {
+                               if (rinfo->norm_selec >= 0)
+                                       return rinfo->norm_selec;
+                       }
+                       else
+                       {
+                               if (rinfo->outer_selec >= 0)
+                                       return rinfo->outer_selec;
+                       }
                        cacheable = true;
                }
 
@@ -753,7 +766,12 @@ clause_selectivity(PlannerInfo *root,
 
        /* Cache the result if possible */
        if (cacheable)
-               rinfo->this_selec = s1;
+       {
+               if (jointype == JOIN_INNER)
+                       rinfo->norm_selec = s1;
+               else
+                       rinfo->outer_selec = s1;
+       }
 
 #ifdef SELECTIVITY_DEBUG
        elog(DEBUG4, "clause_selectivity: s1 %f", s1);
index 1f8f623..07ddf43 100644 (file)
@@ -54,7 +54,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.203 2009/01/01 17:23:43 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.204 2009/02/06 23:43:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -120,7 +120,7 @@ static MergeScanSelCache *cached_scansel(PlannerInfo *root,
                           PathKey *pathkey);
 static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context);
 static double approx_tuple_count(PlannerInfo *root, JoinPath *path,
-                                                                List *quals, SpecialJoinInfo *sjinfo);
+                                                                List *quals);
 static void set_rel_width(PlannerInfo *root, RelOptInfo *rel);
 static double relation_byte_size(double tuples, int width);
 static double page_size(double tuples, int width);
@@ -1507,11 +1507,9 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
 
        /*
         * Get approx # tuples passing the mergequals.  We use approx_tuple_count
-        * here for speed --- in most cases, any errors won't affect the result
-        * much.
+        * here because we need an estimate done with JOIN_INNER semantics.
         */
-       mergejointuples = approx_tuple_count(root, &path->jpath,
-                                                                                mergeclauses, sjinfo);
+       mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses);
 
        /*
         * When there are equal merge keys in the outer relation, the mergejoin
@@ -1539,16 +1537,10 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
         * when we should not.  Can we do better without expensive selectivity
         * computations?
         *
-        * For SEMI and ANTI joins, only one inner tuple need be rescanned for
-        * each group of same-keyed outer tuples (assuming that all joinquals
-        * are merge quals).  This makes the effect small enough to ignore,
-        * so we just set rescannedtuples = 0.  Likewise, the whole issue is
-        * moot if we are working from a unique-ified outer input.
+        * The whole issue is moot if we are working from a unique-ified outer
+        * input.
         */
-       if (sjinfo->jointype == JOIN_SEMI ||
-               sjinfo->jointype == JOIN_ANTI)
-               rescannedtuples = 0;
-       else if (IsA(outer_path, UniquePath))
+       if (IsA(outer_path, UniquePath))
                rescannedtuples = 0;
        else
        {
@@ -1847,11 +1839,9 @@ cost_hashjoin(HashPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
 
        /*
         * Get approx # tuples passing the hashquals.  We use approx_tuple_count
-        * here for speed --- in most cases, any errors won't affect the result
-        * much.
+        * here because we need an estimate done with JOIN_INNER semantics.
         */
-       hashjointuples = approx_tuple_count(root, &path->jpath,
-                                                                               hashclauses, sjinfo);
+       hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses);
 
        /* cost of source data */
        startup_cost += outer_path->startup_cost;
@@ -2324,6 +2314,11 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
  * The quals can be either an implicitly-ANDed list of boolean expressions,
  * or a list of RestrictInfo nodes (typically the latter).
  *
+ * We intentionally compute the selectivity under JOIN_INNER rules, even
+ * if it's some type of outer join.  This is appropriate because we are
+ * trying to figure out how many tuples pass the initial merge or hash
+ * join step.
+ *
  * This is quick-and-dirty because we bypass clauselist_selectivity, and
  * simply multiply the independent clause selectivities together.  Now
  * clauselist_selectivity often can't do any better than that anyhow, but
@@ -2336,31 +2331,40 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
  * seems OK to live with the approximation.
  */
 static double
-approx_tuple_count(PlannerInfo *root, JoinPath *path,
-                                  List *quals, SpecialJoinInfo *sjinfo)
+approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
 {
        double          tuples;
        double          outer_tuples = path->outerjoinpath->parent->rows;
        double          inner_tuples = path->innerjoinpath->parent->rows;
+       SpecialJoinInfo sjinfo;
        Selectivity selec = 1.0;
        ListCell   *l;
 
+       /*
+        * Make up a SpecialJoinInfo for JOIN_INNER semantics.
+        */
+       sjinfo.type = T_SpecialJoinInfo;
+       sjinfo.min_lefthand = path->outerjoinpath->parent->relids;
+       sjinfo.min_righthand = path->innerjoinpath->parent->relids;
+       sjinfo.syn_lefthand = path->outerjoinpath->parent->relids;
+       sjinfo.syn_righthand = path->innerjoinpath->parent->relids;
+       sjinfo.jointype = JOIN_INNER;
+       /* we don't bother trying to make the remaining fields valid */
+       sjinfo.lhs_strict = false;
+       sjinfo.delay_upper_joins = false;
+       sjinfo.join_quals = NIL;
+
        /* Get the approximate selectivity */
        foreach(l, quals)
        {
                Node       *qual = (Node *) lfirst(l);
 
                /* Note that clause_selectivity will be able to cache its result */
-               selec *= clause_selectivity(root, qual, 0, sjinfo->jointype, sjinfo);
+               selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo);
        }
 
-       /* Apply it correctly using the input relation sizes */
-       if (sjinfo->jointype == JOIN_SEMI)
-               tuples = selec * outer_tuples;
-       else if (sjinfo->jointype == JOIN_ANTI)
-               tuples = (1.0 - selec) * outer_tuples;
-       else
-               tuples = selec * outer_tuples * inner_tuples;
+       /* Apply it to the input relation sizes */
+       tuples = selec * outer_tuples * inner_tuples;
 
        return clamp_row_est(tuples);
 }
index cbad816..bc4544e 100644 (file)
@@ -10,7 +10,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/path/equivclass.c,v 1.16 2009/01/01 17:23:43 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/path/equivclass.c,v 1.17 2009/02/06 23:43:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1200,7 +1200,8 @@ reconsider_outer_join_clauses(PlannerInfo *root)
                                        list_delete_cell(root->left_join_clauses, cell, prev);
                                /* we throw it back anyway (see notes above) */
                                /* but the thrown-back clause has no extra selectivity */
-                               rinfo->this_selec = 2.0;
+                               rinfo->norm_selec = 2.0;
+                               rinfo->outer_selec = 1.0;
                                distribute_restrictinfo_to_rels(root, rinfo);
                        }
                        else
@@ -1222,7 +1223,8 @@ reconsider_outer_join_clauses(PlannerInfo *root)
                                        list_delete_cell(root->right_join_clauses, cell, prev);
                                /* we throw it back anyway (see notes above) */
                                /* but the thrown-back clause has no extra selectivity */
-                               rinfo->this_selec = 2.0;
+                               rinfo->norm_selec = 2.0;
+                               rinfo->outer_selec = 1.0;
                                distribute_restrictinfo_to_rels(root, rinfo);
                        }
                        else
@@ -1244,7 +1246,8 @@ reconsider_outer_join_clauses(PlannerInfo *root)
                                        list_delete_cell(root->full_join_clauses, cell, prev);
                                /* we throw it back anyway (see notes above) */
                                /* but the thrown-back clause has no extra selectivity */
-                               rinfo->this_selec = 2.0;
+                               rinfo->norm_selec = 2.0;
+                               rinfo->outer_selec = 1.0;
                                distribute_restrictinfo_to_rels(root, rinfo);
                        }
                        else
index a82f1c8..638078e 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/path/orindxpath.c,v 1.86 2009/01/01 17:23:44 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/path/orindxpath.c,v 1.87 2009/02/06 23:43:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -174,10 +174,11 @@ create_or_index_quals(PlannerInfo *root, RelOptInfo *rel)
        {
                orig_selec = clause_selectivity(root, (Node *) bestrinfo,
                                                                                0, JOIN_INNER, NULL);
-               bestrinfo->this_selec = orig_selec / or_selec;
+               bestrinfo->norm_selec = orig_selec / or_selec;
                /* clamp result to sane range */
-               if (bestrinfo->this_selec > 1)
-                       bestrinfo->this_selec = 1;
+               if (bestrinfo->norm_selec > 1)
+                       bestrinfo->norm_selec = 1;
+               /* It isn't an outer join clause, so no need to adjust outer_selec */
        }
 
        /* Tell caller to recompute rel's rows estimate */
index 1ef4532..b9ce6d2 100644 (file)
@@ -22,7 +22,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.164 2009/01/01 17:23:44 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.165 2009/02/06 23:43:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1662,7 +1662,8 @@ adjust_appendrel_attrs_mutator(Node *node, AppendRelInfo *context)
                 * different values when considering the child relation.
                 */
                newinfo->eval_cost.startup = -1;
-               newinfo->this_selec = -1;
+               newinfo->norm_selec = -1;
+               newinfo->outer_selec = -1;
                newinfo->left_ec = NULL;
                newinfo->right_ec = NULL;
                newinfo->left_em = NULL;
index ddf7daf..22e2aeb 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/util/restrictinfo.c,v 1.56 2009/01/01 17:23:45 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/util/restrictinfo.c,v 1.57 2009/02/06 23:43:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -338,7 +338,8 @@ make_restrictinfo_internal(Expr *clause,
        restrictinfo->parent_ec = NULL;
 
        restrictinfo->eval_cost.startup = -1;
-       restrictinfo->this_selec = -1;
+       restrictinfo->norm_selec = -1;
+       restrictinfo->outer_selec = -1;
 
        restrictinfo->mergeopfamilies = NIL;
 
index 259b6e1..f00d1be 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.167 2009/01/01 17:24:00 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.168 2009/02/06 23:43:24 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -992,8 +992,11 @@ typedef struct RestrictInfo
 
        /* cache space for cost and selectivity */
        QualCost        eval_cost;              /* eval cost of clause; -1 if not yet set */
-       Selectivity this_selec;         /* selectivity; -1 if not yet set; >1 means
+       Selectivity norm_selec;         /* selectivity for "normal" (JOIN_INNER)
+                                                                * semantics; -1 if not yet set; >1 means
                                                                 * a redundant clause */
+       Selectivity outer_selec;        /* selectivity for outer join semantics;
+                                                                * -1 if not yet set */
 
        /* valid if clause is mergejoinable, else NIL */
        List       *mergeopfamilies;    /* opfamilies containing clause operator */