OSDN Git Service

Implement "join removal" for cases where the inner side of a left join
authorTom Lane <tgl@sss.pgh.pa.us>
Thu, 17 Sep 2009 20:49:29 +0000 (20:49 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Thu, 17 Sep 2009 20:49:29 +0000 (20:49 +0000)
is unique and is not referenced above the join.  In this case the inner
side doesn't affect the query result and can be thrown away entirely.
Although perhaps nobody would ever write such a thing by hand, it's
a reasonably common case in machine-generated SQL.

The current implementation only recognizes the case where the inner side
is a simple relation with a unique index matching the query conditions.
This is enough for the use-cases that have been shown so far, but we
might want to try to handle other cases later.

Robert Haas, somewhat rewritten by Tom

src/backend/nodes/outfuncs.c
src/backend/optimizer/README
src/backend/optimizer/path/allpaths.c
src/backend/optimizer/path/indxpath.c
src/backend/optimizer/path/joinpath.c
src/backend/optimizer/plan/createplan.c
src/backend/optimizer/util/pathnode.c
src/include/nodes/nodes.h
src/include/nodes/relation.h
src/include/optimizer/pathnode.h
src/include/optimizer/paths.h

index 454a8d0..0a7aa25 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.363 2009/07/30 02:45:37 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.364 2009/09/17 20:49:28 tgl Exp $
  *
  * NOTES
  *       Every node type that can appear in stored rules' parsetrees *must*
@@ -1422,6 +1422,16 @@ _outUniquePath(StringInfo str, UniquePath *node)
 }
 
 static void
+_outNoOpPath(StringInfo str, NoOpPath *node)
+{
+       WRITE_NODE_TYPE("NOOPPATH");
+
+       _outPathInfo(str, (Path *) node);
+
+       WRITE_NODE_FIELD(subpath);
+}
+
+static void
 _outNestPath(StringInfo str, NestPath *node)
 {
        WRITE_NODE_TYPE("NESTPATH");
@@ -2634,6 +2644,9 @@ _outNode(StringInfo str, void *obj)
                        case T_UniquePath:
                                _outUniquePath(str, obj);
                                break;
+                       case T_NoOpPath:
+                               _outNoOpPath(str, obj);
+                               break;
                        case T_NestPath:
                                _outNestPath(str, obj);
                                break;
index f406e64..560e429 100644 (file)
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.50 2009/07/21 02:02:44 tgl Exp $
+$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.51 2009/09/17 20:49:28 tgl Exp $
 
 Optimizer
 =========
@@ -354,6 +354,7 @@ RelOptInfo      - a relation or joined relations
   NestPath      - nested-loop joins
   MergePath     - merge joins
   HashPath      - hash joins
+  NoOpPath      - same as its input path (used when a join is removed)
 
  EquivalenceClass - a data structure representing a set of values known equal
 
index ada9140..5b7f0ff 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/path/allpaths.c,v 1.185 2009/09/02 17:52:24 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/path/allpaths.c,v 1.186 2009/09/17 20:49:28 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1387,6 +1387,10 @@ print_path(PlannerInfo *root, Path *path, int indent)
                        ptype = "Unique";
                        subpath = ((UniquePath *) path)->subpath;
                        break;
+               case T_NoOpPath:
+                       ptype = "NoOp";
+                       subpath = ((NoOpPath *) path)->subpath;
+                       break;
                case T_NestPath:
                        ptype = "NestLoop";
                        join = true;
index 3930acf..7bb5d89 100644 (file)
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/path/indxpath.c,v 1.241 2009/08/04 16:08:36 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/path/indxpath.c,v 1.242 2009/09/17 20:49:28 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1918,6 +1918,86 @@ find_clauses_for_join(PlannerInfo *root, RelOptInfo *rel,
        return clause_list;
 }
 
+/*
+ * relation_has_unique_index_for
+ *       Determine whether the relation provably has at most one row satisfying
+ *       a set of equality conditions, because the conditions constrain all
+ *       columns of some unique index.
+ *
+ * The conditions are provided as a list of RestrictInfo nodes, where the
+ * caller has already determined that each condition is a mergejoinable
+ * equality with an expression in this relation on one side, and an
+ * expression not involving this relation on the other.  The transient
+ * outer_is_left flag is used to identify which side we should look at:
+ * left side if outer_is_left is false, right side if it is true.
+ */
+bool
+relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
+                                                         List *restrictlist)
+{
+       ListCell   *ic;
+
+       /* Short-circuit the easy case */
+       if (restrictlist == NIL)
+               return false;
+
+       /* Examine each index of the relation ... */
+       foreach(ic, rel->indexlist)
+       {
+               IndexOptInfo   *ind = (IndexOptInfo *) lfirst(ic);
+               int                             c;
+
+               /*
+                * If the index is not unique or if it's a partial index that doesn't
+                * match the query, it's useless here.
+                */
+               if (!ind->unique || (ind->indpred != NIL && !ind->predOK))
+                       continue;
+
+               /*
+                * Try to find each index column in the list of conditions.  This is
+                * O(n^2) or worse, but we expect all the lists to be short.
+                */
+               for (c = 0; c < ind->ncolumns; c++)
+               {
+                       ListCell   *lc;
+
+                       foreach(lc, restrictlist)
+                       {
+                               RestrictInfo   *rinfo = (RestrictInfo *) lfirst(lc);
+                               Node   *rexpr;
+
+                               /*
+                                * The condition's equality operator must be a member of the
+                                * index opfamily, else it is not asserting the right kind
+                                * of equality behavior for this index.  We check this first
+                                * since it's probably cheaper than match_index_to_operand().
+                                */
+                               if (!list_member_oid(rinfo->mergeopfamilies, ind->opfamily[c]))
+                                       continue;
+
+                               /* OK, see if the condition operand matches the index key */
+                               if (rinfo->outer_is_left)
+                                       rexpr = get_rightop(rinfo->clause);
+                               else
+                                       rexpr = get_leftop(rinfo->clause);
+
+                               if (match_index_to_operand(rexpr, c, ind))
+                                       break;          /* found a match; column is unique */
+                       }
+
+                       if (lc == NULL)
+                               break;                  /* no match; this index doesn't help us */
+               }
+
+               /* Matched all columns of this index? */
+               if (c == ind->ncolumns)
+                       return true;
+       }
+
+       return false;
+}
+
 
 /****************************************************************************
  *                             ----  PATH CREATION UTILITIES  ----
index 269c482..827e18a 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/path/joinpath.c,v 1.123 2009/09/12 22:12:04 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/path/joinpath.c,v 1.124 2009/09/17 20:49:28 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "optimizer/paths.h"
 
 
+static bool join_is_removable(PlannerInfo *root, RelOptInfo *joinrel,
+                                 RelOptInfo *outerrel, RelOptInfo *innerrel,
+                                 List *restrictlist, JoinType jointype);
+static void generate_outer_only(PlannerInfo *root, RelOptInfo *joinrel,
+                                       RelOptInfo *outerrel);
 static void sort_inner_and_outer(PlannerInfo *root, RelOptInfo *joinrel,
                                         RelOptInfo *outerrel, RelOptInfo *innerrel,
                                         List *restrictlist, List *mergeclause_list,
@@ -79,10 +84,25 @@ add_paths_to_joinrel(PlannerInfo *root,
        List       *mergeclause_list = NIL;
 
        /*
+        * 0. Consider join removal.  This is always the most efficient strategy,
+        * so if it works, there's no need to consider anything further.
+        */
+       if (join_is_removable(root, joinrel, outerrel, innerrel,
+                                                 restrictlist, jointype))
+       {
+               generate_outer_only(root, joinrel, outerrel);
+               return;
+       }
+
+       /*
         * Find potential mergejoin clauses.  We can skip this if we are not
         * interested in doing a mergejoin.  However, mergejoin is currently our
         * only way of implementing full outer joins, so override mergejoin
         * disable if it's a full join.
+        *
+        * Note: do this after join_is_removable(), because this sets the
+        * outer_is_left flags in the mergejoin clauses, while join_is_removable
+        * uses those flags for its own purposes.
         */
        if (enable_mergejoin || jointype == JOIN_FULL)
                mergeclause_list = select_mergejoin_clauses(root,
@@ -134,6 +154,180 @@ add_paths_to_joinrel(PlannerInfo *root,
 }
 
 /*
+ * join_is_removable
+ *       Determine whether we need not perform the join at all, because
+ *       it will just duplicate its left input.
+ *
+ * This is true for a left join for which the join condition cannot match
+ * more than one inner-side row.  (There are other possibly interesting
+ * cases, but we don't have the infrastructure to prove them.)
+ *
+ * Note: there is no need to consider the symmetrical case of duplicating the
+ * right input, because add_paths_to_joinrel() will be called with each rel
+ * on the outer side.
+ */
+static bool
+join_is_removable(PlannerInfo *root,
+                                 RelOptInfo *joinrel,
+                                 RelOptInfo *outerrel,
+                                 RelOptInfo *innerrel,
+                                 List *restrictlist,
+                                 JoinType jointype)
+{
+       List       *clause_list = NIL;
+       ListCell   *l;
+       int                     attroff;
+
+       /*
+        * Currently, we only know how to remove left joins to a baserel with
+        * unique indexes.  We can check most of these criteria pretty trivially
+        * to avoid doing useless extra work.  But checking whether any of the
+        * indexes are unique would require iterating over the indexlist, so for
+        * now we just make sure there are indexes of some sort or other.  If none
+        * of them are unique, join removal will still fail, just slightly later.
+        */
+       if (jointype != JOIN_LEFT ||
+               innerrel->reloptkind == RELOPT_JOINREL ||
+               innerrel->rtekind != RTE_RELATION ||
+               innerrel->indexlist == NIL)
+               return false;
+
+       /*
+        * We can't remove the join if any inner-rel attributes are used above
+        * the join.
+        *
+        * As a micro-optimization, it seems better to start with max_attr and
+        * count down rather than starting with min_attr and counting up, on the
+        * theory that the system attributes are somewhat less likely to be wanted
+        * and should be tested last.
+        */
+       for (attroff = innerrel->max_attr - innerrel->min_attr;
+                attroff >= 0;
+                attroff--)
+       {
+               if (!bms_is_subset(innerrel->attr_needed[attroff], joinrel->relids))
+                       return false;
+       }
+
+       /*
+        * Search for mergejoinable clauses that constrain the inner rel against
+        * either the outer rel or a pseudoconstant.  If an operator is
+        * mergejoinable then it behaves like equality for some btree opclass,
+        * so it's what we want.  The mergejoinability test also eliminates
+        * clauses containing volatile functions, which we couldn't depend on.
+        */
+       foreach(l, restrictlist)
+       {
+               RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(l);
+
+               /*
+                * We are always considering an outer join here, so ignore pushed-down
+                * clauses.  Also ignore anything that doesn't have a mergejoinable
+                * operator.
+                */
+               if (restrictinfo->is_pushed_down)
+                       continue;
+
+               if (!restrictinfo->can_join ||
+                       restrictinfo->mergeopfamilies == NIL)
+                       continue;                       /* not mergejoinable */
+
+               /*
+                * Check if clause is usable with these input rels.  All the vars
+                * needed on each side of the clause must be available from one or the
+                * other of the input rels.
+                */
+               if (bms_is_subset(restrictinfo->left_relids, outerrel->relids) &&
+                       bms_is_subset(restrictinfo->right_relids, innerrel->relids))
+               {
+                       /* righthand side is inner */
+                       restrictinfo->outer_is_left = true;
+               }
+               else if (bms_is_subset(restrictinfo->left_relids, innerrel->relids) &&
+                                bms_is_subset(restrictinfo->right_relids, outerrel->relids))
+               {
+                       /* lefthand side is inner */
+                       restrictinfo->outer_is_left = false;
+               }
+               else
+                       continue;                       /* no good for these input relations */
+
+               /* OK, add to list */
+               clause_list = lappend(clause_list, restrictinfo);
+       }
+
+       /* Now examine the rel's restriction clauses for var = const clauses */
+       foreach(l, innerrel->baserestrictinfo)
+       {
+               RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(l);
+
+               /*
+                * Note: can_join won't be set for a restriction clause, but
+                * mergeopfamilies will be if it has a mergejoinable operator
+                * and doesn't contain volatile functions.
+                */
+               if (restrictinfo->mergeopfamilies == NIL)
+                       continue;                       /* not mergejoinable */
+
+               /*
+                * The clause certainly doesn't refer to anything but the given
+                * rel.  If either side is pseudoconstant then we can use it.
+                */
+               if (bms_is_empty(restrictinfo->left_relids))
+               {
+                       /* righthand side is inner */
+                       restrictinfo->outer_is_left = true;
+               }
+               else if (bms_is_empty(restrictinfo->right_relids))
+               {
+                       /* lefthand side is inner */
+                       restrictinfo->outer_is_left = false;
+               }
+               else
+                       continue;
+
+               /* OK, add to list */
+               clause_list = lappend(clause_list, restrictinfo);
+       }
+
+       /* Now examine the indexes to see if we have a matching unique index */
+       if (relation_has_unique_index_for(root, innerrel, clause_list))
+               return true;
+
+       /*
+        * Some day it would be nice to check for other methods of establishing
+        * distinctness.
+        */
+       return false;
+}
+
+/*
+ * generate_outer_only
+ *       Generate "join" paths when we have found the join is removable.
+ */
+static void
+generate_outer_only(PlannerInfo *root, RelOptInfo *joinrel,
+                                       RelOptInfo *outerrel)
+{
+       ListCell   *lc;
+
+       /*
+        * For the moment, replicate all of the outerrel's paths as join paths.
+        * Some of them might not really be interesting above the join, if they
+        * have sort orderings that have no real use except to do a mergejoin
+        * for the join we've just found we don't need.  But distinguishing that
+        * case probably isn't worth the extra code it would take.
+        */
+       foreach(lc, outerrel->pathlist)
+       {
+               Path   *outerpath = (Path *) lfirst(lc);
+
+               add_path(joinrel, (Path *)
+                                create_noop_path(root, joinrel, outerpath));
+       }
+}
+
+/*
  * sort_inner_and_outer
  *       Create mergejoin join paths by explicitly sorting both the outer and
  *       inner join relations on each available merge ordering.
index ac9b962..0bb53d3 100644 (file)
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.262 2009/09/12 22:12:04 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.263 2009/09/17 20:49:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -164,6 +164,11 @@ create_plan(PlannerInfo *root, Path *best_path)
                case T_WorkTableScan:
                        plan = create_scan_plan(root, best_path);
                        break;
+               case T_Join:
+                       /* this is only used for no-op joins */
+                       Assert(IsA(best_path, NoOpPath));
+                       plan = create_plan(root, ((NoOpPath *) best_path)->subpath);
+                       break;
                case T_HashJoin:
                case T_MergeJoin:
                case T_NestLoop:
index e2de249..e973389 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.153 2009/09/12 22:12:04 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.154 2009/09/17 20:49:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1216,6 +1216,26 @@ distinct_col_search(int colno, List *colnos, List *opids)
 }
 
 /*
+ * create_noop_path
+ *       Creates a path equivalent to the input subpath, but having a different
+ *       parent rel.  This is used when a join is found to be removable.
+ */
+NoOpPath *
+create_noop_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
+{
+       NoOpPath   *pathnode = makeNode(NoOpPath);
+
+       pathnode->path.pathtype = T_Join;                       /* by convention */
+       pathnode->path.parent = rel;
+       pathnode->path.startup_cost = subpath->startup_cost;
+       pathnode->path.total_cost = subpath->total_cost;
+       pathnode->path.pathkeys = subpath->pathkeys;
+       pathnode->subpath = subpath;
+
+       return pathnode;
+}
+
+/*
  * create_subqueryscan_path
  *       Creates a path corresponding to a sequential scan of a subquery,
  *       returning the pathnode.
index b29f720..6be4bf3 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.224 2009/07/30 02:45:37 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.225 2009/09/17 20:49:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -211,6 +211,7 @@ typedef enum NodeTag
        T_ResultPath,
        T_MaterialPath,
        T_UniquePath,
+       T_NoOpPath,
        T_EquivalenceClass,
        T_EquivalenceMember,
        T_PathKey,
index 4432252..9b59d63 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.174 2009/07/16 20:55:44 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.175 2009/09/17 20:49:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -784,6 +784,22 @@ typedef struct UniquePath
 } UniquePath;
 
 /*
+ * NoOpPath represents exactly the same plan as its subpath.  This is used
+ * when we have determined that a join can be eliminated.  The difference
+ * between the NoOpPath and its subpath is just that the NoOpPath's parent
+ * is the whole join relation while the subpath is for one of the joined
+ * relations (and the other one isn't needed).
+ *
+ * Note: path.pathtype is always T_Join, but this won't actually give rise
+ * to a Join plan node.
+ */
+typedef struct NoOpPath
+{
+       Path            path;
+       Path       *subpath;
+} NoOpPath;
+
+/*
  * All join-type paths share these fields.
  */
 
index b275b1d..8b71500 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.80 2009/01/01 17:24:00 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.81 2009/09/17 20:49:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -51,6 +51,8 @@ extern ResultPath *create_result_path(List *quals);
 extern MaterialPath *create_material_path(RelOptInfo *rel, Path *subpath);
 extern UniquePath *create_unique_path(PlannerInfo *root, RelOptInfo *rel,
                                   Path *subpath, SpecialJoinInfo *sjinfo);
+extern NoOpPath *create_noop_path(PlannerInfo *root, RelOptInfo *rel,
+                                                                 Path *subpath);
 extern Path *create_subqueryscan_path(RelOptInfo *rel, List *pathkeys);
 extern Path *create_functionscan_path(PlannerInfo *root, RelOptInfo *rel);
 extern Path *create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel);
index cbc8214..d9e9167 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/optimizer/paths.h,v 1.107 2009/06/11 14:49:11 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/paths.h,v 1.108 2009/09/17 20:49:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -57,6 +57,8 @@ extern List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
 extern void best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
                                         RelOptInfo *outer_rel, JoinType jointype,
                                         Path **cheapest_startup, Path **cheapest_total);
+extern bool relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
+                                                         List *restrictlist);
 extern List *group_clauses_by_indexkey(IndexOptInfo *index,
                                                  List *clauses, List *outer_clauses,
                                                  Relids outer_relids,