OSDN Git Service

Fix eqjoinsel() to make use of new statistics.
authorTom Lane <tgl@sss.pgh.pa.us>
Sun, 27 May 2001 17:37:48 +0000 (17:37 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sun, 27 May 2001 17:37:48 +0000 (17:37 +0000)
src/backend/utils/adt/selfuncs.c

index 07c4da1..1c9b3c6 100644 (file)
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.90 2001/05/20 20:28:19 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.91 2001/05/27 17:37:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -940,9 +940,7 @@ Datum
 eqjoinsel(PG_FUNCTION_ARGS)
 {
        Query      *root = (Query *) PG_GETARG_POINTER(0);
-#ifdef NOT_USED                                        /* see neqjoinsel() before removing me! */
        Oid                     operator = PG_GETARG_OID(1);
-#endif
        List       *args = (List *) PG_GETARG_POINTER(2);
        Var                *var1;
        Var                *var2;
@@ -958,73 +956,219 @@ eqjoinsel(PG_FUNCTION_ARGS)
                HeapTuple       statsTuple2 = NULL;
                Form_pg_statistic stats1 = NULL;
                Form_pg_statistic stats2 = NULL;
-               double          nd1,
-                                       nd2;
-
-               if (var1 == NULL)
-               {
-                       nd1 = DEFAULT_NUM_DISTINCT;
-               }
-               else
+               double          nd1 = DEFAULT_NUM_DISTINCT;
+               double          nd2 = DEFAULT_NUM_DISTINCT;
+               bool            have_mcvs1 = false;
+               Datum      *values1 = NULL;
+               int                     nvalues1 = 0;
+               float4     *numbers1 = NULL;
+               int                     nnumbers1 = 0;
+               bool            have_mcvs2 = false;
+               Datum      *values2 = NULL;
+               int                     nvalues2 = 0;
+               float4     *numbers2 = NULL;
+               int                     nnumbers2 = 0;
+
+               if (var1 != NULL)
                {
                        /* get stats for the attribute, if available */
                        Oid             relid1 = getrelid(var1->varno, root->rtable);
 
-                       if (relid1 == InvalidOid)
-                               nd1 = DEFAULT_NUM_DISTINCT;
-                       else
+                       if (relid1 != InvalidOid)
                        {
                                statsTuple1 = SearchSysCache(STATRELATT,
                                                                                         ObjectIdGetDatum(relid1),
                                                                                         Int16GetDatum(var1->varattno),
                                                                                         0, 0);
                                if (HeapTupleIsValid(statsTuple1))
+                               {
                                        stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1);
+                                       have_mcvs1 = get_attstatsslot(statsTuple1,
+                                                                                                 var1->vartype,
+                                                                                                 var1->vartypmod,
+                                                                                                 STATISTIC_KIND_MCV,
+                                                                                                 InvalidOid,
+                                                                                                 &values1, &nvalues1,
+                                                                                                 &numbers1, &nnumbers1);
+                               }
 
                                nd1 = get_att_numdistinct(root, var1, stats1);
                        }
                }
 
-               if (var2 == NULL)
-               {
-                       nd2 = DEFAULT_NUM_DISTINCT;
-               }
-               else
+               if (var2 != NULL)
                {
                        /* get stats for the attribute, if available */
                        Oid             relid2 = getrelid(var2->varno, root->rtable);
 
-                       if (relid2 == InvalidOid)
-                               nd2 = DEFAULT_NUM_DISTINCT;
-                       else
+                       if (relid2 != InvalidOid)
                        {
                                statsTuple2 = SearchSysCache(STATRELATT,
                                                                                         ObjectIdGetDatum(relid2),
                                                                                         Int16GetDatum(var2->varattno),
                                                                                         0, 0);
                                if (HeapTupleIsValid(statsTuple2))
+                               {
                                        stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2);
+                                       have_mcvs2 = get_attstatsslot(statsTuple2,
+                                                                                                 var2->vartype,
+                                                                                                 var2->vartypmod,
+                                                                                                 STATISTIC_KIND_MCV,
+                                                                                                 InvalidOid,
+                                                                                                 &values2, &nvalues2,
+                                                                                                 &numbers2, &nnumbers2);
+                               }
 
                                nd2 = get_att_numdistinct(root, var2, stats2);
                        }
                }
 
-               /*
-                * Estimate the join selectivity as 1 / sqrt(nd1*nd2)
-                * (can we produce any theory for this)?
-                *
-                * XXX possibility to do better: if both attributes have histograms
-                * then we could determine the exact join selectivity between the
-                * MCV sets, and only have to assume the join behavior of the non-MCV
-                * values.  This could be a big win when the MCVs cover a large part
-                * of the population.
-                *
-                * XXX what about nulls?
-                */
-               selec = 1.0 / sqrt(nd1 * nd2);
-               if (selec > 1.0)
-                       selec = 1.0;
+               if (have_mcvs1 && have_mcvs2)
+               {
+                       /*
+                        * We have most-common-value lists for both relations.  Run
+                        * through the lists to see which MCVs actually join to each
+                        * other with the given operator.  This allows us to determine
+                        * the exact join selectivity for the portion of the relations
+                        * represented by the MCV lists.  We still have to estimate for
+                        * the remaining population, but in a skewed distribution this
+                        * gives us a big leg up in accuracy.  For motivation see the
+                        * analysis in Y. Ioannidis and S. Christodoulakis, "On the
+                        * propagation of errors in the size of join results", Technical
+                        * Report 1018, Computer Science Dept., University of Wisconsin,
+                        * Madison, March 1991 (available from ftp.cs.wisc.edu).
+                        */
+                       FmgrInfo        eqproc;
+                       bool       *hasmatch1;
+                       bool       *hasmatch2;
+                       double          matchprodfreq,
+                                               matchfreq1,
+                                               matchfreq2,
+                                               unmatchfreq1,
+                                               unmatchfreq2,
+                                               otherfreq1,
+                                               otherfreq2,
+                                               totalsel1,
+                                               totalsel2;
+                       int                     i,
+                                               nmatches;
+
+                       fmgr_info(get_opcode(operator), &eqproc);
+                       hasmatch1 = (bool *) palloc(nvalues1 * sizeof(bool));
+                       memset(hasmatch1, 0, nvalues1 * sizeof(bool));
+                       hasmatch2 = (bool *) palloc(nvalues2 * sizeof(bool));
+                       memset(hasmatch2, 0, nvalues2 * sizeof(bool));
+                       /*
+                        * Note we assume that each MCV will match at most one member of
+                        * the other MCV list.  If the operator isn't really equality,
+                        * there could be multiple matches --- but we don't look for them,
+                        * both for speed and because the math wouldn't add up...
+                        */
+                       matchprodfreq = 0.0;
+                       nmatches = 0;
+                       for (i = 0; i < nvalues1; i++)
+                       {
+                               int             j;
 
+                               for (j = 0; j < nvalues2; j++)
+                               {
+                                       if (hasmatch2[j])
+                                               continue;
+                                       if (DatumGetBool(FunctionCall2(&eqproc,
+                                                                                                  values1[i],
+                                                                                                  values2[j])))
+                                       {
+                                               hasmatch1[i] = hasmatch2[j] = true;
+                                               matchprodfreq += numbers1[i] * numbers2[j];
+                                               nmatches++;
+                                               break;
+                                       }
+                               }
+                       }
+                       /* Sum up frequencies of matched and unmatched MCVs */
+                       matchfreq1 = unmatchfreq1 = 0.0;
+                       for (i = 0; i < nvalues1; i++)
+                       {
+                               if (hasmatch1[i])
+                                       matchfreq1 += numbers1[i];
+                               else
+                                       unmatchfreq1 += numbers1[i];
+                       }
+                       matchfreq2 = unmatchfreq2 = 0.0;
+                       for (i = 0; i < nvalues2; i++)
+                       {
+                               if (hasmatch2[i])
+                                       matchfreq2 += numbers2[i];
+                               else
+                                       unmatchfreq2 += numbers2[i];
+                       }
+                       pfree(hasmatch1);
+                       pfree(hasmatch2);
+                       /*
+                        * Compute total frequency of non-null values that are not in
+                        * the MCV lists.
+                        */
+                       otherfreq1 = 1.0 - stats1->stanullfrac - matchfreq1 - unmatchfreq1;
+                       otherfreq2 = 1.0 - stats2->stanullfrac - matchfreq2 - unmatchfreq2;
+                       /*
+                        * We can estimate the total selectivity from the point of view
+                        * of relation 1 as: the known selectivity for matched MCVs, plus
+                        * unmatched MCVs that are assumed to match against random members
+                        * of relation 2's non-MCV population, plus non-MCV values that
+                        * are assumed to match against random members of relation 2's
+                        * unmatched MCVs plus non-MCV values.
+                        */
+                       totalsel1 = matchprodfreq;
+                       if (nd2 > nvalues2)
+                               totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
+                       if (nd2 > nmatches)
+                               totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
+                                       (nd2 - nmatches);
+                       /* Same estimate from the point of view of relation 2. */
+                       totalsel2 = matchprodfreq;
+                       if (nd1 > nvalues1)
+                               totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
+                       if (nd1 > nmatches)
+                               totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
+                                       (nd1 - nmatches);
+                       /*
+                        * For robustness, we average the two estimates.  (Can a case
+                        * be made for taking the min or max instead?)
+                        */
+                       selec = (totalsel1 + totalsel2) * 0.5;
+               }
+               else
+               {
+                       /*
+                        * We do not have MCV lists for both sides.  Estimate the
+                        * join selectivity as MIN(1/nd1, 1/nd2).  This is plausible
+                        * if we assume that the values are about equally distributed:
+                        * a given tuple of rel1 will join to either 0 or N2/nd2 rows
+                        * of rel2, so total join rows are at most N1*N2/nd2 giving
+                        * a join selectivity of not more than 1/nd2.  By the same logic
+                        * it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper
+                        * bound.  Using the MIN() means we estimate from the point of
+                        * view of the relation with smaller nd (since the larger nd is
+                        * determining the MIN).  It is reasonable to assume that most
+                        * tuples in this rel will have join partners, so the bound is
+                        * probably reasonably tight and should be taken as-is.
+                        *
+                        * XXX Can we be smarter if we have an MCV list for just one side?
+                        * It seems that if we assume equal distribution for the other
+                        * side, we end up with the same answer anyway.
+                        */
+                       if (nd1 > nd2)
+                               selec = 1.0 / nd1;
+                       else
+                               selec = 1.0 / nd2;
+               }
+
+               if (have_mcvs1)
+                       free_attstatsslot(var1->vartype, values1, nvalues1,
+                                                         numbers1, nnumbers1);
+               if (have_mcvs2)
+                       free_attstatsslot(var2->vartype, values2, nvalues2,
+                                                         numbers2, nnumbers2);
                if (HeapTupleIsValid(statsTuple1))
                        ReleaseSysCache(statsTuple1);
                if (HeapTupleIsValid(statsTuple2))
@@ -1039,14 +1183,30 @@ eqjoinsel(PG_FUNCTION_ARGS)
 Datum
 neqjoinsel(PG_FUNCTION_ARGS)
 {
+       Query      *root = (Query *) PG_GETARG_POINTER(0);
+       Oid                     operator = PG_GETARG_OID(1);
+       List       *args = (List *) PG_GETARG_POINTER(2);
+       Oid                     eqop;
        float8          result;
 
        /*
-        * XXX we skip looking up the negator operator here because we know
-        * eqjoinsel() won't look at it anyway.  If eqjoinsel() ever does
-        * look, this routine will need to look more like neqsel() does.
+        * We want 1 - eqjoinsel() where the equality operator is the one
+        * associated with this != operator, that is, its negator.
         */
-       result = DatumGetFloat8(eqjoinsel(fcinfo));
+       eqop = get_negator(operator);
+       if (eqop)
+       {
+               result = DatumGetFloat8(DirectFunctionCall3(eqjoinsel,
+                                                                                        PointerGetDatum(root),
+                                                                                        ObjectIdGetDatum(eqop),
+                                                                                        PointerGetDatum(args)));
+
+       }
+       else
+       {
+               /* Use default selectivity (should we raise an error instead?) */
+               result = DEFAULT_EQ_SEL;
+       }
        result = 1.0 - result;
        PG_RETURN_FLOAT8(result);
 }