src/backend/optimizer/path/costsize.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * costsize.c
   4  *        Routines to compute (and set) relation sizes and path costs
   5  *
   6  * Path costs are measured in units of disk accesses: one sequential page
   7  * fetch has cost 1.  All else is scaled relative to a page fetch, using
   8  * the scaling parameters
   9  *
  10  *      random_page_cost        Cost of a non-sequential page fetch
  11  *      cpu_tuple_cost          Cost of typical CPU time to process a tuple
  12  *      cpu_index_tuple_cost  Cost of typical CPU time to process an index tuple
  13  *      cpu_operator_cost       Cost of CPU time to process a typical WHERE operator
  14  *
  15  * We also use a rough estimate "effective_cache_size" of the number of
  16  * disk pages in Postgres + OS-level disk cache.  (We can't simply use
  17  * NBuffers for this purpose because that would ignore the effects of
  18  * the kernel's disk cache.)
  19  *
  20  * Obviously, taking constants for these values is an oversimplification,
  21  * but it's tough enough to get any useful estimates even at this level of
  22  * detail.      Note that all of these parameters are user-settable, in case
  23  * the default values are drastically off for a particular platform.
  24  *
  25  * We compute two separate costs for each path:
  26  *              total_cost: total estimated cost to fetch all tuples
  27  *              startup_cost: cost that is expended before first tuple is fetched
  28  * In some scenarios, such as when there is a LIMIT or we are implementing
  29  * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
  30  * path's result.  A caller can estimate the cost of fetching a partial
  31  * result by interpolating between startup_cost and total_cost.  In detail:
  32  *              actual_cost = startup_cost +
  33  *                      (total_cost - startup_cost) * tuples_to_fetch / path->parent->rows;
  34  * Note that a base relation's rows count (and, by extension, plan_rows for
  35  * plan nodes below the LIMIT node) are set without regard to any LIMIT, so
  36  * that this equation works properly.  (Also, these routines guarantee not to
  37  * set the rows count to zero, so there will be no zero divide.)  The LIMIT is
  38  * applied as a top-level plan node.
  39  *
  40  *
  41  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  42  * Portions Copyright (c) 1994, Regents of the University of California
  43  *
  44  * IDENTIFICATION
  45  *        $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.90 2002/09/04 20:31:20 momjian Exp $
  46  *
  47  *-------------------------------------------------------------------------
  48  */
  49
  50 #include "postgres.h"
  51
  52 #include <math.h>
  53
  54 #include "catalog/pg_statistic.h"
  55 #include "executor/nodeHash.h"
  56 #include "miscadmin.h"
  57 #include "optimizer/clauses.h"
  58 #include "optimizer/cost.h"
  59 #include "optimizer/pathnode.h"
  60 #include "parser/parsetree.h"
  61 #include "utils/selfuncs.h"
  62 #include "utils/lsyscache.h"
  63 #include "utils/syscache.h"
  64
  65
  66 #define LOG2(x)  (log(x) / 0.693147180559945)
  67 #define LOG6(x)  (log(x) / 1.79175946922805)
  68
  69
  70 double          effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;
  71 double          random_page_cost = DEFAULT_RANDOM_PAGE_COST;
  72 double          cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
  73 double          cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
  74 double          cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;
  75
  76 Cost            disable_cost = 100000000.0;
  77
  78 bool            enable_seqscan = true;
  79 bool            enable_indexscan = true;
  80 bool            enable_tidscan = true;
  81 bool            enable_sort = true;
  82 bool            enable_nestloop = true;
  83 bool            enable_mergejoin = true;
  84 bool            enable_hashjoin = true;
  85
  86
  87 static Selectivity estimate_hash_bucketsize(Query *root, Var *var);
  88 static bool cost_qual_eval_walker(Node *node, Cost *total);
  89 static Selectivity approx_selectivity(Query *root, List *quals);
  90 static void set_rel_width(Query *root, RelOptInfo *rel);
  91 static double relation_byte_size(double tuples, int width);
  92 static double page_size(double tuples, int width);
  93
  94
  95 /*
  96  * cost_seqscan
  97  *        Determines and returns the cost of scanning a relation sequentially.
  98  *
  99  * Note: for historical reasons, this routine and the others in this module
 100  * use the passed result Path only to store their startup_cost and total_cost
 101  * results into.  All the input data they need is passed as separate
 102  * parameters, even though much of it could be extracted from the Path.
 103  */
 104 void
 105 cost_seqscan(Path *path, Query *root,
 106                          RelOptInfo *baserel)
 107 {
 108         Cost            startup_cost = 0;
 109         Cost            run_cost = 0;
 110         Cost            cpu_per_tuple;
 111
 112         /* Should only be applied to base relations */
 113         Assert(length(baserel->relids) == 1);
 114         Assert(baserel->rtekind == RTE_RELATION);
 115
 116         if (!enable_seqscan)
 117                 startup_cost += disable_cost;
 118
 119         /*
 120          * disk costs
 121          *
 122          * The cost of reading a page sequentially is 1.0, by definition. Note
 123          * that the Unix kernel will typically do some amount of read-ahead
 124          * optimization, so that this cost is less than the true cost of
 125          * reading a page from disk.  We ignore that issue here, but must take
 126          * it into account when estimating the cost of non-sequential
 127          * accesses!
 128          */
 129         run_cost += baserel->pages; /* sequential fetches with cost 1.0 */
 130
 131         /* CPU costs */
 132         cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost;
 133         run_cost += cpu_per_tuple * baserel->tuples;
 134
 135         path->startup_cost = startup_cost;
 136         path->total_cost = startup_cost + run_cost;
 137 }
 138
 139 /*
 140  * cost_nonsequential_access
 141  *        Estimate the cost of accessing one page at random from a relation
 142  *        (or sort temp file) of the given size in pages.
 143  *
 144  * The simplistic model that the cost is random_page_cost is what we want
 145  * to use for large relations; but for small ones that is a serious
 146  * overestimate because of the effects of caching.      This routine tries to
 147  * account for that.
 148  *
 149  * Unfortunately we don't have any good way of estimating the effective cache
 150  * size we are working with --- we know that Postgres itself has NBuffers
 151  * internal buffers, but the size of the kernel's disk cache is uncertain,
 152  * and how much of it we get to use is even less certain.  We punt the problem
 153  * for now by assuming we are given an effective_cache_size parameter.
 154  *
 155  * Given a guesstimated cache size, we estimate the actual I/O cost per page
 156  * with the entirely ad-hoc equations:
 157  *      if relpages >= effective_cache_size:
 158  *              random_page_cost * (1 - (effective_cache_size/relpages)/2)
 159  *      if relpages < effective_cache_size:
 160  *              1 + (random_page_cost/2-1) * (relpages/effective_cache_size) ** 2
 161  * These give the right asymptotic behavior (=> 1.0 as relpages becomes
 162  * small, => random_page_cost as it becomes large) and meet in the middle
 163  * with the estimate that the cache is about 50% effective for a relation
 164  * of the same size as effective_cache_size.  (XXX this is probably all
 165  * wrong, but I haven't been able to find any theory about how effective
 166  * a disk cache should be presumed to be.)
 167  */
 168 static Cost
 169 cost_nonsequential_access(double relpages)
 170 {
 171         double          relsize;
 172
 173         /* don't crash on bad input data */
 174         if (relpages <= 0.0 || effective_cache_size <= 0.0)
 175                 return random_page_cost;
 176
 177         relsize = relpages / effective_cache_size;
 178
 179         if (relsize >= 1.0)
 180                 return random_page_cost * (1.0 - 0.5 / relsize);
 181         else
 182                 return 1.0 + (random_page_cost * 0.5 - 1.0) * relsize * relsize;
 183 }
 184
 185 /*
 186  * cost_index
 187  *        Determines and returns the cost of scanning a relation using an index.
 188  *
 189  *        NOTE: an indexscan plan node can actually represent several passes,
 190  *        but here we consider the cost of just one pass.
 191  *
 192  * 'root' is the query root
 193  * 'baserel' is the base relation the index is for
 194  * 'index' is the index to be used
 195  * 'indexQuals' is the list of applicable qual clauses (implicit AND semantics)
 196  * 'is_injoin' is T if we are considering using the index scan as the inside
 197  *              of a nestloop join (hence, some of the indexQuals are join clauses)
 198  *
 199  * NOTE: 'indexQuals' must contain only clauses usable as index restrictions.
 200  * Any additional quals evaluated as qpquals may reduce the number of returned
 201  * tuples, but they won't reduce the number of tuples we have to fetch from
 202  * the table, so they don't reduce the scan cost.
 203  */
 204 void
 205 cost_index(Path *path, Query *root,
 206                    RelOptInfo *baserel,
 207                    IndexOptInfo *index,
 208                    List *indexQuals,
 209                    bool is_injoin)
 210 {
 211         Cost            startup_cost = 0;
 212         Cost            run_cost = 0;
 213         Cost            indexStartupCost;
 214         Cost            indexTotalCost;
 215         Selectivity indexSelectivity;
 216         double          indexCorrelation,
 217                                 csquared;
 218         Cost            min_IO_cost,
 219                                 max_IO_cost;
 220         Cost            cpu_per_tuple;
 221         double          tuples_fetched;
 222         double          pages_fetched;
 223         double          T,
 224                                 b;
 225
 226         /* Should only be applied to base relations */
 227         Assert(IsA(baserel, RelOptInfo) &&
 228                    IsA(index, IndexOptInfo));
 229         Assert(length(baserel->relids) == 1);
 230         Assert(baserel->rtekind == RTE_RELATION);
 231
 232         if (!enable_indexscan && !is_injoin)
 233                 startup_cost += disable_cost;
 234
 235         /*
 236          * Call index-access-method-specific code to estimate the processing
 237          * cost for scanning the index, as well as the selectivity of the
 238          * index (ie, the fraction of main-table tuples we will have to
 239          * retrieve) and its correlation to the main-table tuple order.
 240          */
 241         OidFunctionCall8(index->amcostestimate,
 242                                          PointerGetDatum(root),
 243                                          PointerGetDatum(baserel),
 244                                          PointerGetDatum(index),
 245                                          PointerGetDatum(indexQuals),
 246                                          PointerGetDatum(&indexStartupCost),
 247                                          PointerGetDatum(&indexTotalCost),
 248                                          PointerGetDatum(&indexSelectivity),
 249                                          PointerGetDatum(&indexCorrelation));
 250
 251         /* all costs for touching index itself included here */
 252         startup_cost += indexStartupCost;
 253         run_cost += indexTotalCost - indexStartupCost;
 254
 255         /*----------
 256          * Estimate number of main-table tuples and pages fetched.
 257          *
 258          * When the index ordering is uncorrelated with the table ordering,
 259          * we use an approximation proposed by Mackert and Lohman, "Index Scans
 260          * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
 261          * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
 262          * The Mackert and Lohman approximation is that the number of pages
 263          * fetched is
 264          *      PF =
 265          *              min(2TNs/(2T+Ns), T)                    when T <= b
 266          *              2TNs/(2T+Ns)                                    when T > b and Ns <= 2Tb/(2T-b)
 267          *              b + (Ns - 2Tb/(2T-b))*(T-b)/T   when T > b and Ns > 2Tb/(2T-b)
 268          * where
 269          *              T = # pages in table
 270          *              N = # tuples in table
 271          *              s = selectivity = fraction of table to be scanned
 272          *              b = # buffer pages available (we include kernel space here)
 273          *
 274          * When the index ordering is exactly correlated with the table ordering
 275          * (just after a CLUSTER, for example), the number of pages fetched should
 276          * be just sT.  What's more, these will be sequential fetches, not the
 277          * random fetches that occur in the uncorrelated case.  So, depending on
 278          * the extent of correlation, we should estimate the actual I/O cost
 279          * somewhere between s * T * 1.0 and PF * random_cost.  We currently
 280          * interpolate linearly between these two endpoints based on the
 281          * correlation squared (XXX is that appropriate?).
 282          *
 283          * In any case the number of tuples fetched is Ns.
 284          *----------
 285          */
 286
 287         tuples_fetched = indexSelectivity * baserel->tuples;
 288         /* Don't believe estimates less than 1... */
 289         if (tuples_fetched < 1.0)
 290                 tuples_fetched = 1.0;
 291
 292         /* This part is the Mackert and Lohman formula */
 293
 294         T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
 295         b = (effective_cache_size > 1) ? effective_cache_size : 1.0;
 296
 297         if (T <= b)
 298         {
 299                 pages_fetched =
 300                         (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
 301                 if (pages_fetched > T)
 302                         pages_fetched = T;
 303         }
 304         else
 305         {
 306                 double          lim;
 307
 308                 lim = (2.0 * T * b) / (2.0 * T - b);
 309                 if (tuples_fetched <= lim)
 310                 {
 311                         pages_fetched =
 312                                 (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
 313                 }
 314                 else
 315                 {
 316                         pages_fetched =
 317                                 b + (tuples_fetched - lim) * (T - b) / T;
 318                 }
 319         }
 320
 321         /*
 322          * min_IO_cost corresponds to the perfectly correlated case
 323          * (csquared=1), max_IO_cost to the perfectly uncorrelated case
 324          * (csquared=0).  Note that we just charge random_page_cost per page
 325          * in the uncorrelated case, rather than using
 326          * cost_nonsequential_access, since we've already accounted for
 327          * caching effects by using the Mackert model.
 328          */
 329         min_IO_cost = ceil(indexSelectivity * T);
 330         max_IO_cost = pages_fetched * random_page_cost;
 331
 332         /*
 333          * Now interpolate based on estimated index order correlation to get
 334          * total disk I/O cost for main table accesses.
 335          */
 336         csquared = indexCorrelation * indexCorrelation;
 337
 338         run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
 339
 340         /*
 341          * Estimate CPU costs per tuple.
 342          *
 343          * Normally the indexquals will be removed from the list of restriction
 344          * clauses that we have to evaluate as qpquals, so we should subtract
 345          * their costs from baserestrictcost.  XXX For a lossy index, not all
 346          * the quals will be removed and so we really shouldn't subtract their
 347          * costs; but detecting that seems more expensive than it's worth.
 348          * Also, if we are doing a join then some of the indexquals are join
 349          * clauses and shouldn't be subtracted.  Rather than work out exactly
 350          * how much to subtract, we don't subtract anything.
 351          */
 352         cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost;
 353
 354         if (!is_injoin)
 355                 cpu_per_tuple -= cost_qual_eval(indexQuals);
 356
 357         run_cost += cpu_per_tuple * tuples_fetched;
 358
 359         path->startup_cost = startup_cost;
 360         path->total_cost = startup_cost + run_cost;
 361 }
 362
 363 /*
 364  * cost_tidscan
 365  *        Determines and returns the cost of scanning a relation using TIDs.
 366  */
 367 void
 368 cost_tidscan(Path *path, Query *root,
 369                          RelOptInfo *baserel, List *tideval)
 370 {
 371         Cost            startup_cost = 0;
 372         Cost            run_cost = 0;
 373         Cost            cpu_per_tuple;
 374         int                     ntuples = length(tideval);
 375
 376         /* Should only be applied to base relations */
 377         Assert(length(baserel->relids) == 1);
 378         Assert(baserel->rtekind == RTE_RELATION);
 379
 380         if (!enable_tidscan)
 381                 startup_cost += disable_cost;
 382
 383         /* disk costs --- assume each tuple on a different page */
 384         run_cost += random_page_cost * ntuples;
 385
 386         /* CPU costs */
 387         cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost;
 388         run_cost += cpu_per_tuple * ntuples;
 389
 390         path->startup_cost = startup_cost;
 391         path->total_cost = startup_cost + run_cost;
 392 }
 393
 394 /*
 395  * cost_functionscan
 396  *        Determines and returns the cost of scanning a function RTE.
 397  */
 398 void
 399 cost_functionscan(Path *path, Query *root, RelOptInfo *baserel)
 400 {
 401         Cost            startup_cost = 0;
 402         Cost            run_cost = 0;
 403         Cost            cpu_per_tuple;
 404
 405         /* Should only be applied to base relations that are functions */
 406         Assert(length(baserel->relids) == 1);
 407         Assert(baserel->rtekind == RTE_FUNCTION);
 408
 409         /*
 410          * For now, estimate function's cost at one operator eval per function
 411          * call.  Someday we should revive the function cost estimate columns
 412          * in pg_proc...
 413          */
 414         cpu_per_tuple = cpu_operator_cost;
 415
 416         /* Add scanning CPU costs */
 417         cpu_per_tuple += cpu_tuple_cost + baserel->baserestrictcost;
 418         run_cost += cpu_per_tuple * baserel->tuples;
 419
 420         path->startup_cost = startup_cost;
 421         path->total_cost = startup_cost + run_cost;
 422 }
 423
 424 /*
 425  * cost_sort
 426  *        Determines and returns the cost of sorting a relation.
 427  *
 428  * The cost of supplying the input data is NOT included; the caller should
 429  * add that cost to both startup and total costs returned from this routine!
 430  *
 431  * If the total volume of data to sort is less than SortMem, we will do
 432  * an in-memory sort, which requires no I/O and about t*log2(t) tuple
 433  * comparisons for t tuples.
 434  *
 435  * If the total volume exceeds SortMem, we switch to a tape-style merge
 436  * algorithm.  There will still be about t*log2(t) tuple comparisons in
 437  * total, but we will also need to write and read each tuple once per
 438  * merge pass.  We expect about ceil(log6(r)) merge passes where r is the
 439  * number of initial runs formed (log6 because tuplesort.c uses six-tape
 440  * merging).  Since the average initial run should be about twice SortMem,
 441  * we have
 442  *              disk traffic = 2 * relsize * ceil(log6(p / (2*SortMem)))
 443  *              cpu = comparison_cost * t * log2(t)
 444  *
 445  * The disk traffic is assumed to be half sequential and half random
 446  * accesses (XXX can't we refine that guess?)
 447  *
 448  * We charge two operator evals per tuple comparison, which should be in
 449  * the right ballpark in most cases.
 450  *
 451  * 'pathkeys' is a list of sort keys
 452  * 'tuples' is the number of tuples in the relation
 453  * 'width' is the average tuple width in bytes
 454  *
 455  * NOTE: some callers currently pass NIL for pathkeys because they
 456  * can't conveniently supply the sort keys.  Since this routine doesn't
 457  * currently do anything with pathkeys anyway, that doesn't matter...
 458  * but if it ever does, it should react gracefully to lack of key data.
 459  */
 460 void
 461 cost_sort(Path *path, Query *root,
 462                   List *pathkeys, double tuples, int width)
 463 {
 464         Cost            startup_cost = 0;
 465         Cost            run_cost = 0;
 466         double          nbytes = relation_byte_size(tuples, width);
 467         long            sortmembytes = SortMem * 1024L;
 468
 469         if (!enable_sort)
 470                 startup_cost += disable_cost;
 471
 472         /*
 473          * We want to be sure the cost of a sort is never estimated as zero,
 474          * even if passed-in tuple count is zero.  Besides, mustn't do
 475          * log(0)...
 476          */
 477         if (tuples < 2.0)
 478                 tuples = 2.0;
 479
 480         /*
 481          * CPU costs
 482          *
 483          * Assume about two operator evals per tuple comparison and N log2 N
 484          * comparisons
 485          */
 486         startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
 487
 488         /* disk costs */
 489         if (nbytes > sortmembytes)
 490         {
 491                 double          npages = ceil(nbytes / BLCKSZ);
 492                 double          nruns = nbytes / (sortmembytes * 2);
 493                 double          log_runs = ceil(LOG6(nruns));
 494                 double          npageaccesses;
 495
 496                 if (log_runs < 1.0)
 497                         log_runs = 1.0;
 498                 npageaccesses = 2.0 * npages * log_runs;
 499                 /* Assume half are sequential (cost 1), half are not */
 500                 startup_cost += npageaccesses *
 501                         (1.0 + cost_nonsequential_access(npages)) * 0.5;
 502         }
 503
 504         /*
 505          * Also charge a small amount (arbitrarily set equal to operator cost)
 506          * per extracted tuple.
 507          */
 508         run_cost += cpu_operator_cost * tuples;
 509
 510         path->startup_cost = startup_cost;
 511         path->total_cost = startup_cost + run_cost;
 512 }
 513
 514
 515 /*
 516  * cost_nestloop
 517  *        Determines and returns the cost of joining two relations using the
 518  *        nested loop algorithm.
 519  *
 520  * 'outer_path' is the path for the outer relation
 521  * 'inner_path' is the path for the inner relation
 522  * 'restrictlist' are the RestrictInfo nodes to be applied at the join
 523  */
 524 void
 525 cost_nestloop(Path *path, Query *root,
 526                           Path *outer_path,
 527                           Path *inner_path,
 528                           List *restrictlist)
 529 {
 530         Cost            startup_cost = 0;
 531         Cost            run_cost = 0;
 532         Cost            cpu_per_tuple;
 533         double          ntuples;
 534
 535         if (!enable_nestloop)
 536                 startup_cost += disable_cost;
 537
 538         /* cost of source data */
 539
 540         /*
 541          * NOTE: clearly, we must pay both outer and inner paths' startup_cost
 542          * before we can start returning tuples, so the join's startup cost is
 543          * their sum.  What's not so clear is whether the inner path's
 544          * startup_cost must be paid again on each rescan of the inner path.
 545          * This is not true if the inner path is materialized, but probably is
 546          * true otherwise.      Since we don't yet have clean handling of the
 547          * decision whether to materialize a path, we can't tell here which
 548          * will happen.  As a compromise, charge 50% of the inner startup cost
 549          * for each restart.
 550          */
 551         startup_cost += outer_path->startup_cost + inner_path->startup_cost;
 552         run_cost += outer_path->total_cost - outer_path->startup_cost;
 553         run_cost += outer_path->parent->rows *
 554                 (inner_path->total_cost - inner_path->startup_cost);
 555         if (outer_path->parent->rows > 1)
 556                 run_cost += (outer_path->parent->rows - 1) *
 557                         inner_path->startup_cost * 0.5;
 558
 559         /*
 560          * Number of tuples processed (not number emitted!).  If inner path is
 561          * an indexscan, be sure to use its estimated output row count, which
 562          * may be lower than the restriction-clause-only row count of its
 563          * parent.
 564          */
 565         if (IsA(inner_path, IndexPath))
 566                 ntuples = ((IndexPath *) inner_path)->rows;
 567         else
 568                 ntuples = inner_path->parent->rows;
 569         ntuples *= outer_path->parent->rows;
 570
 571         /* CPU costs */
 572         cpu_per_tuple = cpu_tuple_cost + cost_qual_eval(restrictlist);
 573         run_cost += cpu_per_tuple * ntuples;
 574
 575         path->startup_cost = startup_cost;
 576         path->total_cost = startup_cost + run_cost;
 577 }
 578
 579 /*
 580  * cost_mergejoin
 581  *        Determines and returns the cost of joining two relations using the
 582  *        merge join algorithm.
 583  *
 584  * 'outer_path' is the path for the outer relation
 585  * 'inner_path' is the path for the inner relation
 586  * 'restrictlist' are the RestrictInfo nodes to be applied at the join
 587  * 'mergeclauses' are the RestrictInfo nodes to use as merge clauses
 588  *              (this should be a subset of the restrictlist)
 589  * 'outersortkeys' and 'innersortkeys' are lists of the keys to be used
 590  *                              to sort the outer and inner relations, or NIL if no explicit
 591  *                              sort is needed because the source path is already ordered
 592  */
 593 void
 594 cost_mergejoin(Path *path, Query *root,
 595                            Path *outer_path,
 596                            Path *inner_path,
 597                            List *restrictlist,
 598                            List *mergeclauses,
 599                            List *outersortkeys,
 600                            List *innersortkeys)
 601 {
 602         Cost            startup_cost = 0;
 603         Cost            run_cost = 0;
 604         Cost            cpu_per_tuple;
 605         RestrictInfo *firstclause;
 606         Var                *leftvar;
 607         double          outer_rows,
 608                                 inner_rows;
 609         double          ntuples;
 610         Selectivity outerscansel,
 611                                 innerscansel;
 612         Path            sort_path;              /* dummy for result of cost_sort */
 613
 614         if (!enable_mergejoin)
 615                 startup_cost += disable_cost;
 616
 617         /*
 618          * A merge join will stop as soon as it exhausts either input stream.
 619          * Estimate fraction of the left and right inputs that will actually
 620          * need to be scanned.  We use only the first (most significant) merge
 621          * clause for this purpose.
 622          *
 623          * Since this calculation is somewhat expensive, and will be the same for
 624          * all mergejoin paths associated with the merge clause, we cache the
 625          * results in the RestrictInfo node.
 626          */
 627         firstclause = (RestrictInfo *) lfirst(mergeclauses);
 628         if (firstclause->left_mergescansel < 0)         /* not computed yet? */
 629                 mergejoinscansel(root, (Node *) firstclause->clause,
 630                                                  &firstclause->left_mergescansel,
 631                                                  &firstclause->right_mergescansel);
 632
 633         leftvar = get_leftop(firstclause->clause);
 634         Assert(IsA(leftvar, Var));
 635         if (VARISRELMEMBER(leftvar->varno, outer_path->parent))
 636         {
 637                 /* left side of clause is outer */
 638                 outerscansel = firstclause->left_mergescansel;
 639                 innerscansel = firstclause->right_mergescansel;
 640         }
 641         else
 642         {
 643                 /* left side of clause is inner */
 644                 outerscansel = firstclause->right_mergescansel;
 645                 innerscansel = firstclause->left_mergescansel;
 646         }
 647
 648         outer_rows = outer_path->parent->rows * outerscansel;
 649         inner_rows = inner_path->parent->rows * innerscansel;
 650
 651         /* cost of source data */
 652
 653         /*
 654          * Note we are assuming that each source tuple is fetched just once,
 655          * which is not right in the presence of equal keys.  If we had a way
 656          * of estimating the proportion of equal keys, we could apply a
 657          * correction factor...
 658          */
 659         if (outersortkeys)                      /* do we need to sort outer? */
 660         {
 661                 startup_cost += outer_path->total_cost;
 662                 cost_sort(&sort_path,
 663                                   root,
 664                                   outersortkeys,
 665                                   outer_path->parent->rows,
 666                                   outer_path->parent->width);
 667                 startup_cost += sort_path.startup_cost;
 668                 run_cost += (sort_path.total_cost - sort_path.startup_cost)
 669                         * outerscansel;
 670         }
 671         else
 672         {
 673                 startup_cost += outer_path->startup_cost;
 674                 run_cost += (outer_path->total_cost - outer_path->startup_cost)
 675                         * outerscansel;
 676         }
 677
 678         if (innersortkeys)                      /* do we need to sort inner? */
 679         {
 680                 startup_cost += inner_path->total_cost;
 681                 cost_sort(&sort_path,
 682                                   root,
 683                                   innersortkeys,
 684                                   inner_path->parent->rows,
 685                                   inner_path->parent->width);
 686                 startup_cost += sort_path.startup_cost;
 687                 run_cost += (sort_path.total_cost - sort_path.startup_cost)
 688                         * innerscansel;
 689         }
 690         else
 691         {
 692                 startup_cost += inner_path->startup_cost;
 693                 run_cost += (inner_path->total_cost - inner_path->startup_cost)
 694                         * innerscansel;
 695         }
 696
 697         /*
 698          * The number of tuple comparisons needed depends drastically on the
 699          * number of equal keys in the two source relations, which we have no
 700          * good way of estimating.      (XXX could the MCV statistics help?)
 701          * Somewhat arbitrarily, we charge one tuple comparison (one
 702          * cpu_operator_cost) for each tuple in the two source relations.
 703          * This is probably a lower bound.
 704          */
 705         run_cost += cpu_operator_cost * (outer_rows + inner_rows);
 706
 707         /*
 708          * For each tuple that gets through the mergejoin proper, we charge
 709          * cpu_tuple_cost plus the cost of evaluating additional restriction
 710          * clauses that are to be applied at the join.  It's OK to use an
 711          * approximate selectivity here, since in most cases this is a minor
 712          * component of the cost.  NOTE: it's correct to use the unscaled rows
 713          * counts here, not the scaled-down counts we obtained above.
 714          */
 715         ntuples = approx_selectivity(root, mergeclauses) *
 716                 outer_path->parent->rows * inner_path->parent->rows;
 717
 718         /* CPU costs */
 719         cpu_per_tuple = cpu_tuple_cost + cost_qual_eval(restrictlist);
 720         run_cost += cpu_per_tuple * ntuples;
 721
 722         path->startup_cost = startup_cost;
 723         path->total_cost = startup_cost + run_cost;
 724 }
 725
 726 /*
 727  * cost_hashjoin
 728  *        Determines and returns the cost of joining two relations using the
 729  *        hash join algorithm.
 730  *
 731  * 'outer_path' is the path for the outer relation
 732  * 'inner_path' is the path for the inner relation
 733  * 'restrictlist' are the RestrictInfo nodes to be applied at the join
 734  * 'hashclauses' is a list of the hash join clause (always a 1-element list)
 735  *              (this should be a subset of the restrictlist)
 736  */
 737 void
 738 cost_hashjoin(Path *path, Query *root,
 739                           Path *outer_path,
 740                           Path *inner_path,
 741                           List *restrictlist,
 742                           List *hashclauses)
 743 {
 744         Cost            startup_cost = 0;
 745         Cost            run_cost = 0;
 746         Cost            cpu_per_tuple;
 747         double          ntuples;
 748         double          outerbytes = relation_byte_size(outer_path->parent->rows,
 749                                                                                           outer_path->parent->width);
 750         double          innerbytes = relation_byte_size(inner_path->parent->rows,
 751                                                                                           inner_path->parent->width);
 752         long            hashtablebytes = SortMem * 1024L;
 753         RestrictInfo *restrictinfo;
 754         Var                *left,
 755                            *right;
 756         Selectivity innerbucketsize;
 757
 758         if (!enable_hashjoin)
 759                 startup_cost += disable_cost;
 760
 761         /* cost of source data */
 762         startup_cost += outer_path->startup_cost;
 763         run_cost += outer_path->total_cost - outer_path->startup_cost;
 764         startup_cost += inner_path->total_cost;
 765
 766         /* cost of computing hash function: must do it once per input tuple */
 767         startup_cost += cpu_operator_cost * inner_path->parent->rows;
 768         run_cost += cpu_operator_cost * outer_path->parent->rows;
 769
 770         /*
 771          * Determine bucketsize fraction for inner relation.  First we have to
 772          * figure out which side of the hashjoin clause is the inner side.
 773          */
 774         Assert(length(hashclauses) == 1);
 775         Assert(IsA(lfirst(hashclauses), RestrictInfo));
 776         restrictinfo = (RestrictInfo *) lfirst(hashclauses);
 777         /* these must be OK, since check_hashjoinable accepted the clause */
 778         left = get_leftop(restrictinfo->clause);
 779         right = get_rightop(restrictinfo->clause);
 780
 781         /*
 782          * Since we tend to visit the same clauses over and over when planning
 783          * a large query, we cache the bucketsize estimate in the RestrictInfo
 784          * node to avoid repeated lookups of statistics.
 785          */
 786         if (VARISRELMEMBER(right->varno, inner_path->parent))
 787         {
 788                 /* righthand side is inner */
 789                 innerbucketsize = restrictinfo->right_bucketsize;
 790                 if (innerbucketsize < 0)
 791                 {
 792                         /* not cached yet */
 793                         innerbucketsize = estimate_hash_bucketsize(root, right);
 794                         restrictinfo->right_bucketsize = innerbucketsize;
 795                 }
 796         }
 797         else
 798         {
 799                 Assert(VARISRELMEMBER(left->varno, inner_path->parent));
 800                 /* lefthand side is inner */
 801                 innerbucketsize = restrictinfo->left_bucketsize;
 802                 if (innerbucketsize < 0)
 803                 {
 804                         /* not cached yet */
 805                         innerbucketsize = estimate_hash_bucketsize(root, left);
 806                         restrictinfo->left_bucketsize = innerbucketsize;
 807                 }
 808         }
 809
 810         /*
 811          * The number of tuple comparisons needed is the number of outer
 812          * tuples times the typical number of tuples in a hash bucket, which
 813          * is the inner relation size times its bucketsize fraction. We charge
 814          * one cpu_operator_cost per tuple comparison.
 815          */
 816         run_cost += cpu_operator_cost * outer_path->parent->rows *
 817                 ceil(inner_path->parent->rows * innerbucketsize);
 818
 819         /*
 820          * For each tuple that gets through the hashjoin proper, we charge
 821          * cpu_tuple_cost plus the cost of evaluating additional restriction
 822          * clauses that are to be applied at the join.  It's OK to use an
 823          * approximate selectivity here, since in most cases this is a minor
 824          * component of the cost.
 825          */
 826         ntuples = approx_selectivity(root, hashclauses) *
 827                 outer_path->parent->rows * inner_path->parent->rows;
 828
 829         /* CPU costs */
 830         cpu_per_tuple = cpu_tuple_cost + cost_qual_eval(restrictlist);
 831         run_cost += cpu_per_tuple * ntuples;
 832
 833         /*
 834          * if inner relation is too big then we will need to "batch" the join,
 835          * which implies writing and reading most of the tuples to disk an
 836          * extra time.  Charge one cost unit per page of I/O (correct since it
 837          * should be nice and sequential...).  Writing the inner rel counts as
 838          * startup cost, all the rest as run cost.
 839          */
 840         if (innerbytes > hashtablebytes)
 841         {
 842                 double          outerpages = page_size(outer_path->parent->rows,
 843                                                                                    outer_path->parent->width);
 844                 double          innerpages = page_size(inner_path->parent->rows,
 845                                                                                    inner_path->parent->width);
 846
 847                 startup_cost += innerpages;
 848                 run_cost += innerpages + 2 * outerpages;
 849         }
 850
 851         /*
 852          * Bias against putting larger relation on inside.      We don't want an
 853          * absolute prohibition, though, since larger relation might have
 854          * better bucketsize --- and we can't trust the size estimates
 855          * unreservedly, anyway.  Instead, inflate the startup cost by the
 856          * square root of the size ratio.  (Why square root?  No real good
 857          * reason, but it seems reasonable...)
 858          */
 859         if (innerbytes > outerbytes && outerbytes > 0)
 860                 startup_cost *= sqrt(innerbytes / outerbytes);
 861
 862         path->startup_cost = startup_cost;
 863         path->total_cost = startup_cost + run_cost;
 864 }
 865
 866 /*
 867  * Estimate hash bucketsize fraction (ie, number of entries in a bucket
 868  * divided by total tuples in relation) if the specified Var is used
 869  * as a hash key.
 870  *
 871  * XXX This is really pretty bogus since we're effectively assuming that the
 872  * distribution of hash keys will be the same after applying restriction
 873  * clauses as it was in the underlying relation.  However, we are not nearly
 874  * smart enough to figure out how the restrict clauses might change the
 875  * distribution, so this will have to do for now.
 876  *
 877  * We can get the number of buckets the executor will use for the given
 878  * input relation.      If the data were perfectly distributed, with the same
 879  * number of tuples going into each available bucket, then the bucketsize
 880  * fraction would be 1/nbuckets.  But this happy state of affairs will occur
 881  * only if (a) there are at least nbuckets distinct data values, and (b)
 882  * we have a not-too-skewed data distribution.  Otherwise the buckets will
 883  * be nonuniformly occupied.  If the other relation in the join has a key
 884  * distribution similar to this one's, then the most-loaded buckets are
 885  * exactly those that will be probed most often.  Therefore, the "average"
 886  * bucket size for costing purposes should really be taken as something close
 887  * to the "worst case" bucket size.  We try to estimate this by adjusting the
 888  * fraction if there are too few distinct data values, and then scaling up
 889  * by the ratio of the most common value's frequency to the average frequency.
 890  *
 891  * If no statistics are available, use a default estimate of 0.1.  This will
 892  * discourage use of a hash rather strongly if the inner relation is large,
 893  * which is what we want.  We do not want to hash unless we know that the
 894  * inner rel is well-dispersed (or the alternatives seem much worse).
 895  */
 896 static Selectivity
 897 estimate_hash_bucketsize(Query *root, Var *var)
 898 {
 899         Oid                     relid;
 900         RelOptInfo *rel;
 901         int                     virtualbuckets;
 902         int                     physicalbuckets;
 903         int                     numbatches;
 904         HeapTuple       tuple;
 905         Form_pg_statistic stats;
 906         double          estfract,
 907                                 ndistinct,
 908                                 mcvfreq,
 909                                 avgfreq;
 910         float4     *numbers;
 911         int                     nnumbers;
 912
 913         /*
 914          * Lookup info about var's relation and attribute; if none available,
 915          * return default estimate.
 916          */
 917         if (!IsA(var, Var))
 918                 return 0.1;
 919
 920         relid = getrelid(var->varno, root->rtable);
 921         if (relid == InvalidOid)
 922                 return 0.1;
 923
 924         rel = find_base_rel(root, var->varno);
 925
 926         if (rel->tuples <= 0.0 || rel->rows <= 0.0)
 927                 return 0.1;                             /* ensure we can divide below */
 928
 929         /* Get hash table size that executor would use for this relation */
 930         ExecChooseHashTableSize(rel->rows, rel->width,
 931                                                         &virtualbuckets,
 932                                                         &physicalbuckets,
 933                                                         &numbatches);
 934
 935         tuple = SearchSysCache(STATRELATT,
 936                                                    ObjectIdGetDatum(relid),
 937                                                    Int16GetDatum(var->varattno),
 938                                                    0, 0);
 939         if (!HeapTupleIsValid(tuple))
 940         {
 941                 /*
 942                  * Perhaps the Var is a system attribute; if so, it will have no
 943                  * entry in pg_statistic, but we may be able to guess something
 944                  * about its distribution anyway.
 945                  */
 946                 switch (var->varattno)
 947                 {
 948                         case ObjectIdAttributeNumber:
 949                         case SelfItemPointerAttributeNumber:
 950                                 /* these are unique, so buckets should be well-distributed */
 951                                 return 1.0 / (double) virtualbuckets;
 952                         case TableOidAttributeNumber:
 953                                 /* hashing this is a terrible idea... */
 954                                 return 1.0;
 955                 }
 956                 return 0.1;
 957         }
 958         stats = (Form_pg_statistic) GETSTRUCT(tuple);
 959
 960         /*
 961          * Obtain number of distinct data values in raw relation.
 962          */
 963         ndistinct = stats->stadistinct;
 964         if (ndistinct < 0.0)
 965                 ndistinct = -ndistinct * rel->tuples;
 966
 967         if (ndistinct <= 0.0)           /* ensure we can divide */
 968         {
 969                 ReleaseSysCache(tuple);
 970                 return 0.1;
 971         }
 972
 973         /* Also compute avg freq of all distinct data values in raw relation */
 974         avgfreq = (1.0 - stats->stanullfrac) / ndistinct;
 975
 976         /*
 977          * Adjust ndistinct to account for restriction clauses.  Observe we
 978          * are assuming that the data distribution is affected uniformly by
 979          * the restriction clauses!
 980          *
 981          * XXX Possibly better way, but much more expensive: multiply by
 982          * selectivity of rel's restriction clauses that mention the target
 983          * Var.
 984          */
 985         ndistinct *= rel->rows / rel->tuples;
 986
 987         /*
 988          * Initial estimate of bucketsize fraction is 1/nbuckets as long as
 989          * the number of buckets is less than the expected number of distinct
 990          * values; otherwise it is 1/ndistinct.
 991          */
 992         if (ndistinct > (double) virtualbuckets)
 993                 estfract = 1.0 / (double) virtualbuckets;
 994         else
 995                 estfract = 1.0 / ndistinct;
 996
 997         /*
 998          * Look up the frequency of the most common value, if available.
 999          */
1000         mcvfreq = 0.0;
1001
1002         if (get_attstatsslot(tuple, var->vartype, var->vartypmod,
1003                                                  STATISTIC_KIND_MCV, InvalidOid,
1004                                                  NULL, NULL, &numbers, &nnumbers))
1005         {
1006                 /*
1007                  * The first MCV stat is for the most common value.
1008                  */
1009                 if (nnumbers > 0)
1010                         mcvfreq = numbers[0];
1011                 free_attstatsslot(var->vartype, NULL, 0,
1012                                                   numbers, nnumbers);
1013         }
1014
1015         /*
1016          * Adjust estimated bucketsize upward to account for skewed
1017          * distribution.
1018          */
1019         if (avgfreq > 0.0 && mcvfreq > avgfreq)
1020                 estfract *= mcvfreq / avgfreq;
1021
1022         ReleaseSysCache(tuple);
1023
1024         return (Selectivity) estfract;
1025 }
1026
1027
1028 /*
1029  * cost_qual_eval
1030  *              Estimate the CPU cost of evaluating a WHERE clause (once).
1031  *              The input can be either an implicitly-ANDed list of boolean
1032  *              expressions, or a list of RestrictInfo nodes.
1033  */
1034 Cost
1035 cost_qual_eval(List *quals)
1036 {
1037         Cost            total = 0;
1038         List       *l;
1039
1040         /* We don't charge any cost for the implicit ANDing at top level ... */
1041
1042         foreach(l, quals)
1043         {
1044                 Node       *qual = (Node *) lfirst(l);
1045
1046                 /*
1047                  * RestrictInfo nodes contain an eval_cost field reserved for this
1048                  * routine's use, so that it's not necessary to evaluate the qual
1049                  * clause's cost more than once.  If the clause's cost hasn't been
1050                  * computed yet, the field will contain -1.
1051                  */
1052                 if (qual && IsA(qual, RestrictInfo))
1053                 {
1054                         RestrictInfo *restrictinfo = (RestrictInfo *) qual;
1055
1056                         if (restrictinfo->eval_cost < 0)
1057                         {
1058                                 restrictinfo->eval_cost = 0;
1059                                 cost_qual_eval_walker((Node *) restrictinfo->clause,
1060                                                                           &restrictinfo->eval_cost);
1061                         }
1062                         total += restrictinfo->eval_cost;
1063                 }
1064                 else
1065                 {
1066                         /* If it's a bare expression, must always do it the hard way */
1067                         cost_qual_eval_walker(qual, &total);
1068                 }
1069         }
1070         return total;
1071 }
1072
1073 static bool
1074 cost_qual_eval_walker(Node *node, Cost *total)
1075 {
1076         if (node == NULL)
1077                 return false;
1078
1079         /*
1080          * Our basic strategy is to charge one cpu_operator_cost for each
1081          * operator or function node in the given tree.  Vars and Consts are
1082          * charged zero, and so are boolean operators (AND, OR, NOT).
1083          * Simplistic, but a lot better than no model at all.
1084          *
1085          * Should we try to account for the possibility of short-circuit
1086          * evaluation of AND/OR?
1087          */
1088         if (IsA(node, Expr))
1089         {
1090                 Expr       *expr = (Expr *) node;
1091
1092                 switch (expr->opType)
1093                 {
1094                         case OP_EXPR:
1095                         case DISTINCT_EXPR:
1096                         case FUNC_EXPR:
1097                                 *total += cpu_operator_cost;
1098                                 break;
1099                         case OR_EXPR:
1100                         case AND_EXPR:
1101                         case NOT_EXPR:
1102                                 break;
1103                         case SUBPLAN_EXPR:
1104
1105                                 /*
1106                                  * A subplan node in an expression indicates that the
1107                                  * subplan will be executed on each evaluation, so charge
1108                                  * accordingly. (We assume that sub-selects that can be
1109                                  * executed as InitPlans have already been removed from
1110                                  * the expression.)
1111                                  *
1112                                  * NOTE: this logic should agree with the estimates used by
1113                                  * make_subplan() in plan/subselect.c.
1114                                  */
1115                                 {
1116                                         SubPlan    *subplan = (SubPlan *) expr->oper;
1117                                         Plan       *plan = subplan->plan;
1118                                         Cost            subcost;
1119
1120                                         if (subplan->sublink->subLinkType == EXISTS_SUBLINK)
1121                                         {
1122                                                 /* we only need to fetch 1 tuple */
1123                                                 subcost = plan->startup_cost +
1124                                                         (plan->total_cost - plan->startup_cost) / plan->plan_rows;
1125                                         }
1126                                         else if (subplan->sublink->subLinkType == ALL_SUBLINK ||
1127                                                          subplan->sublink->subLinkType == ANY_SUBLINK)
1128                                         {
1129                                                 /* assume we need 50% of the tuples */
1130                                                 subcost = plan->startup_cost +
1131                                                         0.50 * (plan->total_cost - plan->startup_cost);
1132                                                 /* XXX what if subplan has been materialized? */
1133                                         }
1134                                         else
1135                                         {
1136                                                 /* assume we need all tuples */
1137                                                 subcost = plan->total_cost;
1138                                         }
1139                                         *total += subcost;
1140                                 }
1141                                 break;
1142                 }
1143                 /* fall through to examine args of Expr node */
1144         }
1145         return expression_tree_walker(node, cost_qual_eval_walker,
1146                                                                   (void *) total);
1147 }
1148
1149
1150 /*
1151  * approx_selectivity
1152  *              Quick-and-dirty estimation of clause selectivities.
1153  *              The input can be either an implicitly-ANDed list of boolean
1154  *              expressions, or a list of RestrictInfo nodes (typically the latter).
1155  *
1156  * The "quick" part comes from caching the selectivity estimates so we can
1157  * avoid recomputing them later.  (Since the same clauses are typically
1158  * examined over and over in different possible join trees, this makes a
1159  * big difference.)
1160  *
1161  * The "dirty" part comes from the fact that the selectivities of multiple
1162  * clauses are estimated independently and multiplied together.  Now
1163  * clauselist_selectivity often can't do any better than that anyhow, but
1164  * for some situations (such as range constraints) it is smarter.
1165  *
1166  * Since we are only using the results to estimate how many potential
1167  * output tuples are generated and passed through qpqual checking, it
1168  * seems OK to live with the approximation.
1169  */
1170 static Selectivity
1171 approx_selectivity(Query *root, List *quals)
1172 {
1173         Selectivity total = 1.0;
1174         List       *l;
1175
1176         foreach(l, quals)
1177         {
1178                 Node       *qual = (Node *) lfirst(l);
1179                 Selectivity selec;
1180
1181                 /*
1182                  * RestrictInfo nodes contain a this_selec field reserved for this
1183                  * routine's use, so that it's not necessary to evaluate the qual
1184                  * clause's selectivity more than once.  If the clause's
1185                  * selectivity hasn't been computed yet, the field will contain
1186                  * -1.
1187                  */
1188                 if (qual && IsA(qual, RestrictInfo))
1189                 {
1190                         RestrictInfo *restrictinfo = (RestrictInfo *) qual;
1191
1192                         if (restrictinfo->this_selec < 0)
1193                                 restrictinfo->this_selec =
1194                                         clause_selectivity(root,
1195                                                                            (Node *) restrictinfo->clause,
1196                                                                            0);
1197                         selec = restrictinfo->this_selec;
1198                 }
1199                 else
1200                 {
1201                         /* If it's a bare expression, must always do it the hard way */
1202                         selec = clause_selectivity(root, qual, 0);
1203                 }
1204                 total *= selec;
1205         }
1206         return total;
1207 }
1208
1209
1210 /*
1211  * set_baserel_size_estimates
1212  *              Set the size estimates for the given base relation.
1213  *
1214  * The rel's targetlist and restrictinfo list must have been constructed
1215  * already.
1216  *
1217  * We set the following fields of the rel node:
1218  *      rows: the estimated number of output tuples (after applying
1219  *                restriction clauses).
1220  *      width: the estimated average output tuple width in bytes.
1221  *      baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
1222  */
1223 void
1224 set_baserel_size_estimates(Query *root, RelOptInfo *rel)
1225 {
1226         /* Should only be applied to base relations */
1227         Assert(length(rel->relids) == 1);
1228
1229         rel->rows = rel->tuples *
1230                 restrictlist_selectivity(root,
1231                                                                  rel->baserestrictinfo,
1232                                                                  lfirsti(rel->relids));
1233
1234         /*
1235          * Force estimate to be at least one row, to make explain output look
1236          * better and to avoid possible divide-by-zero when interpolating
1237          * cost.
1238          */
1239         if (rel->rows < 1.0)
1240                 rel->rows = 1.0;
1241
1242         rel->baserestrictcost = cost_qual_eval(rel->baserestrictinfo);
1243
1244         set_rel_width(root, rel);
1245 }
1246
1247 /*
1248  * set_joinrel_size_estimates
1249  *              Set the size estimates for the given join relation.
1250  *
1251  * The rel's targetlist must have been constructed already, and a
1252  * restriction clause list that matches the given component rels must
1253  * be provided.
1254  *
1255  * Since there is more than one way to make a joinrel for more than two
1256  * base relations, the results we get here could depend on which component
1257  * rel pair is provided.  In theory we should get the same answers no matter
1258  * which pair is provided; in practice, since the selectivity estimation
1259  * routines don't handle all cases equally well, we might not.  But there's
1260  * not much to be done about it.  (Would it make sense to repeat the
1261  * calculations for each pair of input rels that's encountered, and somehow
1262  * average the results?  Probably way more trouble than it's worth.)
1263  *
1264  * We set the same relnode fields as set_baserel_size_estimates() does.
1265  */
1266 void
1267 set_joinrel_size_estimates(Query *root, RelOptInfo *rel,
1268                                                    RelOptInfo *outer_rel,
1269                                                    RelOptInfo *inner_rel,
1270                                                    JoinType jointype,
1271                                                    List *restrictlist)
1272 {
1273         double          temp;
1274
1275         /* Start with the Cartesian product */
1276         temp = outer_rel->rows * inner_rel->rows;
1277
1278         /*
1279          * Apply join restrictivity.  Note that we are only considering
1280          * clauses that become restriction clauses at this join level; we are
1281          * not double-counting them because they were not considered in
1282          * estimating the sizes of the component rels.
1283          */
1284         temp *= restrictlist_selectivity(root,
1285                                                                          restrictlist,
1286                                                                          0);
1287
1288         /*
1289          * If we are doing an outer join, take that into account: the output
1290          * must be at least as large as the non-nullable input.  (Is there any
1291          * chance of being even smarter?)
1292          */
1293         switch (jointype)
1294         {
1295                 case JOIN_INNER:
1296                         break;
1297                 case JOIN_LEFT:
1298                         if (temp < outer_rel->rows)
1299                                 temp = outer_rel->rows;
1300                         break;
1301                 case JOIN_RIGHT:
1302                         if (temp < inner_rel->rows)
1303                                 temp = inner_rel->rows;
1304                         break;
1305                 case JOIN_FULL:
1306                         if (temp < outer_rel->rows)
1307                                 temp = outer_rel->rows;
1308                         if (temp < inner_rel->rows)
1309                                 temp = inner_rel->rows;
1310                         break;
1311                 default:
1312                         elog(ERROR, "set_joinrel_size_estimates: unsupported join type %d",
1313                                  (int) jointype);
1314                         break;
1315         }
1316
1317         /*
1318          * Force estimate to be at least one row, to make explain output look
1319          * better and to avoid possible divide-by-zero when interpolating
1320          * cost.
1321          */
1322         if (temp < 1.0)
1323                 temp = 1.0;
1324
1325         rel->rows = temp;
1326
1327         /*
1328          * We could apply set_rel_width() to compute the output tuple width
1329          * from scratch, but at present it's always just the sum of the input
1330          * widths, so why work harder than necessary?  If relnode.c is ever
1331          * taught to remove unneeded columns from join targetlists, go back to
1332          * using set_rel_width here.
1333          */
1334         rel->width = outer_rel->width + inner_rel->width;
1335 }
1336
1337 /*
1338  * set_function_size_estimates
1339  *              Set the size estimates for a base relation that is a function call.
1340  *
1341  * The rel's targetlist and restrictinfo list must have been constructed
1342  * already.
1343  *
1344  * We set the following fields of the rel node:
1345  *      rows: the estimated number of output tuples (after applying
1346  *                restriction clauses).
1347  *      width: the estimated average output tuple width in bytes.
1348  *      baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
1349  */
1350 void
1351 set_function_size_estimates(Query *root, RelOptInfo *rel)
1352 {
1353         /* Should only be applied to base relations that are functions */
1354         Assert(length(rel->relids) == 1);
1355         Assert(rel->rtekind == RTE_FUNCTION);
1356
1357         /*
1358          * Estimate number of rows the function itself will return.
1359          *
1360          * XXX no idea how to do this yet; but should at least check whether
1361          * function returns set or not...
1362          */
1363         rel->tuples = 1000;
1364
1365         /* Now estimate number of output rows */
1366         rel->rows = rel->tuples *
1367                 restrictlist_selectivity(root,
1368                                                                  rel->baserestrictinfo,
1369                                                                  lfirsti(rel->relids));
1370
1371         /*
1372          * Force estimate to be at least one row, to make explain output look
1373          * better and to avoid possible divide-by-zero when interpolating
1374          * cost.
1375          */
1376         if (rel->rows < 1.0)
1377                 rel->rows = 1.0;
1378
1379         rel->baserestrictcost = cost_qual_eval(rel->baserestrictinfo);
1380
1381         set_rel_width(root, rel);
1382 }
1383
1384
1385 /*
1386  * set_rel_width
1387  *              Set the estimated output width of the relation.
1388  *
1389  * NB: this works best on base relations because it prefers to look at
1390  * real Vars.  It will fail to make use of pg_statistic info when applied
1391  * to a subquery relation, even if the subquery outputs are simple vars
1392  * that we could have gotten info for.  Is it worth trying to be smarter
1393  * about subqueries?
1394  */
1395 static void
1396 set_rel_width(Query *root, RelOptInfo *rel)
1397 {
1398         int32           tuple_width = 0;
1399         List       *tllist;
1400
1401         foreach(tllist, rel->targetlist)
1402         {
1403                 TargetEntry *tle = (TargetEntry *) lfirst(tllist);
1404                 int32           item_width;
1405
1406                 /*
1407                  * If it's a Var, try to get statistical info from pg_statistic.
1408                  */
1409                 if (tle->expr && IsA(tle->expr, Var))
1410                 {
1411                         Var                *var = (Var *) tle->expr;
1412                         Oid                     relid;
1413
1414                         relid = getrelid(var->varno, root->rtable);
1415                         if (relid != InvalidOid)
1416                         {
1417                                 item_width = get_attavgwidth(relid, var->varattno);
1418                                 if (item_width > 0)
1419                                 {
1420                                         tuple_width += item_width;
1421                                         continue;
1422                                 }
1423                         }
1424                 }
1425
1426                 /*
1427                  * Not a Var, or can't find statistics for it.  Estimate using
1428                  * just the type info.
1429                  */
1430                 item_width = get_typavgwidth(tle->resdom->restype,
1431                                                                          tle->resdom->restypmod);
1432                 Assert(item_width > 0);
1433                 tuple_width += item_width;
1434         }
1435         Assert(tuple_width >= 0);
1436         rel->width = tuple_width;
1437 }
1438
1439 /*
1440  * relation_byte_size
1441  *        Estimate the storage space in bytes for a given number of tuples
1442  *        of a given width (size in bytes).
1443  */
1444 static double
1445 relation_byte_size(double tuples, int width)
1446 {
1447         return tuples * ((double) MAXALIGN(width + sizeof(HeapTupleData)));
1448 }
1449
1450 /*
1451  * page_size
1452  *        Returns an estimate of the number of pages covered by a given
1453  *        number of tuples of a given width (size in bytes).
1454  */
1455 static double
1456 page_size(double tuples, int width)
1457 {
1458         return ceil(relation_byte_size(tuples, width) / BLCKSZ);
1459 }