From a18ebc5541c20bf6aca70532bbf1a0531d1b2659 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 30 May 2006 19:24:25 +0000 Subject: [PATCH] Code review for EXPLAIN patch. Fix some typos, make it behave sanely across multiple loops, get rid of the shaky assumption that exactly one tuple is returned per node iteration. --- src/backend/executor/instrument.c | 178 +++++++++++++++++++++++--------------- src/include/executor/instrument.h | 11 ++- 2 files changed, 113 insertions(+), 76 deletions(-) diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 41b061bd52..d094ffbbcf 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -7,90 +7,117 @@ * Copyright (c) 2001-2006, PostgreSQL Global Development Group * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.15 2006/05/30 14:01:58 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.16 2006/05/30 19:24:25 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" -#include #include +#include #include "executor/instrument.h" -/* This is the function that is used to determine the sampling intervals. In - * general, if the function is f(x), then for N tuples we will take on the - * order of integral(1/f(x), x=0..N) samples. Some examples follow, with the - * number of samples that would be collected over 1,000,000 tuples. - f(x) = x => log2(N) 20 - f(x) = x^(1/2) => 2 * N^(1/2) 2000 - f(x) = x^(1/3) => 1.5 * N^(2/3) 15000 - +/* + * As of PostgreSQL 8.2, we try to reduce the overhead of EXPLAIN ANALYZE + * by not calling INSTR_TIME_SET_CURRENT() for every single node execution. + * (Because that requires a kernel call on most systems, it's expensive.) + * + * This macro determines the sampling interval: that is, after how many more + * iterations will we take the next time sample, given that niters iterations + * have occurred already. In general, if the function is f(x), then for N + * iterations we will take on the order of integral(1/f(x), x=0..N) + * samples. Some examples follow, with the number of samples that would be + * collected over 1,000,000 iterations: + * + * f(x) = x => log2(N) 20 + * f(x) = x^(1/2) => 2 * N^(1/2) 2000 + * f(x) = x^(1/3) => 1.5 * N^(2/3) 15000 + * * I've chosen the last one as it seems to provide a good compromise between * low overhead but still getting a meaningful number of samples. However, * not all machines have the cbrt() function so on those we substitute * sqrt(). The difference is not very significant in the tests I made. -*/ + * + * The actual sampling interval is randomized with the SampleFunc() value + * as the mean; this hopefully will reduce any measurement bias due to + * cyclic variation in the node execution time. + */ #ifdef HAVE_CBRT -#define SampleFunc cbrt +#define SampleFunc(niters) cbrt(niters) #else -#define SampleFunc sqrt +#define SampleFunc(niters) sqrt(niters) #endif +#define SampleInterval(niters) \ + (SampleFunc(niters) * (double) random() / (double) (MAX_RANDOM_VALUE/2)) + +/* + * We sample at every node iteration until we've reached this threshold, + * so that nodes not called a large number of times are completely accurate. + * (Perhaps this should be a GUC variable? But beware of messing up + * CalculateSampleOverhead if value is too small.) + */ #define SAMPLE_THRESHOLD 50 +/* + * Determine the sampling overhead. This only needs to be done once per + * backend (actually, probably once per postmaster would do ...) + */ static double SampleOverhead; -static bool SampleOverheadCalculated; +static bool SampleOverheadCalculated = false; static void -CalculateSampleOverhead() +CalculateSampleOverhead(void) { Instrumentation instr; int i; - - /* We want to determine the sampling overhead, to correct - * calculations later. This only needs to be done once per backend. - * Is this the place? A wrong value here (due to a mistimed - * task-switch) will cause bad calculations later. - * + + /* + * We could get a wrong result due to being interrupted by task switch. * To minimize the risk we do it a few times and take the lowest. */ - SampleOverhead = 1.0e6; - - for( i = 0; i<5; i++ ) + + for (i = 0; i < 5; i++) { int j; double overhead; - - memset( &instr, 0, sizeof(instr) ); - - /* Loop SAMPLE_THRESHOLD times or 100 microseconds, whichever is faster */ - for( j=0; jstarttime)) { - /* We always sample the first SAMPLE_THRESHOLD tuples, so small nodes are always accurate */ - if (instr->tuplecount < SAMPLE_THRESHOLD) + /* + * Always sample if not yet up to threshold, else check whether + * next threshold has been reached + */ + if (instr->itercount < SAMPLE_THRESHOLD) instr->sampling = true; - else + else if (instr->itercount >= instr->nextsample) { - /* Otherwise we go to sampling, see the comments on SampleFunc at the top of the file */ - if( instr->tuplecount > instr->nextsample ) - { - instr->sampling = true; - /* The doubling is so the random will average 1 over time */ - instr->nextsample += 2.0 * SampleFunc(instr->tuplecount) * (double)rand() / (double)RAND_MAX; - } + instr->sampling = true; + instr->nextsample = + instr->itercount + SampleInterval(instr->itercount); } if (instr->sampling) INSTR_TIME_SET_CURRENT(instr->starttime); @@ -124,13 +150,15 @@ InstrStartNode(Instrumentation *instr) void InstrStopNode(Instrumentation *instr, double nTuples) { - instr_time endtime; - - /* count the returned tuples */ + /* count the returned tuples and iterations */ instr->tuplecount += nTuples; + instr->itercount += 1; + /* measure runtime if appropriate */ if (instr->sampling) { + instr_time endtime; + if (INSTR_TIME_IS_ZERO(instr->starttime)) { elog(DEBUG2, "InstrStopNode called without start"); @@ -159,7 +187,8 @@ InstrStopNode(Instrumentation *instr, double nTuples) #endif INSTR_TIME_SET_ZERO(instr->starttime); - instr->samplecount += nTuples; + + instr->samplecount += 1; instr->sampling = false; } @@ -184,35 +213,44 @@ InstrEndLoop(Instrumentation *instr) if (!INSTR_TIME_IS_ZERO(instr->starttime)) elog(DEBUG2, "InstrEndLoop called on running node"); - /* Accumulate per-cycle statistics into totals */ + /* Compute time spent in node */ totaltime = INSTR_TIME_GET_DOUBLE(instr->counter); - instr->startup += instr->firsttuple; - - /* Here we take into account sampling effects. Doing it naively ends - * up assuming the sampling overhead applies to all tuples, even the - * ones we didn't measure. We've calculated an overhead, so we - * subtract that for all samples we didn't measure. The first tuple - * is also special cased, because it usually takes longer. */ - - if( instr->samplecount < instr->tuplecount ) + /* + * If we didn't measure runtime on every iteration, then we must increase + * the measured total to account for the other iterations. Naively + * multiplying totaltime by itercount/samplecount would be wrong because + * it effectively assumes the sampling overhead applies to all iterations, + * even the ones we didn't measure. (Note that what we are trying to + * estimate here is the actual time spent in the node, including the + * actual measurement overhead; not the time exclusive of measurement + * overhead.) We exclude the first iteration from the correction basis, + * because it often takes longer than others. + */ + if (instr->itercount > instr->samplecount) { - double pertuple = (totaltime - instr->firsttuple) / (instr->samplecount - 1); - instr->total += instr->firsttuple + (pertuple * (instr->samplecount - 1)) - + ((pertuple - SampleOverhead) * (instr->tuplecount - instr->samplecount)); + double per_iter; + + per_iter = (totaltime - instr->firsttuple) / (instr->samplecount - 1) + - SampleOverhead; + if (per_iter > 0) /* sanity check */ + totaltime += per_iter * (instr->itercount - instr->samplecount); } - else - instr->total += totaltime; - + + /* Accumulate per-cycle statistics into totals */ + instr->startup += instr->firsttuple; + instr->total += totaltime; instr->ntuples += instr->tuplecount; - instr->nsamples += instr->samplecount; instr->nloops += 1; /* Reset for next cycle (if any) */ instr->running = false; + instr->sampling = false; INSTR_TIME_SET_ZERO(instr->starttime); INSTR_TIME_SET_ZERO(instr->counter); instr->firsttuple = 0; - instr->samplecount = 0; instr->tuplecount = 0; + instr->itercount = 0; + instr->samplecount = 0; + instr->nextsample = 0; } diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index fbf578d473..fd57f86578 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -6,7 +6,7 @@ * * Copyright (c) 2001-2006, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/executor/instrument.h,v 1.14 2006/05/30 14:01:58 momjian Exp $ + * $PostgreSQL: pgsql/src/include/executor/instrument.h,v 1.15 2006/05/30 19:24:25 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -57,20 +57,19 @@ typedef struct Instrumentation { /* Info about current plan cycle: */ bool running; /* TRUE if we've completed first tuple */ + bool sampling; /* Are we sampling in current iteration? */ instr_time starttime; /* Start time of current iteration of node */ instr_time counter; /* Accumulated runtime for this node */ double firsttuple; /* Time for first tuple of this cycle */ double tuplecount; /* Tuples emitted so far this cycle */ - double samplecount; /* Samples collected this cycle */ + double itercount; /* Plan node iterations this cycle */ + double samplecount; /* Iterations in which we sampled runtime */ + double nextsample; /* Next itercount to sample at */ /* Accumulated statistics across all completed cycles: */ double startup; /* Total startup time (in seconds) */ double total; /* Total total time (in seconds) */ double ntuples; /* Total tuples produced */ double nloops; /* # of run cycles for this node */ - double nsamples; /* # of samples taken */ - /* Tracking for sampling */ - bool sampling; /* Are we sampling this iteration */ - double nextsample; /* The next tuplecount we're going to sample */ } Instrumentation; extern Instrumentation *InstrAlloc(int n); -- 2.11.0