1 /*-------------------------------------------------------------------------
4 * Take statistics of plan selection across a whole database cluster.
6 * Execution costs are totaled for each distinct plan for each query,
7 * and plan and queryid are kept in a shared hashtable, each record in
8 * which is associated with a record in pg_stat_statements, if any, by
11 * For Postgres 9.3 or earlier does not expose query id so
12 * pg_store_plans needs to calculate it based on the given query
13 * string using different algorithm from pg_stat_statements, and later
14 * the id will be matched against the one made from query string
15 * stored in pg_stat_statements. For the reason, queryid matching in
16 * this way will fail if the query string kept in pg_stat_statements
17 * is truncated in the middle.
19 * Plans are identified by fingerprinting plan representations in
20 * "shortened" JSON format with constants and unstable values such as
21 * rows, width, loops ignored. Nevertheless, stored plan entries hold
22 * them of the latest execution. Entry eviction is done in the same
23 * way to pg_stat_statements.
25 * Copyright (c) 2008-2020, PostgreSQL Global Development Group
26 * Copyright (c) 2012-2021, NIPPON TELEGRAPH AND TELEPHONE CORPORATION
29 * pg_store_plans/pg_store_plans.c
31 *-------------------------------------------------------------------------
40 #include "catalog/pg_authid.h"
41 #include "commands/explain.h"
42 #include "access/hash.h"
43 #include "executor/instrument.h"
45 #include "mb/pg_wchar.h"
46 #include "miscadmin.h"
48 #include "storage/fd.h"
49 #include "storage/ipc.h"
50 #include "storage/lwlock.h"
51 #include "storage/spin.h"
52 #include "storage/shmem.h"
53 #include "tcop/utility.h"
54 #include "utils/acl.h"
55 #include "utils/builtins.h"
56 #if PG_VERSION_NUM >= 140000
57 #include "utils/queryjumble.h"
59 #include "utils/timestamp.h"
61 #include "pgsp_json.h"
62 #include "pgsp_explain.h"
66 /* Location of stats file */
67 #define PGSP_DUMP_FILE "global/pg_store_plans.stat"
68 #define PGSP_TEXT_FILE PG_STAT_TMP_DIR "/pgsp_plan_texts.stat"
70 /* PostgreSQL major version number, changes in which invalidate all entries */
71 static const uint32 PGSP_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
73 /* This constant defines the magic number in the stats file header */
74 static const uint32 PGSP_FILE_HEADER = 0x20211125;
75 static int max_plan_len = 5000;
77 /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
78 #define USAGE_EXEC(duration) (1.0)
79 #define USAGE_INIT (1.0) /* including initial planning */
80 #define ASSUMED_MEDIAN_INIT (10.0) /* initial assumed median usage */
81 #define ASSUMED_LENGTH_INIT 1024 /* initial assumed mean query length */
82 #define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */
83 #define STICKY_DECREASE_FACTOR (0.50) /* factor for sticky entries */
84 #define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */
86 /* In PostgreSQL 11, queryid becomes a uint64 internally.
88 #if PG_VERSION_NUM >= 110000
89 typedef uint64 queryid_t;
90 #define PGSP_NO_QUERYID UINT64CONST(0)
92 typedef uint32 queryid_t;
93 #define PGSP_NO_QUERYID 0
97 * Extension version number, for supporting older extension versions' objects
99 typedef enum pgspVersion
106 * Hashtable key that defines the identity of a hashtable entry. We separate
107 * queries by user and by database even if they are otherwise identical.
109 * Presently, the query encoding is fully determined by the source database
110 * and so we don't really need it to be in the key. But that might not always
111 * be true. Anyway it's notationally convenient to pass it as part of the key.
113 typedef struct pgspHashKey
115 Oid userid; /* user OID */
116 Oid dbid; /* database OID */
117 queryid_t queryid; /* query identifier */
118 uint32 planid; /* plan identifier */
122 * The actual stats counters kept within pgspEntry.
124 typedef struct Counters
126 int64 calls; /* # of times executed */
127 double total_time; /* total execution time, in msec */
128 double min_time; /* minimum execution time in msec */
129 double max_time; /* maximum execution time in msec */
130 double mean_time; /* mean execution time in msec */
131 double sum_var_time; /* sum of variances in execution time in msec */
132 int64 rows; /* total # of retrieved or affected rows */
133 int64 shared_blks_hit; /* # of shared buffer hits */
134 int64 shared_blks_read; /* # of shared disk blocks read */
135 int64 shared_blks_dirtied;/* # of shared disk blocks dirtied */
136 int64 shared_blks_written;/* # of shared disk blocks written */
137 int64 local_blks_hit; /* # of local buffer hits */
138 int64 local_blks_read; /* # of local disk blocks read */
139 int64 local_blks_dirtied; /* # of local disk blocks dirtied */
140 int64 local_blks_written; /* # of local disk blocks written */
141 int64 temp_blks_read; /* # of temp blocks read */
142 int64 temp_blks_written; /* # of temp blocks written */
143 double blk_read_time; /* time spent reading, in msec */
144 double blk_write_time; /* time spent writing, in msec */
145 TimestampTz first_call; /* timestamp of first call */
146 TimestampTz last_call; /* timestamp of last call */
147 double usage; /* usage factor */
151 * Global statistics for pg_store_plans
153 typedef struct pgspGlobalStats
155 int64 dealloc; /* # of times entries were deallocated */
156 TimestampTz stats_reset; /* timestamp with all stats reset */
160 * Statistics per plan
162 * NB: see the file read/write code before changing field order here.
164 typedef struct pgspEntry
166 pgspHashKey key; /* hash key of entry - MUST BE FIRST */
167 Counters counters; /* the statistics for this query */
168 Size plan_offset; /* plan text offset in extern file */
169 int plan_len; /* # of valid bytes in query string */
170 int encoding; /* query encoding */
171 slock_t mutex; /* protects the counters only */
175 * Global shared state
177 typedef struct pgspSharedState
179 LWLock *lock; /* protects hashtable search/modification */
180 int plan_size; /* max query length in bytes */
181 double cur_median_usage; /* current median usage in hashtable */
182 Size mean_plan_len; /* current mean entry text length */
183 slock_t mutex; /* protects following fields only: */
184 Size extent; /* current extent of plan file */
185 int n_writers; /* number of active writers to query file */
186 int gc_count; /* plan file garbage collection cycle count */
187 pgspGlobalStats stats; /* global statistics for pgsp */
190 /*---- Local variables ----*/
192 /* Current nesting depth of ExecutorRun+ProcessUtility calls */
193 static int nested_level = 0;
195 /* Saved hook values in case of unload */
196 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
197 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
198 static ExecutorRun_hook_type prev_ExecutorRun = NULL;
199 static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
200 static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
201 static ProcessUtility_hook_type prev_ProcessUtility = NULL;
203 /* Links to shared memory state */
204 static pgspSharedState *shared_state = NULL;
205 static HTAB *hash_table = NULL;
207 /*---- GUC variables ----*/
211 TRACK_LEVEL_NONE, /* track no statements */
212 TRACK_LEVEL_TOP, /* only top level statements */
213 TRACK_LEVEL_ALL, /* all statements, including nested ones */
214 TRACK_LEVEL_FORCE /* all statements, including nested ones */
217 static const struct config_enum_entry track_options[] =
219 {"none", TRACK_LEVEL_NONE, false},
220 {"top", TRACK_LEVEL_TOP, false},
221 {"all", TRACK_LEVEL_ALL, false},
227 PLAN_FORMAT_RAW, /* No conversion. Shorten JSON */
228 PLAN_FORMAT_TEXT, /* Traditional text representation */
229 PLAN_FORMAT_JSON, /* JSON representation */
230 PLAN_FORMAT_YAML, /* YAML */
231 PLAN_FORMAT_XML, /* XML */
234 static const struct config_enum_entry plan_formats[] =
236 {"raw" , PLAN_FORMAT_RAW , false},
237 {"text", PLAN_FORMAT_TEXT, false},
238 {"json", PLAN_FORMAT_JSON, false},
239 {"yaml", PLAN_FORMAT_YAML, false},
240 {"xml" , PLAN_FORMAT_XML , false},
244 /* options for plan storage */
247 PLAN_STORAGE_SHMEM, /* plan is stored as a part of hash entry */
248 PLAN_STORAGE_FILE /* plan is stored in a separate file */
251 static const struct config_enum_entry plan_storage_options[] =
253 {"shmem", PLAN_STORAGE_SHMEM, false},
254 {"file", PLAN_STORAGE_FILE, false},
258 static int store_size; /* max # statements to track */
259 static int track_level; /* tracking level */
260 static int min_duration; /* min duration to record */
261 static bool dump_on_shutdown; /* whether to save stats across shutdown */
262 static bool log_analyze; /* Similar to EXPLAIN (ANALYZE *) */
263 static bool log_verbose; /* Similar to EXPLAIN (VERBOSE *) */
264 static bool log_buffers; /* Similar to EXPLAIN (BUFFERS *) */
265 static bool log_timing; /* Similar to EXPLAIN (TIMING *) */
266 static bool log_triggers; /* whether to log trigger statistics */
267 static int plan_format; /* Plan representation style in
268 * pg_store_plans.plan */
269 static int plan_storage; /* Plan storage type */
271 #if PG_VERSION_NUM >= 140000
273 * For pg14 and later, we rely on core queryid calculation. If
274 * it's not available it means that the admin explicitly refused to
275 * compute it, for performance reason or other. In that case, we
276 * will also consider that this extension is disabled.
278 #define pgsp_enabled(q) \
279 ((track_level == TRACK_LEVEL_ALL || \
280 (track_level == TRACK_LEVEL_TOP && nested_level == 0)) && \
281 (q != PGSP_NO_QUERYID))
283 #define pgsp_enabled(q) \
284 (track_level == TRACK_LEVEL_ALL || \
285 (track_level == TRACK_LEVEL_TOP && nested_level == 0))
288 #define SHMEM_PLAN_PTR(ent) (((char *) ent) + sizeof(pgspEntry))
290 /*---- Function declarations ----*/
295 Datum pg_store_plans_reset(PG_FUNCTION_ARGS);
296 Datum pg_store_plans_hash_query(PG_FUNCTION_ARGS);
297 Datum pg_store_plans(PG_FUNCTION_ARGS);
298 Datum pg_store_plans_shorten(PG_FUNCTION_ARGS);
299 Datum pg_store_plans_normalize(PG_FUNCTION_ARGS);
300 Datum pg_store_plans_jsonplan(PG_FUNCTION_ARGS);
301 Datum pg_store_plans_yamlplan(PG_FUNCTION_ARGS);
302 Datum pg_store_plans_xmlplan(PG_FUNCTION_ARGS);
303 Datum pg_store_plans_textplan(PG_FUNCTION_ARGS);
304 Datum pg_store_plans_info(PG_FUNCTION_ARGS);
306 PG_FUNCTION_INFO_V1(pg_store_plans_reset);
307 PG_FUNCTION_INFO_V1(pg_store_plans_hash_query);
308 PG_FUNCTION_INFO_V1(pg_store_plans);
309 PG_FUNCTION_INFO_V1(pg_store_plans_1_6);
310 PG_FUNCTION_INFO_V1(pg_store_plans_shorten);
311 PG_FUNCTION_INFO_V1(pg_store_plans_normalize);
312 PG_FUNCTION_INFO_V1(pg_store_plans_jsonplan);
313 PG_FUNCTION_INFO_V1(pg_store_plans_yamlplan);
314 PG_FUNCTION_INFO_V1(pg_store_plans_xmlplan);
315 PG_FUNCTION_INFO_V1(pg_store_plans_textplan);
316 PG_FUNCTION_INFO_V1(pg_store_plans_info);
318 #if PG_VERSION_NUM < 130000
319 #define COMPTAG_TYPE char
321 #define COMPTAG_TYPE QueryCompletion
324 #if PG_VERSION_NUM < 140000
325 #define ROLE_PG_READ_ALL_STATS DEFAULT_ROLE_READ_ALL_STATS
328 static void pgsp_shmem_startup(void);
329 static void pgsp_shmem_shutdown(int code, Datum arg);
330 static void pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags);
331 static void pgsp_ExecutorRun(QueryDesc *queryDesc,
332 ScanDirection direction,
333 uint64 count, bool execute_once);
334 static void pgsp_ExecutorFinish(QueryDesc *queryDesc);
335 static void pgsp_ExecutorEnd(QueryDesc *queryDesc);
336 static void pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
337 #if PG_VERSION_NUM >= 140000
340 ProcessUtilityContext context, ParamListInfo params,
341 QueryEnvironment *queryEnv,
342 DestReceiver *dest, COMPTAG_TYPE *completionTag);
343 static uint32 hash_query(const char* query);
344 static void pgsp_store(char *plan, queryid_t queryId,
345 double total_time, uint64 rows,
346 const BufferUsage *bufusage);
347 static void pg_store_plans_internal(FunctionCallInfo fcinfo,
348 pgspVersion api_version);
349 static Size shared_mem_size(void);
350 static pgspEntry *entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len,
352 static bool ptext_store(const char *plan, int plan_len, Size *plan_offset,
354 static char *ptext_load_file(Size *buffer_size);
355 static char *ptext_fetch(Size plan_offset, int plan_len, char *buffer,
357 static bool need_gc_ptexts(void);
358 static void gc_ptexts(void);
359 static void entry_dealloc(void);
360 static void entry_reset(void);
363 * Module load callback
369 * In order to create our shared memory area, we have to be loaded via
370 * shared_preload_libraries. If not, fall out without hooking into any of
371 * the main system. (We don't throw error here because it seems useful to
372 * allow the pg_stat_statements functions to be created even when the
373 * module isn't active. The functions must protect themselves against
374 * being called then, however.)
376 if (!process_shared_preload_libraries_in_progress)
379 #if PG_VERSION_NUM >= 140000
381 * Inform the postmaster that we want to enable query_id calculation if
382 * compute_query_id is set to auto.
388 * Define (or redefine) custom GUC variables.
390 DefineCustomIntVariable("pg_store_plans.max",
391 "Sets the maximum number of plans tracked by pg_store_plans.",
403 DefineCustomIntVariable("pg_store_plans.max_plan_length",
404 "Sets the maximum length of plans stored by pg_store_plans.",
416 DefineCustomEnumVariable("pg_store_plans.plan_storage",
417 "Selects where to store plan texts.",
421 plan_storage_options,
428 DefineCustomEnumVariable("pg_store_plans.track",
429 "Selects which plans are tracked by pg_store_plans.",
440 DefineCustomEnumVariable("pg_store_plans.plan_format",
441 "Selects which format to be appied for plan representation in pg_store_plans.",
452 DefineCustomIntVariable("pg_store_plans.min_duration",
453 "Minimum duration to record plan in milliseconds.",
465 DefineCustomBoolVariable("pg_store_plans.save",
466 "Save pg_store_plans statistics across server shutdowns.",
476 DefineCustomBoolVariable("pg_store_plans.log_analyze",
477 "Use EXPLAIN ANALYZE for plan logging.",
487 DefineCustomBoolVariable("pg_store_plans.log_buffers",
498 DefineCustomBoolVariable("pg_store_plans.log_timing",
509 DefineCustomBoolVariable("pg_store_plans.log_triggers",
510 "Log trigger trace.",
520 DefineCustomBoolVariable("pg_store_plans.log_verbose",
521 "Set VERBOSE for EXPLAIN on logging.",
531 EmitWarningsOnPlaceholders("pg_store_plans");
534 * Request additional shared resources. (These are no-ops if we're not in
535 * the postmaster process.) We'll allocate or attach to the shared
536 * resources in pgsp_shmem_startup().
538 RequestAddinShmemSpace(shared_mem_size());
539 RequestNamedLWLockTranche("pg_store_plans", 1);
544 prev_shmem_startup_hook = shmem_startup_hook;
545 shmem_startup_hook = pgsp_shmem_startup;
546 prev_ExecutorStart = ExecutorStart_hook;
547 ExecutorStart_hook = pgsp_ExecutorStart;
548 prev_ExecutorRun = ExecutorRun_hook;
549 ExecutorRun_hook = pgsp_ExecutorRun;
550 prev_ExecutorFinish = ExecutorFinish_hook;
551 ExecutorFinish_hook = pgsp_ExecutorFinish;
552 prev_ExecutorEnd = ExecutorEnd_hook;
553 ExecutorEnd_hook = pgsp_ExecutorEnd;
554 prev_ProcessUtility = ProcessUtility_hook;
555 ProcessUtility_hook = pgsp_ProcessUtility;
559 * Module unload callback
564 /* Uninstall hooks. */
565 shmem_startup_hook = prev_shmem_startup_hook;
566 ExecutorStart_hook = prev_ExecutorStart;
567 ExecutorRun_hook = prev_ExecutorRun;
568 ExecutorFinish_hook = prev_ExecutorFinish;
569 ExecutorEnd_hook = prev_ExecutorEnd;
570 ProcessUtility_hook = prev_ProcessUtility;
574 * shmem_startup hook: allocate or attach to shared memory,
575 * then load any pre-existing statistics from file.
578 pgsp_shmem_startup(void)
592 if (prev_shmem_startup_hook)
593 prev_shmem_startup_hook();
595 /* reset in case this is a restart within the postmaster */
600 * Create or attach to the shared memory state, including hash table
602 LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
604 shared_state = ShmemInitStruct("pg_store_plans",
605 sizeof(pgspSharedState),
610 /* First time through ... */
611 shared_state->lock = &(GetNamedLWLockTranche("pg_store_plans"))->lock;
612 shared_state->plan_size = max_plan_len;
613 shared_state->cur_median_usage = ASSUMED_MEDIAN_INIT;
614 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
615 SpinLockInit(&shared_state->mutex);
616 shared_state->extent = 0;
617 shared_state->n_writers = 0;
618 shared_state->gc_count = 0;
619 shared_state->stats.dealloc = 0;
620 shared_state->stats.stats_reset = GetCurrentTimestamp();
623 /* Be sure everyone agrees on the hash table entry size */
624 plan_size = shared_state->plan_size;
626 memset(&info, 0, sizeof(info));
627 info.keysize = sizeof(pgspHashKey);
628 info.entrysize = sizeof(pgspEntry);
629 if (plan_storage == PLAN_STORAGE_SHMEM)
630 info.entrysize += max_plan_len;
631 hash_table = ShmemInitHash("pg_store_plans hash",
632 store_size, store_size,
636 LWLockRelease(AddinShmemInitLock);
639 * If we're in the postmaster (or a standalone backend...), set up a shmem
640 * exit hook to dump the statistics to disk.
642 if (!IsUnderPostmaster)
643 on_shmem_exit(pgsp_shmem_shutdown, (Datum) 0);
646 * Done if some other process already completed our initialization.
652 * Note: we don't bother with locks here, because there should be no other
653 * processes running when this code is reached.
656 /* Unlink query text file possibly left over from crash */
657 unlink(PGSP_TEXT_FILE);
659 if (plan_storage == PLAN_STORAGE_FILE)
661 /* Allocate new query text temp file */
662 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
668 * If we were told not to load old statistics, we're done. (Note we do
669 * not try to unlink any old dump file in this case. This seems a bit
670 * questionable but it's the historical behavior.)
672 if (!dump_on_shutdown)
680 * Attempt to load old statistics from the dump file.
682 file = AllocateFile(PGSP_DUMP_FILE, PG_BINARY_R);
686 return; /* ignore not-found error */
687 /* No existing persisted stats file, so we're done */
691 buffer_size = plan_size;
692 buffer = (char *) palloc(buffer_size);
694 if (fread(&header, sizeof(uint32), 1, file) != 1 ||
695 fread(&pgver, sizeof(uint32), 1, file) != 1 ||
696 fread(&num, sizeof(int32), 1, file) != 1)
699 if (header != PGSP_FILE_HEADER ||
700 pgver != PGSP_PG_MAJOR_VERSION)
703 for (i = 0; i < num; i++)
707 Size plan_offset = 0;
709 if (fread(&temp, sizeof(pgspEntry), 1, file) != 1)
712 /* Encoding is the only field we can easily sanity-check */
713 if (!PG_VALID_BE_ENCODING(temp.encoding))
716 /* Previous incarnation might have had a larger plan_size */
717 if (temp.plan_len >= buffer_size)
719 buffer = (char *) repalloc(buffer, temp.plan_len + 1);
720 buffer_size = temp.plan_len + 1;
723 if (fread(buffer, 1, temp.plan_len + 1, file) != temp.plan_len + 1)
726 /* Skip loading "sticky" entries */
727 if (temp.counters.calls == 0)
730 /* Clip to available length if needed */
731 if (temp.plan_len >= plan_size)
732 temp.plan_len = pg_encoding_mbcliplen(temp.encoding,
737 buffer[temp.plan_len] = '\0';
739 if (plan_storage == PLAN_STORAGE_FILE)
741 /* Store the plan text */
742 plan_offset = shared_state->extent;
743 if (fwrite(buffer, 1, temp.plan_len + 1, pfile) !=
746 shared_state->extent += temp.plan_len + 1;
749 /* make the hashtable entry (discards old entries if too many) */
750 entry = entry_alloc(&temp.key, plan_offset, temp.plan_len, false);
752 if (plan_storage == PLAN_STORAGE_SHMEM)
753 memcpy(SHMEM_PLAN_PTR(entry), buffer, temp.plan_len + 1);
755 /* copy in the actual stats */
756 entry->counters = temp.counters;
766 * Remove the file so it's not included in backups/replication slaves,
767 * etc. A new file will be written on next shutdown.
769 unlink(PGSP_DUMP_FILE);
775 (errcode_for_file_access(),
776 errmsg("could not read file \"%s\": %m",
781 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
782 errmsg("ignoring invalid data in file \"%s\"",
787 (errcode_for_file_access(),
788 errmsg("could not write file \"%s\": %m",
797 /* If possible, throw away the bogus file; ignore any error */
798 unlink(PGSP_DUMP_FILE);
801 * Don't unlink PGSP_TEXT_FILE here; it should always be around while the
802 * server is running with pg_stat_statements enabled
807 * shmem_shutdown hook: Dump statistics into file.
809 * Note: we don't bother with acquiring lock, because there should be no
810 * other processes running when this is called.
813 pgsp_shmem_shutdown(int code, Datum arg)
816 char *pbuffer = NULL;
817 Size pbuffer_size = 0;
818 HASH_SEQ_STATUS hash_seq;
822 /* Don't try to dump during a crash. */
826 /* Safety check ... shouldn't get here unless shmem is set up. */
827 if (!shared_state || !hash_table)
830 /* Don't dump if told not to. */
831 if (!dump_on_shutdown)
834 file = AllocateFile(PGSP_DUMP_FILE ".tmp", PG_BINARY_W);
838 if (fwrite(&PGSP_FILE_HEADER, sizeof(uint32), 1, file) != 1)
840 if (fwrite(&PGSP_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
842 num_entries = hash_get_num_entries(hash_table);
843 if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
846 if (plan_storage == PLAN_STORAGE_FILE)
848 pbuffer = ptext_load_file(&pbuffer_size);
853 hash_seq_init(&hash_seq, hash_table);
854 while ((entry = hash_seq_search(&hash_seq)) != NULL)
856 int len = entry->plan_len;
859 if (plan_storage == PLAN_STORAGE_FILE)
860 pstr = ptext_fetch(entry->plan_offset, len,
861 pbuffer, pbuffer_size);
863 pstr = SHMEM_PLAN_PTR(entry);
866 continue; /* Ignore any entries with bogus texts */
868 if (fwrite(entry, sizeof(pgspEntry), 1, file) != 1 ||
869 fwrite(pstr, 1, len + 1, file) != len + 1)
871 /* note: we assume hash_seq_term won't change errno */
872 hash_seq_term(&hash_seq);
884 * Rename file into place, so we atomically replace the old one.
886 if (rename(PGSP_DUMP_FILE ".tmp", PGSP_DUMP_FILE) != 0)
888 (errcode_for_file_access(),
889 errmsg("could not rename pg_store_plans file \"%s\": %m",
890 PGSP_DUMP_FILE ".tmp")));
892 /* Unlink query-texts file; it's not needed while shutdown */
893 unlink(PGSP_TEXT_FILE);
899 (errcode_for_file_access(),
900 errmsg("could not write pg_store_plans file \"%s\": %m",
901 PGSP_DUMP_FILE ".tmp")));
904 unlink(PGSP_DUMP_FILE ".tmp");
909 * ExecutorStart hook: start up tracking if needed
912 pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags)
915 (eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0)
917 queryDesc->instrument_options |=
918 (log_timing ? INSTRUMENT_TIMER : 0)|
919 (log_timing ? 0: INSTRUMENT_ROWS)|
920 (log_buffers ? INSTRUMENT_BUFFERS : 0);
923 if (prev_ExecutorStart)
924 prev_ExecutorStart(queryDesc, eflags);
926 standard_ExecutorStart(queryDesc, eflags);
929 * Set up to track total elapsed time in ExecutorRun. Allocate in per-query
930 * context so as to be free at ExecutorEnd.
932 if (queryDesc->totaltime == NULL &&
933 pgsp_enabled(queryDesc->plannedstmt->queryId))
935 MemoryContext oldcxt;
937 oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
938 queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL
939 #if PG_VERSION_NUM >= 140000
943 MemoryContextSwitchTo(oldcxt);
949 * ExecutorRun hook: all we need do is track nesting depth
952 pgsp_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
958 if (prev_ExecutorRun)
959 prev_ExecutorRun(queryDesc, direction, count, execute_once);
961 standard_ExecutorRun(queryDesc, direction, count, execute_once);
973 * ExecutorFinish hook: all we need do is track nesting depth
976 pgsp_ExecutorFinish(QueryDesc *queryDesc)
981 if (prev_ExecutorFinish)
982 prev_ExecutorFinish(queryDesc);
984 standard_ExecutorFinish(queryDesc);
996 * ExecutorEnd hook: store results if needed
999 pgsp_ExecutorEnd(QueryDesc *queryDesc)
1001 if (queryDesc->totaltime)
1004 * Make sure stats accumulation is done. (Note: it's okay if several
1005 * levels of hook all do this.)
1007 InstrEndLoop(queryDesc->totaltime);
1009 if (pgsp_enabled(queryDesc->plannedstmt->queryId) &&
1010 queryDesc->totaltime->total &&
1011 queryDesc->totaltime->total >=
1012 (double)min_duration / 1000.0)
1018 es = NewExplainState();
1021 es->analyze = queryDesc->instrument_options;
1022 es->verbose = log_verbose;
1023 es->buffers = (es->analyze && log_buffers);
1024 es->timing = (es->analyze && log_timing);
1025 es->format = EXPLAIN_FORMAT_JSON;
1027 ExplainBeginOutput(es);
1028 ExplainPrintPlan(es, queryDesc);
1030 pgspExplainTriggers(es, queryDesc);
1031 ExplainEndOutput(es);
1033 /* Remove last line break */
1034 if (es_str->len > 0 && es_str->data[es_str->len - 1] == '\n')
1035 es_str->data[--es_str->len] = '\0';
1037 /* JSON outmost braces. */
1038 es_str->data[0] = '{';
1039 es_str->data[es_str->len - 1] = '}';
1041 queryid = queryDesc->plannedstmt->queryId;
1042 #if PG_VERSION_NUM < 140000
1044 * For versions before pg14, a queryid is only available if
1045 * pg_stat_statements extension (or similar) if configured. We
1046 * don't want a hard requirement for such an extension so fallback
1047 * to an internal queryid calculation in some case.
1048 * For pg14 and above, core postgres can compute a queryid so we
1051 if (queryid == PGSP_NO_QUERYID)
1052 queryid = (queryid_t) hash_query(queryDesc->sourceText);
1054 Assert(queryid != PGSP_NO_QUERYID);
1057 pgsp_store(es_str->data,
1059 queryDesc->totaltime->total * 1000.0, /* convert to msec */
1060 queryDesc->estate->es_processed,
1061 &queryDesc->totaltime->bufusage);
1062 pfree(es_str->data);
1066 if (prev_ExecutorEnd)
1067 prev_ExecutorEnd(queryDesc);
1069 standard_ExecutorEnd(queryDesc);
1073 * ProcessUtility hook
1076 pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
1077 #if PG_VERSION_NUM >= 140000
1080 ProcessUtilityContext context, ParamListInfo params,
1081 QueryEnvironment *queryEnv,
1082 DestReceiver *dest, COMPTAG_TYPE *completionTag)
1084 if (prev_ProcessUtility)
1085 prev_ProcessUtility(pstmt, queryString,
1086 #if PG_VERSION_NUM >= 140000
1089 context, params, queryEnv,
1090 dest, completionTag);
1092 standard_ProcessUtility(pstmt, queryString,
1093 #if PG_VERSION_NUM >= 140000
1096 context, params, queryEnv,
1097 dest, completionTag);
1101 * hash_query: calculate internal query ID for a query
1103 * As of PG11, Query.queryId has been widen to 64 bit to reduce collision of
1104 * queries to practical level. On the other hand pg_store_plans uses the
1105 * combination of query hash and plan hash values as the hash table key and
1106 * the resolution of the hash value effectively has the same degree so we
1107 * continue to use uint32 as internal queryid.
1109 * This may merge plans from different queries into single internal query id
1110 * but it is not a problem when pg_stat_statements is used together since the
1111 * extension gives enough resolution on queries.
1114 hash_query(const char* query)
1118 char *normquery = pstrdup(query);
1119 normalize_expr(normquery, false);
1120 queryid = hash_any((const unsigned char*)normquery, strlen(normquery));
1123 /* If we are unlucky enough to get a hash of zero, use 1 instead */
1132 * Store some statistics for a plan.
1134 * Table entry is keyed with userid.dbid.queryId.planId. planId is the hash
1135 * value of the given plan, which is calculated in ths function.
1138 pgsp_store(char *plan, queryid_t queryId,
1139 double total_time, uint64 rows,
1140 const BufferUsage *bufusage)
1144 char *norm_query = NULL;
1146 char *normalized_plan = NULL;
1147 char *shorten_plan = NULL;
1148 volatile pgspEntry *e;
1149 Size plan_offset = 0;
1152 Assert(plan != NULL && queryId != PGSP_NO_QUERYID);
1154 /* Safety check... */
1155 if (!shared_state || !hash_table)
1158 /* Set up key for hashtable search */
1159 key.userid = GetUserId();
1160 key.dbid = MyDatabaseId;
1161 key.queryid = queryId;
1163 normalized_plan = pgsp_json_normalize(plan);
1164 shorten_plan = pgsp_json_shorten(plan);
1165 elog(DEBUG3, "pg_store_plans: Normalized plan: %s", normalized_plan);
1166 elog(DEBUG3, "pg_store_plans: Shorten plan: %s", shorten_plan);
1167 elog(DEBUG3, "pg_store_plans: Original plan: %s", plan);
1168 plan_len = strlen(shorten_plan);
1170 key.planid = hash_any((const unsigned char *)normalized_plan,
1171 strlen(normalized_plan));
1172 pfree(normalized_plan);
1174 if (plan_len >= shared_state->plan_size)
1175 plan_len = pg_encoding_mbcliplen(GetDatabaseEncoding(),
1178 shared_state->plan_size - 1);
1181 /* Look up the hash table entry with shared lock. */
1182 LWLockAcquire(shared_state->lock, LW_SHARED);
1184 entry = (pgspEntry *) hash_search(hash_table, &key, HASH_FIND, NULL);
1186 /* Store the plan text, if the entry not present */
1187 if (!entry && plan_storage == PLAN_STORAGE_FILE)
1192 /* Append new plan text to file with only shared lock held */
1193 stored = ptext_store(shorten_plan, plan_len, &plan_offset, &gc_count);
1196 * Determine whether we need to garbage collect external query texts
1197 * while the shared lock is still held. This micro-optimization
1198 * avoids taking the time to decide this while holding exclusive lock.
1200 do_gc = need_gc_ptexts();
1202 /* Acquire exclusive lock as required by entry_alloc() */
1203 LWLockRelease(shared_state->lock);
1204 LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
1207 * A garbage collection may have occurred while we weren't holding the
1208 * lock. In the unlikely event that this happens, the plan text we
1209 * stored above will have been garbage collected, so write it again.
1210 * This should be infrequent enough that doing it while holding
1211 * exclusive lock isn't a performance problem.
1213 if (!stored || shared_state->gc_count != gc_count)
1214 stored = ptext_store(shorten_plan, plan_len, &plan_offset, NULL);
1216 /* If we failed to write to the text file, give up */
1222 /* Create new entry, if not present */
1225 entry = entry_alloc(&key, plan_offset, plan_len, false);
1227 /* shorten_plan is terminated by NUL */
1228 if (plan_storage == PLAN_STORAGE_SHMEM)
1229 memcpy(SHMEM_PLAN_PTR(entry), shorten_plan, plan_len + 1);
1232 /* If needed, perform garbage collection while exclusive lock held */
1237 /* Increment the counts, except when jstate is not NULL */
1240 * Grab the spinlock while updating the counters (see comment about
1241 * locking rules at the head of the file)
1244 e = (volatile pgspEntry *) entry;
1245 SpinLockAcquire(&e->mutex);
1247 /* "Unstick" entry if it was previously sticky */
1248 if (e->counters.calls == 0)
1250 e->counters.usage = USAGE_INIT;
1251 e->counters.first_call = GetCurrentTimestamp();
1254 e->counters.calls += 1;
1255 e->counters.total_time += total_time;
1256 if (e->counters.calls == 1)
1258 e->counters.min_time = total_time;
1259 e->counters.max_time = total_time;
1260 e->counters.mean_time = total_time;
1265 * Welford's method for accurately computing variance. See
1266 * <http://www.johndcook.com/blog/standard_deviation/>
1268 double old_mean = e->counters.mean_time;
1270 e->counters.mean_time +=
1271 (total_time - old_mean) / e->counters.calls;
1272 e->counters.sum_var_time +=
1273 (total_time - old_mean) * (total_time - e->counters.mean_time);
1275 /* calculate min and max time */
1276 if (e->counters.min_time > total_time)
1277 e->counters.min_time = total_time;
1278 if (e->counters.max_time < total_time)
1279 e->counters.max_time = total_time;
1282 e->counters.rows += rows;
1283 e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1284 e->counters.shared_blks_read += bufusage->shared_blks_read;
1285 e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
1286 e->counters.shared_blks_written += bufusage->shared_blks_written;
1287 e->counters.local_blks_hit += bufusage->local_blks_hit;
1288 e->counters.local_blks_read += bufusage->local_blks_read;
1289 e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
1290 e->counters.local_blks_written += bufusage->local_blks_written;
1291 e->counters.temp_blks_read += bufusage->temp_blks_read;
1292 e->counters.temp_blks_written += bufusage->temp_blks_written;
1293 e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1294 e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
1295 e->counters.last_call = GetCurrentTimestamp();
1296 e->counters.usage += USAGE_EXEC(total_time);
1298 SpinLockRelease(&e->mutex);
1301 LWLockRelease(shared_state->lock);
1303 /* We postpone this pfree until we're out of the lock */
1309 * Reset all statement statistics.
1312 pg_store_plans_reset(PG_FUNCTION_ARGS)
1314 if (!shared_state || !hash_table)
1316 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1317 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1322 /* Number of output arguments (columns) for various API versions */
1323 #define PG_STORE_PLANS_COLS_V1_5 27
1324 #define PG_STORE_PLANS_COLS_V1_6 26
1325 #define PG_STORE_PLANS_COLS 27 /* maximum of above */
1328 * Retrieve statement statistics.
1331 pg_store_plans_1_6(PG_FUNCTION_ARGS)
1333 pg_store_plans_internal(fcinfo, PGSP_V1_6);
1339 pg_store_plans(PG_FUNCTION_ARGS)
1341 pg_store_plans_internal(fcinfo, PGSP_V1_5);
1347 pg_store_plans_internal(FunctionCallInfo fcinfo,
1348 pgspVersion api_version)
1350 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1352 Tuplestorestate *tupstore;
1353 MemoryContext per_query_ctx;
1354 MemoryContext oldcontext;
1355 Oid userid = GetUserId();
1356 bool is_allowed_role = is_member_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS);
1358 char *pbuffer = NULL;
1359 Size pbuffer_size = 0;
1362 HASH_SEQ_STATUS hash_seq;
1365 if (!shared_state || !hash_table)
1367 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1368 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1370 /* check to see if caller supports us returning a tuplestore */
1371 if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1373 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1374 errmsg("set-valued function called in context that cannot accept a set")));
1375 if (!(rsinfo->allowedModes & SFRM_Materialize))
1377 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1378 errmsg("materialize mode required, but it is not " \
1379 "allowed in this context")));
1381 /* Build a tuple descriptor for our result type */
1382 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1383 elog(ERROR, "return type must be a row type");
1385 per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1386 oldcontext = MemoryContextSwitchTo(per_query_ctx);
1388 tupstore = tuplestore_begin_heap(true, false, work_mem);
1389 rsinfo->returnMode = SFRM_Materialize;
1390 rsinfo->setResult = tupstore;
1391 rsinfo->setDesc = tupdesc;
1393 MemoryContextSwitchTo(oldcontext);
1396 * We'd like to load the plan text file (if needed) while not holding any
1397 * lock on shared_state->lock. In the worst case we'll have to do this
1398 * again after we have the lock, but it's unlikely enough to make this a
1399 * win despite occasional duplicated work. We need to reload if anybody
1400 * writes to the file (either a retail ptext_store(), or a garbage
1401 * collection) between this point and where we've gotten shared lock. If a
1402 * ptext_store is actually in progress when we look, we might as well skip
1403 * the speculative load entirely.
1406 /* Take the mutex so we can examine variables */
1408 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1410 SpinLockAcquire(&s->mutex);
1412 n_writers = s->n_writers;
1413 gc_count = s->gc_count;
1414 SpinLockRelease(&s->mutex);
1417 /* No point in loading file now if there are active writers */
1418 if (n_writers == 0 && plan_storage == PLAN_STORAGE_FILE)
1419 pbuffer = ptext_load_file(&pbuffer_size);
1422 * Get shared lock, load or reload the plan text file if we must, and
1423 * iterate over the hashtable entries.
1425 * With a large hash table, we might be holding the lock rather longer
1426 * than one could wish. However, this only blocks creation of new hash
1427 * table entries, and the larger the hash table the less likely that is to
1428 * be needed. So we can hope this is okay. Perhaps someday we'll decide
1429 * we need to partition the hash table to limit the time spent holding any
1432 LWLockAcquire(shared_state->lock, LW_SHARED);
1435 * Here it is safe to examine extent and gc_count without taking the mutex.
1436 * Note that although other processes might change shared_state->extent
1437 * just after we look at it, the strings they then write into the file
1438 * cannot yet be referenced in the hashtable, so we don't care whether we
1441 * If ptext_load_file fails, we just press on; we'll return NULL for every
1444 if (plan_storage == PLAN_STORAGE_FILE &&
1446 shared_state->extent != extent ||
1447 shared_state->gc_count != gc_count))
1451 pbuffer = ptext_load_file(&pbuffer_size);
1454 hash_seq_init(&hash_seq, hash_table);
1455 while ((entry = hash_seq_search(&hash_seq)) != NULL)
1457 Datum values[PG_STORE_PLANS_COLS];
1458 bool nulls[PG_STORE_PLANS_COLS];
1460 int64 queryid = entry->key.queryid;
1461 int64 planid = entry->key.planid;
1465 memset(values, 0, sizeof(values));
1466 memset(nulls, 0, sizeof(nulls));
1468 values[i++] = ObjectIdGetDatum(entry->key.userid);
1469 values[i++] = ObjectIdGetDatum(entry->key.dbid);
1470 if (is_allowed_role || entry->key.userid == userid)
1472 values[i++] = Int64GetDatumFast(queryid);
1473 values[i++] = Int64GetDatumFast(planid);
1474 if (api_version == PGSP_V1_5)
1475 values[i++] = ObjectIdGetDatum(queryid);
1479 values[i++] = Int64GetDatumFast(0);
1480 values[i++] = Int64GetDatumFast(0);
1481 if (api_version == PGSP_V1_5)
1482 values[i++] = Int64GetDatumFast(0);
1485 if (is_allowed_role || entry->key.userid == userid)
1487 char *pstr; /* Plan string */
1488 char *mstr; /* Modified plan string */
1489 char *estr; /* Encoded modified plan string */
1491 if (plan_storage == PLAN_STORAGE_FILE)
1492 pstr = ptext_fetch(entry->plan_offset, entry->plan_len,
1493 pbuffer, pbuffer_size);
1495 pstr = SHMEM_PLAN_PTR(entry);
1497 switch (plan_format)
1499 case PLAN_FORMAT_TEXT:
1500 mstr = pgsp_json_textize(pstr);
1502 case PLAN_FORMAT_JSON:
1503 mstr = pgsp_json_inflate(pstr);
1505 case PLAN_FORMAT_YAML:
1506 mstr = pgsp_json_yamlize(pstr);
1508 case PLAN_FORMAT_XML:
1509 mstr = pgsp_json_xmlize(pstr);
1516 pg_do_encoding_conversion((unsigned char *) mstr,
1519 GetDatabaseEncoding());
1520 values[i++] = CStringGetTextDatum(estr);
1528 /* pstr is a pointer onto pbuffer */
1531 values[i++] = CStringGetTextDatum("<insufficient privilege>");
1533 /* copy counters to a local variable to keep locking time short */
1535 volatile pgspEntry *e = (volatile pgspEntry *) entry;
1537 SpinLockAcquire(&e->mutex);
1539 SpinLockRelease(&e->mutex);
1542 /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1546 values[i++] = Int64GetDatumFast(tmp.calls);
1547 values[i++] = Float8GetDatumFast(tmp.total_time);
1548 values[i++] = Float8GetDatumFast(tmp.min_time);
1549 values[i++] = Float8GetDatumFast(tmp.max_time);
1550 values[i++] = Float8GetDatumFast(tmp.mean_time);
1553 * Note we are calculating the population variance here, not the
1554 * sample variance, as we have data for the whole population, so
1555 * Bessel's correction is not used, and we don't divide by
1559 stddev = sqrt(tmp.sum_var_time / tmp.calls);
1562 values[i++] = Float8GetDatumFast(stddev);
1564 values[i++] = Int64GetDatumFast(tmp.rows);
1565 values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1566 values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
1567 values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
1568 values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1569 values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1570 values[i++] = Int64GetDatumFast(tmp.local_blks_read);
1571 values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
1572 values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1573 values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1574 values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
1575 values[i++] = Float8GetDatumFast(tmp.blk_read_time);
1576 values[i++] = Float8GetDatumFast(tmp.blk_write_time);
1577 values[i++] = TimestampTzGetDatum(tmp.first_call);
1578 values[i++] = TimestampTzGetDatum(tmp.last_call);
1580 Assert(i == (api_version == PGSP_V1_5 ? PG_STORE_PLANS_COLS_V1_5 :
1581 api_version == PGSP_V1_6 ? PG_STORE_PLANS_COLS_V1_6 :
1582 -1 /* fail if you forget to update this assert */ ));
1584 tuplestore_putvalues(tupstore, tupdesc, values, nulls);
1587 LWLockRelease(shared_state->lock);
1589 /* clean up and return the tuplestore */
1590 tuplestore_donestoring(tupstore);
1593 /* Number of output arguments (columns) for pg_stat_statements_info */
1594 #define PG_STORE_PLANS_INFO_COLS 2
1597 * Return statistics of pg_stat_statements.
1600 pg_store_plans_info(PG_FUNCTION_ARGS)
1602 pgspGlobalStats stats;
1604 Datum values[PG_STORE_PLANS_INFO_COLS];
1605 bool nulls[PG_STORE_PLANS_INFO_COLS];
1607 if (!shared_state || !hash_table)
1609 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1610 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1612 /* Build a tuple descriptor for our result type */
1613 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1614 elog(ERROR, "return type must be a row type");
1616 MemSet(values, 0, sizeof(values));
1617 MemSet(nulls, 0, sizeof(nulls));
1619 /* Read global statistics for pg_stat_statements */
1621 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1623 SpinLockAcquire(&s->mutex);
1625 SpinLockRelease(&s->mutex);
1628 values[0] = Int64GetDatum(stats.dealloc);
1629 values[1] = TimestampTzGetDatum(stats.stats_reset);
1631 PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
1635 * Estimate shared memory space needed.
1638 shared_mem_size(void)
1643 size = MAXALIGN(sizeof(pgspSharedState));
1644 entry_size = sizeof(pgspEntry);
1646 /* plan text is apppended to the struct body */
1647 if (plan_storage == PLAN_STORAGE_SHMEM)
1648 entry_size += max_plan_len;
1650 size = add_size(size, hash_estimate_size(store_size, entry_size));
1656 * Allocate a new hashtable entry.
1657 * caller must hold an exclusive lock on shared_state->lock
1659 * "plan" need not be null-terminated; we rely on plan_len instead
1661 * If "sticky" is true, make the new entry artificially sticky so that it will
1662 * probably still be there when the query finishes execution. We do this by
1663 * giving it a median usage value rather than the normal value. (Strictly
1664 * speaking, query strings are normalized on a best effort basis, though it
1665 * would be difficult to demonstrate this even under artificial conditions.)
1667 * Note: despite needing exclusive lock, it's not an error for the target
1668 * entry to already exist. This is because pgsp_store releases and
1669 * reacquires lock after failing to find a match; so someone else could
1670 * have made the entry while we waited to get exclusive lock.
1673 entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len, bool sticky)
1678 /* Make space if needed */
1679 while (hash_get_num_entries(hash_table) >= store_size)
1682 /* Find or create an entry with desired hash code */
1683 entry = (pgspEntry *) hash_search(hash_table, key, HASH_ENTER, &found);
1687 /* New entry, initialize it */
1689 /* reset the statistics */
1690 memset(&entry->counters, 0, sizeof(Counters));
1691 /* set the appropriate initial usage count */
1692 entry->counters.usage = sticky ? shared_state->cur_median_usage : USAGE_INIT;
1693 /* re-initialize the mutex each time ... we assume no one using it */
1694 SpinLockInit(&entry->mutex);
1695 /* ... and don't forget the query text */
1696 Assert(plan_len >= 0 && plan_len < shared_state->plan_size);
1697 entry->plan_offset = plan_offset;
1698 entry->plan_len = plan_len;
1699 entry->encoding = GetDatabaseEncoding();
1706 * qsort comparator for sorting into increasing usage order
1709 entry_cmp(const void *lhs, const void *rhs)
1711 double l_usage = (*(pgspEntry *const *) lhs)->counters.usage;
1712 double r_usage = (*(pgspEntry *const *) rhs)->counters.usage;
1714 if (l_usage < r_usage)
1716 else if (l_usage > r_usage)
1723 * Deallocate least used entries.
1724 * Caller must hold an exclusive lock on shared_state->lock.
1729 HASH_SEQ_STATUS hash_seq;
1730 pgspEntry **entries;
1738 * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1739 * While we're scanning the table, apply the decay factor to the usage
1743 entries = palloc(hash_get_num_entries(hash_table) * sizeof(pgspEntry *));
1746 hash_seq_init(&hash_seq, hash_table);
1747 while ((entry = hash_seq_search(&hash_seq)) != NULL)
1749 entries[i++] = entry;
1750 /* "Sticky" entries get a different usage decay rate. */
1751 if (entry->counters.calls == 0)
1752 entry->counters.usage *= STICKY_DECREASE_FACTOR;
1754 entry->counters.usage *= USAGE_DECREASE_FACTOR;
1756 /* In the mean length computation, ignore dropped texts. */
1757 if (entry->plan_len >= 0)
1759 tottextlen += entry->plan_len + 1;
1764 qsort(entries, i, sizeof(pgspEntry *), entry_cmp);
1766 /* Also, record the (approximate) median usage */
1768 shared_state->cur_median_usage = entries[i / 2]->counters.usage;
1769 /* Record the mean plan length */
1770 if (nvalidtexts > 0)
1771 shared_state->mean_plan_len = tottextlen / nvalidtexts;
1773 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
1775 nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
1776 nvictims = Min(nvictims, i);
1778 for (i = 0; i < nvictims; i++)
1780 hash_search(hash_table, &entries[i]->key, HASH_REMOVE, NULL);
1785 /* Increment the number of times entries are deallocated */
1787 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1789 SpinLockAcquire(&s->mutex);
1790 s->stats.dealloc += 1;
1791 SpinLockRelease(&s->mutex);
1796 * Given a plan string (not necessarily null-terminated), allocate a new
1797 * entry in the external plan text file and store the string there.
1799 * If successful, returns true, and stores the new entry's offset in the file
1800 * into *plan_offset. Also, if gc_count isn't NULL, *gc_count is set to the
1801 * number of garbage collections that have occurred so far.
1803 * On failure, returns false.
1805 * At least a shared lock on shared_state->lock must be held by the caller, so
1806 * as to prevent a concurrent garbage collection. Share-lock-holding callers
1807 * should pass a gc_count pointer to obtain the number of garbage collections,
1808 * so that they can recheck the count after obtaining exclusive lock to detect
1809 * whether a garbage collection occurred (and removed this entry).
1812 ptext_store(const char *plan, int plan_len, Size *plan_offset, int *gc_count)
1817 Assert (plan_storage == PLAN_STORAGE_FILE);
1820 * We use a spinlock to protect extent/n_writers/gc_count, so that
1821 * multiple processes may execute this function concurrently.
1824 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1826 SpinLockAcquire(&s->mutex);
1828 s->extent += plan_len + 1;
1831 *gc_count = s->gc_count;
1832 SpinLockRelease(&s->mutex);
1837 /* Now write the data into the successfully-reserved part of the file */
1838 fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
1842 if (pg_pwrite(fd, plan, plan_len, off) != plan_len)
1844 if (pg_pwrite(fd, "\0", 1, off + plan_len) != 1)
1847 CloseTransientFile(fd);
1849 /* Mark our write complete */
1851 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1853 SpinLockAcquire(&s->mutex);
1855 SpinLockRelease(&s->mutex);
1862 (errcode_for_file_access(),
1863 errmsg("could not write file \"%s\": %m",
1867 CloseTransientFile(fd);
1869 /* Mark our write complete */
1871 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1873 SpinLockAcquire(&s->mutex);
1875 SpinLockRelease(&s->mutex);
1882 * Read the external plan text file into a malloc'd buffer.
1884 * Returns NULL (without throwing an error) if unable to read, eg
1885 * file not there or insufficient memory.
1887 * On success, the buffer size is also returned into *buffer_size.
1889 * This can be called without any lock on shared_state->lock, but in that case
1890 * the caller is responsible for verifying that the result is sane.
1893 ptext_load_file(Size *buffer_size)
1900 Assert (plan_storage == PLAN_STORAGE_FILE);
1902 fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDONLY | PG_BINARY);
1905 if (errno != ENOENT)
1907 (errcode_for_file_access(),
1908 errmsg("could not read file \"%s\": %m",
1913 /* Get file length */
1914 if (fstat(fd, &stat))
1917 (errcode_for_file_access(),
1918 errmsg("could not stat file \"%s\": %m",
1920 CloseTransientFile(fd);
1924 /* Allocate buffer; beware that off_t might be wider than size_t */
1925 if (stat.st_size <= MaxAllocHugeSize)
1926 buf = (char *) malloc(stat.st_size);
1932 (errcode(ERRCODE_OUT_OF_MEMORY),
1933 errmsg("out of memory"),
1934 errdetail("Could not allocate enough memory to read file \"%s\".",
1936 CloseTransientFile(fd);
1941 * OK, slurp in the file. Windows fails if we try to read more than
1942 * INT_MAX bytes at once, and other platforms might not like that either,
1943 * so read a very large file in 1GB segments.
1946 while (nread < stat.st_size)
1948 int toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
1951 * If we get a short read and errno doesn't get set, the reason is
1952 * probably that garbage collection truncated the file since we did
1953 * the fstat(), so we don't log a complaint --- but we don't return
1954 * the data, either, since it's most likely corrupt due to concurrent
1955 * writes from garbage collection.
1958 if (read(fd, buf + nread, toread) != toread)
1962 (errcode_for_file_access(),
1963 errmsg("could not read file \"%s\": %m",
1966 CloseTransientFile(fd);
1972 if (CloseTransientFile(fd) != 0)
1974 (errcode_for_file_access(),
1975 errmsg("could not close file \"%s\": %m", PGSP_TEXT_FILE)));
1977 *buffer_size = nread;
1982 * Locate a plan text in the file image previously read by ptext_load_file().
1984 * We validate the given offset/length, and return NULL if bogus. Otherwise,
1985 * the result points to a null-terminated string within the buffer.
1988 ptext_fetch(Size plan_offset, int plan_len,
1989 char *buffer, Size buffer_size)
1991 Assert (plan_storage == PLAN_STORAGE_FILE);
1993 /* File read failed? */
1996 /* Bogus offset/length? */
1998 plan_offset + plan_len >= buffer_size)
2000 /* As a further sanity check, make sure there's a trailing null */
2001 if (buffer[plan_offset + plan_len] != '\0')
2004 return buffer + plan_offset;
2008 * Do we need to garbage-collect the external plan text file?
2010 * Caller should hold at least a shared lock on shared_state->lock.
2013 need_gc_ptexts(void)
2017 Assert (plan_storage == PLAN_STORAGE_FILE);
2019 /* Read shared extent pointer */
2021 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2023 SpinLockAcquire(&s->mutex);
2025 SpinLockRelease(&s->mutex);
2028 /* Don't proceed if file does not exceed 512 bytes per possible entry */
2029 if (extent < 512 * store_size)
2033 * Don't proceed if file is less than about 50% bloat. Nothing can or
2034 * should be done in the event of unusually large query texts accounting
2035 * for file's large size. We go to the trouble of maintaining the mean
2036 * query length in order to prevent garbage collection from thrashing
2039 if (extent < shared_state->mean_plan_len * store_size * 2)
2046 * Garbage-collect orphaned plan texts in external file.
2048 * This won't be called often in the typical case, since it's likely that
2049 * there won't be too much churn, and besides, a similar compaction process
2050 * occurs when serializing to disk at shutdown or as part of resetting.
2051 * Despite this, it seems prudent to plan for the edge case where the file
2052 * becomes unreasonably large, with no other method of compaction likely to
2053 * occur in the foreseeable future.
2055 * The caller must hold an exclusive lock on shared_state->lock.
2057 * At the first sign of trouble we unlink the query text file to get a clean
2058 * slate (although existing statistics are retained), rather than risk
2059 * thrashing by allowing the same problem case to recur indefinitely.
2067 HASH_SEQ_STATUS hash_seq;
2072 Assert (plan_storage == PLAN_STORAGE_FILE);
2075 * When called from store_entry, some other session might have proceeded
2076 * with garbage collection in the no-lock-held interim of lock strength
2077 * escalation. Check once more that this is actually necessary.
2079 if (!need_gc_ptexts())
2083 * Load the old texts file. If we fail (out of memory, for instance),
2084 * invalidate query texts. Hopefully this is rare. It might seem better
2085 * to leave things alone on an OOM failure, but the problem is that the
2086 * file is only going to get bigger; hoping for a future non-OOM result is
2087 * risky and can easily lead to complete denial of service.
2089 pbuffer = ptext_load_file(&pbuffer_size);
2090 if (pbuffer == NULL)
2094 * We overwrite the plan texts file in place, so as to reduce the risk of
2095 * an out-of-disk-space failure. Since the file is guaranteed not to get
2096 * larger, this should always work on traditional filesystems; though we
2097 * could still lose on copy-on-write filesystems.
2099 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2103 (errcode_for_file_access(),
2104 errmsg("could not write file \"%s\": %m",
2112 hash_seq_init(&hash_seq, hash_table);
2113 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2115 int plan_len = entry->plan_len;
2116 char *plan = ptext_fetch(entry->plan_offset,
2123 /* Trouble ... drop the text */
2124 entry->plan_offset = 0;
2125 entry->plan_len = -1;
2126 /* entry will not be counted in mean plan length computation */
2130 if (fwrite(plan, 1, plan_len + 1, pfile) != plan_len + 1)
2133 (errcode_for_file_access(),
2134 errmsg("could not write file \"%s\": %m",
2136 hash_seq_term(&hash_seq);
2140 entry->plan_offset = extent;
2141 extent += plan_len + 1;
2146 * Truncate away any now-unused space. If this fails for some odd reason,
2147 * we log it, but there's no need to fail.
2149 if (ftruncate(fileno(pfile), extent) != 0)
2151 (errcode_for_file_access(),
2152 errmsg("could not truncate file \"%s\": %m",
2155 if (FreeFile(pfile))
2158 (errcode_for_file_access(),
2159 errmsg("could not write file \"%s\": %m",
2165 elog(DEBUG1, "pgsp gc of queries file shrunk size from %zu to %zu",
2166 shared_state->extent, extent);
2168 /* Reset the shared extent pointer */
2169 shared_state->extent = extent;
2172 * Also update the mean plan length, to be sure that need_gc_ptexts()
2173 * won't still think we have a problem.
2176 shared_state->mean_plan_len = extent / nentries;
2178 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2185 /* clean up resources */
2192 * Since the contents of the external file are now uncertain, mark all
2193 * hashtable entries as having invalid texts.
2195 hash_seq_init(&hash_seq, hash_table);
2196 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2198 entry->plan_offset = 0;
2199 entry->plan_len = -1;
2203 * Destroy the query text file and create a new, empty one
2205 (void) unlink(PGSP_TEXT_FILE);
2206 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2209 (errcode_for_file_access(),
2210 errmsg("could not recreate file \"%s\": %m",
2215 /* Reset the shared extent pointer */
2216 shared_state->extent = 0;
2218 /* Reset mean_plan_len to match the new state */
2219 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2223 * Release all entries.
2228 HASH_SEQ_STATUS hash_seq;
2232 if (!shared_state || !hash_table)
2234 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2235 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
2237 LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
2239 hash_seq_init(&hash_seq, hash_table);
2240 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2242 hash_search(hash_table, &entry->key, HASH_REMOVE, NULL);
2246 * Reset global statistics for pg_store_plans.
2249 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2250 TimestampTz stats_reset = GetCurrentTimestamp();
2252 SpinLockAcquire(&s->mutex);
2253 s->stats.dealloc = 0;
2254 s->stats.stats_reset = stats_reset;
2255 SpinLockRelease(&s->mutex);
2259 * Write new empty plan file, perhaps even creating a new one to recover
2260 * if the file was missing.
2262 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2266 (errcode_for_file_access(),
2267 errmsg("could not create file \"%s\": %m",
2272 /* If ftruncate fails, log it, but it's not a fatal problem */
2273 if (ftruncate(fileno(pfile), 0) != 0)
2275 (errcode_for_file_access(),
2276 errmsg("could not truncate file \"%s\": %m",
2282 shared_state->extent = 0;
2283 LWLockRelease(shared_state->lock);
2287 pg_store_plans_hash_query(PG_FUNCTION_ARGS)
2289 PG_RETURN_OID(hash_query(text_to_cstring(PG_GETARG_TEXT_P(0))));
2293 pg_store_plans_shorten(PG_FUNCTION_ARGS)
2295 text *short_plan = PG_GETARG_TEXT_P(0);
2296 char *cjson = text_to_cstring(short_plan);
2297 char *cshorten = pgsp_json_shorten(cjson);
2298 PG_RETURN_TEXT_P(cstring_to_text(cshorten));
2302 pg_store_plans_normalize(PG_FUNCTION_ARGS)
2304 text *short_plan = PG_GETARG_TEXT_P(0);
2305 char *cjson = text_to_cstring(short_plan);
2306 char *cnormalized = pgsp_json_normalize(cjson);
2307 PG_RETURN_TEXT_P(cstring_to_text(cnormalized));
2311 pg_store_plans_jsonplan(PG_FUNCTION_ARGS)
2313 text *short_plan = PG_GETARG_TEXT_P(0);
2314 char *cshort = text_to_cstring(short_plan);
2315 char *cinflated = pgsp_json_inflate(cshort);
2316 PG_RETURN_TEXT_P(cstring_to_text(cinflated));
2320 pg_store_plans_textplan(PG_FUNCTION_ARGS)
2322 text *short_plan = PG_GETARG_TEXT_P(0);
2323 char *cshort = text_to_cstring(short_plan);
2324 char *ctextized = pgsp_json_textize(cshort);
2326 PG_RETURN_TEXT_P(cstring_to_text(ctextized));
2330 pg_store_plans_yamlplan(PG_FUNCTION_ARGS)
2332 text *short_plan = PG_GETARG_TEXT_P(0);
2333 char *cshort = text_to_cstring(short_plan);
2334 char *cyamlized = pgsp_json_yamlize(cshort);
2336 PG_RETURN_TEXT_P(cstring_to_text(cyamlized));
2340 pg_store_plans_xmlplan(PG_FUNCTION_ARGS)
2342 text *short_plan = PG_GETARG_TEXT_P(0);
2343 char *cshort = text_to_cstring(short_plan);
2344 char *cxmlized = pgsp_json_xmlize(cshort);
2346 PG_RETURN_TEXT_P(cstring_to_text(cxmlized));