1 /*-------------------------------------------------------------------------
4 * Take statistics of plan selection across a whole database cluster.
6 * Execution costs are totaled for each distinct plan for each query,
7 * and plan and queryid are kept in a shared hashtable, each record in
8 * which is associated with a record in pg_stat_statements, if any, by
11 * For Postgres 9.3 or earlier does not expose query id so
12 * pg_store_plans needs to calculate it based on the given query
13 * string using different algorithm from pg_stat_statements, and later
14 * the id will be matched against the one made from query string
15 * stored in pg_stat_statements. For the reason, queryid matching in
16 * this way will fail if the query string kept in pg_stat_statements
17 * is truncated in the middle.
19 * Plans are identified by fingerprinting plan representations in
20 * "shortened" JSON format with constants and unstable values such as
21 * rows, width, loops ignored. Nevertheless, stored plan entries hold
22 * them of the latest execution. Entry eviction is done in the same
23 * way to pg_stat_statements.
25 * Copyright (c) 2008-2020, PostgreSQL Global Development Group
26 * Copyright (c) 2012-2021, NIPPON TELEGRAPH AND TELEPHONE CORPORATION
29 * pg_store_plans/pg_store_plans.c
31 *-------------------------------------------------------------------------
40 #include "catalog/pg_authid.h"
41 #include "commands/explain.h"
42 #include "access/hash.h"
43 #include "executor/instrument.h"
45 #include "mb/pg_wchar.h"
46 #include "miscadmin.h"
48 #include "storage/fd.h"
49 #include "storage/ipc.h"
50 #include "storage/lwlock.h"
51 #include "storage/spin.h"
52 #include "storage/shmem.h"
53 #include "tcop/utility.h"
54 #include "utils/acl.h"
55 #include "utils/builtins.h"
56 #if PG_VERSION_NUM >= 140000
57 #include "utils/queryjumble.h"
59 #include "utils/timestamp.h"
61 #include "pgsp_json.h"
62 #include "pgsp_explain.h"
66 /* Location of stats file */
67 #define PGSP_DUMP_FILE "global/pg_store_plans.stat"
68 #define PGSP_TEXT_FILE PG_STAT_TMP_DIR "/pgsp_plan_texts.stat"
70 /* PostgreSQL major version number, changes in which invalidate all entries */
71 static const uint32 PGSP_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
73 /* This constant defines the magic number in the stats file header */
74 static const uint32 PGSP_FILE_HEADER = 0x20211125;
75 static int max_plan_len = 5000;
77 /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
78 #define USAGE_EXEC(duration) (1.0)
79 #define USAGE_INIT (1.0) /* including initial planning */
80 #define ASSUMED_MEDIAN_INIT (10.0) /* initial assumed median usage */
81 #define ASSUMED_LENGTH_INIT 1024 /* initial assumed mean query length */
82 #define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */
83 #define STICKY_DECREASE_FACTOR (0.50) /* factor for sticky entries */
84 #define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */
86 /* In PostgreSQL 11, queryid becomes a uint64 internally. */
87 #if PG_VERSION_NUM >= 110000
88 typedef uint64 queryid_t;
89 #define PGSP_NO_QUERYID UINT64CONST(0)
91 typedef uint32 queryid_t;
92 #define PGSP_NO_QUERYID 0
96 * Extension version number, for supporting older extension versions' objects
98 typedef enum pgspVersion
105 * Hashtable key that defines the identity of a hashtable entry. We separate
106 * queries by user and by database even if they are otherwise identical.
108 * Presently, the query encoding is fully determined by the source database
109 * and so we don't really need it to be in the key. But that might not always
110 * be true. Anyway it's notationally convenient to pass it as part of the key.
112 typedef struct pgspHashKey
114 Oid userid; /* user OID */
115 Oid dbid; /* database OID */
116 queryid_t queryid; /* query identifier */
117 uint32 planid; /* plan identifier */
121 * The actual stats counters kept within pgspEntry.
123 typedef struct Counters
125 int64 calls; /* # of times executed */
126 double total_time; /* total execution time, in msec */
127 double min_time; /* minimum execution time in msec */
128 double max_time; /* maximum execution time in msec */
129 double mean_time; /* mean execution time in msec */
130 double sum_var_time; /* sum of variances in execution time in msec */
131 int64 rows; /* total # of retrieved or affected rows */
132 int64 shared_blks_hit; /* # of shared buffer hits */
133 int64 shared_blks_read; /* # of shared disk blocks read */
134 int64 shared_blks_dirtied;/* # of shared disk blocks dirtied */
135 int64 shared_blks_written;/* # of shared disk blocks written */
136 int64 local_blks_hit; /* # of local buffer hits */
137 int64 local_blks_read; /* # of local disk blocks read */
138 int64 local_blks_dirtied; /* # of local disk blocks dirtied */
139 int64 local_blks_written; /* # of local disk blocks written */
140 int64 temp_blks_read; /* # of temp blocks read */
141 int64 temp_blks_written; /* # of temp blocks written */
142 double blk_read_time; /* time spent reading, in msec */
143 double blk_write_time; /* time spent writing, in msec */
144 TimestampTz first_call; /* timestamp of first call */
145 TimestampTz last_call; /* timestamp of last call */
146 double usage; /* usage factor */
150 * Global statistics for pg_store_plans
152 typedef struct pgspGlobalStats
154 int64 dealloc; /* # of times entries were deallocated */
155 TimestampTz stats_reset; /* timestamp with all stats reset */
159 * Statistics per plan
161 * NB: see the file read/write code before changing field order here.
163 typedef struct pgspEntry
165 pgspHashKey key; /* hash key of entry - MUST BE FIRST */
166 Counters counters; /* the statistics for this query */
167 Size plan_offset; /* plan text offset in extern file */
168 int plan_len; /* # of valid bytes in query string */
169 int encoding; /* query encoding */
170 slock_t mutex; /* protects the counters only */
174 * Global shared state
176 typedef struct pgspSharedState
178 LWLock *lock; /* protects hashtable search/modification */
179 int plan_size; /* max query length in bytes */
180 double cur_median_usage; /* current median usage in hashtable */
181 Size mean_plan_len; /* current mean entry text length */
182 slock_t mutex; /* protects following fields only: */
183 Size extent; /* current extent of plan file */
184 int n_writers; /* number of active writers to query file */
185 int gc_count; /* plan file garbage collection cycle count */
186 pgspGlobalStats stats; /* global statistics for pgsp */
189 /*---- Local variables ----*/
191 /* Current nesting depth of ExecutorRun+ProcessUtility calls */
192 static int nested_level = 0;
194 /* Saved hook values in case of unload */
195 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
196 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
197 static ExecutorRun_hook_type prev_ExecutorRun = NULL;
198 static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
199 static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
200 static ProcessUtility_hook_type prev_ProcessUtility = NULL;
202 /* Links to shared memory state */
203 static pgspSharedState *shared_state = NULL;
204 static HTAB *hash_table = NULL;
206 /*---- GUC variables ----*/
210 TRACK_LEVEL_NONE, /* track no statements */
211 TRACK_LEVEL_TOP, /* only top level statements */
212 TRACK_LEVEL_ALL, /* all statements, including nested ones */
213 TRACK_LEVEL_VERBOSE /* all statements, including internal ones */
216 static const struct config_enum_entry track_options[] =
218 {"none", TRACK_LEVEL_NONE, false},
219 {"top", TRACK_LEVEL_TOP, false},
220 {"all", TRACK_LEVEL_ALL, false},
221 {"verbose", TRACK_LEVEL_VERBOSE, false},
227 PLAN_FORMAT_RAW, /* No conversion. Shorten JSON */
228 PLAN_FORMAT_TEXT, /* Traditional text representation */
229 PLAN_FORMAT_JSON, /* JSON representation */
230 PLAN_FORMAT_YAML, /* YAML */
231 PLAN_FORMAT_XML, /* XML */
234 static const struct config_enum_entry plan_formats[] =
236 {"raw" , PLAN_FORMAT_RAW , false},
237 {"text", PLAN_FORMAT_TEXT, false},
238 {"json", PLAN_FORMAT_JSON, false},
239 {"yaml", PLAN_FORMAT_YAML, false},
240 {"xml" , PLAN_FORMAT_XML , false},
244 /* options for plan storage */
247 PLAN_STORAGE_SHMEM, /* plan is stored as a part of hash entry */
248 PLAN_STORAGE_FILE /* plan is stored in a separate file */
251 static const struct config_enum_entry plan_storage_options[] =
253 {"shmem", PLAN_STORAGE_SHMEM, false},
254 {"file", PLAN_STORAGE_FILE, false},
258 static int store_size; /* max # statements to track */
259 static int track_level; /* tracking level */
260 static int min_duration; /* min duration to record */
261 static bool dump_on_shutdown; /* whether to save stats across shutdown */
262 static bool log_analyze; /* Similar to EXPLAIN (ANALYZE *) */
263 static bool log_verbose; /* Similar to EXPLAIN (VERBOSE *) */
264 static bool log_buffers; /* Similar to EXPLAIN (BUFFERS *) */
265 static bool log_timing; /* Similar to EXPLAIN (TIMING *) */
266 static bool log_triggers; /* whether to log trigger statistics */
267 static int plan_format; /* Plan representation style in
268 * pg_store_plans.plan */
269 static int plan_storage; /* Plan storage type */
272 /* disables tracking overriding track_level */
273 static bool force_disabled = false;
275 #if PG_VERSION_NUM >= 140000
277 * For pg14 and later, we rely on core queryid calculation. If
278 * it's not available it means that the admin explicitly refused to
279 * compute it, for performance reason or other. In that case, we
280 * will also consider that this extension is disabled.
282 #define pgsp_enabled(q) \
283 (!force_disabled && \
284 (track_level >= TRACK_LEVEL_ALL || \
285 (track_level == TRACK_LEVEL_TOP && nested_level == 0)) && \
286 (q != PGSP_NO_QUERYID))
288 #define pgsp_enabled(q) \
289 (!force_disabled && \
290 (track_level >= TRACK_LEVEL_ALL || \
291 (track_level == TRACK_LEVEL_TOP && nested_level == 0)))
294 #define SHMEM_PLAN_PTR(ent) (((char *) ent) + sizeof(pgspEntry))
296 /*---- Function declarations ----*/
301 Datum pg_store_plans_reset(PG_FUNCTION_ARGS);
302 Datum pg_store_plans_hash_query(PG_FUNCTION_ARGS);
303 Datum pg_store_plans(PG_FUNCTION_ARGS);
304 Datum pg_store_plans_shorten(PG_FUNCTION_ARGS);
305 Datum pg_store_plans_normalize(PG_FUNCTION_ARGS);
306 Datum pg_store_plans_jsonplan(PG_FUNCTION_ARGS);
307 Datum pg_store_plans_yamlplan(PG_FUNCTION_ARGS);
308 Datum pg_store_plans_xmlplan(PG_FUNCTION_ARGS);
309 Datum pg_store_plans_textplan(PG_FUNCTION_ARGS);
310 Datum pg_store_plans_info(PG_FUNCTION_ARGS);
312 PG_FUNCTION_INFO_V1(pg_store_plans_reset);
313 PG_FUNCTION_INFO_V1(pg_store_plans_hash_query);
314 PG_FUNCTION_INFO_V1(pg_store_plans);
315 PG_FUNCTION_INFO_V1(pg_store_plans_1_6);
316 PG_FUNCTION_INFO_V1(pg_store_plans_shorten);
317 PG_FUNCTION_INFO_V1(pg_store_plans_normalize);
318 PG_FUNCTION_INFO_V1(pg_store_plans_jsonplan);
319 PG_FUNCTION_INFO_V1(pg_store_plans_yamlplan);
320 PG_FUNCTION_INFO_V1(pg_store_plans_xmlplan);
321 PG_FUNCTION_INFO_V1(pg_store_plans_textplan);
322 PG_FUNCTION_INFO_V1(pg_store_plans_info);
324 #if PG_VERSION_NUM < 130000
325 #define COMPTAG_TYPE char
327 #define COMPTAG_TYPE QueryCompletion
330 #if PG_VERSION_NUM < 140000
331 #define ROLE_PG_READ_ALL_STATS DEFAULT_ROLE_READ_ALL_STATS
334 static void pgsp_shmem_startup(void);
335 static void pgsp_shmem_shutdown(int code, Datum arg);
336 static void pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags);
337 static void pgsp_ExecutorRun(QueryDesc *queryDesc,
338 ScanDirection direction,
339 uint64 count, bool execute_once);
340 static void pgsp_ExecutorFinish(QueryDesc *queryDesc);
341 static void pgsp_ExecutorEnd(QueryDesc *queryDesc);
342 static void pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
343 #if PG_VERSION_NUM >= 140000
346 ProcessUtilityContext context, ParamListInfo params,
347 QueryEnvironment *queryEnv,
348 DestReceiver *dest, COMPTAG_TYPE *completionTag);
349 static uint32 hash_query(const char* query);
350 static void pgsp_store(char *plan, queryid_t queryId,
351 double total_time, uint64 rows,
352 const BufferUsage *bufusage);
353 static void pg_store_plans_internal(FunctionCallInfo fcinfo,
354 pgspVersion api_version);
355 static Size shared_mem_size(void);
356 static pgspEntry *entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len,
358 static bool ptext_store(const char *plan, int plan_len, Size *plan_offset,
360 static char *ptext_load_file(Size *buffer_size);
361 static char *ptext_fetch(Size plan_offset, int plan_len, char *buffer,
363 static bool need_gc_ptexts(void);
364 static void gc_ptexts(void);
365 static void entry_dealloc(void);
366 static void entry_reset(void);
369 * Module load callback
375 * In order to create our shared memory area, we have to be loaded via
376 * shared_preload_libraries. If not, fall out without hooking into any of
377 * the main system. (We don't throw error here because it seems useful to
378 * allow the pg_stat_statements functions to be created even when the
379 * module isn't active. The functions must protect themselves against
380 * being called then, however.)
382 if (!process_shared_preload_libraries_in_progress)
385 #if PG_VERSION_NUM >= 140000
387 * Inform the postmaster that we want to enable query_id calculation if
388 * compute_query_id is set to auto.
394 * Define (or redefine) custom GUC variables.
396 DefineCustomIntVariable("pg_store_plans.max",
397 "Sets the maximum number of plans tracked by pg_store_plans.",
409 DefineCustomIntVariable("pg_store_plans.max_plan_length",
410 "Sets the maximum length of plans stored by pg_store_plans.",
422 DefineCustomEnumVariable("pg_store_plans.plan_storage",
423 "Selects where to store plan texts.",
427 plan_storage_options,
434 DefineCustomEnumVariable("pg_store_plans.track",
435 "Selects which plans are tracked by pg_store_plans.",
446 DefineCustomEnumVariable("pg_store_plans.plan_format",
447 "Selects which format to be appied for plan representation in pg_store_plans.",
458 DefineCustomIntVariable("pg_store_plans.min_duration",
459 "Minimum duration to record plan in milliseconds.",
471 DefineCustomBoolVariable("pg_store_plans.save",
472 "Save pg_store_plans statistics across server shutdowns.",
482 DefineCustomBoolVariable("pg_store_plans.log_analyze",
483 "Use EXPLAIN ANALYZE for plan logging.",
493 DefineCustomBoolVariable("pg_store_plans.log_buffers",
504 DefineCustomBoolVariable("pg_store_plans.log_timing",
515 DefineCustomBoolVariable("pg_store_plans.log_triggers",
516 "Log trigger trace.",
526 DefineCustomBoolVariable("pg_store_plans.log_verbose",
527 "Set VERBOSE for EXPLAIN on logging.",
537 EmitWarningsOnPlaceholders("pg_store_plans");
540 * Request additional shared resources. (These are no-ops if we're not in
541 * the postmaster process.) We'll allocate or attach to the shared
542 * resources in pgsp_shmem_startup().
544 RequestAddinShmemSpace(shared_mem_size());
545 RequestNamedLWLockTranche("pg_store_plans", 1);
550 prev_shmem_startup_hook = shmem_startup_hook;
551 shmem_startup_hook = pgsp_shmem_startup;
552 prev_ExecutorStart = ExecutorStart_hook;
553 ExecutorStart_hook = pgsp_ExecutorStart;
554 prev_ExecutorRun = ExecutorRun_hook;
555 ExecutorRun_hook = pgsp_ExecutorRun;
556 prev_ExecutorFinish = ExecutorFinish_hook;
557 ExecutorFinish_hook = pgsp_ExecutorFinish;
558 prev_ExecutorEnd = ExecutorEnd_hook;
559 ExecutorEnd_hook = pgsp_ExecutorEnd;
560 prev_ProcessUtility = ProcessUtility_hook;
561 ProcessUtility_hook = pgsp_ProcessUtility;
565 * Module unload callback
570 /* Uninstall hooks. */
571 shmem_startup_hook = prev_shmem_startup_hook;
572 ExecutorStart_hook = prev_ExecutorStart;
573 ExecutorRun_hook = prev_ExecutorRun;
574 ExecutorFinish_hook = prev_ExecutorFinish;
575 ExecutorEnd_hook = prev_ExecutorEnd;
576 ProcessUtility_hook = prev_ProcessUtility;
580 * shmem_startup hook: allocate or attach to shared memory,
581 * then load any pre-existing statistics from file.
584 pgsp_shmem_startup(void)
598 if (prev_shmem_startup_hook)
599 prev_shmem_startup_hook();
601 /* reset in case this is a restart within the postmaster */
606 * Create or attach to the shared memory state, including hash table
608 LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
610 shared_state = ShmemInitStruct("pg_store_plans",
611 sizeof(pgspSharedState),
616 /* First time through ... */
617 shared_state->lock = &(GetNamedLWLockTranche("pg_store_plans"))->lock;
618 shared_state->plan_size = max_plan_len;
619 shared_state->cur_median_usage = ASSUMED_MEDIAN_INIT;
620 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
621 SpinLockInit(&shared_state->mutex);
622 shared_state->extent = 0;
623 shared_state->n_writers = 0;
624 shared_state->gc_count = 0;
625 shared_state->stats.dealloc = 0;
626 shared_state->stats.stats_reset = GetCurrentTimestamp();
629 /* Be sure everyone agrees on the hash table entry size */
630 plan_size = shared_state->plan_size;
632 memset(&info, 0, sizeof(info));
633 info.keysize = sizeof(pgspHashKey);
634 info.entrysize = sizeof(pgspEntry);
635 if (plan_storage == PLAN_STORAGE_SHMEM)
636 info.entrysize += max_plan_len;
637 hash_table = ShmemInitHash("pg_store_plans hash",
638 store_size, store_size,
642 LWLockRelease(AddinShmemInitLock);
645 * If we're in the postmaster (or a standalone backend...), set up a shmem
646 * exit hook to dump the statistics to disk.
648 if (!IsUnderPostmaster)
649 on_shmem_exit(pgsp_shmem_shutdown, (Datum) 0);
652 * Done if some other process already completed our initialization.
658 * Note: we don't bother with locks here, because there should be no other
659 * processes running when this code is reached.
662 /* Unlink query text file possibly left over from crash */
663 unlink(PGSP_TEXT_FILE);
665 if (plan_storage == PLAN_STORAGE_FILE)
667 /* Allocate new query text temp file */
668 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
674 * If we were told not to load old statistics, we're done. (Note we do
675 * not try to unlink any old dump file in this case. This seems a bit
676 * questionable but it's the historical behavior.)
678 if (!dump_on_shutdown)
686 * Attempt to load old statistics from the dump file.
688 file = AllocateFile(PGSP_DUMP_FILE, PG_BINARY_R);
692 return; /* ignore not-found error */
693 /* No existing persisted stats file, so we're done */
697 buffer_size = plan_size;
698 buffer = (char *) palloc(buffer_size);
700 if (fread(&header, sizeof(uint32), 1, file) != 1 ||
701 fread(&pgver, sizeof(uint32), 1, file) != 1 ||
702 fread(&num, sizeof(int32), 1, file) != 1)
705 if (header != PGSP_FILE_HEADER ||
706 pgver != PGSP_PG_MAJOR_VERSION)
709 for (i = 0; i < num; i++)
713 Size plan_offset = 0;
715 if (fread(&temp, sizeof(pgspEntry), 1, file) != 1)
718 /* Encoding is the only field we can easily sanity-check */
719 if (!PG_VALID_BE_ENCODING(temp.encoding))
722 /* Previous incarnation might have had a larger plan_size */
723 if (temp.plan_len >= buffer_size)
725 buffer = (char *) repalloc(buffer, temp.plan_len + 1);
726 buffer_size = temp.plan_len + 1;
729 if (fread(buffer, 1, temp.plan_len + 1, file) != temp.plan_len + 1)
732 /* Skip loading "sticky" entries */
733 if (temp.counters.calls == 0)
736 /* Clip to available length if needed */
737 if (temp.plan_len >= plan_size)
738 temp.plan_len = pg_encoding_mbcliplen(temp.encoding,
743 buffer[temp.plan_len] = '\0';
745 if (plan_storage == PLAN_STORAGE_FILE)
747 /* Store the plan text */
748 plan_offset = shared_state->extent;
749 if (fwrite(buffer, 1, temp.plan_len + 1, pfile) !=
752 shared_state->extent += temp.plan_len + 1;
755 /* make the hashtable entry (discards old entries if too many) */
756 entry = entry_alloc(&temp.key, plan_offset, temp.plan_len, false);
758 if (plan_storage == PLAN_STORAGE_SHMEM)
759 memcpy(SHMEM_PLAN_PTR(entry), buffer, temp.plan_len + 1);
761 /* copy in the actual stats */
762 entry->counters = temp.counters;
772 * Remove the file so it's not included in backups/replication slaves,
773 * etc. A new file will be written on next shutdown.
775 unlink(PGSP_DUMP_FILE);
781 (errcode_for_file_access(),
782 errmsg("could not read file \"%s\": %m",
787 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
788 errmsg("ignoring invalid data in file \"%s\"",
793 (errcode_for_file_access(),
794 errmsg("could not write file \"%s\": %m",
803 /* If possible, throw away the bogus file; ignore any error */
804 unlink(PGSP_DUMP_FILE);
807 * Don't unlink PGSP_TEXT_FILE here; it should always be around while the
808 * server is running with pg_stat_statements enabled
813 * shmem_shutdown hook: Dump statistics into file.
815 * Note: we don't bother with acquiring lock, because there should be no
816 * other processes running when this is called.
819 pgsp_shmem_shutdown(int code, Datum arg)
822 char *pbuffer = NULL;
823 Size pbuffer_size = 0;
824 HASH_SEQ_STATUS hash_seq;
828 /* Don't try to dump during a crash. */
832 /* Safety check ... shouldn't get here unless shmem is set up. */
833 if (!shared_state || !hash_table)
836 /* Don't dump if told not to. */
837 if (!dump_on_shutdown)
840 file = AllocateFile(PGSP_DUMP_FILE ".tmp", PG_BINARY_W);
844 if (fwrite(&PGSP_FILE_HEADER, sizeof(uint32), 1, file) != 1)
846 if (fwrite(&PGSP_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
848 num_entries = hash_get_num_entries(hash_table);
849 if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
852 if (plan_storage == PLAN_STORAGE_FILE)
854 pbuffer = ptext_load_file(&pbuffer_size);
859 hash_seq_init(&hash_seq, hash_table);
860 while ((entry = hash_seq_search(&hash_seq)) != NULL)
862 int len = entry->plan_len;
865 if (plan_storage == PLAN_STORAGE_FILE)
866 pstr = ptext_fetch(entry->plan_offset, len,
867 pbuffer, pbuffer_size);
869 pstr = SHMEM_PLAN_PTR(entry);
872 continue; /* Ignore any entries with bogus texts */
874 if (fwrite(entry, sizeof(pgspEntry), 1, file) != 1 ||
875 fwrite(pstr, 1, len + 1, file) != len + 1)
877 /* note: we assume hash_seq_term won't change errno */
878 hash_seq_term(&hash_seq);
890 * Rename file into place, so we atomically replace the old one.
892 if (rename(PGSP_DUMP_FILE ".tmp", PGSP_DUMP_FILE) != 0)
894 (errcode_for_file_access(),
895 errmsg("could not rename pg_store_plans file \"%s\": %m",
896 PGSP_DUMP_FILE ".tmp")));
898 /* Unlink query-texts file; it's not needed while shutdown */
899 unlink(PGSP_TEXT_FILE);
905 (errcode_for_file_access(),
906 errmsg("could not write pg_store_plans file \"%s\": %m",
907 PGSP_DUMP_FILE ".tmp")));
910 unlink(PGSP_DUMP_FILE ".tmp");
915 * ExecutorStart hook: start up tracking if needed
918 pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags)
921 (eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0)
923 queryDesc->instrument_options |=
924 (log_timing ? INSTRUMENT_TIMER : 0)|
925 (log_timing ? 0: INSTRUMENT_ROWS)|
926 (log_buffers ? INSTRUMENT_BUFFERS : 0);
929 if (prev_ExecutorStart)
930 prev_ExecutorStart(queryDesc, eflags);
932 standard_ExecutorStart(queryDesc, eflags);
935 * Set up to track total elapsed time in ExecutorRun. Allocate in per-query
936 * context so as to be free at ExecutorEnd.
938 if (queryDesc->totaltime == NULL &&
939 pgsp_enabled(queryDesc->plannedstmt->queryId))
941 MemoryContext oldcxt;
943 oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
944 queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL
945 #if PG_VERSION_NUM >= 140000
949 MemoryContextSwitchTo(oldcxt);
955 * ExecutorRun hook: all we need do is track nesting depth
958 pgsp_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
964 if (prev_ExecutorRun)
965 prev_ExecutorRun(queryDesc, direction, count, execute_once);
967 standard_ExecutorRun(queryDesc, direction, count, execute_once);
979 * ExecutorFinish hook: all we need do is track nesting depth
982 pgsp_ExecutorFinish(QueryDesc *queryDesc)
987 if (prev_ExecutorFinish)
988 prev_ExecutorFinish(queryDesc);
990 standard_ExecutorFinish(queryDesc);
1002 * ExecutorEnd hook: store results if needed
1005 pgsp_ExecutorEnd(QueryDesc *queryDesc)
1007 if (queryDesc->totaltime)
1010 * Make sure stats accumulation is done. (Note: it's okay if several
1011 * levels of hook all do this.)
1013 InstrEndLoop(queryDesc->totaltime);
1015 if (pgsp_enabled(queryDesc->plannedstmt->queryId) &&
1016 queryDesc->totaltime->total &&
1017 queryDesc->totaltime->total >=
1018 (double)min_duration / 1000.0)
1024 es = NewExplainState();
1027 es->analyze = queryDesc->instrument_options;
1028 es->verbose = log_verbose;
1029 es->buffers = (es->analyze && log_buffers);
1030 es->timing = (es->analyze && log_timing);
1031 es->format = EXPLAIN_FORMAT_JSON;
1033 ExplainBeginOutput(es);
1034 ExplainPrintPlan(es, queryDesc);
1036 pgspExplainTriggers(es, queryDesc);
1037 ExplainEndOutput(es);
1039 /* Remove last line break */
1040 if (es_str->len > 0 && es_str->data[es_str->len - 1] == '\n')
1041 es_str->data[--es_str->len] = '\0';
1043 /* JSON outmost braces. */
1044 es_str->data[0] = '{';
1045 es_str->data[es_str->len - 1] = '}';
1047 queryid = queryDesc->plannedstmt->queryId;
1048 #if PG_VERSION_NUM < 140000
1050 * For versions before pg14, a queryid is only available if
1051 * pg_stat_statements extension (or similar) if configured. We
1052 * don't want a hard requirement for such an extension so fallback
1053 * to an internal queryid calculation in some case.
1054 * For pg14 and above, core postgres can compute a queryid so we
1057 if (queryid == PGSP_NO_QUERYID)
1058 queryid = (queryid_t) hash_query(queryDesc->sourceText);
1060 Assert(queryid != PGSP_NO_QUERYID);
1063 pgsp_store(es_str->data,
1065 queryDesc->totaltime->total * 1000.0, /* convert to msec */
1066 queryDesc->estate->es_processed,
1067 &queryDesc->totaltime->bufusage);
1068 pfree(es_str->data);
1072 if (prev_ExecutorEnd)
1073 prev_ExecutorEnd(queryDesc);
1075 standard_ExecutorEnd(queryDesc);
1079 * ProcessUtility hook
1082 pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
1083 #if PG_VERSION_NUM >= 140000
1086 ProcessUtilityContext context, ParamListInfo params,
1087 QueryEnvironment *queryEnv,
1088 DestReceiver *dest, COMPTAG_TYPE *completionTag)
1090 int tag = nodeTag(pstmt->utilityStmt);
1091 queryid_t saved_queryId = pstmt->queryId;
1092 bool reset_force_disabled = false;
1094 if (pgsp_enabled(saved_queryId) &&
1095 (tag == T_CreateExtensionStmt || tag == T_AlterExtensionStmt) &&
1096 !force_disabled && track_level < TRACK_LEVEL_VERBOSE)
1098 force_disabled = true;
1099 reset_force_disabled = true;
1104 if (prev_ProcessUtility)
1106 prev_ProcessUtility(pstmt, queryString,
1107 #if PG_VERSION_NUM >= 140000
1110 context, params, queryEnv,
1111 dest, completionTag);
1114 standard_ProcessUtility(pstmt, queryString,
1115 #if PG_VERSION_NUM >= 140000
1118 context, params, queryEnv,
1119 dest, completionTag);
1121 if (reset_force_disabled)
1122 force_disabled = false;
1126 if (reset_force_disabled)
1127 force_disabled = false;
1134 * hash_query: calculate internal query ID for a query
1136 * As of PG11, Query.queryId has been widen to 64 bit to reduce collision of
1137 * queries to practical level. On the other hand pg_store_plans uses the
1138 * combination of query hash and plan hash values as the hash table key and
1139 * the resolution of the hash value effectively has the same degree so we
1140 * continue to use uint32 as internal queryid.
1142 * This may merge plans from different queries into single internal query id
1143 * but it is not a problem when pg_stat_statements is used together since the
1144 * extension gives enough resolution on queries.
1147 hash_query(const char* query)
1151 char *normquery = pstrdup(query);
1152 normalize_expr(normquery, false);
1153 queryid = hash_any((const unsigned char*)normquery, strlen(normquery));
1156 /* If we are unlucky enough to get a hash of zero, use 1 instead */
1165 * Store some statistics for a plan.
1167 * Table entry is keyed with userid.dbid.queryId.planId. planId is the hash
1168 * value of the given plan, which is calculated in ths function.
1171 pgsp_store(char *plan, queryid_t queryId,
1172 double total_time, uint64 rows,
1173 const BufferUsage *bufusage)
1177 char *norm_query = NULL;
1179 char *normalized_plan = NULL;
1180 char *shorten_plan = NULL;
1181 volatile pgspEntry *e;
1182 Size plan_offset = 0;
1185 Assert(plan != NULL && queryId != PGSP_NO_QUERYID);
1187 /* Safety check... */
1188 if (!shared_state || !hash_table)
1191 /* Set up key for hashtable search */
1192 key.userid = GetUserId();
1193 key.dbid = MyDatabaseId;
1194 key.queryid = queryId;
1196 normalized_plan = pgsp_json_normalize(plan);
1197 shorten_plan = pgsp_json_shorten(plan);
1198 elog(DEBUG3, "pg_store_plans: Normalized plan: %s", normalized_plan);
1199 elog(DEBUG3, "pg_store_plans: Shorten plan: %s", shorten_plan);
1200 elog(DEBUG3, "pg_store_plans: Original plan: %s", plan);
1201 plan_len = strlen(shorten_plan);
1203 key.planid = hash_any((const unsigned char *)normalized_plan,
1204 strlen(normalized_plan));
1205 pfree(normalized_plan);
1207 if (plan_len >= shared_state->plan_size)
1208 plan_len = pg_encoding_mbcliplen(GetDatabaseEncoding(),
1211 shared_state->plan_size - 1);
1214 /* Look up the hash table entry with shared lock. */
1215 LWLockAcquire(shared_state->lock, LW_SHARED);
1217 entry = (pgspEntry *) hash_search(hash_table, &key, HASH_FIND, NULL);
1219 /* Store the plan text, if the entry not present */
1220 if (!entry && plan_storage == PLAN_STORAGE_FILE)
1225 /* Append new plan text to file with only shared lock held */
1226 stored = ptext_store(shorten_plan, plan_len, &plan_offset, &gc_count);
1229 * Determine whether we need to garbage collect external query texts
1230 * while the shared lock is still held. This micro-optimization
1231 * avoids taking the time to decide this while holding exclusive lock.
1233 do_gc = need_gc_ptexts();
1235 /* Acquire exclusive lock as required by entry_alloc() */
1236 LWLockRelease(shared_state->lock);
1237 LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
1240 * A garbage collection may have occurred while we weren't holding the
1241 * lock. In the unlikely event that this happens, the plan text we
1242 * stored above will have been garbage collected, so write it again.
1243 * This should be infrequent enough that doing it while holding
1244 * exclusive lock isn't a performance problem.
1246 if (!stored || shared_state->gc_count != gc_count)
1247 stored = ptext_store(shorten_plan, plan_len, &plan_offset, NULL);
1249 /* If we failed to write to the text file, give up */
1255 /* Create new entry, if not present */
1258 entry = entry_alloc(&key, plan_offset, plan_len, false);
1260 /* shorten_plan is terminated by NUL */
1261 if (plan_storage == PLAN_STORAGE_SHMEM)
1262 memcpy(SHMEM_PLAN_PTR(entry), shorten_plan, plan_len + 1);
1265 /* If needed, perform garbage collection while exclusive lock held */
1270 /* Increment the counts, except when jstate is not NULL */
1273 * Grab the spinlock while updating the counters (see comment about
1274 * locking rules at the head of the file)
1277 e = (volatile pgspEntry *) entry;
1278 SpinLockAcquire(&e->mutex);
1280 /* "Unstick" entry if it was previously sticky */
1281 if (e->counters.calls == 0)
1283 e->counters.usage = USAGE_INIT;
1284 e->counters.first_call = GetCurrentTimestamp();
1287 e->counters.calls += 1;
1288 e->counters.total_time += total_time;
1289 if (e->counters.calls == 1)
1291 e->counters.min_time = total_time;
1292 e->counters.max_time = total_time;
1293 e->counters.mean_time = total_time;
1298 * Welford's method for accurately computing variance. See
1299 * <http://www.johndcook.com/blog/standard_deviation/>
1301 double old_mean = e->counters.mean_time;
1303 e->counters.mean_time +=
1304 (total_time - old_mean) / e->counters.calls;
1305 e->counters.sum_var_time +=
1306 (total_time - old_mean) * (total_time - e->counters.mean_time);
1308 /* calculate min and max time */
1309 if (e->counters.min_time > total_time)
1310 e->counters.min_time = total_time;
1311 if (e->counters.max_time < total_time)
1312 e->counters.max_time = total_time;
1315 e->counters.rows += rows;
1316 e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1317 e->counters.shared_blks_read += bufusage->shared_blks_read;
1318 e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
1319 e->counters.shared_blks_written += bufusage->shared_blks_written;
1320 e->counters.local_blks_hit += bufusage->local_blks_hit;
1321 e->counters.local_blks_read += bufusage->local_blks_read;
1322 e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
1323 e->counters.local_blks_written += bufusage->local_blks_written;
1324 e->counters.temp_blks_read += bufusage->temp_blks_read;
1325 e->counters.temp_blks_written += bufusage->temp_blks_written;
1326 e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1327 e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
1328 e->counters.last_call = GetCurrentTimestamp();
1329 e->counters.usage += USAGE_EXEC(total_time);
1331 SpinLockRelease(&e->mutex);
1334 LWLockRelease(shared_state->lock);
1336 /* We postpone this pfree until we're out of the lock */
1342 * Reset all statement statistics.
1345 pg_store_plans_reset(PG_FUNCTION_ARGS)
1347 if (!shared_state || !hash_table)
1349 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1350 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1355 /* Number of output arguments (columns) for various API versions */
1356 #define PG_STORE_PLANS_COLS_V1_5 27
1357 #define PG_STORE_PLANS_COLS_V1_6 26
1358 #define PG_STORE_PLANS_COLS 27 /* maximum of above */
1361 * Retrieve statement statistics.
1364 pg_store_plans_1_6(PG_FUNCTION_ARGS)
1366 pg_store_plans_internal(fcinfo, PGSP_V1_6);
1372 pg_store_plans(PG_FUNCTION_ARGS)
1374 pg_store_plans_internal(fcinfo, PGSP_V1_5);
1380 pg_store_plans_internal(FunctionCallInfo fcinfo,
1381 pgspVersion api_version)
1383 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1385 Tuplestorestate *tupstore;
1386 MemoryContext per_query_ctx;
1387 MemoryContext oldcontext;
1388 Oid userid = GetUserId();
1389 bool is_allowed_role = is_member_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS);
1391 char *pbuffer = NULL;
1392 Size pbuffer_size = 0;
1395 HASH_SEQ_STATUS hash_seq;
1398 if (!shared_state || !hash_table)
1400 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1401 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1403 /* check to see if caller supports us returning a tuplestore */
1404 if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1406 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1407 errmsg("set-valued function called in context that cannot accept a set")));
1408 if (!(rsinfo->allowedModes & SFRM_Materialize))
1410 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1411 errmsg("materialize mode required, but it is not " \
1412 "allowed in this context")));
1414 /* Build a tuple descriptor for our result type */
1415 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1416 elog(ERROR, "return type must be a row type");
1418 per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1419 oldcontext = MemoryContextSwitchTo(per_query_ctx);
1421 tupstore = tuplestore_begin_heap(true, false, work_mem);
1422 rsinfo->returnMode = SFRM_Materialize;
1423 rsinfo->setResult = tupstore;
1424 rsinfo->setDesc = tupdesc;
1426 MemoryContextSwitchTo(oldcontext);
1429 * We'd like to load the plan text file (if needed) while not holding any
1430 * lock on shared_state->lock. In the worst case we'll have to do this
1431 * again after we have the lock, but it's unlikely enough to make this a
1432 * win despite occasional duplicated work. We need to reload if anybody
1433 * writes to the file (either a retail ptext_store(), or a garbage
1434 * collection) between this point and where we've gotten shared lock. If a
1435 * ptext_store is actually in progress when we look, we might as well skip
1436 * the speculative load entirely.
1439 /* Take the mutex so we can examine variables */
1441 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1443 SpinLockAcquire(&s->mutex);
1445 n_writers = s->n_writers;
1446 gc_count = s->gc_count;
1447 SpinLockRelease(&s->mutex);
1450 /* No point in loading file now if there are active writers */
1451 if (n_writers == 0 && plan_storage == PLAN_STORAGE_FILE)
1452 pbuffer = ptext_load_file(&pbuffer_size);
1455 * Get shared lock, load or reload the plan text file if we must, and
1456 * iterate over the hashtable entries.
1458 * With a large hash table, we might be holding the lock rather longer
1459 * than one could wish. However, this only blocks creation of new hash
1460 * table entries, and the larger the hash table the less likely that is to
1461 * be needed. So we can hope this is okay. Perhaps someday we'll decide
1462 * we need to partition the hash table to limit the time spent holding any
1465 LWLockAcquire(shared_state->lock, LW_SHARED);
1468 * Here it is safe to examine extent and gc_count without taking the mutex.
1469 * Note that although other processes might change shared_state->extent
1470 * just after we look at it, the strings they then write into the file
1471 * cannot yet be referenced in the hashtable, so we don't care whether we
1474 * If ptext_load_file fails, we just press on; we'll return NULL for every
1477 if (plan_storage == PLAN_STORAGE_FILE &&
1479 shared_state->extent != extent ||
1480 shared_state->gc_count != gc_count))
1484 pbuffer = ptext_load_file(&pbuffer_size);
1487 hash_seq_init(&hash_seq, hash_table);
1488 while ((entry = hash_seq_search(&hash_seq)) != NULL)
1490 Datum values[PG_STORE_PLANS_COLS];
1491 bool nulls[PG_STORE_PLANS_COLS];
1493 int64 queryid = entry->key.queryid;
1494 int64 planid = entry->key.planid;
1498 memset(values, 0, sizeof(values));
1499 memset(nulls, 0, sizeof(nulls));
1501 values[i++] = ObjectIdGetDatum(entry->key.userid);
1502 values[i++] = ObjectIdGetDatum(entry->key.dbid);
1503 if (is_allowed_role || entry->key.userid == userid)
1505 values[i++] = Int64GetDatumFast(queryid);
1506 values[i++] = Int64GetDatumFast(planid);
1508 /* fill queryid_stat_statements with the same value with queryid */
1509 if (api_version == PGSP_V1_5)
1510 values[i++] = Int64GetDatumFast(queryid);
1514 nulls[i++] = true; /* queryid */
1515 nulls[i++] = true; /* planid */
1517 /* queryid_stat_statemetns*/
1518 if (api_version == PGSP_V1_5)
1522 if (is_allowed_role || entry->key.userid == userid)
1524 char *pstr; /* Plan string */
1525 char *mstr; /* Modified plan string */
1526 char *estr; /* Encoded modified plan string */
1528 if (plan_storage == PLAN_STORAGE_FILE)
1529 pstr = ptext_fetch(entry->plan_offset, entry->plan_len,
1530 pbuffer, pbuffer_size);
1532 pstr = SHMEM_PLAN_PTR(entry);
1534 switch (plan_format)
1536 case PLAN_FORMAT_TEXT:
1537 mstr = pgsp_json_textize(pstr);
1539 case PLAN_FORMAT_JSON:
1540 mstr = pgsp_json_inflate(pstr);
1542 case PLAN_FORMAT_YAML:
1543 mstr = pgsp_json_yamlize(pstr);
1545 case PLAN_FORMAT_XML:
1546 mstr = pgsp_json_xmlize(pstr);
1554 pg_do_encoding_conversion((unsigned char *) mstr,
1557 GetDatabaseEncoding());
1558 values[i++] = CStringGetTextDatum(estr);
1566 /* pstr is a pointer onto pbuffer */
1569 values[i++] = CStringGetTextDatum("<insufficient privilege>");
1571 /* copy counters to a local variable to keep locking time short */
1573 volatile pgspEntry *e = (volatile pgspEntry *) entry;
1575 SpinLockAcquire(&e->mutex);
1577 SpinLockRelease(&e->mutex);
1580 /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1584 values[i++] = Int64GetDatumFast(tmp.calls);
1585 values[i++] = Float8GetDatumFast(tmp.total_time);
1586 values[i++] = Float8GetDatumFast(tmp.min_time);
1587 values[i++] = Float8GetDatumFast(tmp.max_time);
1588 values[i++] = Float8GetDatumFast(tmp.mean_time);
1591 * Note we are calculating the population variance here, not the
1592 * sample variance, as we have data for the whole population, so
1593 * Bessel's correction is not used, and we don't divide by
1597 stddev = sqrt(tmp.sum_var_time / tmp.calls);
1600 values[i++] = Float8GetDatumFast(stddev);
1602 values[i++] = Int64GetDatumFast(tmp.rows);
1603 values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1604 values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
1605 values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
1606 values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1607 values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1608 values[i++] = Int64GetDatumFast(tmp.local_blks_read);
1609 values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
1610 values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1611 values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1612 values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
1613 values[i++] = Float8GetDatumFast(tmp.blk_read_time);
1614 values[i++] = Float8GetDatumFast(tmp.blk_write_time);
1615 values[i++] = TimestampTzGetDatum(tmp.first_call);
1616 values[i++] = TimestampTzGetDatum(tmp.last_call);
1618 Assert(i == (api_version == PGSP_V1_5 ? PG_STORE_PLANS_COLS_V1_5 :
1619 api_version == PGSP_V1_6 ? PG_STORE_PLANS_COLS_V1_6 :
1620 -1 /* fail if you forget to update this assert */ ));
1622 tuplestore_putvalues(tupstore, tupdesc, values, nulls);
1625 LWLockRelease(shared_state->lock);
1627 /* clean up and return the tuplestore */
1628 tuplestore_donestoring(tupstore);
1631 /* Number of output arguments (columns) for pg_stat_statements_info */
1632 #define PG_STORE_PLANS_INFO_COLS 2
1635 * Return statistics of pg_stat_statements.
1638 pg_store_plans_info(PG_FUNCTION_ARGS)
1640 pgspGlobalStats stats;
1642 Datum values[PG_STORE_PLANS_INFO_COLS];
1643 bool nulls[PG_STORE_PLANS_INFO_COLS];
1645 if (!shared_state || !hash_table)
1647 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1648 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1650 /* Build a tuple descriptor for our result type */
1651 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1652 elog(ERROR, "return type must be a row type");
1654 MemSet(values, 0, sizeof(values));
1655 MemSet(nulls, 0, sizeof(nulls));
1657 /* Read global statistics for pg_stat_statements */
1659 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1661 SpinLockAcquire(&s->mutex);
1663 SpinLockRelease(&s->mutex);
1666 values[0] = Int64GetDatum(stats.dealloc);
1667 values[1] = TimestampTzGetDatum(stats.stats_reset);
1669 PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
1673 * Estimate shared memory space needed.
1676 shared_mem_size(void)
1681 size = MAXALIGN(sizeof(pgspSharedState));
1682 entry_size = sizeof(pgspEntry);
1684 /* plan text is apppended to the struct body */
1685 if (plan_storage == PLAN_STORAGE_SHMEM)
1686 entry_size += max_plan_len;
1688 size = add_size(size, hash_estimate_size(store_size, entry_size));
1694 * Allocate a new hashtable entry.
1695 * caller must hold an exclusive lock on shared_state->lock
1697 * "plan" need not be null-terminated; we rely on plan_len instead
1699 * If "sticky" is true, make the new entry artificially sticky so that it will
1700 * probably still be there when the query finishes execution. We do this by
1701 * giving it a median usage value rather than the normal value. (Strictly
1702 * speaking, query strings are normalized on a best effort basis, though it
1703 * would be difficult to demonstrate this even under artificial conditions.)
1705 * Note: despite needing exclusive lock, it's not an error for the target
1706 * entry to already exist. This is because pgsp_store releases and
1707 * reacquires lock after failing to find a match; so someone else could
1708 * have made the entry while we waited to get exclusive lock.
1711 entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len, bool sticky)
1716 /* Make space if needed */
1717 while (hash_get_num_entries(hash_table) >= store_size)
1720 /* Find or create an entry with desired hash code */
1721 entry = (pgspEntry *) hash_search(hash_table, key, HASH_ENTER, &found);
1725 /* New entry, initialize it */
1727 /* reset the statistics */
1728 memset(&entry->counters, 0, sizeof(Counters));
1729 /* set the appropriate initial usage count */
1730 entry->counters.usage = sticky ? shared_state->cur_median_usage : USAGE_INIT;
1731 /* re-initialize the mutex each time ... we assume no one using it */
1732 SpinLockInit(&entry->mutex);
1733 /* ... and don't forget the query text */
1734 Assert(plan_len >= 0 && plan_len < shared_state->plan_size);
1735 entry->plan_offset = plan_offset;
1736 entry->plan_len = plan_len;
1737 entry->encoding = GetDatabaseEncoding();
1744 * qsort comparator for sorting into increasing usage order
1747 entry_cmp(const void *lhs, const void *rhs)
1749 double l_usage = (*(pgspEntry *const *) lhs)->counters.usage;
1750 double r_usage = (*(pgspEntry *const *) rhs)->counters.usage;
1752 if (l_usage < r_usage)
1754 else if (l_usage > r_usage)
1761 * Deallocate least used entries.
1762 * Caller must hold an exclusive lock on shared_state->lock.
1767 HASH_SEQ_STATUS hash_seq;
1768 pgspEntry **entries;
1776 * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1777 * While we're scanning the table, apply the decay factor to the usage
1781 entries = palloc(hash_get_num_entries(hash_table) * sizeof(pgspEntry *));
1787 hash_seq_init(&hash_seq, hash_table);
1788 while ((entry = hash_seq_search(&hash_seq)) != NULL)
1790 entries[i++] = entry;
1791 /* "Sticky" entries get a different usage decay rate. */
1792 if (entry->counters.calls == 0)
1793 entry->counters.usage *= STICKY_DECREASE_FACTOR;
1795 entry->counters.usage *= USAGE_DECREASE_FACTOR;
1797 /* In the mean length computation, ignore dropped texts. */
1798 if (entry->plan_len >= 0)
1800 tottextlen += entry->plan_len + 1;
1805 qsort(entries, i, sizeof(pgspEntry *), entry_cmp);
1807 /* Also, record the (approximate) median usage */
1809 shared_state->cur_median_usage = entries[i / 2]->counters.usage;
1810 /* Record the mean plan length */
1811 if (nvalidtexts > 0)
1812 shared_state->mean_plan_len = tottextlen / nvalidtexts;
1814 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
1816 nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
1817 nvictims = Min(nvictims, i);
1819 for (i = 0; i < nvictims; i++)
1821 hash_search(hash_table, &entries[i]->key, HASH_REMOVE, NULL);
1826 /* Increment the number of times entries are deallocated */
1828 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1830 SpinLockAcquire(&s->mutex);
1831 s->stats.dealloc += 1;
1832 SpinLockRelease(&s->mutex);
1837 * Given a plan string (not necessarily null-terminated), allocate a new
1838 * entry in the external plan text file and store the string there.
1840 * If successful, returns true, and stores the new entry's offset in the file
1841 * into *plan_offset. Also, if gc_count isn't NULL, *gc_count is set to the
1842 * number of garbage collections that have occurred so far.
1844 * On failure, returns false.
1846 * At least a shared lock on shared_state->lock must be held by the caller, so
1847 * as to prevent a concurrent garbage collection. Share-lock-holding callers
1848 * should pass a gc_count pointer to obtain the number of garbage collections,
1849 * so that they can recheck the count after obtaining exclusive lock to detect
1850 * whether a garbage collection occurred (and removed this entry).
1853 ptext_store(const char *plan, int plan_len, Size *plan_offset, int *gc_count)
1858 Assert (plan_storage == PLAN_STORAGE_FILE);
1861 * We use a spinlock to protect extent/n_writers/gc_count, so that
1862 * multiple processes may execute this function concurrently.
1865 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1867 SpinLockAcquire(&s->mutex);
1869 s->extent += plan_len + 1;
1872 *gc_count = s->gc_count;
1873 SpinLockRelease(&s->mutex);
1878 /* Now write the data into the successfully-reserved part of the file */
1879 fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
1883 if (pg_pwrite(fd, plan, plan_len, off) != plan_len)
1885 if (pg_pwrite(fd, "\0", 1, off + plan_len) != 1)
1888 CloseTransientFile(fd);
1890 /* Mark our write complete */
1892 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1894 SpinLockAcquire(&s->mutex);
1896 SpinLockRelease(&s->mutex);
1903 (errcode_for_file_access(),
1904 errmsg("could not write file \"%s\": %m",
1908 CloseTransientFile(fd);
1910 /* Mark our write complete */
1912 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1914 SpinLockAcquire(&s->mutex);
1916 SpinLockRelease(&s->mutex);
1923 * Read the external plan text file into a malloc'd buffer.
1925 * Returns NULL (without throwing an error) if unable to read, eg
1926 * file not there or insufficient memory.
1928 * On success, the buffer size is also returned into *buffer_size.
1930 * This can be called without any lock on shared_state->lock, but in that case
1931 * the caller is responsible for verifying that the result is sane.
1934 ptext_load_file(Size *buffer_size)
1941 Assert (plan_storage == PLAN_STORAGE_FILE);
1943 fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDONLY | PG_BINARY);
1946 if (errno != ENOENT)
1948 (errcode_for_file_access(),
1949 errmsg("could not read file \"%s\": %m",
1954 /* Get file length */
1955 if (fstat(fd, &stat))
1958 (errcode_for_file_access(),
1959 errmsg("could not stat file \"%s\": %m",
1961 CloseTransientFile(fd);
1965 /* Allocate buffer; beware that off_t might be wider than size_t */
1966 if (stat.st_size <= MaxAllocHugeSize)
1967 buf = (char *) malloc(stat.st_size);
1973 (errcode(ERRCODE_OUT_OF_MEMORY),
1974 errmsg("out of memory"),
1975 errdetail("Could not allocate enough memory to read file \"%s\".",
1977 CloseTransientFile(fd);
1982 * OK, slurp in the file. Windows fails if we try to read more than
1983 * INT_MAX bytes at once, and other platforms might not like that either,
1984 * so read a very large file in 1GB segments.
1987 while (nread < stat.st_size)
1989 int toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
1992 * If we get a short read and errno doesn't get set, the reason is
1993 * probably that garbage collection truncated the file since we did
1994 * the fstat(), so we don't log a complaint --- but we don't return
1995 * the data, either, since it's most likely corrupt due to concurrent
1996 * writes from garbage collection.
1999 if (read(fd, buf + nread, toread) != toread)
2003 (errcode_for_file_access(),
2004 errmsg("could not read file \"%s\": %m",
2007 CloseTransientFile(fd);
2013 if (CloseTransientFile(fd) != 0)
2015 (errcode_for_file_access(),
2016 errmsg("could not close file \"%s\": %m", PGSP_TEXT_FILE)));
2018 *buffer_size = nread;
2023 * Locate a plan text in the file image previously read by ptext_load_file().
2025 * We validate the given offset/length, and return NULL if bogus. Otherwise,
2026 * the result points to a null-terminated string within the buffer.
2029 ptext_fetch(Size plan_offset, int plan_len,
2030 char *buffer, Size buffer_size)
2032 Assert (plan_storage == PLAN_STORAGE_FILE);
2034 /* File read failed? */
2037 /* Bogus offset/length? */
2039 plan_offset + plan_len >= buffer_size)
2041 /* As a further sanity check, make sure there's a trailing null */
2042 if (buffer[plan_offset + plan_len] != '\0')
2045 return buffer + plan_offset;
2049 * Do we need to garbage-collect the external plan text file?
2051 * Caller should hold at least a shared lock on shared_state->lock.
2054 need_gc_ptexts(void)
2058 Assert (plan_storage == PLAN_STORAGE_FILE);
2060 /* Read shared extent pointer */
2062 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2064 SpinLockAcquire(&s->mutex);
2066 SpinLockRelease(&s->mutex);
2069 /* Don't proceed if file does not exceed 512 bytes per possible entry */
2070 if (extent < 512 * store_size)
2074 * Don't proceed if file is less than about 50% bloat. Nothing can or
2075 * should be done in the event of unusually large query texts accounting
2076 * for file's large size. We go to the trouble of maintaining the mean
2077 * query length in order to prevent garbage collection from thrashing
2080 if (extent < shared_state->mean_plan_len * store_size * 2)
2087 * Garbage-collect orphaned plan texts in external file.
2089 * This won't be called often in the typical case, since it's likely that
2090 * there won't be too much churn, and besides, a similar compaction process
2091 * occurs when serializing to disk at shutdown or as part of resetting.
2092 * Despite this, it seems prudent to plan for the edge case where the file
2093 * becomes unreasonably large, with no other method of compaction likely to
2094 * occur in the foreseeable future.
2096 * The caller must hold an exclusive lock on shared_state->lock.
2098 * At the first sign of trouble we unlink the query text file to get a clean
2099 * slate (although existing statistics are retained), rather than risk
2100 * thrashing by allowing the same problem case to recur indefinitely.
2108 HASH_SEQ_STATUS hash_seq;
2113 Assert (plan_storage == PLAN_STORAGE_FILE);
2116 * When called from store_entry, some other session might have proceeded
2117 * with garbage collection in the no-lock-held interim of lock strength
2118 * escalation. Check once more that this is actually necessary.
2120 if (!need_gc_ptexts())
2124 * Load the old texts file. If we fail (out of memory, for instance),
2125 * invalidate query texts. Hopefully this is rare. It might seem better
2126 * to leave things alone on an OOM failure, but the problem is that the
2127 * file is only going to get bigger; hoping for a future non-OOM result is
2128 * risky and can easily lead to complete denial of service.
2130 pbuffer = ptext_load_file(&pbuffer_size);
2131 if (pbuffer == NULL)
2135 * We overwrite the plan texts file in place, so as to reduce the risk of
2136 * an out-of-disk-space failure. Since the file is guaranteed not to get
2137 * larger, this should always work on traditional filesystems; though we
2138 * could still lose on copy-on-write filesystems.
2140 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2144 (errcode_for_file_access(),
2145 errmsg("could not write file \"%s\": %m",
2153 hash_seq_init(&hash_seq, hash_table);
2154 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2156 int plan_len = entry->plan_len;
2157 char *plan = ptext_fetch(entry->plan_offset,
2164 /* Trouble ... drop the text */
2165 entry->plan_offset = 0;
2166 entry->plan_len = -1;
2167 /* entry will not be counted in mean plan length computation */
2171 if (fwrite(plan, 1, plan_len + 1, pfile) != plan_len + 1)
2174 (errcode_for_file_access(),
2175 errmsg("could not write file \"%s\": %m",
2177 hash_seq_term(&hash_seq);
2181 entry->plan_offset = extent;
2182 extent += plan_len + 1;
2187 * Truncate away any now-unused space. If this fails for some odd reason,
2188 * we log it, but there's no need to fail.
2190 if (ftruncate(fileno(pfile), extent) != 0)
2192 (errcode_for_file_access(),
2193 errmsg("could not truncate file \"%s\": %m",
2196 if (FreeFile(pfile))
2199 (errcode_for_file_access(),
2200 errmsg("could not write file \"%s\": %m",
2206 elog(DEBUG1, "pgsp gc of queries file shrunk size from %zu to %zu",
2207 shared_state->extent, extent);
2209 /* Reset the shared extent pointer */
2210 shared_state->extent = extent;
2213 * Also update the mean plan length, to be sure that need_gc_ptexts()
2214 * won't still think we have a problem.
2217 shared_state->mean_plan_len = extent / nentries;
2219 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2226 /* clean up resources */
2233 * Since the contents of the external file are now uncertain, mark all
2234 * hashtable entries as having invalid texts.
2236 hash_seq_init(&hash_seq, hash_table);
2237 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2239 entry->plan_offset = 0;
2240 entry->plan_len = -1;
2244 * Destroy the query text file and create a new, empty one
2246 (void) unlink(PGSP_TEXT_FILE);
2247 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2250 (errcode_for_file_access(),
2251 errmsg("could not recreate file \"%s\": %m",
2256 /* Reset the shared extent pointer */
2257 shared_state->extent = 0;
2259 /* Reset mean_plan_len to match the new state */
2260 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2264 * Release all entries.
2269 HASH_SEQ_STATUS hash_seq;
2273 if (!shared_state || !hash_table)
2275 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2276 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
2278 LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
2280 hash_seq_init(&hash_seq, hash_table);
2281 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2283 hash_search(hash_table, &entry->key, HASH_REMOVE, NULL);
2287 * Reset global statistics for pg_store_plans.
2290 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2291 TimestampTz stats_reset = GetCurrentTimestamp();
2293 SpinLockAcquire(&s->mutex);
2294 s->stats.dealloc = 0;
2295 s->stats.stats_reset = stats_reset;
2296 SpinLockRelease(&s->mutex);
2300 * Write new empty plan file, perhaps even creating a new one to recover
2301 * if the file was missing.
2303 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2307 (errcode_for_file_access(),
2308 errmsg("could not create file \"%s\": %m",
2313 /* If ftruncate fails, log it, but it's not a fatal problem */
2314 if (ftruncate(fileno(pfile), 0) != 0)
2316 (errcode_for_file_access(),
2317 errmsg("could not truncate file \"%s\": %m",
2323 shared_state->extent = 0;
2324 LWLockRelease(shared_state->lock);
2328 pg_store_plans_hash_query(PG_FUNCTION_ARGS)
2330 PG_RETURN_OID(hash_query(text_to_cstring(PG_GETARG_TEXT_P(0))));
2334 pg_store_plans_shorten(PG_FUNCTION_ARGS)
2336 text *short_plan = PG_GETARG_TEXT_P(0);
2337 char *cjson = text_to_cstring(short_plan);
2338 char *cshorten = pgsp_json_shorten(cjson);
2339 PG_RETURN_TEXT_P(cstring_to_text(cshorten));
2343 pg_store_plans_normalize(PG_FUNCTION_ARGS)
2345 text *short_plan = PG_GETARG_TEXT_P(0);
2346 char *cjson = text_to_cstring(short_plan);
2347 char *cnormalized = pgsp_json_normalize(cjson);
2348 PG_RETURN_TEXT_P(cstring_to_text(cnormalized));
2352 pg_store_plans_jsonplan(PG_FUNCTION_ARGS)
2354 text *short_plan = PG_GETARG_TEXT_P(0);
2355 char *cshort = text_to_cstring(short_plan);
2356 char *cinflated = pgsp_json_inflate(cshort);
2357 PG_RETURN_TEXT_P(cstring_to_text(cinflated));
2361 pg_store_plans_textplan(PG_FUNCTION_ARGS)
2363 text *short_plan = PG_GETARG_TEXT_P(0);
2364 char *cshort = text_to_cstring(short_plan);
2365 char *ctextized = pgsp_json_textize(cshort);
2367 PG_RETURN_TEXT_P(cstring_to_text(ctextized));
2371 pg_store_plans_yamlplan(PG_FUNCTION_ARGS)
2373 text *short_plan = PG_GETARG_TEXT_P(0);
2374 char *cshort = text_to_cstring(short_plan);
2375 char *cyamlized = pgsp_json_yamlize(cshort);
2377 PG_RETURN_TEXT_P(cstring_to_text(cyamlized));
2381 pg_store_plans_xmlplan(PG_FUNCTION_ARGS)
2383 text *short_plan = PG_GETARG_TEXT_P(0);
2384 char *cshort = text_to_cstring(short_plan);
2385 char *cxmlized = pgsp_json_xmlize(cshort);
2387 PG_RETURN_TEXT_P(cstring_to_text(cxmlized));