1 /*-------------------------------------------------------------------------
4 * Take statistics of plan selection across a whole database cluster.
6 * Execution costs are totaled for each distinct plan for each query,
7 * and plan and queryid are kept in a shared hashtable, each record in
8 * which is associated with a record in pg_stat_statements, if any, by
11 * For Postgres 9.3 or earlier does not expose query id so
12 * pg_store_plans needs to calculate it based on the given query
13 * string using different algorithm from pg_stat_statements, and later
14 * the id will be matched against the one made from query string
15 * stored in pg_stat_statements. For the reason, queryid matching in
16 * this way will fail if the query string kept in pg_stat_statements
17 * is truncated in the middle.
19 * Plans are identified by fingerprinting plan representations in
20 * "shortened" JSON format with constants and unstable values such as
21 * rows, width, loops ignored. Nevertheless, stored plan entries hold
22 * them of the latest execution. Entry eviction is done in the same
23 * way to pg_stat_statements.
25 * Copyright (c) 2008-2022, PostgreSQL Global Development Group
26 * Copyright (c) 2012-2022, NIPPON TELEGRAPH AND TELEPHONE CORPORATION
29 * pg_store_plans/pg_store_plans.c
31 *-------------------------------------------------------------------------
40 #include "catalog/pg_authid.h"
41 #include "commands/explain.h"
42 #include "access/hash.h"
43 #include "executor/instrument.h"
45 #include "mb/pg_wchar.h"
46 #include "miscadmin.h"
48 #include "storage/fd.h"
49 #include "storage/ipc.h"
50 #include "storage/lwlock.h"
51 #include "storage/spin.h"
52 #include "storage/shmem.h"
53 #include "tcop/utility.h"
54 #include "utils/acl.h"
55 #include "utils/builtins.h"
56 #if PG_VERSION_NUM >= 140000
57 #include "utils/queryjumble.h"
59 #include "utils/timestamp.h"
61 #include "pgsp_json.h"
62 #include "pgsp_explain.h"
66 /* Location of stats file */
67 #define PGSP_DUMP_FILE "global/pg_store_plans.stat"
68 #define PGSP_TEXT_FILE PG_STAT_TMP_DIR "/pgsp_plan_texts.stat"
70 /* PostgreSQL major version number, changes in which invalidate all entries */
71 static const uint32 PGSP_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
73 /* This constant defines the magic number in the stats file header */
74 static const uint32 PGSP_FILE_HEADER = 0x20211125;
75 static int max_plan_len = 5000;
77 /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
78 #define USAGE_EXEC(duration) (1.0)
79 #define USAGE_INIT (1.0) /* including initial planning */
80 #define ASSUMED_MEDIAN_INIT (10.0) /* initial assumed median usage */
81 #define ASSUMED_LENGTH_INIT 1024 /* initial assumed mean query length */
82 #define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */
83 #define STICKY_DECREASE_FACTOR (0.50) /* factor for sticky entries */
84 #define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */
86 /* In PostgreSQL 11, queryid becomes a uint64 internally. */
87 #if PG_VERSION_NUM >= 110000
88 typedef uint64 queryid_t;
89 #define PGSP_NO_QUERYID UINT64CONST(0)
91 typedef uint32 queryid_t;
92 #define PGSP_NO_QUERYID 0
96 * Extension version number, for supporting older extension versions' objects
98 typedef enum pgspVersion
105 * Hashtable key that defines the identity of a hashtable entry. We separate
106 * queries by user and by database even if they are otherwise identical.
108 * Presently, the query encoding is fully determined by the source database
109 * and so we don't really need it to be in the key. But that might not always
110 * be true. Anyway it's notationally convenient to pass it as part of the key.
112 typedef struct pgspHashKey
114 Oid userid; /* user OID */
115 Oid dbid; /* database OID */
116 queryid_t queryid; /* query identifier */
117 uint32 planid; /* plan identifier */
121 * The actual stats counters kept within pgspEntry.
123 typedef struct Counters
125 int64 calls; /* # of times executed */
126 double total_time; /* total execution time, in msec */
127 double min_time; /* minimum execution time in msec */
128 double max_time; /* maximum execution time in msec */
129 double mean_time; /* mean execution time in msec */
130 double sum_var_time; /* sum of variances in execution time in msec */
131 int64 rows; /* total # of retrieved or affected rows */
132 int64 shared_blks_hit; /* # of shared buffer hits */
133 int64 shared_blks_read; /* # of shared disk blocks read */
134 int64 shared_blks_dirtied;/* # of shared disk blocks dirtied */
135 int64 shared_blks_written;/* # of shared disk blocks written */
136 int64 local_blks_hit; /* # of local buffer hits */
137 int64 local_blks_read; /* # of local disk blocks read */
138 int64 local_blks_dirtied; /* # of local disk blocks dirtied */
139 int64 local_blks_written; /* # of local disk blocks written */
140 int64 temp_blks_read; /* # of temp blocks read */
141 int64 temp_blks_written; /* # of temp blocks written */
142 double blk_read_time; /* time spent reading, in msec */
143 double blk_write_time; /* time spent writing, in msec */
144 TimestampTz first_call; /* timestamp of first call */
145 TimestampTz last_call; /* timestamp of last call */
146 double usage; /* usage factor */
150 * Global statistics for pg_store_plans
152 typedef struct pgspGlobalStats
154 int64 dealloc; /* # of times entries were deallocated */
155 TimestampTz stats_reset; /* timestamp with all stats reset */
159 * Statistics per plan
161 * NB: see the file read/write code before changing field order here.
163 typedef struct pgspEntry
165 pgspHashKey key; /* hash key of entry - MUST BE FIRST */
166 Counters counters; /* the statistics for this query */
167 Size plan_offset; /* plan text offset in extern file */
168 int plan_len; /* # of valid bytes in query string */
169 int encoding; /* query encoding */
170 slock_t mutex; /* protects the counters only */
174 * Global shared state
176 typedef struct pgspSharedState
178 LWLock *lock; /* protects hashtable search/modification */
179 int plan_size; /* max query length in bytes */
180 double cur_median_usage; /* current median usage in hashtable */
181 Size mean_plan_len; /* current mean entry text length */
182 slock_t mutex; /* protects following fields only: */
183 Size extent; /* current extent of plan file */
184 int n_writers; /* number of active writers to query file */
185 int gc_count; /* plan file garbage collection cycle count */
186 pgspGlobalStats stats; /* global statistics for pgsp */
189 /*---- Local variables ----*/
191 /* Current nesting depth of ExecutorRun+ProcessUtility calls */
192 static int nested_level = 0;
194 /* Saved hook values in case of unload */
195 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
196 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
197 static ExecutorRun_hook_type prev_ExecutorRun = NULL;
198 static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
199 static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
200 static ProcessUtility_hook_type prev_ProcessUtility = NULL;
202 /* Links to shared memory state */
203 static pgspSharedState *shared_state = NULL;
204 static HTAB *hash_table = NULL;
206 /*---- GUC variables ----*/
210 TRACK_LEVEL_NONE, /* track no statements */
211 TRACK_LEVEL_TOP, /* only top level statements */
212 TRACK_LEVEL_ALL, /* all statements, including nested ones */
213 TRACK_LEVEL_VERBOSE /* all statements, including internal ones */
216 static const struct config_enum_entry track_options[] =
218 {"none", TRACK_LEVEL_NONE, false},
219 {"top", TRACK_LEVEL_TOP, false},
220 {"all", TRACK_LEVEL_ALL, false},
221 {"verbose", TRACK_LEVEL_VERBOSE, false},
227 PLAN_FORMAT_RAW, /* No conversion. Shorten JSON */
228 PLAN_FORMAT_TEXT, /* Traditional text representation */
229 PLAN_FORMAT_JSON, /* JSON representation */
230 PLAN_FORMAT_YAML, /* YAML */
231 PLAN_FORMAT_XML, /* XML */
234 static const struct config_enum_entry plan_formats[] =
236 {"raw" , PLAN_FORMAT_RAW , false},
237 {"text", PLAN_FORMAT_TEXT, false},
238 {"json", PLAN_FORMAT_JSON, false},
239 {"yaml", PLAN_FORMAT_YAML, false},
240 {"xml" , PLAN_FORMAT_XML , false},
244 /* options for plan storage */
247 PLAN_STORAGE_SHMEM, /* plan is stored as a part of hash entry */
248 PLAN_STORAGE_FILE /* plan is stored in a separate file */
251 static const struct config_enum_entry plan_storage_options[] =
253 {"shmem", PLAN_STORAGE_SHMEM, false},
254 {"file", PLAN_STORAGE_FILE, false},
258 static int store_size; /* max # statements to track */
259 static int track_level; /* tracking level */
260 static int min_duration; /* min duration to record */
261 static bool dump_on_shutdown; /* whether to save stats across shutdown */
262 static bool log_analyze; /* Similar to EXPLAIN (ANALYZE *) */
263 static bool log_verbose; /* Similar to EXPLAIN (VERBOSE *) */
264 static bool log_buffers; /* Similar to EXPLAIN (BUFFERS *) */
265 static bool log_timing; /* Similar to EXPLAIN (TIMING *) */
266 static bool log_triggers; /* whether to log trigger statistics */
267 static int plan_format; /* Plan representation style in
268 * pg_store_plans.plan */
269 static int plan_storage; /* Plan storage type */
272 /* disables tracking overriding track_level */
273 static bool force_disabled = false;
275 #if PG_VERSION_NUM >= 140000
277 * For pg14 and later, we rely on core queryid calculation. If
278 * it's not available it means that the admin explicitly refused to
279 * compute it, for performance reason or other. In that case, we
280 * will also consider that this extension is disabled.
282 #define pgsp_enabled(q) \
283 (!force_disabled && \
284 (track_level >= TRACK_LEVEL_ALL || \
285 (track_level == TRACK_LEVEL_TOP && nested_level == 0)) && \
286 (q != PGSP_NO_QUERYID))
288 #define pgsp_enabled(q) \
289 (!force_disabled && \
290 (track_level >= TRACK_LEVEL_ALL || \
291 (track_level == TRACK_LEVEL_TOP && nested_level == 0)))
294 #define SHMEM_PLAN_PTR(ent) (((char *) ent) + sizeof(pgspEntry))
296 /*---- Function declarations ----*/
301 Datum pg_store_plans_reset(PG_FUNCTION_ARGS);
302 Datum pg_store_plans_hash_query(PG_FUNCTION_ARGS);
303 Datum pg_store_plans(PG_FUNCTION_ARGS);
304 Datum pg_store_plans_shorten(PG_FUNCTION_ARGS);
305 Datum pg_store_plans_normalize(PG_FUNCTION_ARGS);
306 Datum pg_store_plans_jsonplan(PG_FUNCTION_ARGS);
307 Datum pg_store_plans_yamlplan(PG_FUNCTION_ARGS);
308 Datum pg_store_plans_xmlplan(PG_FUNCTION_ARGS);
309 Datum pg_store_plans_textplan(PG_FUNCTION_ARGS);
310 Datum pg_store_plans_info(PG_FUNCTION_ARGS);
312 PG_FUNCTION_INFO_V1(pg_store_plans_reset);
313 PG_FUNCTION_INFO_V1(pg_store_plans_hash_query);
314 PG_FUNCTION_INFO_V1(pg_store_plans);
315 PG_FUNCTION_INFO_V1(pg_store_plans_1_6);
316 PG_FUNCTION_INFO_V1(pg_store_plans_shorten);
317 PG_FUNCTION_INFO_V1(pg_store_plans_normalize);
318 PG_FUNCTION_INFO_V1(pg_store_plans_jsonplan);
319 PG_FUNCTION_INFO_V1(pg_store_plans_yamlplan);
320 PG_FUNCTION_INFO_V1(pg_store_plans_xmlplan);
321 PG_FUNCTION_INFO_V1(pg_store_plans_textplan);
322 PG_FUNCTION_INFO_V1(pg_store_plans_info);
324 #if PG_VERSION_NUM < 130000
325 #define COMPTAG_TYPE char
327 #define COMPTAG_TYPE QueryCompletion
330 #if PG_VERSION_NUM < 140000
331 #define ROLE_PG_READ_ALL_STATS DEFAULT_ROLE_READ_ALL_STATS
334 #if PG_VERSION_NUM >= 150000
335 static shmem_request_hook_type prev_shmem_request_hook = NULL;
338 static void pgsp_shmem_request(void);
339 static void pgsp_shmem_startup(void);
340 static void pgsp_shmem_shutdown(int code, Datum arg);
341 static void pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags);
342 static void pgsp_ExecutorRun(QueryDesc *queryDesc,
343 ScanDirection direction,
344 uint64 count, bool execute_once);
345 static void pgsp_ExecutorFinish(QueryDesc *queryDesc);
346 static void pgsp_ExecutorEnd(QueryDesc *queryDesc);
347 static void pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
348 #if PG_VERSION_NUM >= 140000
351 ProcessUtilityContext context, ParamListInfo params,
352 QueryEnvironment *queryEnv,
353 DestReceiver *dest, COMPTAG_TYPE *completionTag);
354 static uint32 hash_query(const char* query);
355 static void pgsp_store(char *plan, queryid_t queryId,
356 double total_time, uint64 rows,
357 const BufferUsage *bufusage);
358 static void pg_store_plans_internal(FunctionCallInfo fcinfo,
359 pgspVersion api_version);
360 static Size shared_mem_size(void);
361 static pgspEntry *entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len,
363 static bool ptext_store(const char *plan, int plan_len, Size *plan_offset,
365 static char *ptext_load_file(Size *buffer_size);
366 static char *ptext_fetch(Size plan_offset, int plan_len, char *buffer,
368 static bool need_gc_ptexts(void);
369 static void gc_ptexts(void);
370 static void entry_dealloc(void);
371 static void entry_reset(void);
374 * Module load callback
380 * In order to create our shared memory area, we have to be loaded via
381 * shared_preload_libraries. If not, fall out without hooking into any of
382 * the main system. (We don't throw error here because it seems useful to
383 * allow the pg_stat_statements functions to be created even when the
384 * module isn't active. The functions must protect themselves against
385 * being called then, however.)
387 if (!process_shared_preload_libraries_in_progress)
390 #if PG_VERSION_NUM >= 140000
392 * Inform the postmaster that we want to enable query_id calculation if
393 * compute_query_id is set to auto.
399 * Define (or redefine) custom GUC variables.
401 DefineCustomIntVariable("pg_store_plans.max",
402 "Sets the maximum number of plans tracked by pg_store_plans.",
414 DefineCustomIntVariable("pg_store_plans.max_plan_length",
415 "Sets the maximum length of plans stored by pg_store_plans.",
427 DefineCustomEnumVariable("pg_store_plans.plan_storage",
428 "Selects where to store plan texts.",
432 plan_storage_options,
439 DefineCustomEnumVariable("pg_store_plans.track",
440 "Selects which plans are tracked by pg_store_plans.",
451 DefineCustomEnumVariable("pg_store_plans.plan_format",
452 "Selects which format to be appied for plan representation in pg_store_plans.",
463 DefineCustomIntVariable("pg_store_plans.min_duration",
464 "Minimum duration to record plan in milliseconds.",
476 DefineCustomBoolVariable("pg_store_plans.save",
477 "Save pg_store_plans statistics across server shutdowns.",
487 DefineCustomBoolVariable("pg_store_plans.log_analyze",
488 "Use EXPLAIN ANALYZE for plan logging.",
498 DefineCustomBoolVariable("pg_store_plans.log_buffers",
509 DefineCustomBoolVariable("pg_store_plans.log_timing",
520 DefineCustomBoolVariable("pg_store_plans.log_triggers",
521 "Log trigger trace.",
531 DefineCustomBoolVariable("pg_store_plans.log_verbose",
532 "Set VERBOSE for EXPLAIN on logging.",
542 EmitWarningsOnPlaceholders("pg_store_plans");
544 #if PG_VERSION_NUM < 150000
545 pgsp_shmem_request();
551 #if PG_VERSION_NUM >= 150000
552 prev_shmem_request_hook = shmem_request_hook;
553 shmem_request_hook = pgsp_shmem_request;
555 prev_shmem_startup_hook = shmem_startup_hook;
556 shmem_startup_hook = pgsp_shmem_startup;
557 prev_ExecutorStart = ExecutorStart_hook;
558 ExecutorStart_hook = pgsp_ExecutorStart;
559 prev_ExecutorRun = ExecutorRun_hook;
560 ExecutorRun_hook = pgsp_ExecutorRun;
561 prev_ExecutorFinish = ExecutorFinish_hook;
562 ExecutorFinish_hook = pgsp_ExecutorFinish;
563 prev_ExecutorEnd = ExecutorEnd_hook;
564 ExecutorEnd_hook = pgsp_ExecutorEnd;
565 prev_ProcessUtility = ProcessUtility_hook;
566 ProcessUtility_hook = pgsp_ProcessUtility;
570 * Module unload callback
575 /* Uninstall hooks. */
576 shmem_startup_hook = prev_shmem_startup_hook;
577 ExecutorStart_hook = prev_ExecutorStart;
578 ExecutorRun_hook = prev_ExecutorRun;
579 ExecutorFinish_hook = prev_ExecutorFinish;
580 ExecutorEnd_hook = prev_ExecutorEnd;
581 ProcessUtility_hook = prev_ProcessUtility;
585 * pgsp_shmem_request: request shared memory to the core.
586 * Called as a hook in PG15 or later, otherwise called from _PG_init().
589 pgsp_shmem_request(void)
591 #if PG_VERSION_NUM >= 150000
592 if (prev_shmem_request_hook)
593 prev_shmem_request_hook();
596 RequestAddinShmemSpace(shared_mem_size());
597 RequestNamedLWLockTranche("pg_store_plans", 1);
601 * shmem_startup hook: allocate or attach to shared memory,
602 * then load any pre-existing statistics from file.
605 pgsp_shmem_startup(void)
619 if (prev_shmem_startup_hook)
620 prev_shmem_startup_hook();
622 /* reset in case this is a restart within the postmaster */
627 * Create or attach to the shared memory state, including hash table
629 LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
631 shared_state = ShmemInitStruct("pg_store_plans",
632 sizeof(pgspSharedState),
637 /* First time through ... */
638 shared_state->lock = &(GetNamedLWLockTranche("pg_store_plans"))->lock;
639 shared_state->plan_size = max_plan_len;
640 shared_state->cur_median_usage = ASSUMED_MEDIAN_INIT;
641 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
642 SpinLockInit(&shared_state->mutex);
643 shared_state->extent = 0;
644 shared_state->n_writers = 0;
645 shared_state->gc_count = 0;
646 shared_state->stats.dealloc = 0;
647 shared_state->stats.stats_reset = GetCurrentTimestamp();
650 /* Be sure everyone agrees on the hash table entry size */
651 plan_size = shared_state->plan_size;
653 memset(&info, 0, sizeof(info));
654 info.keysize = sizeof(pgspHashKey);
655 info.entrysize = sizeof(pgspEntry);
656 if (plan_storage == PLAN_STORAGE_SHMEM)
657 info.entrysize += max_plan_len;
658 hash_table = ShmemInitHash("pg_store_plans hash",
659 store_size, store_size,
663 LWLockRelease(AddinShmemInitLock);
666 * If we're in the postmaster (or a standalone backend...), set up a shmem
667 * exit hook to dump the statistics to disk.
669 if (!IsUnderPostmaster)
670 on_shmem_exit(pgsp_shmem_shutdown, (Datum) 0);
673 * Done if some other process already completed our initialization.
679 * Note: we don't bother with locks here, because there should be no other
680 * processes running when this code is reached.
683 /* Unlink query text file possibly left over from crash */
684 unlink(PGSP_TEXT_FILE);
686 if (plan_storage == PLAN_STORAGE_FILE)
688 /* Allocate new query text temp file */
689 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
695 * If we were told not to load old statistics, we're done. (Note we do
696 * not try to unlink any old dump file in this case. This seems a bit
697 * questionable but it's the historical behavior.)
699 if (!dump_on_shutdown)
707 * Attempt to load old statistics from the dump file.
709 file = AllocateFile(PGSP_DUMP_FILE, PG_BINARY_R);
712 /* ignore not-found error */
716 /* No existing persisted stats file, so we're done */
722 buffer_size = plan_size;
723 buffer = (char *) palloc(buffer_size);
725 if (fread(&header, sizeof(uint32), 1, file) != 1 ||
726 fread(&pgver, sizeof(uint32), 1, file) != 1 ||
727 fread(&num, sizeof(int32), 1, file) != 1)
730 if (header != PGSP_FILE_HEADER ||
731 pgver != PGSP_PG_MAJOR_VERSION)
734 for (i = 0; i < num; i++)
738 Size plan_offset = 0;
740 if (fread(&temp, sizeof(pgspEntry), 1, file) != 1)
743 /* Encoding is the only field we can easily sanity-check */
744 if (!PG_VALID_BE_ENCODING(temp.encoding))
747 /* Previous incarnation might have had a larger plan_size */
748 if (temp.plan_len >= buffer_size)
750 buffer = (char *) repalloc(buffer, temp.plan_len + 1);
751 buffer_size = temp.plan_len + 1;
754 if (fread(buffer, 1, temp.plan_len + 1, file) != temp.plan_len + 1)
757 /* Skip loading "sticky" entries */
758 if (temp.counters.calls == 0)
761 /* Clip to available length if needed */
762 if (temp.plan_len >= plan_size)
763 temp.plan_len = pg_encoding_mbcliplen(temp.encoding,
768 buffer[temp.plan_len] = '\0';
770 if (plan_storage == PLAN_STORAGE_FILE)
772 /* Store the plan text */
773 plan_offset = shared_state->extent;
774 if (fwrite(buffer, 1, temp.plan_len + 1, pfile) !=
777 shared_state->extent += temp.plan_len + 1;
780 /* make the hashtable entry (discards old entries if too many) */
781 entry = entry_alloc(&temp.key, plan_offset, temp.plan_len, false);
783 if (plan_storage == PLAN_STORAGE_SHMEM)
784 memcpy(SHMEM_PLAN_PTR(entry), buffer, temp.plan_len + 1);
786 /* copy in the actual stats */
787 entry->counters = temp.counters;
797 * Remove the file so it's not included in backups/replication slaves,
798 * etc. A new file will be written on next shutdown.
800 unlink(PGSP_DUMP_FILE);
806 (errcode_for_file_access(),
807 errmsg("could not read file \"%s\": %m",
812 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
813 errmsg("ignoring invalid data in file \"%s\"",
818 (errcode_for_file_access(),
819 errmsg("could not write file \"%s\": %m",
828 /* If possible, throw away the bogus file; ignore any error */
829 unlink(PGSP_DUMP_FILE);
832 * Don't unlink PGSP_TEXT_FILE here; it should always be around while the
833 * server is running with pg_stat_statements enabled
838 * shmem_shutdown hook: Dump statistics into file.
840 * Note: we don't bother with acquiring lock, because there should be no
841 * other processes running when this is called.
844 pgsp_shmem_shutdown(int code, Datum arg)
847 char *pbuffer = NULL;
848 Size pbuffer_size = 0;
849 HASH_SEQ_STATUS hash_seq;
853 /* Don't try to dump during a crash. */
857 /* Safety check ... shouldn't get here unless shmem is set up. */
858 if (!shared_state || !hash_table)
861 /* Don't dump if told not to. */
862 if (!dump_on_shutdown)
865 file = AllocateFile(PGSP_DUMP_FILE ".tmp", PG_BINARY_W);
869 if (fwrite(&PGSP_FILE_HEADER, sizeof(uint32), 1, file) != 1)
871 if (fwrite(&PGSP_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
873 num_entries = hash_get_num_entries(hash_table);
874 if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
877 if (plan_storage == PLAN_STORAGE_FILE)
879 pbuffer = ptext_load_file(&pbuffer_size);
884 hash_seq_init(&hash_seq, hash_table);
885 while ((entry = hash_seq_search(&hash_seq)) != NULL)
887 int len = entry->plan_len;
890 if (plan_storage == PLAN_STORAGE_FILE)
891 pstr = ptext_fetch(entry->plan_offset, len,
892 pbuffer, pbuffer_size);
894 pstr = SHMEM_PLAN_PTR(entry);
897 continue; /* Ignore any entries with bogus texts */
899 if (fwrite(entry, sizeof(pgspEntry), 1, file) != 1 ||
900 fwrite(pstr, 1, len + 1, file) != len + 1)
902 /* note: we assume hash_seq_term won't change errno */
903 hash_seq_term(&hash_seq);
915 * Rename file into place, so we atomically replace the old one.
917 if (rename(PGSP_DUMP_FILE ".tmp", PGSP_DUMP_FILE) != 0)
919 (errcode_for_file_access(),
920 errmsg("could not rename pg_store_plans file \"%s\": %m",
921 PGSP_DUMP_FILE ".tmp")));
923 /* Unlink query-texts file; it's not needed while shutdown */
924 unlink(PGSP_TEXT_FILE);
930 (errcode_for_file_access(),
931 errmsg("could not write pg_store_plans file \"%s\": %m",
932 PGSP_DUMP_FILE ".tmp")));
935 unlink(PGSP_DUMP_FILE ".tmp");
940 * ExecutorStart hook: start up tracking if needed
943 pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags)
946 (eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0)
948 queryDesc->instrument_options |=
949 (log_timing ? INSTRUMENT_TIMER : 0)|
950 (log_timing ? 0: INSTRUMENT_ROWS)|
951 (log_buffers ? INSTRUMENT_BUFFERS : 0);
954 if (prev_ExecutorStart)
955 prev_ExecutorStart(queryDesc, eflags);
957 standard_ExecutorStart(queryDesc, eflags);
960 * Set up to track total elapsed time in ExecutorRun. Allocate in per-query
961 * context so as to be free at ExecutorEnd.
963 if (queryDesc->totaltime == NULL &&
964 pgsp_enabled(queryDesc->plannedstmt->queryId))
966 MemoryContext oldcxt;
968 oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
969 queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL
970 #if PG_VERSION_NUM >= 140000
974 MemoryContextSwitchTo(oldcxt);
980 * ExecutorRun hook: all we need do is track nesting depth
983 pgsp_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
989 if (prev_ExecutorRun)
990 prev_ExecutorRun(queryDesc, direction, count, execute_once);
992 standard_ExecutorRun(queryDesc, direction, count, execute_once);
1004 * ExecutorFinish hook: all we need do is track nesting depth
1007 pgsp_ExecutorFinish(QueryDesc *queryDesc)
1012 if (prev_ExecutorFinish)
1013 prev_ExecutorFinish(queryDesc);
1015 standard_ExecutorFinish(queryDesc);
1027 * ExecutorEnd hook: store results if needed
1030 pgsp_ExecutorEnd(QueryDesc *queryDesc)
1032 if (queryDesc->totaltime)
1035 * Make sure stats accumulation is done. (Note: it's okay if several
1036 * levels of hook all do this.)
1038 InstrEndLoop(queryDesc->totaltime);
1040 if (pgsp_enabled(queryDesc->plannedstmt->queryId) &&
1041 queryDesc->totaltime->total &&
1042 queryDesc->totaltime->total >=
1043 (double)min_duration / 1000.0)
1049 es = NewExplainState();
1052 es->analyze = queryDesc->instrument_options;
1053 es->verbose = log_verbose;
1054 es->buffers = (es->analyze && log_buffers);
1055 es->timing = (es->analyze && log_timing);
1056 es->format = EXPLAIN_FORMAT_JSON;
1058 ExplainBeginOutput(es);
1059 ExplainPrintPlan(es, queryDesc);
1061 pgspExplainTriggers(es, queryDesc);
1062 ExplainEndOutput(es);
1064 /* Remove last line break */
1065 if (es_str->len > 0 && es_str->data[es_str->len - 1] == '\n')
1066 es_str->data[--es_str->len] = '\0';
1068 /* JSON outmost braces. */
1069 es_str->data[0] = '{';
1070 es_str->data[es_str->len - 1] = '}';
1072 queryid = queryDesc->plannedstmt->queryId;
1073 #if PG_VERSION_NUM < 140000
1075 * For versions before pg14, a queryid is only available if
1076 * pg_stat_statements extension (or similar) if configured. We
1077 * don't want a hard requirement for such an extension so fallback
1078 * to an internal queryid calculation in some case.
1079 * For pg14 and above, core postgres can compute a queryid so we
1082 if (queryid == PGSP_NO_QUERYID)
1083 queryid = (queryid_t) hash_query(queryDesc->sourceText);
1085 Assert(queryid != PGSP_NO_QUERYID);
1088 pgsp_store(es_str->data,
1090 queryDesc->totaltime->total * 1000.0, /* convert to msec */
1091 queryDesc->estate->es_processed,
1092 &queryDesc->totaltime->bufusage);
1093 pfree(es_str->data);
1097 if (prev_ExecutorEnd)
1098 prev_ExecutorEnd(queryDesc);
1100 standard_ExecutorEnd(queryDesc);
1104 * ProcessUtility hook
1107 pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
1108 #if PG_VERSION_NUM >= 140000
1111 ProcessUtilityContext context, ParamListInfo params,
1112 QueryEnvironment *queryEnv,
1113 DestReceiver *dest, COMPTAG_TYPE *completionTag)
1115 int tag = nodeTag(pstmt->utilityStmt);
1116 queryid_t saved_queryId = pstmt->queryId;
1117 bool reset_force_disabled = false;
1119 if (pgsp_enabled(saved_queryId) &&
1120 (tag == T_CreateExtensionStmt || tag == T_AlterExtensionStmt) &&
1121 !force_disabled && track_level < TRACK_LEVEL_VERBOSE)
1123 force_disabled = true;
1124 reset_force_disabled = true;
1129 if (prev_ProcessUtility)
1131 prev_ProcessUtility(pstmt, queryString,
1132 #if PG_VERSION_NUM >= 140000
1135 context, params, queryEnv,
1136 dest, completionTag);
1139 standard_ProcessUtility(pstmt, queryString,
1140 #if PG_VERSION_NUM >= 140000
1143 context, params, queryEnv,
1144 dest, completionTag);
1146 if (reset_force_disabled)
1147 force_disabled = false;
1151 if (reset_force_disabled)
1152 force_disabled = false;
1159 * hash_query: calculate internal query ID for a query
1161 * As of PG11, Query.queryId has been widen to 64 bit to reduce collision of
1162 * queries to practical level. On the other hand pg_store_plans uses the
1163 * combination of query hash and plan hash values as the hash table key and
1164 * the resolution of the hash value effectively has the same degree so we
1165 * continue to use uint32 as internal queryid.
1167 * This may merge plans from different queries into single internal query id
1168 * but it is not a problem when pg_stat_statements is used together since the
1169 * extension gives enough resolution on queries.
1172 hash_query(const char* query)
1176 char *normquery = pstrdup(query);
1177 normalize_expr(normquery, false);
1178 queryid = hash_any((const unsigned char*)normquery, strlen(normquery));
1181 /* If we are unlucky enough to get a hash of zero, use 1 instead */
1190 * Store some statistics for a plan.
1192 * Table entry is keyed with userid.dbid.queryId.planId. planId is the hash
1193 * value of the given plan, which is calculated in ths function.
1196 pgsp_store(char *plan, queryid_t queryId,
1197 double total_time, uint64 rows,
1198 const BufferUsage *bufusage)
1202 char *norm_query = NULL;
1204 char *normalized_plan = NULL;
1205 char *shorten_plan = NULL;
1206 volatile pgspEntry *e;
1207 Size plan_offset = 0;
1210 Assert(plan != NULL && queryId != PGSP_NO_QUERYID);
1212 /* Safety check... */
1213 if (!shared_state || !hash_table)
1216 /* Set up key for hashtable search */
1217 key.userid = GetUserId();
1218 key.dbid = MyDatabaseId;
1219 key.queryid = queryId;
1221 normalized_plan = pgsp_json_normalize(plan);
1222 shorten_plan = pgsp_json_shorten(plan);
1223 elog(DEBUG3, "pg_store_plans: Normalized plan: %s", normalized_plan);
1224 elog(DEBUG3, "pg_store_plans: Shorten plan: %s", shorten_plan);
1225 elog(DEBUG3, "pg_store_plans: Original plan: %s", plan);
1226 plan_len = strlen(shorten_plan);
1228 key.planid = hash_any((const unsigned char *)normalized_plan,
1229 strlen(normalized_plan));
1230 pfree(normalized_plan);
1232 if (plan_len >= shared_state->plan_size)
1233 plan_len = pg_encoding_mbcliplen(GetDatabaseEncoding(),
1236 shared_state->plan_size - 1);
1239 /* Look up the hash table entry with shared lock. */
1240 LWLockAcquire(shared_state->lock, LW_SHARED);
1242 entry = (pgspEntry *) hash_search(hash_table, &key, HASH_FIND, NULL);
1244 /* Store the plan text, if the entry not present */
1245 if (!entry && plan_storage == PLAN_STORAGE_FILE)
1250 /* Append new plan text to file with only shared lock held */
1251 stored = ptext_store(shorten_plan, plan_len, &plan_offset, &gc_count);
1254 * Determine whether we need to garbage collect external query texts
1255 * while the shared lock is still held. This micro-optimization
1256 * avoids taking the time to decide this while holding exclusive lock.
1258 do_gc = need_gc_ptexts();
1260 /* Acquire exclusive lock as required by entry_alloc() */
1261 LWLockRelease(shared_state->lock);
1262 LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
1265 * A garbage collection may have occurred while we weren't holding the
1266 * lock. In the unlikely event that this happens, the plan text we
1267 * stored above will have been garbage collected, so write it again.
1268 * This should be infrequent enough that doing it while holding
1269 * exclusive lock isn't a performance problem.
1271 if (!stored || shared_state->gc_count != gc_count)
1272 stored = ptext_store(shorten_plan, plan_len, &plan_offset, NULL);
1274 /* If we failed to write to the text file, give up */
1280 /* Create new entry, if not present */
1283 entry = entry_alloc(&key, plan_offset, plan_len, false);
1285 /* shorten_plan is terminated by NUL */
1286 if (plan_storage == PLAN_STORAGE_SHMEM)
1287 memcpy(SHMEM_PLAN_PTR(entry), shorten_plan, plan_len + 1);
1290 /* If needed, perform garbage collection while exclusive lock held */
1295 /* Increment the counts, except when jstate is not NULL */
1298 * Grab the spinlock while updating the counters (see comment about
1299 * locking rules at the head of the file)
1302 e = (volatile pgspEntry *) entry;
1303 SpinLockAcquire(&e->mutex);
1305 /* "Unstick" entry if it was previously sticky */
1306 if (e->counters.calls == 0)
1308 e->counters.usage = USAGE_INIT;
1309 e->counters.first_call = GetCurrentTimestamp();
1312 e->counters.calls += 1;
1313 e->counters.total_time += total_time;
1314 if (e->counters.calls == 1)
1316 e->counters.min_time = total_time;
1317 e->counters.max_time = total_time;
1318 e->counters.mean_time = total_time;
1323 * Welford's method for accurately computing variance. See
1324 * <http://www.johndcook.com/blog/standard_deviation/>
1326 double old_mean = e->counters.mean_time;
1328 e->counters.mean_time +=
1329 (total_time - old_mean) / e->counters.calls;
1330 e->counters.sum_var_time +=
1331 (total_time - old_mean) * (total_time - e->counters.mean_time);
1333 /* calculate min and max time */
1334 if (e->counters.min_time > total_time)
1335 e->counters.min_time = total_time;
1336 if (e->counters.max_time < total_time)
1337 e->counters.max_time = total_time;
1340 e->counters.rows += rows;
1341 e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1342 e->counters.shared_blks_read += bufusage->shared_blks_read;
1343 e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
1344 e->counters.shared_blks_written += bufusage->shared_blks_written;
1345 e->counters.local_blks_hit += bufusage->local_blks_hit;
1346 e->counters.local_blks_read += bufusage->local_blks_read;
1347 e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
1348 e->counters.local_blks_written += bufusage->local_blks_written;
1349 e->counters.temp_blks_read += bufusage->temp_blks_read;
1350 e->counters.temp_blks_written += bufusage->temp_blks_written;
1351 e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1352 e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
1353 e->counters.last_call = GetCurrentTimestamp();
1354 e->counters.usage += USAGE_EXEC(total_time);
1356 SpinLockRelease(&e->mutex);
1359 LWLockRelease(shared_state->lock);
1361 /* We postpone this pfree until we're out of the lock */
1367 * Reset all statement statistics.
1370 pg_store_plans_reset(PG_FUNCTION_ARGS)
1372 if (!shared_state || !hash_table)
1374 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1375 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1380 /* Number of output arguments (columns) for various API versions */
1381 #define PG_STORE_PLANS_COLS_V1_5 27
1382 #define PG_STORE_PLANS_COLS_V1_6 26
1383 #define PG_STORE_PLANS_COLS 27 /* maximum of above */
1386 * Retrieve statement statistics.
1389 pg_store_plans_1_6(PG_FUNCTION_ARGS)
1391 pg_store_plans_internal(fcinfo, PGSP_V1_6);
1397 pg_store_plans(PG_FUNCTION_ARGS)
1399 pg_store_plans_internal(fcinfo, PGSP_V1_5);
1405 pg_store_plans_internal(FunctionCallInfo fcinfo,
1406 pgspVersion api_version)
1408 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1410 Tuplestorestate *tupstore;
1411 MemoryContext per_query_ctx;
1412 MemoryContext oldcontext;
1413 Oid userid = GetUserId();
1414 bool is_allowed_role = is_member_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS);
1416 char *pbuffer = NULL;
1417 Size pbuffer_size = 0;
1420 HASH_SEQ_STATUS hash_seq;
1423 if (!shared_state || !hash_table)
1425 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1426 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1428 /* check to see if caller supports us returning a tuplestore */
1429 if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1431 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1432 errmsg("set-valued function called in context that cannot accept a set")));
1433 if (!(rsinfo->allowedModes & SFRM_Materialize))
1435 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1436 errmsg("materialize mode required, but it is not " \
1437 "allowed in this context")));
1439 /* Build a tuple descriptor for our result type */
1440 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1441 elog(ERROR, "return type must be a row type");
1443 per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1444 oldcontext = MemoryContextSwitchTo(per_query_ctx);
1446 tupstore = tuplestore_begin_heap(true, false, work_mem);
1447 rsinfo->returnMode = SFRM_Materialize;
1448 rsinfo->setResult = tupstore;
1449 rsinfo->setDesc = tupdesc;
1451 MemoryContextSwitchTo(oldcontext);
1454 * We'd like to load the plan text file (if needed) while not holding any
1455 * lock on shared_state->lock. In the worst case we'll have to do this
1456 * again after we have the lock, but it's unlikely enough to make this a
1457 * win despite occasional duplicated work. We need to reload if anybody
1458 * writes to the file (either a retail ptext_store(), or a garbage
1459 * collection) between this point and where we've gotten shared lock. If a
1460 * ptext_store is actually in progress when we look, we might as well skip
1461 * the speculative load entirely.
1464 /* Take the mutex so we can examine variables */
1466 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1468 SpinLockAcquire(&s->mutex);
1470 n_writers = s->n_writers;
1471 gc_count = s->gc_count;
1472 SpinLockRelease(&s->mutex);
1475 /* No point in loading file now if there are active writers */
1476 if (n_writers == 0 && plan_storage == PLAN_STORAGE_FILE)
1477 pbuffer = ptext_load_file(&pbuffer_size);
1480 * Get shared lock, load or reload the plan text file if we must, and
1481 * iterate over the hashtable entries.
1483 * With a large hash table, we might be holding the lock rather longer
1484 * than one could wish. However, this only blocks creation of new hash
1485 * table entries, and the larger the hash table the less likely that is to
1486 * be needed. So we can hope this is okay. Perhaps someday we'll decide
1487 * we need to partition the hash table to limit the time spent holding any
1490 LWLockAcquire(shared_state->lock, LW_SHARED);
1493 * Here it is safe to examine extent and gc_count without taking the mutex.
1494 * Note that although other processes might change shared_state->extent
1495 * just after we look at it, the strings they then write into the file
1496 * cannot yet be referenced in the hashtable, so we don't care whether we
1499 * If ptext_load_file fails, we just press on; we'll return NULL for every
1502 if (plan_storage == PLAN_STORAGE_FILE &&
1504 shared_state->extent != extent ||
1505 shared_state->gc_count != gc_count))
1509 pbuffer = ptext_load_file(&pbuffer_size);
1512 hash_seq_init(&hash_seq, hash_table);
1513 while ((entry = hash_seq_search(&hash_seq)) != NULL)
1515 Datum values[PG_STORE_PLANS_COLS];
1516 bool nulls[PG_STORE_PLANS_COLS];
1518 int64 queryid = entry->key.queryid;
1519 int64 planid = entry->key.planid;
1523 memset(values, 0, sizeof(values));
1524 memset(nulls, 0, sizeof(nulls));
1526 values[i++] = ObjectIdGetDatum(entry->key.userid);
1527 values[i++] = ObjectIdGetDatum(entry->key.dbid);
1528 if (is_allowed_role || entry->key.userid == userid)
1530 values[i++] = Int64GetDatumFast(queryid);
1531 values[i++] = Int64GetDatumFast(planid);
1533 /* fill queryid_stat_statements with the same value with queryid */
1534 if (api_version == PGSP_V1_5)
1535 values[i++] = Int64GetDatumFast(queryid);
1539 nulls[i++] = true; /* queryid */
1540 nulls[i++] = true; /* planid */
1542 /* queryid_stat_statemetns*/
1543 if (api_version == PGSP_V1_5)
1547 if (is_allowed_role || entry->key.userid == userid)
1549 char *pstr; /* Plan string */
1550 char *mstr; /* Modified plan string */
1551 char *estr; /* Encoded modified plan string */
1553 if (plan_storage == PLAN_STORAGE_FILE)
1554 pstr = ptext_fetch(entry->plan_offset, entry->plan_len,
1555 pbuffer, pbuffer_size);
1557 pstr = SHMEM_PLAN_PTR(entry);
1559 switch (plan_format)
1561 case PLAN_FORMAT_TEXT:
1562 mstr = pgsp_json_textize(pstr);
1564 case PLAN_FORMAT_JSON:
1565 mstr = pgsp_json_inflate(pstr);
1567 case PLAN_FORMAT_YAML:
1568 mstr = pgsp_json_yamlize(pstr);
1570 case PLAN_FORMAT_XML:
1571 mstr = pgsp_json_xmlize(pstr);
1579 pg_do_encoding_conversion((unsigned char *) mstr,
1582 GetDatabaseEncoding());
1583 values[i++] = CStringGetTextDatum(estr);
1591 /* pstr is a pointer onto pbuffer */
1594 values[i++] = CStringGetTextDatum("<insufficient privilege>");
1596 /* copy counters to a local variable to keep locking time short */
1598 volatile pgspEntry *e = (volatile pgspEntry *) entry;
1600 SpinLockAcquire(&e->mutex);
1602 SpinLockRelease(&e->mutex);
1605 /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1609 values[i++] = Int64GetDatumFast(tmp.calls);
1610 values[i++] = Float8GetDatumFast(tmp.total_time);
1611 values[i++] = Float8GetDatumFast(tmp.min_time);
1612 values[i++] = Float8GetDatumFast(tmp.max_time);
1613 values[i++] = Float8GetDatumFast(tmp.mean_time);
1616 * Note we are calculating the population variance here, not the
1617 * sample variance, as we have data for the whole population, so
1618 * Bessel's correction is not used, and we don't divide by
1622 stddev = sqrt(tmp.sum_var_time / tmp.calls);
1625 values[i++] = Float8GetDatumFast(stddev);
1627 values[i++] = Int64GetDatumFast(tmp.rows);
1628 values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1629 values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
1630 values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
1631 values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1632 values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1633 values[i++] = Int64GetDatumFast(tmp.local_blks_read);
1634 values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
1635 values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1636 values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1637 values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
1638 values[i++] = Float8GetDatumFast(tmp.blk_read_time);
1639 values[i++] = Float8GetDatumFast(tmp.blk_write_time);
1640 values[i++] = TimestampTzGetDatum(tmp.first_call);
1641 values[i++] = TimestampTzGetDatum(tmp.last_call);
1643 Assert(i == (api_version == PGSP_V1_5 ? PG_STORE_PLANS_COLS_V1_5 :
1644 api_version == PGSP_V1_6 ? PG_STORE_PLANS_COLS_V1_6 :
1645 -1 /* fail if you forget to update this assert */ ));
1647 tuplestore_putvalues(tupstore, tupdesc, values, nulls);
1650 LWLockRelease(shared_state->lock);
1652 /* clean up and return the tuplestore */
1653 tuplestore_donestoring(tupstore);
1656 /* Number of output arguments (columns) for pg_stat_statements_info */
1657 #define PG_STORE_PLANS_INFO_COLS 2
1660 * Return statistics of pg_stat_statements.
1663 pg_store_plans_info(PG_FUNCTION_ARGS)
1665 pgspGlobalStats stats;
1667 Datum values[PG_STORE_PLANS_INFO_COLS];
1668 bool nulls[PG_STORE_PLANS_INFO_COLS];
1670 if (!shared_state || !hash_table)
1672 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1673 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1675 /* Build a tuple descriptor for our result type */
1676 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1677 elog(ERROR, "return type must be a row type");
1679 MemSet(values, 0, sizeof(values));
1680 MemSet(nulls, 0, sizeof(nulls));
1682 /* Read global statistics for pg_stat_statements */
1684 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1686 SpinLockAcquire(&s->mutex);
1688 SpinLockRelease(&s->mutex);
1691 values[0] = Int64GetDatum(stats.dealloc);
1692 values[1] = TimestampTzGetDatum(stats.stats_reset);
1694 PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
1698 * Estimate shared memory space needed.
1701 shared_mem_size(void)
1706 size = MAXALIGN(sizeof(pgspSharedState));
1707 entry_size = sizeof(pgspEntry);
1709 /* plan text is apppended to the struct body */
1710 if (plan_storage == PLAN_STORAGE_SHMEM)
1711 entry_size += max_plan_len;
1713 size = add_size(size, hash_estimate_size(store_size, entry_size));
1719 * Allocate a new hashtable entry.
1720 * caller must hold an exclusive lock on shared_state->lock
1722 * "plan" need not be null-terminated; we rely on plan_len instead
1724 * If "sticky" is true, make the new entry artificially sticky so that it will
1725 * probably still be there when the query finishes execution. We do this by
1726 * giving it a median usage value rather than the normal value. (Strictly
1727 * speaking, query strings are normalized on a best effort basis, though it
1728 * would be difficult to demonstrate this even under artificial conditions.)
1730 * Note: despite needing exclusive lock, it's not an error for the target
1731 * entry to already exist. This is because pgsp_store releases and
1732 * reacquires lock after failing to find a match; so someone else could
1733 * have made the entry while we waited to get exclusive lock.
1736 entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len, bool sticky)
1741 /* Make space if needed */
1742 while (hash_get_num_entries(hash_table) >= store_size)
1745 /* Find or create an entry with desired hash code */
1746 entry = (pgspEntry *) hash_search(hash_table, key, HASH_ENTER, &found);
1750 /* New entry, initialize it */
1752 /* reset the statistics */
1753 memset(&entry->counters, 0, sizeof(Counters));
1754 /* set the appropriate initial usage count */
1755 entry->counters.usage = sticky ? shared_state->cur_median_usage : USAGE_INIT;
1756 /* re-initialize the mutex each time ... we assume no one using it */
1757 SpinLockInit(&entry->mutex);
1758 /* ... and don't forget the query text */
1759 Assert(plan_len >= 0 && plan_len < shared_state->plan_size);
1760 entry->plan_offset = plan_offset;
1761 entry->plan_len = plan_len;
1762 entry->encoding = GetDatabaseEncoding();
1769 * qsort comparator for sorting into increasing usage order
1772 entry_cmp(const void *lhs, const void *rhs)
1774 double l_usage = (*(pgspEntry *const *) lhs)->counters.usage;
1775 double r_usage = (*(pgspEntry *const *) rhs)->counters.usage;
1777 if (l_usage < r_usage)
1779 else if (l_usage > r_usage)
1786 * Deallocate least used entries.
1787 * Caller must hold an exclusive lock on shared_state->lock.
1792 HASH_SEQ_STATUS hash_seq;
1793 pgspEntry **entries;
1801 * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1802 * While we're scanning the table, apply the decay factor to the usage
1806 entries = palloc(hash_get_num_entries(hash_table) * sizeof(pgspEntry *));
1812 hash_seq_init(&hash_seq, hash_table);
1813 while ((entry = hash_seq_search(&hash_seq)) != NULL)
1815 entries[i++] = entry;
1816 /* "Sticky" entries get a different usage decay rate. */
1817 if (entry->counters.calls == 0)
1818 entry->counters.usage *= STICKY_DECREASE_FACTOR;
1820 entry->counters.usage *= USAGE_DECREASE_FACTOR;
1822 /* In the mean length computation, ignore dropped texts. */
1823 if (entry->plan_len >= 0)
1825 tottextlen += entry->plan_len + 1;
1830 qsort(entries, i, sizeof(pgspEntry *), entry_cmp);
1832 /* Also, record the (approximate) median usage */
1834 shared_state->cur_median_usage = entries[i / 2]->counters.usage;
1835 /* Record the mean plan length */
1836 if (nvalidtexts > 0)
1837 shared_state->mean_plan_len = tottextlen / nvalidtexts;
1839 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
1841 nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
1842 nvictims = Min(nvictims, i);
1844 for (i = 0; i < nvictims; i++)
1846 hash_search(hash_table, &entries[i]->key, HASH_REMOVE, NULL);
1851 /* Increment the number of times entries are deallocated */
1853 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1855 SpinLockAcquire(&s->mutex);
1856 s->stats.dealloc += 1;
1857 SpinLockRelease(&s->mutex);
1862 * Given a plan string (not necessarily null-terminated), allocate a new
1863 * entry in the external plan text file and store the string there.
1865 * If successful, returns true, and stores the new entry's offset in the file
1866 * into *plan_offset. Also, if gc_count isn't NULL, *gc_count is set to the
1867 * number of garbage collections that have occurred so far.
1869 * On failure, returns false.
1871 * At least a shared lock on shared_state->lock must be held by the caller, so
1872 * as to prevent a concurrent garbage collection. Share-lock-holding callers
1873 * should pass a gc_count pointer to obtain the number of garbage collections,
1874 * so that they can recheck the count after obtaining exclusive lock to detect
1875 * whether a garbage collection occurred (and removed this entry).
1878 ptext_store(const char *plan, int plan_len, Size *plan_offset, int *gc_count)
1883 Assert (plan_storage == PLAN_STORAGE_FILE);
1886 * We use a spinlock to protect extent/n_writers/gc_count, so that
1887 * multiple processes may execute this function concurrently.
1890 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1892 SpinLockAcquire(&s->mutex);
1894 s->extent += plan_len + 1;
1897 *gc_count = s->gc_count;
1898 SpinLockRelease(&s->mutex);
1903 /* Now write the data into the successfully-reserved part of the file */
1904 fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
1908 if (pg_pwrite(fd, plan, plan_len, off) != plan_len)
1910 if (pg_pwrite(fd, "\0", 1, off + plan_len) != 1)
1913 CloseTransientFile(fd);
1915 /* Mark our write complete */
1917 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1919 SpinLockAcquire(&s->mutex);
1921 SpinLockRelease(&s->mutex);
1928 (errcode_for_file_access(),
1929 errmsg("could not write file \"%s\": %m",
1933 CloseTransientFile(fd);
1935 /* Mark our write complete */
1937 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1939 SpinLockAcquire(&s->mutex);
1941 SpinLockRelease(&s->mutex);
1948 * Read the external plan text file into a malloc'd buffer.
1950 * Returns NULL (without throwing an error) if unable to read, eg
1951 * file not there or insufficient memory.
1953 * On success, the buffer size is also returned into *buffer_size.
1955 * This can be called without any lock on shared_state->lock, but in that case
1956 * the caller is responsible for verifying that the result is sane.
1959 ptext_load_file(Size *buffer_size)
1966 Assert (plan_storage == PLAN_STORAGE_FILE);
1968 fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDONLY | PG_BINARY);
1971 if (errno != ENOENT)
1973 (errcode_for_file_access(),
1974 errmsg("could not read file \"%s\": %m",
1979 /* Get file length */
1980 if (fstat(fd, &stat))
1983 (errcode_for_file_access(),
1984 errmsg("could not stat file \"%s\": %m",
1986 CloseTransientFile(fd);
1990 /* Allocate buffer; beware that off_t might be wider than size_t */
1991 if (stat.st_size <= MaxAllocHugeSize)
1992 buf = (char *) malloc(stat.st_size);
1998 (errcode(ERRCODE_OUT_OF_MEMORY),
1999 errmsg("out of memory"),
2000 errdetail("Could not allocate enough memory to read file \"%s\".",
2002 CloseTransientFile(fd);
2007 * OK, slurp in the file. Windows fails if we try to read more than
2008 * INT_MAX bytes at once, and other platforms might not like that either,
2009 * so read a very large file in 1GB segments.
2012 while (nread < stat.st_size)
2014 int toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
2017 * If we get a short read and errno doesn't get set, the reason is
2018 * probably that garbage collection truncated the file since we did
2019 * the fstat(), so we don't log a complaint --- but we don't return
2020 * the data, either, since it's most likely corrupt due to concurrent
2021 * writes from garbage collection.
2024 if (read(fd, buf + nread, toread) != toread)
2028 (errcode_for_file_access(),
2029 errmsg("could not read file \"%s\": %m",
2032 CloseTransientFile(fd);
2038 if (CloseTransientFile(fd) != 0)
2040 (errcode_for_file_access(),
2041 errmsg("could not close file \"%s\": %m", PGSP_TEXT_FILE)));
2043 *buffer_size = nread;
2048 * Locate a plan text in the file image previously read by ptext_load_file().
2050 * We validate the given offset/length, and return NULL if bogus. Otherwise,
2051 * the result points to a null-terminated string within the buffer.
2054 ptext_fetch(Size plan_offset, int plan_len,
2055 char *buffer, Size buffer_size)
2057 Assert (plan_storage == PLAN_STORAGE_FILE);
2059 /* File read failed? */
2062 /* Bogus offset/length? */
2064 plan_offset + plan_len >= buffer_size)
2066 /* As a further sanity check, make sure there's a trailing null */
2067 if (buffer[plan_offset + plan_len] != '\0')
2070 return buffer + plan_offset;
2074 * Do we need to garbage-collect the external plan text file?
2076 * Caller should hold at least a shared lock on shared_state->lock.
2079 need_gc_ptexts(void)
2083 Assert (plan_storage == PLAN_STORAGE_FILE);
2085 /* Read shared extent pointer */
2087 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2089 SpinLockAcquire(&s->mutex);
2091 SpinLockRelease(&s->mutex);
2094 /* Don't proceed if file does not exceed 512 bytes per possible entry */
2095 if (extent < 512 * store_size)
2099 * Don't proceed if file is less than about 50% bloat. Nothing can or
2100 * should be done in the event of unusually large query texts accounting
2101 * for file's large size. We go to the trouble of maintaining the mean
2102 * query length in order to prevent garbage collection from thrashing
2105 if (extent < shared_state->mean_plan_len * store_size * 2)
2112 * Garbage-collect orphaned plan texts in external file.
2114 * This won't be called often in the typical case, since it's likely that
2115 * there won't be too much churn, and besides, a similar compaction process
2116 * occurs when serializing to disk at shutdown or as part of resetting.
2117 * Despite this, it seems prudent to plan for the edge case where the file
2118 * becomes unreasonably large, with no other method of compaction likely to
2119 * occur in the foreseeable future.
2121 * The caller must hold an exclusive lock on shared_state->lock.
2123 * At the first sign of trouble we unlink the query text file to get a clean
2124 * slate (although existing statistics are retained), rather than risk
2125 * thrashing by allowing the same problem case to recur indefinitely.
2133 HASH_SEQ_STATUS hash_seq;
2138 Assert (plan_storage == PLAN_STORAGE_FILE);
2141 * When called from store_entry, some other session might have proceeded
2142 * with garbage collection in the no-lock-held interim of lock strength
2143 * escalation. Check once more that this is actually necessary.
2145 if (!need_gc_ptexts())
2149 * Load the old texts file. If we fail (out of memory, for instance),
2150 * invalidate query texts. Hopefully this is rare. It might seem better
2151 * to leave things alone on an OOM failure, but the problem is that the
2152 * file is only going to get bigger; hoping for a future non-OOM result is
2153 * risky and can easily lead to complete denial of service.
2155 pbuffer = ptext_load_file(&pbuffer_size);
2156 if (pbuffer == NULL)
2160 * We overwrite the plan texts file in place, so as to reduce the risk of
2161 * an out-of-disk-space failure. Since the file is guaranteed not to get
2162 * larger, this should always work on traditional filesystems; though we
2163 * could still lose on copy-on-write filesystems.
2165 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2169 (errcode_for_file_access(),
2170 errmsg("could not write file \"%s\": %m",
2178 hash_seq_init(&hash_seq, hash_table);
2179 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2181 int plan_len = entry->plan_len;
2182 char *plan = ptext_fetch(entry->plan_offset,
2189 /* Trouble ... drop the text */
2190 entry->plan_offset = 0;
2191 entry->plan_len = -1;
2192 /* entry will not be counted in mean plan length computation */
2196 if (fwrite(plan, 1, plan_len + 1, pfile) != plan_len + 1)
2199 (errcode_for_file_access(),
2200 errmsg("could not write file \"%s\": %m",
2202 hash_seq_term(&hash_seq);
2206 entry->plan_offset = extent;
2207 extent += plan_len + 1;
2212 * Truncate away any now-unused space. If this fails for some odd reason,
2213 * we log it, but there's no need to fail.
2215 if (ftruncate(fileno(pfile), extent) != 0)
2217 (errcode_for_file_access(),
2218 errmsg("could not truncate file \"%s\": %m",
2221 if (FreeFile(pfile))
2224 (errcode_for_file_access(),
2225 errmsg("could not write file \"%s\": %m",
2231 elog(DEBUG1, "pgsp gc of queries file shrunk size from %zu to %zu",
2232 shared_state->extent, extent);
2234 /* Reset the shared extent pointer */
2235 shared_state->extent = extent;
2238 * Also update the mean plan length, to be sure that need_gc_ptexts()
2239 * won't still think we have a problem.
2242 shared_state->mean_plan_len = extent / nentries;
2244 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2251 /* clean up resources */
2258 * Since the contents of the external file are now uncertain, mark all
2259 * hashtable entries as having invalid texts.
2261 hash_seq_init(&hash_seq, hash_table);
2262 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2264 entry->plan_offset = 0;
2265 entry->plan_len = -1;
2269 * Destroy the query text file and create a new, empty one
2271 (void) unlink(PGSP_TEXT_FILE);
2272 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2275 (errcode_for_file_access(),
2276 errmsg("could not recreate file \"%s\": %m",
2281 /* Reset the shared extent pointer */
2282 shared_state->extent = 0;
2284 /* Reset mean_plan_len to match the new state */
2285 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2289 * Release all entries.
2294 HASH_SEQ_STATUS hash_seq;
2298 if (!shared_state || !hash_table)
2300 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2301 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
2303 LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
2305 hash_seq_init(&hash_seq, hash_table);
2306 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2308 hash_search(hash_table, &entry->key, HASH_REMOVE, NULL);
2312 * Reset global statistics for pg_store_plans.
2315 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2316 TimestampTz stats_reset = GetCurrentTimestamp();
2318 SpinLockAcquire(&s->mutex);
2319 s->stats.dealloc = 0;
2320 s->stats.stats_reset = stats_reset;
2321 SpinLockRelease(&s->mutex);
2325 * Write new empty plan file, perhaps even creating a new one to recover
2326 * if the file was missing.
2328 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2332 (errcode_for_file_access(),
2333 errmsg("could not create file \"%s\": %m",
2338 /* If ftruncate fails, log it, but it's not a fatal problem */
2339 if (ftruncate(fileno(pfile), 0) != 0)
2341 (errcode_for_file_access(),
2342 errmsg("could not truncate file \"%s\": %m",
2348 shared_state->extent = 0;
2349 LWLockRelease(shared_state->lock);
2353 pg_store_plans_hash_query(PG_FUNCTION_ARGS)
2355 PG_RETURN_OID(hash_query(text_to_cstring(PG_GETARG_TEXT_P(0))));
2359 pg_store_plans_shorten(PG_FUNCTION_ARGS)
2361 text *short_plan = PG_GETARG_TEXT_P(0);
2362 char *cjson = text_to_cstring(short_plan);
2363 char *cshorten = pgsp_json_shorten(cjson);
2364 PG_RETURN_TEXT_P(cstring_to_text(cshorten));
2368 pg_store_plans_normalize(PG_FUNCTION_ARGS)
2370 text *short_plan = PG_GETARG_TEXT_P(0);
2371 char *cjson = text_to_cstring(short_plan);
2372 char *cnormalized = pgsp_json_normalize(cjson);
2373 PG_RETURN_TEXT_P(cstring_to_text(cnormalized));
2377 pg_store_plans_jsonplan(PG_FUNCTION_ARGS)
2379 text *short_plan = PG_GETARG_TEXT_P(0);
2380 char *cshort = text_to_cstring(short_plan);
2381 char *cinflated = pgsp_json_inflate(cshort);
2382 PG_RETURN_TEXT_P(cstring_to_text(cinflated));
2386 pg_store_plans_textplan(PG_FUNCTION_ARGS)
2388 text *short_plan = PG_GETARG_TEXT_P(0);
2389 char *cshort = text_to_cstring(short_plan);
2390 char *ctextized = pgsp_json_textize(cshort);
2392 PG_RETURN_TEXT_P(cstring_to_text(ctextized));
2396 pg_store_plans_yamlplan(PG_FUNCTION_ARGS)
2398 text *short_plan = PG_GETARG_TEXT_P(0);
2399 char *cshort = text_to_cstring(short_plan);
2400 char *cyamlized = pgsp_json_yamlize(cshort);
2402 PG_RETURN_TEXT_P(cstring_to_text(cyamlized));
2406 pg_store_plans_xmlplan(PG_FUNCTION_ARGS)
2408 text *short_plan = PG_GETARG_TEXT_P(0);
2409 char *cshort = text_to_cstring(short_plan);
2410 char *cxmlized = pgsp_json_xmlize(cshort);
2412 PG_RETURN_TEXT_P(cstring_to_text(cxmlized));