1 /*-------------------------------------------------------------------------
4 * Take statistics of plan selection across a whole database cluster.
6 * Execution costs are totaled for each distinct plan for each query,
7 * and plan and queryid are kept in a shared hashtable, each record in
8 * which is associated with a record in pg_stat_statements, if any, by
11 * For Postgres 9.3 or earlier does not expose query id so
12 * pg_store_plans needs to calculate it based on the given query
13 * string using different algorithm from pg_stat_statements, and later
14 * the id will be matched against the one made from query string
15 * stored in pg_stat_statements. For the reason, queryid matching in
16 * this way will fail if the query string kept in pg_stat_statements
17 * is truncated in the middle.
19 * Plans are identified by fingerprinting plan representations in
20 * "shortened" JSON format with constants and unstable values such as
21 * rows, width, loops ignored. Nevertheless, stored plan entries hold
22 * them of the latest execution. Entry eviction is done in the same
23 * way to pg_stat_statements.
25 * Copyright (c) 2008-2022, PostgreSQL Global Development Group
26 * Copyright (c) 2012-2022, NIPPON TELEGRAPH AND TELEPHONE CORPORATION
29 * pg_store_plans/pg_store_plans.c
31 *-------------------------------------------------------------------------
40 #include "catalog/pg_authid.h"
41 #include "commands/explain.h"
42 #include "access/hash.h"
43 #include "executor/instrument.h"
45 #include "mb/pg_wchar.h"
46 #include "miscadmin.h"
48 #include "storage/fd.h"
49 #include "storage/ipc.h"
50 #include "storage/lwlock.h"
51 #include "storage/spin.h"
52 #include "storage/shmem.h"
53 #include "tcop/utility.h"
54 #include "utils/acl.h"
55 #include "utils/builtins.h"
56 #if PG_VERSION_NUM >= 140000
57 #include "utils/queryjumble.h"
59 #include "utils/timestamp.h"
61 #include "pgsp_json.h"
62 #include "pgsp_explain.h"
66 /* Location of stats file */
67 #define PGSP_DUMP_FILE "global/pg_store_plans.stat"
68 #define PGSP_TEXT_FILE PG_STAT_TMP_DIR "/pgsp_plan_texts.stat"
70 /* PostgreSQL major version number, changes in which invalidate all entries */
71 static const uint32 PGSP_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
73 /* This constant defines the magic number in the stats file header */
74 static const uint32 PGSP_FILE_HEADER = 0x20221214;
75 static int max_plan_len = 5000;
77 /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
78 #define USAGE_EXEC(duration) (1.0)
79 #define USAGE_INIT (1.0) /* including initial planning */
80 #define ASSUMED_MEDIAN_INIT (10.0) /* initial assumed median usage */
81 #define ASSUMED_LENGTH_INIT 1024 /* initial assumed mean query length */
82 #define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */
83 #define STICKY_DECREASE_FACTOR (0.50) /* factor for sticky entries */
84 #define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */
86 /* In PostgreSQL 11, queryid becomes a uint64 internally. */
87 #if PG_VERSION_NUM >= 110000
88 typedef uint64 queryid_t;
89 #define PGSP_NO_QUERYID UINT64CONST(0)
91 typedef uint32 queryid_t;
92 #define PGSP_NO_QUERYID 0
96 * Extension version number, for supporting older extension versions' objects
98 typedef enum pgspVersion
106 * Hashtable key that defines the identity of a hashtable entry. We separate
107 * queries by user and by database even if they are otherwise identical.
109 * Presently, the query encoding is fully determined by the source database
110 * and so we don't really need it to be in the key. But that might not always
111 * be true. Anyway it's notationally convenient to pass it as part of the key.
113 typedef struct pgspHashKey
115 Oid userid; /* user OID */
116 Oid dbid; /* database OID */
117 queryid_t queryid; /* query identifier */
118 uint32 planid; /* plan identifier */
122 * The actual stats counters kept within pgspEntry.
124 typedef struct Counters
126 int64 calls; /* # of times executed */
127 double total_time; /* total execution time, in msec */
128 double min_time; /* minimum execution time in msec */
129 double max_time; /* maximum execution time in msec */
130 double mean_time; /* mean execution time in msec */
131 double sum_var_time; /* sum of variances in execution time in msec */
132 int64 rows; /* total # of retrieved or affected rows */
133 int64 shared_blks_hit; /* # of shared buffer hits */
134 int64 shared_blks_read; /* # of shared disk blocks read */
135 int64 shared_blks_dirtied;/* # of shared disk blocks dirtied */
136 int64 shared_blks_written;/* # of shared disk blocks written */
137 int64 local_blks_hit; /* # of local buffer hits */
138 int64 local_blks_read; /* # of local disk blocks read */
139 int64 local_blks_dirtied; /* # of local disk blocks dirtied */
140 int64 local_blks_written; /* # of local disk blocks written */
141 int64 temp_blks_read; /* # of temp blocks read */
142 int64 temp_blks_written; /* # of temp blocks written */
143 double blk_read_time; /* time spent reading, in msec */
144 double blk_write_time; /* time spent writing, in msec */
145 double temp_blk_read_time; /* time spent reading temp blocks,
147 double temp_blk_write_time;/* time spent writing temp blocks,
149 TimestampTz first_call; /* timestamp of first call */
150 TimestampTz last_call; /* timestamp of last call */
151 double usage; /* usage factor */
155 * Global statistics for pg_store_plans
157 typedef struct pgspGlobalStats
159 int64 dealloc; /* # of times entries were deallocated */
160 TimestampTz stats_reset; /* timestamp with all stats reset */
164 * Statistics per plan
166 * NB: see the file read/write code before changing field order here.
168 typedef struct pgspEntry
170 pgspHashKey key; /* hash key of entry - MUST BE FIRST */
171 Counters counters; /* the statistics for this query */
172 Size plan_offset; /* plan text offset in extern file */
173 int plan_len; /* # of valid bytes in query string */
174 int encoding; /* query encoding */
175 slock_t mutex; /* protects the counters only */
179 * Global shared state
181 typedef struct pgspSharedState
183 LWLock *lock; /* protects hashtable search/modification */
184 int plan_size; /* max query length in bytes */
185 double cur_median_usage; /* current median usage in hashtable */
186 Size mean_plan_len; /* current mean entry text length */
187 slock_t mutex; /* protects following fields only: */
188 Size extent; /* current extent of plan file */
189 int n_writers; /* number of active writers to query file */
190 int gc_count; /* plan file garbage collection cycle count */
191 pgspGlobalStats stats; /* global statistics for pgsp */
194 /*---- Local variables ----*/
196 /* Current nesting depth of ExecutorRun+ProcessUtility calls */
197 static int nested_level = 0;
199 /* Saved hook values in case of unload */
200 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
201 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
202 static ExecutorRun_hook_type prev_ExecutorRun = NULL;
203 static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
204 static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
205 static ProcessUtility_hook_type prev_ProcessUtility = NULL;
207 /* Links to shared memory state */
208 static pgspSharedState *shared_state = NULL;
209 static HTAB *hash_table = NULL;
211 /*---- GUC variables ----*/
215 TRACK_LEVEL_NONE, /* track no statements */
216 TRACK_LEVEL_TOP, /* only top level statements */
217 TRACK_LEVEL_ALL, /* all statements, including nested ones */
218 TRACK_LEVEL_VERBOSE /* all statements, including internal ones */
221 static const struct config_enum_entry track_options[] =
223 {"none", TRACK_LEVEL_NONE, false},
224 {"top", TRACK_LEVEL_TOP, false},
225 {"all", TRACK_LEVEL_ALL, false},
226 {"verbose", TRACK_LEVEL_VERBOSE, false},
232 PLAN_FORMAT_RAW, /* No conversion. Shorten JSON */
233 PLAN_FORMAT_TEXT, /* Traditional text representation */
234 PLAN_FORMAT_JSON, /* JSON representation */
235 PLAN_FORMAT_YAML, /* YAML */
236 PLAN_FORMAT_XML, /* XML */
239 static const struct config_enum_entry plan_formats[] =
241 {"raw" , PLAN_FORMAT_RAW , false},
242 {"text", PLAN_FORMAT_TEXT, false},
243 {"json", PLAN_FORMAT_JSON, false},
244 {"yaml", PLAN_FORMAT_YAML, false},
245 {"xml" , PLAN_FORMAT_XML , false},
249 /* options for plan storage */
252 PLAN_STORAGE_SHMEM, /* plan is stored as a part of hash entry */
253 PLAN_STORAGE_FILE /* plan is stored in a separate file */
256 static const struct config_enum_entry plan_storage_options[] =
258 {"shmem", PLAN_STORAGE_SHMEM, false},
259 {"file", PLAN_STORAGE_FILE, false},
263 static int store_size; /* max # statements to track */
264 static int track_level; /* tracking level */
265 static int min_duration; /* min duration to record */
266 static bool dump_on_shutdown; /* whether to save stats across shutdown */
267 static bool log_analyze; /* Similar to EXPLAIN (ANALYZE *) */
268 static bool log_verbose; /* Similar to EXPLAIN (VERBOSE *) */
269 static bool log_buffers; /* Similar to EXPLAIN (BUFFERS *) */
270 static bool log_timing; /* Similar to EXPLAIN (TIMING *) */
271 static bool log_triggers; /* whether to log trigger statistics */
272 static int plan_format; /* Plan representation style in
273 * pg_store_plans.plan */
274 static int plan_storage; /* Plan storage type */
277 /* disables tracking overriding track_level */
278 static bool force_disabled = false;
280 #if PG_VERSION_NUM >= 140000
282 * For pg14 and later, we rely on core queryid calculation. If
283 * it's not available it means that the admin explicitly refused to
284 * compute it, for performance reason or other. In that case, we
285 * will also consider that this extension is disabled.
287 #define pgsp_enabled(q) \
288 (!force_disabled && \
289 (track_level >= TRACK_LEVEL_ALL || \
290 (track_level == TRACK_LEVEL_TOP && nested_level == 0)) && \
291 (q != PGSP_NO_QUERYID))
293 #define pgsp_enabled(q) \
294 (!force_disabled && \
295 (track_level >= TRACK_LEVEL_ALL || \
296 (track_level == TRACK_LEVEL_TOP && nested_level == 0)))
299 #define SHMEM_PLAN_PTR(ent) (((char *) ent) + sizeof(pgspEntry))
301 /*---- Function declarations ----*/
306 Datum pg_store_plans_reset(PG_FUNCTION_ARGS);
307 Datum pg_store_plans_hash_query(PG_FUNCTION_ARGS);
308 Datum pg_store_plans(PG_FUNCTION_ARGS);
309 Datum pg_store_plans_shorten(PG_FUNCTION_ARGS);
310 Datum pg_store_plans_normalize(PG_FUNCTION_ARGS);
311 Datum pg_store_plans_jsonplan(PG_FUNCTION_ARGS);
312 Datum pg_store_plans_yamlplan(PG_FUNCTION_ARGS);
313 Datum pg_store_plans_xmlplan(PG_FUNCTION_ARGS);
314 Datum pg_store_plans_textplan(PG_FUNCTION_ARGS);
315 Datum pg_store_plans_info(PG_FUNCTION_ARGS);
317 PG_FUNCTION_INFO_V1(pg_store_plans_reset);
318 PG_FUNCTION_INFO_V1(pg_store_plans_hash_query);
319 PG_FUNCTION_INFO_V1(pg_store_plans);
320 PG_FUNCTION_INFO_V1(pg_store_plans_1_6);
321 PG_FUNCTION_INFO_V1(pg_store_plans_1_7);
322 PG_FUNCTION_INFO_V1(pg_store_plans_shorten);
323 PG_FUNCTION_INFO_V1(pg_store_plans_normalize);
324 PG_FUNCTION_INFO_V1(pg_store_plans_jsonplan);
325 PG_FUNCTION_INFO_V1(pg_store_plans_yamlplan);
326 PG_FUNCTION_INFO_V1(pg_store_plans_xmlplan);
327 PG_FUNCTION_INFO_V1(pg_store_plans_textplan);
328 PG_FUNCTION_INFO_V1(pg_store_plans_info);
330 #if PG_VERSION_NUM < 130000
331 #define COMPTAG_TYPE char
333 #define COMPTAG_TYPE QueryCompletion
336 #if PG_VERSION_NUM < 140000
337 #define ROLE_PG_READ_ALL_STATS DEFAULT_ROLE_READ_ALL_STATS
340 #if PG_VERSION_NUM >= 150000
341 static shmem_request_hook_type prev_shmem_request_hook = NULL;
344 static void pgsp_shmem_request(void);
345 static void pgsp_shmem_startup(void);
346 static void pgsp_shmem_shutdown(int code, Datum arg);
347 static void pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags);
348 static void pgsp_ExecutorRun(QueryDesc *queryDesc,
349 ScanDirection direction,
350 uint64 count, bool execute_once);
351 static void pgsp_ExecutorFinish(QueryDesc *queryDesc);
352 static void pgsp_ExecutorEnd(QueryDesc *queryDesc);
353 static void pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
354 #if PG_VERSION_NUM >= 140000
357 ProcessUtilityContext context, ParamListInfo params,
358 QueryEnvironment *queryEnv,
359 DestReceiver *dest, COMPTAG_TYPE *completionTag);
360 static uint32 hash_query(const char* query);
361 static void pgsp_store(char *plan, queryid_t queryId,
362 double total_time, uint64 rows,
363 const BufferUsage *bufusage);
364 static void pg_store_plans_internal(FunctionCallInfo fcinfo,
365 pgspVersion api_version);
366 static Size shared_mem_size(void);
367 static pgspEntry *entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len,
369 static bool ptext_store(const char *plan, int plan_len, Size *plan_offset,
371 static char *ptext_load_file(Size *buffer_size);
372 static char *ptext_fetch(Size plan_offset, int plan_len, char *buffer,
374 static bool need_gc_ptexts(void);
375 static void gc_ptexts(void);
376 static void entry_dealloc(void);
377 static void entry_reset(void);
380 * Module load callback
386 * In order to create our shared memory area, we have to be loaded via
387 * shared_preload_libraries. If not, fall out without hooking into any of
388 * the main system. (We don't throw error here because it seems useful to
389 * allow the pg_stat_statements functions to be created even when the
390 * module isn't active. The functions must protect themselves against
391 * being called then, however.)
393 if (!process_shared_preload_libraries_in_progress)
395 #if PG_VERSION_NUM >= 140000
397 * Inform the postmaster that we want to enable query_id calculation if
398 * compute_query_id is set to auto.
404 * Define (or redefine) custom GUC variables.
406 DefineCustomIntVariable("pg_store_plans.max",
407 "Sets the maximum number of plans tracked by pg_store_plans.",
419 DefineCustomIntVariable("pg_store_plans.max_plan_length",
420 "Sets the maximum length of plans stored by pg_store_plans.",
432 DefineCustomEnumVariable("pg_store_plans.plan_storage",
433 "Selects where to store plan texts.",
437 plan_storage_options,
444 DefineCustomEnumVariable("pg_store_plans.track",
445 "Selects which plans are tracked by pg_store_plans.",
456 DefineCustomEnumVariable("pg_store_plans.plan_format",
457 "Selects which format to be appied for plan representation in pg_store_plans.",
468 DefineCustomIntVariable("pg_store_plans.min_duration",
469 "Minimum duration to record plan in milliseconds.",
481 DefineCustomBoolVariable("pg_store_plans.save",
482 "Save pg_store_plans statistics across server shutdowns.",
492 DefineCustomBoolVariable("pg_store_plans.log_analyze",
493 "Use EXPLAIN ANALYZE for plan logging.",
503 DefineCustomBoolVariable("pg_store_plans.log_buffers",
514 DefineCustomBoolVariable("pg_store_plans.log_timing",
525 DefineCustomBoolVariable("pg_store_plans.log_triggers",
526 "Log trigger trace.",
536 DefineCustomBoolVariable("pg_store_plans.log_verbose",
537 "Set VERBOSE for EXPLAIN on logging.",
547 EmitWarningsOnPlaceholders("pg_store_plans");
549 #if PG_VERSION_NUM < 150000
550 pgsp_shmem_request();
556 #if PG_VERSION_NUM >= 150000
557 prev_shmem_request_hook = shmem_request_hook;
558 shmem_request_hook = pgsp_shmem_request;
560 prev_shmem_startup_hook = shmem_startup_hook;
561 shmem_startup_hook = pgsp_shmem_startup;
562 prev_ExecutorStart = ExecutorStart_hook;
563 ExecutorStart_hook = pgsp_ExecutorStart;
564 prev_ExecutorRun = ExecutorRun_hook;
565 ExecutorRun_hook = pgsp_ExecutorRun;
566 prev_ExecutorFinish = ExecutorFinish_hook;
567 ExecutorFinish_hook = pgsp_ExecutorFinish;
568 prev_ExecutorEnd = ExecutorEnd_hook;
569 ExecutorEnd_hook = pgsp_ExecutorEnd;
570 prev_ProcessUtility = ProcessUtility_hook;
571 ProcessUtility_hook = pgsp_ProcessUtility;
575 * Module unload callback
580 /* Uninstall hooks. */
581 shmem_startup_hook = prev_shmem_startup_hook;
582 ExecutorStart_hook = prev_ExecutorStart;
583 ExecutorRun_hook = prev_ExecutorRun;
584 ExecutorFinish_hook = prev_ExecutorFinish;
585 ExecutorEnd_hook = prev_ExecutorEnd;
586 ProcessUtility_hook = prev_ProcessUtility;
590 * pgsp_shmem_request: request shared memory to the core.
591 * Called as a hook in PG15 or later, otherwise called from _PG_init().
594 pgsp_shmem_request(void)
596 #if PG_VERSION_NUM >= 150000
597 if (prev_shmem_request_hook)
598 prev_shmem_request_hook();
601 RequestAddinShmemSpace(shared_mem_size());
602 RequestNamedLWLockTranche("pg_store_plans", 1);
606 * shmem_startup hook: allocate or attach to shared memory,
607 * then load any pre-existing statistics from file.
610 pgsp_shmem_startup(void)
624 if (prev_shmem_startup_hook)
625 prev_shmem_startup_hook();
627 /* reset in case this is a restart within the postmaster */
632 * Create or attach to the shared memory state, including hash table
634 LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
636 shared_state = ShmemInitStruct("pg_store_plans",
637 sizeof(pgspSharedState),
642 /* First time through ... */
643 shared_state->lock = &(GetNamedLWLockTranche("pg_store_plans"))->lock;
644 shared_state->plan_size = max_plan_len;
645 shared_state->cur_median_usage = ASSUMED_MEDIAN_INIT;
646 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
647 SpinLockInit(&shared_state->mutex);
648 shared_state->extent = 0;
649 shared_state->n_writers = 0;
650 shared_state->gc_count = 0;
651 shared_state->stats.dealloc = 0;
652 shared_state->stats.stats_reset = GetCurrentTimestamp();
655 /* Be sure everyone agrees on the hash table entry size */
656 plan_size = shared_state->plan_size;
658 memset(&info, 0, sizeof(info));
659 info.keysize = sizeof(pgspHashKey);
660 info.entrysize = sizeof(pgspEntry);
661 if (plan_storage == PLAN_STORAGE_SHMEM)
662 info.entrysize += max_plan_len;
663 hash_table = ShmemInitHash("pg_store_plans hash",
664 store_size, store_size,
668 LWLockRelease(AddinShmemInitLock);
671 * If we're in the postmaster (or a standalone backend...), set up a shmem
672 * exit hook to dump the statistics to disk.
674 if (!IsUnderPostmaster)
675 on_shmem_exit(pgsp_shmem_shutdown, (Datum) 0);
678 * Done if some other process already completed our initialization.
684 * Note: we don't bother with locks here, because there should be no other
685 * processes running when this code is reached.
688 /* Unlink query text file possibly left over from crash */
689 unlink(PGSP_TEXT_FILE);
691 if (plan_storage == PLAN_STORAGE_FILE)
693 /* Allocate new query text temp file */
694 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
700 * If we were told not to load old statistics, we're done. (Note we do
701 * not try to unlink any old dump file in this case. This seems a bit
702 * questionable but it's the historical behavior.)
704 if (!dump_on_shutdown)
712 * Attempt to load old statistics from the dump file.
714 file = AllocateFile(PGSP_DUMP_FILE, PG_BINARY_R);
717 /* ignore not-found error */
721 /* No existing persisted stats file, so we're done */
727 buffer_size = plan_size;
728 buffer = (char *) palloc(buffer_size);
730 if (fread(&header, sizeof(uint32), 1, file) != 1 ||
731 fread(&pgver, sizeof(uint32), 1, file) != 1 ||
732 fread(&num, sizeof(int32), 1, file) != 1)
735 if (header != PGSP_FILE_HEADER ||
736 pgver != PGSP_PG_MAJOR_VERSION)
739 for (i = 0; i < num; i++)
743 Size plan_offset = 0;
745 if (fread(&temp, sizeof(pgspEntry), 1, file) != 1)
748 /* Encoding is the only field we can easily sanity-check */
749 if (!PG_VALID_BE_ENCODING(temp.encoding))
752 /* Previous incarnation might have had a larger plan_size */
753 if (temp.plan_len >= buffer_size)
755 buffer = (char *) repalloc(buffer, temp.plan_len + 1);
756 buffer_size = temp.plan_len + 1;
759 if (fread(buffer, 1, temp.plan_len + 1, file) != temp.plan_len + 1)
762 /* Skip loading "sticky" entries */
763 if (temp.counters.calls == 0)
766 /* Clip to available length if needed */
767 if (temp.plan_len >= plan_size)
768 temp.plan_len = pg_encoding_mbcliplen(temp.encoding,
773 buffer[temp.plan_len] = '\0';
775 if (plan_storage == PLAN_STORAGE_FILE)
777 /* Store the plan text */
778 plan_offset = shared_state->extent;
779 if (fwrite(buffer, 1, temp.plan_len + 1, pfile) !=
782 shared_state->extent += temp.plan_len + 1;
785 /* make the hashtable entry (discards old entries if too many) */
786 entry = entry_alloc(&temp.key, plan_offset, temp.plan_len, false);
788 if (plan_storage == PLAN_STORAGE_SHMEM)
789 memcpy(SHMEM_PLAN_PTR(entry), buffer, temp.plan_len + 1);
791 /* copy in the actual stats */
792 entry->counters = temp.counters;
802 * Remove the file so it's not included in backups/replication slaves,
803 * etc. A new file will be written on next shutdown.
805 unlink(PGSP_DUMP_FILE);
811 (errcode_for_file_access(),
812 errmsg("could not read file \"%s\": %m",
817 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
818 errmsg("ignoring invalid data in file \"%s\"",
823 (errcode_for_file_access(),
824 errmsg("could not write file \"%s\": %m",
833 /* If possible, throw away the bogus file; ignore any error */
834 unlink(PGSP_DUMP_FILE);
837 * Don't unlink PGSP_TEXT_FILE here; it should always be around while the
838 * server is running with pg_stat_statements enabled
843 * shmem_shutdown hook: Dump statistics into file.
845 * Note: we don't bother with acquiring lock, because there should be no
846 * other processes running when this is called.
849 pgsp_shmem_shutdown(int code, Datum arg)
852 char *pbuffer = NULL;
853 Size pbuffer_size = 0;
854 HASH_SEQ_STATUS hash_seq;
858 /* Don't try to dump during a crash. */
862 /* Safety check ... shouldn't get here unless shmem is set up. */
863 if (!shared_state || !hash_table)
866 /* Don't dump if told not to. */
867 if (!dump_on_shutdown)
870 file = AllocateFile(PGSP_DUMP_FILE ".tmp", PG_BINARY_W);
874 if (fwrite(&PGSP_FILE_HEADER, sizeof(uint32), 1, file) != 1)
876 if (fwrite(&PGSP_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
878 num_entries = hash_get_num_entries(hash_table);
879 if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
882 if (plan_storage == PLAN_STORAGE_FILE)
884 pbuffer = ptext_load_file(&pbuffer_size);
889 hash_seq_init(&hash_seq, hash_table);
890 while ((entry = hash_seq_search(&hash_seq)) != NULL)
892 int len = entry->plan_len;
895 if (plan_storage == PLAN_STORAGE_FILE)
896 pstr = ptext_fetch(entry->plan_offset, len,
897 pbuffer, pbuffer_size);
899 pstr = SHMEM_PLAN_PTR(entry);
902 continue; /* Ignore any entries with bogus texts */
904 if (fwrite(entry, sizeof(pgspEntry), 1, file) != 1 ||
905 fwrite(pstr, 1, len + 1, file) != len + 1)
907 /* note: we assume hash_seq_term won't change errno */
908 hash_seq_term(&hash_seq);
920 * Rename file into place, so we atomically replace the old one.
922 if (rename(PGSP_DUMP_FILE ".tmp", PGSP_DUMP_FILE) != 0)
924 (errcode_for_file_access(),
925 errmsg("could not rename pg_store_plans file \"%s\": %m",
926 PGSP_DUMP_FILE ".tmp")));
928 /* Unlink query-texts file; it's not needed while shutdown */
929 unlink(PGSP_TEXT_FILE);
935 (errcode_for_file_access(),
936 errmsg("could not write pg_store_plans file \"%s\": %m",
937 PGSP_DUMP_FILE ".tmp")));
940 unlink(PGSP_DUMP_FILE ".tmp");
945 * ExecutorStart hook: start up tracking if needed
948 pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags)
951 (eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0)
953 queryDesc->instrument_options |=
954 (log_timing ? INSTRUMENT_TIMER : 0)|
955 (log_timing ? 0: INSTRUMENT_ROWS)|
956 (log_buffers ? INSTRUMENT_BUFFERS : 0);
959 if (prev_ExecutorStart)
960 prev_ExecutorStart(queryDesc, eflags);
962 standard_ExecutorStart(queryDesc, eflags);
965 * Set up to track total elapsed time in ExecutorRun. Allocate in per-query
966 * context so as to be free at ExecutorEnd.
968 if (queryDesc->totaltime == NULL &&
969 pgsp_enabled(queryDesc->plannedstmt->queryId))
971 MemoryContext oldcxt;
973 oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
974 queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL
975 #if PG_VERSION_NUM >= 140000
979 MemoryContextSwitchTo(oldcxt);
985 * ExecutorRun hook: all we need do is track nesting depth
988 pgsp_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
994 if (prev_ExecutorRun)
995 prev_ExecutorRun(queryDesc, direction, count, execute_once);
997 standard_ExecutorRun(queryDesc, direction, count, execute_once);
1009 * ExecutorFinish hook: all we need do is track nesting depth
1012 pgsp_ExecutorFinish(QueryDesc *queryDesc)
1017 if (prev_ExecutorFinish)
1018 prev_ExecutorFinish(queryDesc);
1020 standard_ExecutorFinish(queryDesc);
1032 * ExecutorEnd hook: store results if needed
1035 pgsp_ExecutorEnd(QueryDesc *queryDesc)
1037 if (queryDesc->totaltime)
1040 * Make sure stats accumulation is done. (Note: it's okay if several
1041 * levels of hook all do this.)
1043 InstrEndLoop(queryDesc->totaltime);
1045 if (pgsp_enabled(queryDesc->plannedstmt->queryId) &&
1046 queryDesc->totaltime->total &&
1047 queryDesc->totaltime->total >=
1048 (double)min_duration / 1000.0)
1054 es = NewExplainState();
1057 es->analyze = queryDesc->instrument_options;
1058 es->verbose = log_verbose;
1059 es->buffers = (es->analyze && log_buffers);
1060 es->timing = (es->analyze && log_timing);
1061 es->format = EXPLAIN_FORMAT_JSON;
1063 ExplainBeginOutput(es);
1064 ExplainPrintPlan(es, queryDesc);
1066 pgspExplainTriggers(es, queryDesc);
1067 ExplainEndOutput(es);
1069 /* Remove last line break */
1070 if (es_str->len > 0 && es_str->data[es_str->len - 1] == '\n')
1071 es_str->data[--es_str->len] = '\0';
1073 /* JSON outmost braces. */
1074 es_str->data[0] = '{';
1075 es_str->data[es_str->len - 1] = '}';
1077 queryid = queryDesc->plannedstmt->queryId;
1078 #if PG_VERSION_NUM < 140000
1080 * For versions before pg14, a queryid is only available if
1081 * pg_stat_statements extension (or similar) if configured. We
1082 * don't want a hard requirement for such an extension so fallback
1083 * to an internal queryid calculation in some case.
1084 * For pg14 and above, core postgres can compute a queryid so we
1087 if (queryid == PGSP_NO_QUERYID)
1088 queryid = (queryid_t) hash_query(queryDesc->sourceText);
1090 Assert(queryid != PGSP_NO_QUERYID);
1093 pgsp_store(es_str->data,
1095 queryDesc->totaltime->total * 1000.0, /* convert to msec */
1096 queryDesc->estate->es_processed,
1097 &queryDesc->totaltime->bufusage);
1098 pfree(es_str->data);
1102 if (prev_ExecutorEnd)
1103 prev_ExecutorEnd(queryDesc);
1105 standard_ExecutorEnd(queryDesc);
1109 * ProcessUtility hook
1112 pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
1113 #if PG_VERSION_NUM >= 140000
1116 ProcessUtilityContext context, ParamListInfo params,
1117 QueryEnvironment *queryEnv,
1118 DestReceiver *dest, COMPTAG_TYPE *completionTag)
1120 int tag = nodeTag(pstmt->utilityStmt);
1121 queryid_t saved_queryId = pstmt->queryId;
1122 bool reset_force_disabled = false;
1124 if (pgsp_enabled(saved_queryId) &&
1125 (tag == T_CreateExtensionStmt || tag == T_AlterExtensionStmt) &&
1126 !force_disabled && track_level < TRACK_LEVEL_VERBOSE)
1128 force_disabled = true;
1129 reset_force_disabled = true;
1134 if (prev_ProcessUtility)
1136 prev_ProcessUtility(pstmt, queryString,
1137 #if PG_VERSION_NUM >= 140000
1140 context, params, queryEnv,
1141 dest, completionTag);
1144 standard_ProcessUtility(pstmt, queryString,
1145 #if PG_VERSION_NUM >= 140000
1148 context, params, queryEnv,
1149 dest, completionTag);
1151 if (reset_force_disabled)
1152 force_disabled = false;
1156 if (reset_force_disabled)
1157 force_disabled = false;
1164 * hash_query: calculate internal query ID for a query
1166 * As of PG11, Query.queryId has been widen to 64 bit to reduce collision of
1167 * queries to practical level. On the other hand pg_store_plans uses the
1168 * combination of query hash and plan hash values as the hash table key and
1169 * the resolution of the hash value effectively has the same degree so we
1170 * continue to use uint32 as internal queryid.
1172 * This may merge plans from different queries into single internal query id
1173 * but it is not a problem when pg_stat_statements is used together since the
1174 * extension gives enough resolution on queries.
1177 hash_query(const char* query)
1181 char *normquery = pstrdup(query);
1182 normalize_expr(normquery, false);
1183 queryid = hash_any((const unsigned char*)normquery, strlen(normquery));
1186 /* If we are unlucky enough to get a hash of zero, use 1 instead */
1195 * Store some statistics for a plan.
1197 * Table entry is keyed with userid.dbid.queryId.planId. planId is the hash
1198 * value of the given plan, which is calculated in ths function.
1201 pgsp_store(char *plan, queryid_t queryId,
1202 double total_time, uint64 rows,
1203 const BufferUsage *bufusage)
1207 char *norm_query = NULL;
1209 char *normalized_plan = NULL;
1210 char *shorten_plan = NULL;
1211 volatile pgspEntry *e;
1212 Size plan_offset = 0;
1215 Assert(plan != NULL && queryId != PGSP_NO_QUERYID);
1217 /* Safety check... */
1218 if (!shared_state || !hash_table)
1221 /* Set up key for hashtable search */
1222 key.userid = GetUserId();
1223 key.dbid = MyDatabaseId;
1224 key.queryid = queryId;
1226 normalized_plan = pgsp_json_normalize(plan);
1227 shorten_plan = pgsp_json_shorten(plan);
1228 elog(DEBUG3, "pg_store_plans: Normalized plan: %s", normalized_plan);
1229 elog(DEBUG3, "pg_store_plans: Shorten plan: %s", shorten_plan);
1230 elog(DEBUG3, "pg_store_plans: Original plan: %s", plan);
1231 plan_len = strlen(shorten_plan);
1233 key.planid = hash_any((const unsigned char *)normalized_plan,
1234 strlen(normalized_plan));
1235 pfree(normalized_plan);
1237 if (plan_len >= shared_state->plan_size)
1238 plan_len = pg_encoding_mbcliplen(GetDatabaseEncoding(),
1241 shared_state->plan_size - 1);
1244 /* Look up the hash table entry with shared lock. */
1245 LWLockAcquire(shared_state->lock, LW_SHARED);
1247 entry = (pgspEntry *) hash_search(hash_table, &key, HASH_FIND, NULL);
1249 /* Store the plan text, if the entry not present */
1250 if (!entry && plan_storage == PLAN_STORAGE_FILE)
1255 /* Append new plan text to file with only shared lock held */
1256 stored = ptext_store(shorten_plan, plan_len, &plan_offset, &gc_count);
1259 * Determine whether we need to garbage collect external query texts
1260 * while the shared lock is still held. This micro-optimization
1261 * avoids taking the time to decide this while holding exclusive lock.
1263 do_gc = need_gc_ptexts();
1265 /* Acquire exclusive lock as required by entry_alloc() */
1266 LWLockRelease(shared_state->lock);
1267 LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
1270 * A garbage collection may have occurred while we weren't holding the
1271 * lock. In the unlikely event that this happens, the plan text we
1272 * stored above will have been garbage collected, so write it again.
1273 * This should be infrequent enough that doing it while holding
1274 * exclusive lock isn't a performance problem.
1276 if (!stored || shared_state->gc_count != gc_count)
1277 stored = ptext_store(shorten_plan, plan_len, &plan_offset, NULL);
1279 /* If we failed to write to the text file, give up */
1285 /* Create new entry, if not present */
1288 entry = entry_alloc(&key, plan_offset, plan_len, false);
1290 /* shorten_plan is terminated by NUL */
1291 if (plan_storage == PLAN_STORAGE_SHMEM)
1292 memcpy(SHMEM_PLAN_PTR(entry), shorten_plan, plan_len + 1);
1295 /* If needed, perform garbage collection while exclusive lock held */
1300 /* Increment the counts, except when jstate is not NULL */
1303 * Grab the spinlock while updating the counters (see comment about
1304 * locking rules at the head of the file)
1307 e = (volatile pgspEntry *) entry;
1308 SpinLockAcquire(&e->mutex);
1310 /* "Unstick" entry if it was previously sticky */
1311 if (e->counters.calls == 0)
1313 e->counters.usage = USAGE_INIT;
1314 e->counters.first_call = GetCurrentTimestamp();
1317 e->counters.calls += 1;
1318 e->counters.total_time += total_time;
1319 if (e->counters.calls == 1)
1321 e->counters.min_time = total_time;
1322 e->counters.max_time = total_time;
1323 e->counters.mean_time = total_time;
1328 * Welford's method for accurately computing variance. See
1329 * <http://www.johndcook.com/blog/standard_deviation/>
1331 double old_mean = e->counters.mean_time;
1333 e->counters.mean_time +=
1334 (total_time - old_mean) / e->counters.calls;
1335 e->counters.sum_var_time +=
1336 (total_time - old_mean) * (total_time - e->counters.mean_time);
1338 /* calculate min and max time */
1339 if (e->counters.min_time > total_time)
1340 e->counters.min_time = total_time;
1341 if (e->counters.max_time < total_time)
1342 e->counters.max_time = total_time;
1345 e->counters.rows += rows;
1346 e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1347 e->counters.shared_blks_read += bufusage->shared_blks_read;
1348 e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
1349 e->counters.shared_blks_written += bufusage->shared_blks_written;
1350 e->counters.local_blks_hit += bufusage->local_blks_hit;
1351 e->counters.local_blks_read += bufusage->local_blks_read;
1352 e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
1353 e->counters.local_blks_written += bufusage->local_blks_written;
1354 e->counters.temp_blks_read += bufusage->temp_blks_read;
1355 e->counters.temp_blks_written += bufusage->temp_blks_written;
1357 e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1358 e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
1359 e->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time);
1360 e->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time);
1362 e->counters.last_call = GetCurrentTimestamp();
1363 e->counters.usage += USAGE_EXEC(total_time);
1365 SpinLockRelease(&e->mutex);
1368 LWLockRelease(shared_state->lock);
1370 /* We postpone this pfree until we're out of the lock */
1376 * Reset all statement statistics.
1379 pg_store_plans_reset(PG_FUNCTION_ARGS)
1381 if (!shared_state || !hash_table)
1383 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1384 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1389 /* Number of output arguments (columns) for various API versions */
1390 #define PG_STORE_PLANS_COLS_V1_5 27
1391 #define PG_STORE_PLANS_COLS_V1_6 26
1392 #define PG_STORE_PLANS_COLS_V1_7 28
1393 #define PG_STORE_PLANS_COLS 28 /* maximum of above */
1396 * Retrieve statement statistics.
1399 pg_store_plans_1_7(PG_FUNCTION_ARGS)
1401 pg_store_plans_internal(fcinfo, PGSP_V1_7);
1407 pg_store_plans_1_6(PG_FUNCTION_ARGS)
1409 pg_store_plans_internal(fcinfo, PGSP_V1_6);
1415 pg_store_plans(PG_FUNCTION_ARGS)
1417 pg_store_plans_internal(fcinfo, PGSP_V1_5);
1423 pg_store_plans_internal(FunctionCallInfo fcinfo,
1424 pgspVersion api_version)
1426 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1428 Tuplestorestate *tupstore;
1429 MemoryContext per_query_ctx;
1430 MemoryContext oldcontext;
1431 Oid userid = GetUserId();
1432 bool is_allowed_role = is_member_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS);
1434 char *pbuffer = NULL;
1435 Size pbuffer_size = 0;
1438 HASH_SEQ_STATUS hash_seq;
1441 if (!shared_state || !hash_table)
1443 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1444 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1446 /* check to see if caller supports us returning a tuplestore */
1447 if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1449 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1450 errmsg("set-valued function called in context that cannot accept a set")));
1451 if (!(rsinfo->allowedModes & SFRM_Materialize))
1453 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1454 errmsg("materialize mode required, but it is not " \
1455 "allowed in this context")));
1457 /* Build a tuple descriptor for our result type */
1458 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1459 elog(ERROR, "return type must be a row type");
1461 per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1462 oldcontext = MemoryContextSwitchTo(per_query_ctx);
1464 tupstore = tuplestore_begin_heap(true, false, work_mem);
1465 rsinfo->returnMode = SFRM_Materialize;
1466 rsinfo->setResult = tupstore;
1467 rsinfo->setDesc = tupdesc;
1469 MemoryContextSwitchTo(oldcontext);
1472 * We'd like to load the plan text file (if needed) while not holding any
1473 * lock on shared_state->lock. In the worst case we'll have to do this
1474 * again after we have the lock, but it's unlikely enough to make this a
1475 * win despite occasional duplicated work. We need to reload if anybody
1476 * writes to the file (either a retail ptext_store(), or a garbage
1477 * collection) between this point and where we've gotten shared lock. If a
1478 * ptext_store is actually in progress when we look, we might as well skip
1479 * the speculative load entirely.
1482 /* Take the mutex so we can examine variables */
1484 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1486 SpinLockAcquire(&s->mutex);
1488 n_writers = s->n_writers;
1489 gc_count = s->gc_count;
1490 SpinLockRelease(&s->mutex);
1493 /* No point in loading file now if there are active writers */
1494 if (n_writers == 0 && plan_storage == PLAN_STORAGE_FILE)
1495 pbuffer = ptext_load_file(&pbuffer_size);
1498 * Get shared lock, load or reload the plan text file if we must, and
1499 * iterate over the hashtable entries.
1501 * With a large hash table, we might be holding the lock rather longer
1502 * than one could wish. However, this only blocks creation of new hash
1503 * table entries, and the larger the hash table the less likely that is to
1504 * be needed. So we can hope this is okay. Perhaps someday we'll decide
1505 * we need to partition the hash table to limit the time spent holding any
1508 LWLockAcquire(shared_state->lock, LW_SHARED);
1511 * Here it is safe to examine extent and gc_count without taking the mutex.
1512 * Note that although other processes might change shared_state->extent
1513 * just after we look at it, the strings they then write into the file
1514 * cannot yet be referenced in the hashtable, so we don't care whether we
1517 * If ptext_load_file fails, we just press on; we'll return NULL for every
1520 if (plan_storage == PLAN_STORAGE_FILE &&
1522 shared_state->extent != extent ||
1523 shared_state->gc_count != gc_count))
1527 pbuffer = ptext_load_file(&pbuffer_size);
1530 hash_seq_init(&hash_seq, hash_table);
1531 while ((entry = hash_seq_search(&hash_seq)) != NULL)
1533 Datum values[PG_STORE_PLANS_COLS];
1534 bool nulls[PG_STORE_PLANS_COLS];
1536 int64 queryid = entry->key.queryid;
1537 int64 planid = entry->key.planid;
1541 memset(values, 0, sizeof(values));
1542 memset(nulls, 0, sizeof(nulls));
1544 values[i++] = ObjectIdGetDatum(entry->key.userid);
1545 values[i++] = ObjectIdGetDatum(entry->key.dbid);
1546 if (is_allowed_role || entry->key.userid == userid)
1548 values[i++] = Int64GetDatumFast(queryid);
1549 values[i++] = Int64GetDatumFast(planid);
1551 /* fill queryid_stat_statements with the same value with queryid */
1552 if (api_version == PGSP_V1_5)
1553 values[i++] = Int64GetDatumFast(queryid);
1557 nulls[i++] = true; /* queryid */
1558 nulls[i++] = true; /* planid */
1560 /* queryid_stat_statemetns*/
1561 if (api_version == PGSP_V1_5)
1565 if (is_allowed_role || entry->key.userid == userid)
1567 char *pstr; /* Plan string */
1568 char *mstr; /* Modified plan string */
1569 char *estr; /* Encoded modified plan string */
1571 if (plan_storage == PLAN_STORAGE_FILE)
1572 pstr = ptext_fetch(entry->plan_offset, entry->plan_len,
1573 pbuffer, pbuffer_size);
1575 pstr = SHMEM_PLAN_PTR(entry);
1577 switch (plan_format)
1579 case PLAN_FORMAT_TEXT:
1580 mstr = pgsp_json_textize(pstr);
1582 case PLAN_FORMAT_JSON:
1583 mstr = pgsp_json_inflate(pstr);
1585 case PLAN_FORMAT_YAML:
1586 mstr = pgsp_json_yamlize(pstr);
1588 case PLAN_FORMAT_XML:
1589 mstr = pgsp_json_xmlize(pstr);
1597 pg_do_encoding_conversion((unsigned char *) mstr,
1600 GetDatabaseEncoding());
1601 values[i++] = CStringGetTextDatum(estr);
1609 /* pstr is a pointer onto pbuffer */
1612 values[i++] = CStringGetTextDatum("<insufficient privilege>");
1614 /* copy counters to a local variable to keep locking time short */
1616 volatile pgspEntry *e = (volatile pgspEntry *) entry;
1618 SpinLockAcquire(&e->mutex);
1620 SpinLockRelease(&e->mutex);
1623 /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1627 values[i++] = Int64GetDatumFast(tmp.calls);
1628 values[i++] = Float8GetDatumFast(tmp.total_time);
1629 values[i++] = Float8GetDatumFast(tmp.min_time);
1630 values[i++] = Float8GetDatumFast(tmp.max_time);
1631 values[i++] = Float8GetDatumFast(tmp.mean_time);
1634 * Note we are calculating the population variance here, not the
1635 * sample variance, as we have data for the whole population, so
1636 * Bessel's correction is not used, and we don't divide by
1640 stddev = sqrt(tmp.sum_var_time / tmp.calls);
1643 values[i++] = Float8GetDatumFast(stddev);
1645 values[i++] = Int64GetDatumFast(tmp.rows);
1646 values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1647 values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
1648 values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
1649 values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1650 values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1651 values[i++] = Int64GetDatumFast(tmp.local_blks_read);
1652 values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
1653 values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1654 values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1655 values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
1656 values[i++] = Float8GetDatumFast(tmp.blk_read_time);
1657 values[i++] = Float8GetDatumFast(tmp.blk_write_time);
1659 if (api_version >= PGSP_V1_7)
1661 values[i++] = Float8GetDatumFast(tmp.temp_blk_read_time);
1662 values[i++] = Float8GetDatumFast(tmp.temp_blk_write_time);
1665 values[i++] = TimestampTzGetDatum(tmp.first_call);
1666 values[i++] = TimestampTzGetDatum(tmp.last_call);
1668 Assert(i == (api_version == PGSP_V1_5 ? PG_STORE_PLANS_COLS_V1_5 :
1669 api_version == PGSP_V1_6 ? PG_STORE_PLANS_COLS_V1_6 :
1670 api_version == PGSP_V1_7 ? PG_STORE_PLANS_COLS_V1_7 :
1671 -1 /* fail if you forget to update this assert */ ));
1673 tuplestore_putvalues(tupstore, tupdesc, values, nulls);
1676 LWLockRelease(shared_state->lock);
1678 /* clean up and return the tuplestore */
1679 tuplestore_donestoring(tupstore);
1682 /* Number of output arguments (columns) for pg_stat_statements_info */
1683 #define PG_STORE_PLANS_INFO_COLS 2
1686 * Return statistics of pg_stat_statements.
1689 pg_store_plans_info(PG_FUNCTION_ARGS)
1691 pgspGlobalStats stats;
1693 Datum values[PG_STORE_PLANS_INFO_COLS];
1694 bool nulls[PG_STORE_PLANS_INFO_COLS];
1696 if (!shared_state || !hash_table)
1698 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1699 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1701 /* Build a tuple descriptor for our result type */
1702 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1703 elog(ERROR, "return type must be a row type");
1705 MemSet(values, 0, sizeof(values));
1706 MemSet(nulls, 0, sizeof(nulls));
1708 /* Read global statistics for pg_stat_statements */
1710 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1712 SpinLockAcquire(&s->mutex);
1714 SpinLockRelease(&s->mutex);
1717 values[0] = Int64GetDatum(stats.dealloc);
1718 values[1] = TimestampTzGetDatum(stats.stats_reset);
1720 PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
1724 * Estimate shared memory space needed.
1727 shared_mem_size(void)
1732 size = MAXALIGN(sizeof(pgspSharedState));
1733 entry_size = sizeof(pgspEntry);
1735 /* plan text is apppended to the struct body */
1736 if (plan_storage == PLAN_STORAGE_SHMEM)
1737 entry_size += max_plan_len;
1739 size = add_size(size, hash_estimate_size(store_size, entry_size));
1745 * Allocate a new hashtable entry.
1746 * caller must hold an exclusive lock on shared_state->lock
1748 * "plan" need not be null-terminated; we rely on plan_len instead
1750 * If "sticky" is true, make the new entry artificially sticky so that it will
1751 * probably still be there when the query finishes execution. We do this by
1752 * giving it a median usage value rather than the normal value. (Strictly
1753 * speaking, query strings are normalized on a best effort basis, though it
1754 * would be difficult to demonstrate this even under artificial conditions.)
1756 * Note: despite needing exclusive lock, it's not an error for the target
1757 * entry to already exist. This is because pgsp_store releases and
1758 * reacquires lock after failing to find a match; so someone else could
1759 * have made the entry while we waited to get exclusive lock.
1762 entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len, bool sticky)
1767 /* Make space if needed */
1768 while (hash_get_num_entries(hash_table) >= store_size)
1771 /* Find or create an entry with desired hash code */
1772 entry = (pgspEntry *) hash_search(hash_table, key, HASH_ENTER, &found);
1776 /* New entry, initialize it */
1778 /* reset the statistics */
1779 memset(&entry->counters, 0, sizeof(Counters));
1780 /* set the appropriate initial usage count */
1781 entry->counters.usage = sticky ? shared_state->cur_median_usage : USAGE_INIT;
1782 /* re-initialize the mutex each time ... we assume no one using it */
1783 SpinLockInit(&entry->mutex);
1784 /* ... and don't forget the query text */
1785 Assert(plan_len >= 0 && plan_len < shared_state->plan_size);
1786 entry->plan_offset = plan_offset;
1787 entry->plan_len = plan_len;
1788 entry->encoding = GetDatabaseEncoding();
1795 * qsort comparator for sorting into increasing usage order
1798 entry_cmp(const void *lhs, const void *rhs)
1800 double l_usage = (*(pgspEntry *const *) lhs)->counters.usage;
1801 double r_usage = (*(pgspEntry *const *) rhs)->counters.usage;
1803 if (l_usage < r_usage)
1805 else if (l_usage > r_usage)
1812 * Deallocate least used entries.
1813 * Caller must hold an exclusive lock on shared_state->lock.
1818 HASH_SEQ_STATUS hash_seq;
1819 pgspEntry **entries;
1827 * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1828 * While we're scanning the table, apply the decay factor to the usage
1832 entries = palloc(hash_get_num_entries(hash_table) * sizeof(pgspEntry *));
1838 hash_seq_init(&hash_seq, hash_table);
1839 while ((entry = hash_seq_search(&hash_seq)) != NULL)
1841 entries[i++] = entry;
1842 /* "Sticky" entries get a different usage decay rate. */
1843 if (entry->counters.calls == 0)
1844 entry->counters.usage *= STICKY_DECREASE_FACTOR;
1846 entry->counters.usage *= USAGE_DECREASE_FACTOR;
1848 /* In the mean length computation, ignore dropped texts. */
1849 if (entry->plan_len >= 0)
1851 tottextlen += entry->plan_len + 1;
1856 qsort(entries, i, sizeof(pgspEntry *), entry_cmp);
1858 /* Also, record the (approximate) median usage */
1860 shared_state->cur_median_usage = entries[i / 2]->counters.usage;
1861 /* Record the mean plan length */
1862 if (nvalidtexts > 0)
1863 shared_state->mean_plan_len = tottextlen / nvalidtexts;
1865 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
1867 nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
1868 nvictims = Min(nvictims, i);
1870 for (i = 0; i < nvictims; i++)
1872 hash_search(hash_table, &entries[i]->key, HASH_REMOVE, NULL);
1877 /* Increment the number of times entries are deallocated */
1879 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1881 SpinLockAcquire(&s->mutex);
1882 s->stats.dealloc += 1;
1883 SpinLockRelease(&s->mutex);
1888 * Given a plan string (not necessarily null-terminated), allocate a new
1889 * entry in the external plan text file and store the string there.
1891 * If successful, returns true, and stores the new entry's offset in the file
1892 * into *plan_offset. Also, if gc_count isn't NULL, *gc_count is set to the
1893 * number of garbage collections that have occurred so far.
1895 * On failure, returns false.
1897 * At least a shared lock on shared_state->lock must be held by the caller, so
1898 * as to prevent a concurrent garbage collection. Share-lock-holding callers
1899 * should pass a gc_count pointer to obtain the number of garbage collections,
1900 * so that they can recheck the count after obtaining exclusive lock to detect
1901 * whether a garbage collection occurred (and removed this entry).
1904 ptext_store(const char *plan, int plan_len, Size *plan_offset, int *gc_count)
1909 Assert (plan_storage == PLAN_STORAGE_FILE);
1912 * We use a spinlock to protect extent/n_writers/gc_count, so that
1913 * multiple processes may execute this function concurrently.
1916 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1918 SpinLockAcquire(&s->mutex);
1920 s->extent += plan_len + 1;
1923 *gc_count = s->gc_count;
1924 SpinLockRelease(&s->mutex);
1929 /* Now write the data into the successfully-reserved part of the file */
1930 fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
1934 if (pg_pwrite(fd, plan, plan_len, off) != plan_len)
1936 if (pg_pwrite(fd, "\0", 1, off + plan_len) != 1)
1939 CloseTransientFile(fd);
1941 /* Mark our write complete */
1943 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1945 SpinLockAcquire(&s->mutex);
1947 SpinLockRelease(&s->mutex);
1954 (errcode_for_file_access(),
1955 errmsg("could not write file \"%s\": %m",
1959 CloseTransientFile(fd);
1961 /* Mark our write complete */
1963 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1965 SpinLockAcquire(&s->mutex);
1967 SpinLockRelease(&s->mutex);
1974 * Read the external plan text file into a malloc'd buffer.
1976 * Returns NULL (without throwing an error) if unable to read, eg
1977 * file not there or insufficient memory.
1979 * On success, the buffer size is also returned into *buffer_size.
1981 * This can be called without any lock on shared_state->lock, but in that case
1982 * the caller is responsible for verifying that the result is sane.
1985 ptext_load_file(Size *buffer_size)
1992 Assert (plan_storage == PLAN_STORAGE_FILE);
1994 fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDONLY | PG_BINARY);
1997 if (errno != ENOENT)
1999 (errcode_for_file_access(),
2000 errmsg("could not read file \"%s\": %m",
2005 /* Get file length */
2006 if (fstat(fd, &stat))
2009 (errcode_for_file_access(),
2010 errmsg("could not stat file \"%s\": %m",
2012 CloseTransientFile(fd);
2016 /* Allocate buffer; beware that off_t might be wider than size_t */
2017 if (stat.st_size <= MaxAllocHugeSize)
2018 buf = (char *) malloc(stat.st_size);
2024 (errcode(ERRCODE_OUT_OF_MEMORY),
2025 errmsg("out of memory"),
2026 errdetail("Could not allocate enough memory to read file \"%s\".",
2028 CloseTransientFile(fd);
2033 * OK, slurp in the file. Windows fails if we try to read more than
2034 * INT_MAX bytes at once, and other platforms might not like that either,
2035 * so read a very large file in 1GB segments.
2038 while (nread < stat.st_size)
2040 int toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
2043 * If we get a short read and errno doesn't get set, the reason is
2044 * probably that garbage collection truncated the file since we did
2045 * the fstat(), so we don't log a complaint --- but we don't return
2046 * the data, either, since it's most likely corrupt due to concurrent
2047 * writes from garbage collection.
2050 if (read(fd, buf + nread, toread) != toread)
2054 (errcode_for_file_access(),
2055 errmsg("could not read file \"%s\": %m",
2058 CloseTransientFile(fd);
2064 if (CloseTransientFile(fd) != 0)
2066 (errcode_for_file_access(),
2067 errmsg("could not close file \"%s\": %m", PGSP_TEXT_FILE)));
2069 *buffer_size = nread;
2074 * Locate a plan text in the file image previously read by ptext_load_file().
2076 * We validate the given offset/length, and return NULL if bogus. Otherwise,
2077 * the result points to a null-terminated string within the buffer.
2080 ptext_fetch(Size plan_offset, int plan_len,
2081 char *buffer, Size buffer_size)
2083 Assert (plan_storage == PLAN_STORAGE_FILE);
2085 /* File read failed? */
2088 /* Bogus offset/length? */
2090 plan_offset + plan_len >= buffer_size)
2092 /* As a further sanity check, make sure there's a trailing null */
2093 if (buffer[plan_offset + plan_len] != '\0')
2096 return buffer + plan_offset;
2100 * Do we need to garbage-collect the external plan text file?
2102 * Caller should hold at least a shared lock on shared_state->lock.
2105 need_gc_ptexts(void)
2109 Assert (plan_storage == PLAN_STORAGE_FILE);
2111 /* Read shared extent pointer */
2113 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2115 SpinLockAcquire(&s->mutex);
2117 SpinLockRelease(&s->mutex);
2120 /* Don't proceed if file does not exceed 512 bytes per possible entry */
2121 if (extent < 512 * store_size)
2125 * Don't proceed if file is less than about 50% bloat. Nothing can or
2126 * should be done in the event of unusually large query texts accounting
2127 * for file's large size. We go to the trouble of maintaining the mean
2128 * query length in order to prevent garbage collection from thrashing
2131 if (extent < shared_state->mean_plan_len * store_size * 2)
2138 * Garbage-collect orphaned plan texts in external file.
2140 * This won't be called often in the typical case, since it's likely that
2141 * there won't be too much churn, and besides, a similar compaction process
2142 * occurs when serializing to disk at shutdown or as part of resetting.
2143 * Despite this, it seems prudent to plan for the edge case where the file
2144 * becomes unreasonably large, with no other method of compaction likely to
2145 * occur in the foreseeable future.
2147 * The caller must hold an exclusive lock on shared_state->lock.
2149 * At the first sign of trouble we unlink the query text file to get a clean
2150 * slate (although existing statistics are retained), rather than risk
2151 * thrashing by allowing the same problem case to recur indefinitely.
2159 HASH_SEQ_STATUS hash_seq;
2164 Assert (plan_storage == PLAN_STORAGE_FILE);
2167 * When called from store_entry, some other session might have proceeded
2168 * with garbage collection in the no-lock-held interim of lock strength
2169 * escalation. Check once more that this is actually necessary.
2171 if (!need_gc_ptexts())
2175 * Load the old texts file. If we fail (out of memory, for instance),
2176 * invalidate query texts. Hopefully this is rare. It might seem better
2177 * to leave things alone on an OOM failure, but the problem is that the
2178 * file is only going to get bigger; hoping for a future non-OOM result is
2179 * risky and can easily lead to complete denial of service.
2181 pbuffer = ptext_load_file(&pbuffer_size);
2182 if (pbuffer == NULL)
2186 * We overwrite the plan texts file in place, so as to reduce the risk of
2187 * an out-of-disk-space failure. Since the file is guaranteed not to get
2188 * larger, this should always work on traditional filesystems; though we
2189 * could still lose on copy-on-write filesystems.
2191 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2195 (errcode_for_file_access(),
2196 errmsg("could not write file \"%s\": %m",
2204 hash_seq_init(&hash_seq, hash_table);
2205 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2207 int plan_len = entry->plan_len;
2208 char *plan = ptext_fetch(entry->plan_offset,
2215 /* Trouble ... drop the text */
2216 entry->plan_offset = 0;
2217 entry->plan_len = -1;
2218 /* entry will not be counted in mean plan length computation */
2222 if (fwrite(plan, 1, plan_len + 1, pfile) != plan_len + 1)
2225 (errcode_for_file_access(),
2226 errmsg("could not write file \"%s\": %m",
2228 hash_seq_term(&hash_seq);
2232 entry->plan_offset = extent;
2233 extent += plan_len + 1;
2238 * Truncate away any now-unused space. If this fails for some odd reason,
2239 * we log it, but there's no need to fail.
2241 if (ftruncate(fileno(pfile), extent) != 0)
2243 (errcode_for_file_access(),
2244 errmsg("could not truncate file \"%s\": %m",
2247 if (FreeFile(pfile))
2250 (errcode_for_file_access(),
2251 errmsg("could not write file \"%s\": %m",
2257 elog(DEBUG1, "pgsp gc of queries file shrunk size from %zu to %zu",
2258 shared_state->extent, extent);
2260 /* Reset the shared extent pointer */
2261 shared_state->extent = extent;
2264 * Also update the mean plan length, to be sure that need_gc_ptexts()
2265 * won't still think we have a problem.
2268 shared_state->mean_plan_len = extent / nentries;
2270 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2277 /* clean up resources */
2284 * Since the contents of the external file are now uncertain, mark all
2285 * hashtable entries as having invalid texts.
2287 hash_seq_init(&hash_seq, hash_table);
2288 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2290 entry->plan_offset = 0;
2291 entry->plan_len = -1;
2295 * Destroy the query text file and create a new, empty one
2297 (void) unlink(PGSP_TEXT_FILE);
2298 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2301 (errcode_for_file_access(),
2302 errmsg("could not recreate file \"%s\": %m",
2307 /* Reset the shared extent pointer */
2308 shared_state->extent = 0;
2310 /* Reset mean_plan_len to match the new state */
2311 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2315 * Release all entries.
2320 HASH_SEQ_STATUS hash_seq;
2324 if (!shared_state || !hash_table)
2326 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2327 errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
2329 LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
2331 hash_seq_init(&hash_seq, hash_table);
2332 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2334 hash_search(hash_table, &entry->key, HASH_REMOVE, NULL);
2338 * Reset global statistics for pg_store_plans.
2341 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2342 TimestampTz stats_reset = GetCurrentTimestamp();
2344 SpinLockAcquire(&s->mutex);
2345 s->stats.dealloc = 0;
2346 s->stats.stats_reset = stats_reset;
2347 SpinLockRelease(&s->mutex);
2351 * Write new empty plan file, perhaps even creating a new one to recover
2352 * if the file was missing.
2354 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2358 (errcode_for_file_access(),
2359 errmsg("could not create file \"%s\": %m",
2364 /* If ftruncate fails, log it, but it's not a fatal problem */
2365 if (ftruncate(fileno(pfile), 0) != 0)
2367 (errcode_for_file_access(),
2368 errmsg("could not truncate file \"%s\": %m",
2374 shared_state->extent = 0;
2375 LWLockRelease(shared_state->lock);
2379 pg_store_plans_hash_query(PG_FUNCTION_ARGS)
2381 PG_RETURN_OID(hash_query(text_to_cstring(PG_GETARG_TEXT_P(0))));
2385 pg_store_plans_shorten(PG_FUNCTION_ARGS)
2387 text *short_plan = PG_GETARG_TEXT_P(0);
2388 char *cjson = text_to_cstring(short_plan);
2389 char *cshorten = pgsp_json_shorten(cjson);
2390 PG_RETURN_TEXT_P(cstring_to_text(cshorten));
2394 pg_store_plans_normalize(PG_FUNCTION_ARGS)
2396 text *short_plan = PG_GETARG_TEXT_P(0);
2397 char *cjson = text_to_cstring(short_plan);
2398 char *cnormalized = pgsp_json_normalize(cjson);
2399 PG_RETURN_TEXT_P(cstring_to_text(cnormalized));
2403 pg_store_plans_jsonplan(PG_FUNCTION_ARGS)
2405 text *short_plan = PG_GETARG_TEXT_P(0);
2406 char *cshort = text_to_cstring(short_plan);
2407 char *cinflated = pgsp_json_inflate(cshort);
2408 PG_RETURN_TEXT_P(cstring_to_text(cinflated));
2412 pg_store_plans_textplan(PG_FUNCTION_ARGS)
2414 text *short_plan = PG_GETARG_TEXT_P(0);
2415 char *cshort = text_to_cstring(short_plan);
2416 char *ctextized = pgsp_json_textize(cshort);
2418 PG_RETURN_TEXT_P(cstring_to_text(ctextized));
2422 pg_store_plans_yamlplan(PG_FUNCTION_ARGS)
2424 text *short_plan = PG_GETARG_TEXT_P(0);
2425 char *cshort = text_to_cstring(short_plan);
2426 char *cyamlized = pgsp_json_yamlize(cshort);
2428 PG_RETURN_TEXT_P(cstring_to_text(cyamlized));
2432 pg_store_plans_xmlplan(PG_FUNCTION_ARGS)
2434 text *short_plan = PG_GETARG_TEXT_P(0);
2435 char *cshort = text_to_cstring(short_plan);
2436 char *cxmlized = pgsp_json_xmlize(cshort);
2438 PG_RETURN_TEXT_P(cstring_to_text(cxmlized));