OSDN Git Service

Add new columns to pg_store_plans view
[pgstoreplans/pg_store_plans.git] / pg_store_plans.c
1 /*-------------------------------------------------------------------------
2  *
3  * pg_store_plans.c
4  *              Take statistics of plan selection across a whole database cluster.
5  *
6  * Execution costs are totaled for each distinct plan for each query,
7  * and plan and queryid are kept in a shared hashtable, each record in
8  * which is associated with a record in pg_stat_statements, if any, by
9  * the queryid.
10  *
11  * For Postgres 9.3 or earlier does not expose query id so
12  * pg_store_plans needs to calculate it based on the given query
13  * string using different algorithm from pg_stat_statements, and later
14  * the id will be matched against the one made from query string
15  * stored in pg_stat_statements. For the reason, queryid matching in
16  * this way will fail if the query string kept in pg_stat_statements
17  * is truncated in the middle.
18  *
19  * Plans are identified by fingerprinting plan representations in
20  * "shortened" JSON format with constants and unstable values such as
21  * rows, width, loops ignored. Nevertheless, stored plan entries hold
22  * them of the latest execution. Entry eviction is done in the same
23  * way to pg_stat_statements.
24  *
25  * Copyright (c) 2008-2022, PostgreSQL Global Development Group
26  * Copyright (c) 2012-2022, NIPPON TELEGRAPH AND TELEPHONE CORPORATION
27  *
28  * IDENTIFICATION
29  *        pg_store_plans/pg_store_plans.c
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34
35 #include <sys/stat.h>
36 #include <unistd.h>
37 #include <dlfcn.h>
38 #include <math.h>
39
40 #include "catalog/pg_authid.h"
41 #include "commands/explain.h"
42 #include "access/hash.h"
43 #include "executor/instrument.h"
44 #include "funcapi.h"
45 #include "mb/pg_wchar.h"
46 #include "miscadmin.h"
47 #include "pgstat.h"
48 #include "storage/fd.h"
49 #include "storage/ipc.h"
50 #include "storage/lwlock.h"
51 #include "storage/spin.h"
52 #include "storage/shmem.h"
53 #include "tcop/utility.h"
54 #include "utils/acl.h"
55 #include "utils/builtins.h"
56 #if PG_VERSION_NUM >= 140000
57 #include "utils/queryjumble.h"
58 #endif
59 #include "utils/timestamp.h"
60
61 #include "pgsp_json.h"
62 #include "pgsp_explain.h"
63
64 PG_MODULE_MAGIC;
65
66 /* Location of stats file */
67 #define PGSP_DUMP_FILE  "global/pg_store_plans.stat"
68 #define PGSP_TEXT_FILE  PG_STAT_TMP_DIR "/pgsp_plan_texts.stat"
69
70 /* PostgreSQL major version number, changes in which invalidate all entries */
71 static const uint32 PGSP_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
72
73 /* This constant defines the magic number in the stats file header */
74 static const uint32 PGSP_FILE_HEADER = 0x20221214;
75 static int max_plan_len = 5000;
76
77 /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
78 #define USAGE_EXEC(duration)    (1.0)
79 #define USAGE_INIT                              (1.0)   /* including initial planning */
80 #define ASSUMED_MEDIAN_INIT             (10.0)  /* initial assumed median usage */
81 #define ASSUMED_LENGTH_INIT             1024    /* initial assumed mean query length */
82 #define USAGE_DECREASE_FACTOR   (0.99)  /* decreased every entry_dealloc */
83 #define STICKY_DECREASE_FACTOR  (0.50)  /* factor for sticky entries */
84 #define USAGE_DEALLOC_PERCENT   5               /* free this % of entries at once */
85
86 /* In PostgreSQL 11, queryid becomes a uint64 internally. */
87 #if PG_VERSION_NUM >= 110000
88 typedef uint64 queryid_t;
89 #define PGSP_NO_QUERYID         UINT64CONST(0)
90 #else
91 typedef uint32 queryid_t;
92 #define PGSP_NO_QUERYID         0
93 #endif
94
95 /*
96  * Extension version number, for supporting older extension versions' objects
97  */
98 typedef enum pgspVersion
99 {
100         PGSP_V1_5 = 0,
101         PGSP_V1_6,
102         PGSP_V1_7
103 } pgspVersion;
104
105 /*
106  * Hashtable key that defines the identity of a hashtable entry.  We separate
107  * queries by user and by database even if they are otherwise identical.
108  *
109  * Presently, the query encoding is fully determined by the source database
110  * and so we don't really need it to be in the key.  But that might not always
111  * be true. Anyway it's notationally convenient to pass it as part of the key.
112  */
113 typedef struct pgspHashKey
114 {
115         Oid                     userid;                 /* user OID */
116         Oid                     dbid;                   /* database OID */
117         queryid_t       queryid;                /* query identifier */
118         uint32          planid;                 /* plan identifier */
119 } pgspHashKey;
120
121 /*
122  * The actual stats counters kept within pgspEntry.
123  */
124 typedef struct Counters
125 {
126         int64           calls;                          /* # of times executed */
127         double          total_time;                     /* total execution time, in msec */
128         double          min_time;                       /* minimum execution time in msec */
129         double          max_time;                       /* maximum execution time in msec */
130         double          mean_time;                      /* mean execution time in msec */
131         double          sum_var_time;   /* sum of variances in execution time in msec */
132         int64           rows;                           /* total # of retrieved or affected rows */
133         int64           shared_blks_hit;        /* # of shared buffer hits */
134         int64           shared_blks_read;       /* # of shared disk blocks read */
135         int64           shared_blks_dirtied;/* # of shared disk blocks dirtied */
136         int64           shared_blks_written;/* # of shared disk blocks written */
137         int64           local_blks_hit;         /* # of local buffer hits */
138         int64           local_blks_read;        /* # of local disk blocks read */
139         int64           local_blks_dirtied;     /* # of local disk blocks dirtied */
140         int64           local_blks_written;     /* # of local disk blocks written */
141         int64           temp_blks_read;         /* # of temp blocks read */
142         int64           temp_blks_written;      /* # of temp blocks written */
143         double          blk_read_time;          /* time spent reading, in msec */
144         double          blk_write_time;         /* time spent writing, in msec */
145         double          temp_blk_read_time;     /* time spent reading temp blocks,
146                                                                            in msec */
147         double          temp_blk_write_time;/* time spent writing temp blocks,
148                                                                            in msec */
149         TimestampTz     first_call;                     /* timestamp of first call  */
150         TimestampTz     last_call;                      /* timestamp of last call  */
151         double          usage;                          /* usage factor */
152 } Counters;
153
154 /*
155  * Global statistics for pg_store_plans
156  */
157 typedef struct pgspGlobalStats
158 {
159         int64           dealloc;                /* # of times entries were deallocated */
160         TimestampTz stats_reset;        /* timestamp with all stats reset */
161 } pgspGlobalStats;
162
163 /*
164  * Statistics per plan
165  *
166  * NB: see the file read/write code before changing field order here.
167  */
168 typedef struct pgspEntry
169 {
170         pgspHashKey     key;                    /* hash key of entry - MUST BE FIRST */
171         Counters        counters;               /* the statistics for this query */
172         Size            plan_offset;    /* plan text offset in extern file */
173         int                     plan_len;               /* # of valid bytes in query string */
174         int                     encoding;               /* query encoding */
175         slock_t         mutex;                  /* protects the counters only */
176 } pgspEntry;
177
178 /*
179  * Global shared state
180  */
181 typedef struct pgspSharedState
182 {
183         LWLock     *lock;                       /* protects hashtable search/modification */
184         int                     plan_size;              /* max query length in bytes */
185         double          cur_median_usage;       /* current median usage in hashtable */
186         Size            mean_plan_len;  /* current mean entry text length */
187         slock_t         mutex;                  /* protects following fields only: */
188         Size            extent;                 /* current extent of plan file */
189         int                     n_writers;              /* number of active writers to query file */
190         int                     gc_count;               /* plan file garbage collection cycle count */
191         pgspGlobalStats stats;          /* global statistics for pgsp */
192 } pgspSharedState;
193
194 /*---- Local variables ----*/
195
196 /* Current nesting depth of ExecutorRun+ProcessUtility calls */
197 static int      nested_level = 0;
198
199 /* Saved hook values in case of unload */
200 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
201 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
202 static ExecutorRun_hook_type prev_ExecutorRun = NULL;
203 static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
204 static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
205 static ProcessUtility_hook_type prev_ProcessUtility = NULL;
206
207 /* Links to shared memory state */
208 static pgspSharedState *shared_state = NULL;
209 static HTAB *hash_table = NULL;
210
211 /*---- GUC variables ----*/
212
213 typedef enum
214 {
215         TRACK_LEVEL_NONE,                       /* track no statements */
216         TRACK_LEVEL_TOP,                                /* only top level statements */
217         TRACK_LEVEL_ALL,                                /* all statements, including nested ones */
218         TRACK_LEVEL_VERBOSE                     /* all statements, including internal ones */
219 }       PGSPTrackLevel;
220
221 static const struct config_enum_entry track_options[] =
222 {
223         {"none", TRACK_LEVEL_NONE, false},
224         {"top", TRACK_LEVEL_TOP, false},
225         {"all", TRACK_LEVEL_ALL, false},
226         {"verbose", TRACK_LEVEL_VERBOSE, false},
227         {NULL, 0, false}
228 };
229
230 typedef enum
231 {
232         PLAN_FORMAT_RAW,                /* No conversion. Shorten JSON */
233         PLAN_FORMAT_TEXT,               /* Traditional text representation */
234         PLAN_FORMAT_JSON,               /* JSON representation */
235         PLAN_FORMAT_YAML,               /* YAML */
236         PLAN_FORMAT_XML,                /* XML  */
237 }       PGSPPlanFormats;
238
239 static const struct config_enum_entry plan_formats[] =
240 {
241         {"raw" , PLAN_FORMAT_RAW , false},
242         {"text", PLAN_FORMAT_TEXT, false},
243         {"json", PLAN_FORMAT_JSON, false},
244         {"yaml", PLAN_FORMAT_YAML, false},
245         {"xml" , PLAN_FORMAT_XML , false},
246         {NULL, 0, false}
247 };
248
249 /* options for plan storage */
250 typedef enum
251 {
252         PLAN_STORAGE_SHMEM,             /* plan is stored as a part of hash entry */
253         PLAN_STORAGE_FILE               /* plan is stored in a separate file */
254 }  pgspPlanStorage;
255
256 static const struct config_enum_entry plan_storage_options[] =
257 {
258         {"shmem", PLAN_STORAGE_SHMEM, false},
259         {"file", PLAN_STORAGE_FILE, false},
260         {NULL, 0, false}
261 };
262
263 static int      store_size;                     /* max # statements to track */
264 static int      track_level;            /* tracking level */
265 static int      min_duration;           /* min duration to record */
266 static bool dump_on_shutdown;   /* whether to save stats across shutdown */
267 static bool log_analyze;                /* Similar to EXPLAIN (ANALYZE *) */
268 static bool log_verbose;                /* Similar to EXPLAIN (VERBOSE *) */
269 static bool log_buffers;                /* Similar to EXPLAIN (BUFFERS *) */
270 static bool log_timing;                 /* Similar to EXPLAIN (TIMING *) */
271 static bool log_triggers;               /* whether to log trigger statistics  */
272 static int  plan_format;                /* Plan representation style in
273                                                                  * pg_store_plans.plan  */
274 static int  plan_storage;               /* Plan storage type */
275
276
277 /* disables tracking overriding track_level */
278 static bool force_disabled = false;
279
280 #if PG_VERSION_NUM >= 140000
281 /*
282  * For pg14 and later, we rely on core queryid calculation.  If
283  * it's not available it means that the admin explicitly refused to
284  * compute it, for performance reason or other.  In that case, we
285  * will also consider that this extension is disabled.
286  */
287 #define pgsp_enabled(q) \
288         (!force_disabled &&                                                                                       \
289          (track_level >= TRACK_LEVEL_ALL ||                                                       \
290           (track_level == TRACK_LEVEL_TOP && nested_level == 0)) &&       \
291          (q != PGSP_NO_QUERYID))
292 #else
293 #define pgsp_enabled(q) \
294         (!force_disabled &&                                                                                     \
295          (track_level >= TRACK_LEVEL_ALL ||                                                     \
296           (track_level == TRACK_LEVEL_TOP && nested_level == 0)))
297 #endif
298
299 #define SHMEM_PLAN_PTR(ent) (((char *) ent) + sizeof(pgspEntry))
300
301 /*---- Function declarations ----*/
302
303 void            _PG_init(void);
304 void            _PG_fini(void);
305
306 Datum           pg_store_plans_reset(PG_FUNCTION_ARGS);
307 Datum           pg_store_plans_hash_query(PG_FUNCTION_ARGS);
308 Datum           pg_store_plans(PG_FUNCTION_ARGS);
309 Datum           pg_store_plans_shorten(PG_FUNCTION_ARGS);
310 Datum           pg_store_plans_normalize(PG_FUNCTION_ARGS);
311 Datum           pg_store_plans_jsonplan(PG_FUNCTION_ARGS);
312 Datum           pg_store_plans_yamlplan(PG_FUNCTION_ARGS);
313 Datum           pg_store_plans_xmlplan(PG_FUNCTION_ARGS);
314 Datum           pg_store_plans_textplan(PG_FUNCTION_ARGS);
315 Datum           pg_store_plans_info(PG_FUNCTION_ARGS);
316
317 PG_FUNCTION_INFO_V1(pg_store_plans_reset);
318 PG_FUNCTION_INFO_V1(pg_store_plans_hash_query);
319 PG_FUNCTION_INFO_V1(pg_store_plans);
320 PG_FUNCTION_INFO_V1(pg_store_plans_1_6);
321 PG_FUNCTION_INFO_V1(pg_store_plans_1_7);
322 PG_FUNCTION_INFO_V1(pg_store_plans_shorten);
323 PG_FUNCTION_INFO_V1(pg_store_plans_normalize);
324 PG_FUNCTION_INFO_V1(pg_store_plans_jsonplan);
325 PG_FUNCTION_INFO_V1(pg_store_plans_yamlplan);
326 PG_FUNCTION_INFO_V1(pg_store_plans_xmlplan);
327 PG_FUNCTION_INFO_V1(pg_store_plans_textplan);
328 PG_FUNCTION_INFO_V1(pg_store_plans_info);
329
330 #if PG_VERSION_NUM < 130000
331 #define COMPTAG_TYPE char
332 #else
333 #define COMPTAG_TYPE QueryCompletion
334 #endif
335
336 #if PG_VERSION_NUM < 140000
337 #define ROLE_PG_READ_ALL_STATS          DEFAULT_ROLE_READ_ALL_STATS
338 #endif
339
340 #if PG_VERSION_NUM >= 150000
341 static shmem_request_hook_type prev_shmem_request_hook = NULL;
342 #endif
343
344 static void pgsp_shmem_request(void);
345 static void pgsp_shmem_startup(void);
346 static void pgsp_shmem_shutdown(int code, Datum arg);
347 static void pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags);
348 static void pgsp_ExecutorRun(QueryDesc *queryDesc,
349                                  ScanDirection direction,
350                                                          uint64 count, bool execute_once);
351 static void pgsp_ExecutorFinish(QueryDesc *queryDesc);
352 static void pgsp_ExecutorEnd(QueryDesc *queryDesc);
353 static void pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
354 #if PG_VERSION_NUM >= 140000
355                                         bool readOnlyTree,
356 #endif
357                                         ProcessUtilityContext context, ParamListInfo params,
358                                         QueryEnvironment *queryEnv,
359                                         DestReceiver *dest, COMPTAG_TYPE *completionTag);
360 static uint32 hash_query(const char* query);
361 static void pgsp_store(char *plan, queryid_t queryId,
362                    double total_time, uint64 rows,
363                    const BufferUsage *bufusage);
364 static void pg_store_plans_internal(FunctionCallInfo fcinfo,
365                                                                         pgspVersion api_version);
366 static Size shared_mem_size(void);
367 static pgspEntry *entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len,
368                                                           bool sticky);
369 static bool ptext_store(const char *plan, int plan_len, Size *plan_offset,
370                                                 int *gc_count);
371 static char *ptext_load_file(Size *buffer_size);
372 static char *ptext_fetch(Size plan_offset, int plan_len, char *buffer,
373                                                  Size buffer_size);
374 static bool need_gc_ptexts(void);
375 static void gc_ptexts(void);
376 static void entry_dealloc(void);
377 static void entry_reset(void);
378
379 /*
380  * Module load callback
381  */
382 void
383 _PG_init(void)
384 {
385         /*
386          * In order to create our shared memory area, we have to be loaded via
387          * shared_preload_libraries.  If not, fall out without hooking into any of
388          * the main system.  (We don't throw error here because it seems useful to
389          * allow the pg_stat_statements functions to be created even when the
390          * module isn't active.  The functions must protect themselves against
391          * being called then, however.)
392          */
393         if (!process_shared_preload_libraries_in_progress)
394                 return;
395 #if PG_VERSION_NUM >= 140000
396         /*
397          * Inform the postmaster that we want to enable query_id calculation if
398          * compute_query_id is set to auto.
399          */
400         EnableQueryId();
401 #endif
402
403         /*
404          * Define (or redefine) custom GUC variables.
405          */
406         DefineCustomIntVariable("pg_store_plans.max",
407           "Sets the maximum number of plans tracked by pg_store_plans.",
408                                                         NULL,
409                                                         &store_size,
410                                                         1000,
411                                                         100,
412                                                         INT_MAX,
413                                                         PGC_POSTMASTER,
414                                                         0,
415                                                         NULL,
416                                                         NULL,
417                                                         NULL);
418
419         DefineCustomIntVariable("pg_store_plans.max_plan_length",
420           "Sets the maximum length of plans stored by pg_store_plans.",
421                                                         NULL,
422                                                         &max_plan_len,
423                                                         5000,
424                                                         100,
425                                                         INT32_MAX,
426                                                         PGC_POSTMASTER,
427                                                         0,
428                                                         NULL,
429                                                         NULL,
430                                                         NULL);
431
432         DefineCustomEnumVariable("pg_store_plans.plan_storage",
433                            "Selects where to store plan texts.",
434                                                          NULL,
435                                                          &plan_storage,
436                                                          PLAN_STORAGE_FILE,
437                                                          plan_storage_options,
438                                                          PGC_POSTMASTER,
439                                                          0,
440                                                          NULL,
441                                                          NULL,
442                                                          NULL);
443
444         DefineCustomEnumVariable("pg_store_plans.track",
445                            "Selects which plans are tracked by pg_store_plans.",
446                                                          NULL,
447                                                          &track_level,
448                                                          TRACK_LEVEL_TOP,
449                                                          track_options,
450                                                          PGC_SUSET,
451                                                          0,
452                                                          NULL,
453                                                          NULL,
454                                                          NULL);
455
456         DefineCustomEnumVariable("pg_store_plans.plan_format",
457                            "Selects which format to be appied for plan representation in pg_store_plans.",
458                                                          NULL,
459                                                          &plan_format,
460                                                          PLAN_FORMAT_TEXT,
461                                                          plan_formats,
462                                                          PGC_USERSET,
463                                                          0,
464                                                          NULL,
465                                                          NULL,
466                                                          NULL);
467
468         DefineCustomIntVariable("pg_store_plans.min_duration",
469                                         "Minimum duration to record plan in milliseconds.",
470                                                         NULL,
471                                                         &min_duration,
472                                                         0,
473                                                         0,
474                                                         INT_MAX,
475                                                         PGC_SUSET,
476                                                         0,
477                                                         NULL,
478                                                         NULL,
479                                                         NULL);
480
481         DefineCustomBoolVariable("pg_store_plans.save",
482                            "Save pg_store_plans statistics across server shutdowns.",
483                                                          NULL,
484                                                          &dump_on_shutdown,
485                                                          true,
486                                                          PGC_SIGHUP,
487                                                          0,
488                                                          NULL,
489                                                          NULL,
490                                                          NULL);
491
492         DefineCustomBoolVariable("pg_store_plans.log_analyze",
493                                                          "Use EXPLAIN ANALYZE for plan logging.",
494                                                          NULL,
495                                                          &log_analyze,
496                                                          false,
497                                                          PGC_SUSET,
498                                                          0,
499                                                          NULL,
500                                                          NULL,
501                                                          NULL);
502
503         DefineCustomBoolVariable("pg_store_plans.log_buffers",
504                                                          "Log buffer usage.",
505                                                          NULL,
506                                                          &log_buffers,
507                                                          false,
508                                                          PGC_SUSET,
509                                                          0,
510                                                          NULL,
511                                                          NULL,
512                                                          NULL);
513
514         DefineCustomBoolVariable("pg_store_plans.log_timing",
515                                                          "Log timings.",
516                                                          NULL,
517                                                          &log_timing,
518                                                          true,
519                                                          PGC_SUSET,
520                                                          0,
521                                                          NULL,
522                                                          NULL,
523                                                          NULL);
524
525         DefineCustomBoolVariable("pg_store_plans.log_triggers",
526                                                          "Log trigger trace.",
527                                                          NULL,
528                                                          &log_triggers,
529                                                          false,
530                                                          PGC_SUSET,
531                                                          0,
532                                                          NULL,
533                                                          NULL,
534                                                          NULL);
535
536         DefineCustomBoolVariable("pg_store_plans.log_verbose",
537                            "Set VERBOSE for EXPLAIN on logging.",
538                                                          NULL,
539                                                          &log_verbose,
540                                                          false,
541                                                          PGC_SUSET,
542                                                          0,
543                                                          NULL,
544                                                          NULL,
545                                                          NULL);
546
547         EmitWarningsOnPlaceholders("pg_store_plans");
548
549 #if PG_VERSION_NUM < 150000     
550         pgsp_shmem_request();
551 #endif
552
553         /*
554          * Install hooks.
555          */
556 #if PG_VERSION_NUM >= 150000
557         prev_shmem_request_hook = shmem_request_hook;
558         shmem_request_hook = pgsp_shmem_request;
559 #endif
560         prev_shmem_startup_hook = shmem_startup_hook;
561         shmem_startup_hook = pgsp_shmem_startup;
562         prev_ExecutorStart = ExecutorStart_hook;
563         ExecutorStart_hook = pgsp_ExecutorStart;
564         prev_ExecutorRun = ExecutorRun_hook;
565         ExecutorRun_hook = pgsp_ExecutorRun;
566         prev_ExecutorFinish = ExecutorFinish_hook;
567         ExecutorFinish_hook = pgsp_ExecutorFinish;
568         prev_ExecutorEnd = ExecutorEnd_hook;
569         ExecutorEnd_hook = pgsp_ExecutorEnd;
570         prev_ProcessUtility = ProcessUtility_hook;
571         ProcessUtility_hook = pgsp_ProcessUtility;
572 }
573
574 /*
575  * Module unload callback
576  */
577 void
578 _PG_fini(void)
579 {
580         /* Uninstall hooks. */
581         shmem_startup_hook = prev_shmem_startup_hook;
582         ExecutorStart_hook = prev_ExecutorStart;
583         ExecutorRun_hook = prev_ExecutorRun;
584         ExecutorFinish_hook = prev_ExecutorFinish;
585         ExecutorEnd_hook = prev_ExecutorEnd;
586         ProcessUtility_hook = prev_ProcessUtility;
587 }
588
589 /*
590  * pgsp_shmem_request: request shared memory to the core.
591  * Called as a hook in PG15 or later, otherwise called from _PG_init().
592  */
593 static void
594 pgsp_shmem_request(void)
595 {
596 #if PG_VERSION_NUM >= 150000
597         if (prev_shmem_request_hook)
598                 prev_shmem_request_hook();
599 #endif
600
601         RequestAddinShmemSpace(shared_mem_size());
602         RequestNamedLWLockTranche("pg_store_plans", 1);
603 }
604
605 /*
606  * shmem_startup hook: allocate or attach to shared memory,
607  * then load any pre-existing statistics from file.
608  */
609 static void
610 pgsp_shmem_startup(void)
611 {
612         bool            found;
613         HASHCTL         info;
614         FILE       *file = NULL;
615         FILE       *pfile = NULL;
616         uint32          header;
617         int32           num;
618         int32           pgver;
619         int32           i;
620         int                     plan_size;
621         int                     buffer_size;
622         char       *buffer = NULL;
623
624         if (prev_shmem_startup_hook)
625                 prev_shmem_startup_hook();
626
627         /* reset in case this is a restart within the postmaster */
628         shared_state = NULL;
629         hash_table = NULL;
630
631         /*
632          * Create or attach to the shared memory state, including hash table
633          */
634         LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
635
636         shared_state = ShmemInitStruct("pg_store_plans",
637                                                    sizeof(pgspSharedState),
638                                                    &found);
639
640         if (!found)
641         {
642                 /* First time through ... */
643                 shared_state->lock = &(GetNamedLWLockTranche("pg_store_plans"))->lock;
644                 shared_state->plan_size = max_plan_len;
645                 shared_state->cur_median_usage = ASSUMED_MEDIAN_INIT;
646                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
647                 SpinLockInit(&shared_state->mutex);
648                 shared_state->extent = 0;
649                 shared_state->n_writers = 0;
650                 shared_state->gc_count = 0;
651                 shared_state->stats.dealloc = 0;
652                 shared_state->stats.stats_reset = GetCurrentTimestamp();
653         }
654
655         /* Be sure everyone agrees on the hash table entry size */
656         plan_size = shared_state->plan_size;
657
658         memset(&info, 0, sizeof(info));
659         info.keysize = sizeof(pgspHashKey);
660         info.entrysize = sizeof(pgspEntry);
661         if (plan_storage == PLAN_STORAGE_SHMEM)
662                 info.entrysize += max_plan_len;
663         hash_table = ShmemInitHash("pg_store_plans hash",
664                                                           store_size, store_size,
665                                                           &info, HASH_ELEM |
666                                                           HASH_BLOBS);
667
668         LWLockRelease(AddinShmemInitLock);
669
670         /*
671          * If we're in the postmaster (or a standalone backend...), set up a shmem
672          * exit hook to dump the statistics to disk.
673          */
674         if (!IsUnderPostmaster)
675                 on_shmem_exit(pgsp_shmem_shutdown, (Datum) 0);
676
677         /*
678          * Done if some other process already completed our initialization.
679          */
680         if (found)
681                 return;
682
683         /*
684          * Note: we don't bother with locks here, because there should be no other
685          * processes running when this code is reached.
686          */
687
688         /* Unlink query text file possibly left over from crash */
689         unlink(PGSP_TEXT_FILE);
690
691         if (plan_storage == PLAN_STORAGE_FILE)
692         {
693                 /* Allocate new query text temp file */
694                 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
695                 if (pfile == NULL)
696                         goto write_error;
697         }
698
699         /*
700          * If we were told not to load old statistics, we're done.  (Note we do
701          * not try to unlink any old dump file in this case.  This seems a bit
702          * questionable but it's the historical behavior.)
703          */
704         if (!dump_on_shutdown)
705         {
706                 if (pfile)
707                         FreeFile(pfile);
708                 return;
709         }
710
711         /*
712          * Attempt to load old statistics from the dump file.
713          */
714         file = AllocateFile(PGSP_DUMP_FILE, PG_BINARY_R);
715         if (file == NULL)
716         {
717                 /* ignore not-found error */
718                 if (errno != ENOENT)
719                         goto read_error;
720
721                 /* No existing persisted stats file, so we're done */
722                 if (pfile)
723                         FreeFile(pfile);
724                 return;
725         }
726
727         buffer_size = plan_size;
728         buffer = (char *) palloc(buffer_size);
729
730         if (fread(&header, sizeof(uint32), 1, file) != 1 ||
731                 fread(&pgver, sizeof(uint32), 1, file) != 1 ||
732                 fread(&num, sizeof(int32), 1, file) != 1)
733                 goto read_error;
734
735         if (header != PGSP_FILE_HEADER ||
736                 pgver != PGSP_PG_MAJOR_VERSION)
737                 goto data_error;
738
739         for (i = 0; i < num; i++)
740         {
741                 pgspEntry       temp;
742                 pgspEntry  *entry;
743                 Size            plan_offset = 0;
744
745                 if (fread(&temp, sizeof(pgspEntry), 1, file) != 1)
746                         goto read_error;
747
748                 /* Encoding is the only field we can easily sanity-check */
749                 if (!PG_VALID_BE_ENCODING(temp.encoding))
750                         goto data_error;
751
752                 /* Previous incarnation might have had a larger plan_size */
753                 if (temp.plan_len >= buffer_size)
754                 {
755                         buffer = (char *) repalloc(buffer, temp.plan_len + 1);
756                         buffer_size = temp.plan_len + 1;
757                 }
758
759                 if (fread(buffer, 1, temp.plan_len + 1, file) != temp.plan_len + 1)
760                         goto read_error;
761
762                 /* Skip loading "sticky" entries */
763                 if (temp.counters.calls == 0)
764                         continue;
765
766                 /* Clip to available length if needed */
767                 if (temp.plan_len >= plan_size)
768                         temp.plan_len = pg_encoding_mbcliplen(temp.encoding,
769                                                                                                    buffer,
770                                                                                                    temp.plan_len,
771                                                                                                    plan_size - 1);
772
773                 buffer[temp.plan_len] = '\0';
774
775                 if (plan_storage == PLAN_STORAGE_FILE)
776                 {
777                         /* Store the plan text */
778                         plan_offset = shared_state->extent;
779                         if (fwrite(buffer, 1, temp.plan_len + 1, pfile) !=
780                                 temp.plan_len + 1)
781                                 goto write_error;
782                         shared_state->extent += temp.plan_len + 1;
783                 }
784
785                 /* make the hashtable entry (discards old entries if too many) */
786                 entry = entry_alloc(&temp.key, plan_offset, temp.plan_len, false);
787
788                 if (plan_storage == PLAN_STORAGE_SHMEM)
789                         memcpy(SHMEM_PLAN_PTR(entry), buffer, temp.plan_len + 1);
790
791                 /* copy in the actual stats */
792                 entry->counters = temp.counters;
793         }
794
795         pfree(buffer);
796         FreeFile(file);
797
798         if (pfile)
799                 FreeFile(pfile);
800
801         /*
802          * Remove the file so it's not included in backups/replication slaves,
803          * etc. A new file will be written on next shutdown.
804          */
805         unlink(PGSP_DUMP_FILE);
806
807         return;
808
809 read_error:
810         ereport(LOG,
811                         (errcode_for_file_access(),
812                          errmsg("could not read file \"%s\": %m",
813                                         PGSP_DUMP_FILE)));
814         goto fail;
815 data_error:
816         ereport(LOG,
817                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
818                          errmsg("ignoring invalid data in file \"%s\"",
819                                         PGSP_DUMP_FILE)));
820         goto fail;
821 write_error:
822         ereport(LOG,
823                         (errcode_for_file_access(),
824                          errmsg("could not write file \"%s\": %m",
825                                         PGSP_TEXT_FILE)));
826 fail:
827         if (buffer)
828                 pfree(buffer);
829         if (file)
830                 FreeFile(file);
831         if (pfile)
832                 FreeFile(pfile);
833         /* If possible, throw away the bogus file; ignore any error */
834         unlink(PGSP_DUMP_FILE);
835
836         /*
837          * Don't unlink PGSP_TEXT_FILE here; it should always be around while the
838          * server is running with pg_stat_statements enabled
839          */
840 }
841
842 /*
843  * shmem_shutdown hook: Dump statistics into file.
844  *
845  * Note: we don't bother with acquiring lock, because there should be no
846  * other processes running when this is called.
847  */
848 static void
849 pgsp_shmem_shutdown(int code, Datum arg)
850 {
851         FILE       *file;
852         char       *pbuffer = NULL;
853         Size            pbuffer_size = 0;
854         HASH_SEQ_STATUS hash_seq;
855         int32           num_entries;
856         pgspEntry  *entry;
857
858         /* Don't try to dump during a crash. */
859         if (code)
860                 return;
861
862         /* Safety check ... shouldn't get here unless shmem is set up. */
863         if (!shared_state || !hash_table)
864                 return;
865
866         /* Don't dump if told not to. */
867         if (!dump_on_shutdown)
868                 return;
869
870         file = AllocateFile(PGSP_DUMP_FILE ".tmp", PG_BINARY_W);
871         if (file == NULL)
872                 goto error;
873
874         if (fwrite(&PGSP_FILE_HEADER, sizeof(uint32), 1, file) != 1)
875                 goto error;
876         if (fwrite(&PGSP_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
877                 goto error;
878         num_entries = hash_get_num_entries(hash_table);
879         if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
880                 goto error;
881
882         if (plan_storage == PLAN_STORAGE_FILE)
883         {
884                 pbuffer = ptext_load_file(&pbuffer_size);
885                 if (pbuffer == NULL)
886                         goto error;
887         }
888
889         hash_seq_init(&hash_seq, hash_table);
890         while ((entry = hash_seq_search(&hash_seq)) != NULL)
891         {
892                 int                     len = entry->plan_len;
893                 char       *pstr;
894
895                 if (plan_storage == PLAN_STORAGE_FILE)
896                         pstr = ptext_fetch(entry->plan_offset, len,
897                                                            pbuffer, pbuffer_size);
898                 else
899                         pstr = SHMEM_PLAN_PTR(entry);
900
901                 if (pstr == NULL)
902                         continue;                       /* Ignore any entries with bogus texts */
903
904                 if (fwrite(entry, sizeof(pgspEntry), 1, file) != 1 ||
905                         fwrite(pstr, 1, len + 1, file) != len + 1)
906                 {
907                         /* note: we assume hash_seq_term won't change errno */
908                         hash_seq_term(&hash_seq);
909                         goto error;
910                 }
911         }
912
913         if (FreeFile(file))
914         {
915                 file = NULL;
916                 goto error;
917         }
918
919         /*
920          * Rename file into place, so we atomically replace the old one.
921          */
922         if (rename(PGSP_DUMP_FILE ".tmp", PGSP_DUMP_FILE) != 0)
923                 ereport(LOG,
924                                 (errcode_for_file_access(),
925                                  errmsg("could not rename pg_store_plans file \"%s\": %m",
926                                                 PGSP_DUMP_FILE ".tmp")));
927
928         /* Unlink query-texts file; it's not needed while shutdown */
929         unlink(PGSP_TEXT_FILE);
930
931         return;
932
933 error:
934         ereport(LOG,
935                         (errcode_for_file_access(),
936                          errmsg("could not write pg_store_plans file \"%s\": %m",
937                                         PGSP_DUMP_FILE ".tmp")));
938         if (file)
939                 FreeFile(file);
940         unlink(PGSP_DUMP_FILE ".tmp");
941 }
942
943
944 /*
945  * ExecutorStart hook: start up tracking if needed
946  */
947 static void
948 pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags)
949 {
950         if (log_analyze &&
951                 (eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0)
952         {
953                 queryDesc->instrument_options |=
954                         (log_timing ? INSTRUMENT_TIMER : 0)|
955                         (log_timing ? 0: INSTRUMENT_ROWS)|
956                         (log_buffers ? INSTRUMENT_BUFFERS : 0);
957         }
958
959         if (prev_ExecutorStart)
960                 prev_ExecutorStart(queryDesc, eflags);
961         else
962                 standard_ExecutorStart(queryDesc, eflags);
963
964         /*
965          * Set up to track total elapsed time in ExecutorRun. Allocate in per-query
966          * context so as to be free at ExecutorEnd.
967          */
968         if (queryDesc->totaltime == NULL &&
969                         pgsp_enabled(queryDesc->plannedstmt->queryId))
970         {
971                 MemoryContext oldcxt;
972
973                 oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
974                 queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL
975 #if PG_VERSION_NUM >= 140000
976                                                                                   , false
977 #endif
978                                                                                  );
979                 MemoryContextSwitchTo(oldcxt);
980         }
981
982 }
983
984 /*
985  * ExecutorRun hook: all we need do is track nesting depth
986  */
987 static void
988 pgsp_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
989                                  bool execute_once)
990 {
991         nested_level++;
992         PG_TRY();
993         {
994                 if (prev_ExecutorRun)
995                         prev_ExecutorRun(queryDesc, direction, count, execute_once);
996                 else
997                         standard_ExecutorRun(queryDesc, direction, count, execute_once);
998                 nested_level--;
999         }
1000         PG_CATCH();
1001         {
1002                 nested_level--;
1003                 PG_RE_THROW();
1004         }
1005         PG_END_TRY();
1006 }
1007
1008 /*
1009  * ExecutorFinish hook: all we need do is track nesting depth
1010  */
1011 static void
1012 pgsp_ExecutorFinish(QueryDesc *queryDesc)
1013 {
1014         nested_level++;
1015         PG_TRY();
1016         {
1017                 if (prev_ExecutorFinish)
1018                         prev_ExecutorFinish(queryDesc);
1019                 else
1020                         standard_ExecutorFinish(queryDesc);
1021                 nested_level--;
1022         }
1023         PG_CATCH();
1024         {
1025                 nested_level--;
1026                 PG_RE_THROW();
1027         }
1028         PG_END_TRY();
1029 }
1030
1031 /*
1032  * ExecutorEnd hook: store results if needed
1033  */
1034 static void
1035 pgsp_ExecutorEnd(QueryDesc *queryDesc)
1036 {
1037         if (queryDesc->totaltime)
1038         {
1039                 /*
1040                  * Make sure stats accumulation is done.  (Note: it's okay if several
1041                  * levels of hook all do this.)
1042                  */
1043                 InstrEndLoop(queryDesc->totaltime);
1044
1045                 if (pgsp_enabled(queryDesc->plannedstmt->queryId) &&
1046                         queryDesc->totaltime->total &&
1047                         queryDesc->totaltime->total >=
1048                         (double)min_duration / 1000.0)
1049                 {
1050                         queryid_t         queryid;
1051                         ExplainState *es;
1052                         StringInfo        es_str;
1053
1054                         es = NewExplainState();
1055                         es_str = es->str;
1056
1057                         es->analyze = queryDesc->instrument_options;
1058                         es->verbose = log_verbose;
1059                         es->buffers = (es->analyze && log_buffers);
1060                         es->timing = (es->analyze && log_timing);
1061                         es->format = EXPLAIN_FORMAT_JSON;
1062
1063                         ExplainBeginOutput(es);
1064                         ExplainPrintPlan(es, queryDesc);
1065                         if (log_triggers)
1066                                 pgspExplainTriggers(es, queryDesc);
1067                         ExplainEndOutput(es);
1068
1069                         /* Remove last line break */
1070                         if (es_str->len > 0 && es_str->data[es_str->len - 1] == '\n')
1071                                 es_str->data[--es_str->len] = '\0';
1072
1073                         /* JSON outmost braces. */
1074                         es_str->data[0] = '{';
1075                         es_str->data[es_str->len - 1] = '}';
1076
1077                         queryid = queryDesc->plannedstmt->queryId;
1078 #if PG_VERSION_NUM < 140000
1079                         /*
1080                          * For versions before pg14, a queryid is only available if
1081                          * pg_stat_statements extension (or similar) if configured.  We
1082                          * don't want a hard requirement for such an extension so fallback
1083                          * to an internal queryid calculation in some case.
1084                          * For pg14 and above, core postgres can compute a queryid so we
1085                          * will rely on it.
1086                          */
1087                         if (queryid == PGSP_NO_QUERYID)
1088                                 queryid = (queryid_t) hash_query(queryDesc->sourceText);
1089 #else
1090                         Assert(queryid != PGSP_NO_QUERYID);
1091 #endif
1092
1093                         pgsp_store(es_str->data,
1094                                                 queryid,
1095                                                 queryDesc->totaltime->total * 1000.0,   /* convert to msec */
1096                                                 queryDesc->estate->es_processed,
1097                                                 &queryDesc->totaltime->bufusage);
1098                         pfree(es_str->data);
1099                 }
1100         }
1101
1102         if (prev_ExecutorEnd)
1103                 prev_ExecutorEnd(queryDesc);
1104         else
1105                 standard_ExecutorEnd(queryDesc);
1106 }
1107
1108 /*
1109  * ProcessUtility hook
1110  */
1111 static void
1112 pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
1113 #if PG_VERSION_NUM >= 140000
1114                                         bool readOnlyTree,
1115 #endif
1116                                         ProcessUtilityContext context, ParamListInfo params,
1117                                         QueryEnvironment *queryEnv,
1118                                         DestReceiver *dest, COMPTAG_TYPE *completionTag)
1119 {
1120         int                     tag = nodeTag(pstmt->utilityStmt);
1121         queryid_t       saved_queryId = pstmt->queryId;
1122         bool            reset_force_disabled = false;
1123
1124         if (pgsp_enabled(saved_queryId) &&
1125                 (tag == T_CreateExtensionStmt || tag == T_AlterExtensionStmt) &&
1126                 !force_disabled && track_level < TRACK_LEVEL_VERBOSE)
1127         {
1128                 force_disabled = true;
1129                 reset_force_disabled = true;
1130         }
1131
1132         PG_TRY();
1133         {
1134                 if (prev_ProcessUtility)
1135                 {
1136                         prev_ProcessUtility(pstmt, queryString,
1137 #if PG_VERSION_NUM >= 140000
1138                                                                 readOnlyTree,
1139 #endif
1140                                                                 context, params, queryEnv,
1141                                                                 dest, completionTag);
1142                 }
1143                 else
1144                         standard_ProcessUtility(pstmt, queryString,
1145 #if PG_VERSION_NUM >= 140000
1146                                                                         readOnlyTree,
1147 #endif
1148                                                                         context, params, queryEnv,
1149                                                                         dest, completionTag);
1150
1151                 if (reset_force_disabled)
1152                         force_disabled = false;
1153         }
1154         PG_CATCH();
1155         {
1156                 if (reset_force_disabled)
1157                         force_disabled = false;
1158                 PG_RE_THROW();
1159         }
1160         PG_END_TRY();
1161 }
1162
1163 /*
1164  * hash_query: calculate internal query ID for a query
1165  *
1166  *  As of PG11, Query.queryId has been widen to 64 bit to reduce collision of
1167  *  queries to practical level. On the other hand pg_store_plans uses the
1168  *  combination of query hash and plan hash values as the hash table key and
1169  *  the resolution of the hash value effectively has the same degree so we
1170  *  continue to use uint32 as internal queryid.
1171  *
1172  *  This may merge plans from different queries into single internal query id
1173  *  but it is not a problem when pg_stat_statements is used together since the
1174  *  extension gives enough resolution on queries.
1175  */
1176 static uint32
1177 hash_query(const char* query)
1178 {
1179         uint32 queryid;
1180
1181         char *normquery = pstrdup(query);
1182         normalize_expr(normquery, false);
1183         queryid = hash_any((const unsigned char*)normquery, strlen(normquery));
1184         pfree(normquery);
1185
1186         /* If we are unlucky enough to get a hash of zero, use 1 instead */
1187         if (queryid == 0)
1188                 queryid = 1;
1189
1190         return queryid;
1191 }
1192
1193
1194 /*
1195  * Store some statistics for a plan.
1196  *
1197  * Table entry is keyed with userid.dbid.queryId.planId. planId is the hash
1198  * value of the given plan, which is calculated in ths function.
1199  */
1200 static void
1201 pgsp_store(char *plan, queryid_t queryId,
1202                    double total_time, uint64 rows,
1203                    const BufferUsage *bufusage)
1204 {
1205         pgspHashKey key;
1206         pgspEntry  *entry;
1207         char       *norm_query = NULL;
1208         int             plan_len;
1209         char       *normalized_plan = NULL;
1210         char       *shorten_plan = NULL;
1211         volatile pgspEntry *e;
1212         Size            plan_offset = 0;
1213         bool            do_gc = false;
1214
1215         Assert(plan != NULL && queryId != PGSP_NO_QUERYID);
1216
1217         /* Safety check... */
1218         if (!shared_state || !hash_table)
1219                 return;
1220
1221         /* Set up key for hashtable search */
1222         key.userid = GetUserId();
1223         key.dbid = MyDatabaseId;
1224         key.queryid = queryId;
1225
1226         normalized_plan = pgsp_json_normalize(plan);
1227         shorten_plan = pgsp_json_shorten(plan);
1228         elog(DEBUG3, "pg_store_plans: Normalized plan: %s", normalized_plan);
1229         elog(DEBUG3, "pg_store_plans: Shorten plan: %s", shorten_plan);
1230         elog(DEBUG3, "pg_store_plans: Original plan: %s", plan);
1231         plan_len = strlen(shorten_plan);
1232
1233         key.planid = hash_any((const unsigned char *)normalized_plan,
1234                                                   strlen(normalized_plan));
1235         pfree(normalized_plan);
1236
1237         if (plan_len >= shared_state->plan_size)
1238                 plan_len = pg_encoding_mbcliplen(GetDatabaseEncoding(),
1239                                                                                  shorten_plan,
1240                                                                                  plan_len,
1241                                                                                  shared_state->plan_size - 1);
1242
1243
1244         /* Look up the hash table entry with shared lock. */
1245         LWLockAcquire(shared_state->lock, LW_SHARED);
1246
1247         entry = (pgspEntry *) hash_search(hash_table, &key, HASH_FIND, NULL);
1248
1249         /* Store the plan text, if the entry not present */
1250         if (!entry && plan_storage == PLAN_STORAGE_FILE)
1251         {
1252                 int             gc_count;
1253                 bool    stored;
1254
1255                 /* Append new plan text to file with only shared lock held */
1256                 stored = ptext_store(shorten_plan, plan_len, &plan_offset, &gc_count);
1257
1258                 /*
1259                  * Determine whether we need to garbage collect external query texts
1260                  * while the shared lock is still held.  This micro-optimization
1261                  * avoids taking the time to decide this while holding exclusive lock.
1262                  */
1263                 do_gc = need_gc_ptexts();
1264
1265                 /* Acquire exclusive lock as required by entry_alloc() */
1266                 LWLockRelease(shared_state->lock);
1267                 LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
1268
1269                 /*
1270                  * A garbage collection may have occurred while we weren't holding the
1271                  * lock.  In the unlikely event that this happens, the plan text we
1272                  * stored above will have been garbage collected, so write it again.
1273                  * This should be infrequent enough that doing it while holding
1274                  * exclusive lock isn't a performance problem.
1275                  */
1276                 if (!stored || shared_state->gc_count != gc_count)
1277                         stored = ptext_store(shorten_plan, plan_len, &plan_offset, NULL);
1278
1279                 /* If we failed to write to the text file, give up */
1280                 if (!stored)
1281                         goto done;
1282
1283         }
1284
1285         /* Create new entry, if not present */
1286         if (!entry)
1287         {
1288                 entry = entry_alloc(&key, plan_offset, plan_len, false);
1289
1290                 /* shorten_plan is terminated by NUL */
1291                 if (plan_storage == PLAN_STORAGE_SHMEM)
1292                         memcpy(SHMEM_PLAN_PTR(entry), shorten_plan, plan_len + 1);
1293                         
1294
1295                 /* If needed, perform garbage collection while exclusive lock held */
1296                 if (do_gc)
1297                         gc_ptexts();
1298         }
1299
1300         /* Increment the counts, except when jstate is not NULL */
1301
1302         /*
1303          * Grab the spinlock while updating the counters (see comment about
1304          * locking rules at the head of the file)
1305          */
1306
1307         e = (volatile pgspEntry *) entry;
1308         SpinLockAcquire(&e->mutex);
1309
1310         /* "Unstick" entry if it was previously sticky */
1311         if (e->counters.calls == 0)
1312         {
1313                 e->counters.usage = USAGE_INIT;
1314                 e->counters.first_call = GetCurrentTimestamp();
1315         }
1316
1317         e->counters.calls += 1;
1318         e->counters.total_time += total_time;
1319         if (e->counters.calls == 1)
1320         {
1321                 e->counters.min_time = total_time;
1322                 e->counters.max_time = total_time;
1323                 e->counters.mean_time = total_time;
1324         }
1325         else
1326         {
1327                 /*
1328                  * Welford's method for accurately computing variance. See
1329                  * <http://www.johndcook.com/blog/standard_deviation/>
1330                  */
1331                 double          old_mean = e->counters.mean_time;
1332
1333                 e->counters.mean_time +=
1334                         (total_time - old_mean) / e->counters.calls;
1335                 e->counters.sum_var_time +=
1336                         (total_time - old_mean) * (total_time - e->counters.mean_time);
1337
1338                 /* calculate min and max time */
1339                 if (e->counters.min_time > total_time)
1340                         e->counters.min_time = total_time;
1341                 if (e->counters.max_time < total_time)
1342                         e->counters.max_time = total_time;
1343         }
1344
1345         e->counters.rows += rows;
1346         e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1347         e->counters.shared_blks_read += bufusage->shared_blks_read;
1348         e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
1349         e->counters.shared_blks_written += bufusage->shared_blks_written;
1350         e->counters.local_blks_hit += bufusage->local_blks_hit;
1351         e->counters.local_blks_read += bufusage->local_blks_read;
1352         e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
1353         e->counters.local_blks_written += bufusage->local_blks_written;
1354         e->counters.temp_blks_read += bufusage->temp_blks_read;
1355         e->counters.temp_blks_written += bufusage->temp_blks_written;
1356
1357         e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1358         e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
1359         e->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time);
1360         e->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time);
1361
1362         e->counters.last_call = GetCurrentTimestamp();
1363         e->counters.usage += USAGE_EXEC(total_time);
1364
1365         SpinLockRelease(&e->mutex);
1366
1367 done:
1368         LWLockRelease(shared_state->lock);
1369
1370         /* We postpone this pfree until we're out of the lock */
1371         if (norm_query)
1372                 pfree(norm_query);
1373 }
1374
1375 /*
1376  * Reset all statement statistics.
1377  */
1378 Datum
1379 pg_store_plans_reset(PG_FUNCTION_ARGS)
1380 {
1381         if (!shared_state || !hash_table)
1382                 ereport(ERROR,
1383                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1384                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1385         entry_reset();
1386         PG_RETURN_VOID();
1387 }
1388
1389 /* Number of output arguments (columns) for various API versions */
1390 #define PG_STORE_PLANS_COLS_V1_5        27
1391 #define PG_STORE_PLANS_COLS_V1_6        26
1392 #define PG_STORE_PLANS_COLS_V1_7        28
1393 #define PG_STORE_PLANS_COLS                     28      /* maximum of above */
1394
1395 /*
1396  * Retrieve statement statistics.
1397  */
1398 Datum
1399 pg_store_plans_1_7(PG_FUNCTION_ARGS)
1400 {
1401         pg_store_plans_internal(fcinfo, PGSP_V1_7);
1402
1403         return (Datum) 0;
1404 }
1405
1406 Datum
1407 pg_store_plans_1_6(PG_FUNCTION_ARGS)
1408 {
1409         pg_store_plans_internal(fcinfo, PGSP_V1_6);
1410
1411         return (Datum) 0;
1412 }
1413
1414 Datum
1415 pg_store_plans(PG_FUNCTION_ARGS)
1416 {
1417         pg_store_plans_internal(fcinfo, PGSP_V1_5);
1418
1419         return (Datum) 0;
1420 }
1421
1422 static void
1423 pg_store_plans_internal(FunctionCallInfo fcinfo,
1424                                                 pgspVersion api_version)
1425 {
1426         ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1427         TupleDesc       tupdesc;
1428         Tuplestorestate *tupstore;
1429         MemoryContext per_query_ctx;
1430         MemoryContext oldcontext;
1431         Oid                     userid = GetUserId();
1432         bool            is_allowed_role = is_member_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS);
1433         int                     n_writers;
1434         char       *pbuffer = NULL;
1435         Size            pbuffer_size = 0;
1436         Size            extent = 0;
1437         int                     gc_count = 0;
1438         HASH_SEQ_STATUS hash_seq;
1439         pgspEntry  *entry;
1440
1441         if (!shared_state || !hash_table)
1442                 ereport(ERROR,
1443                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1444                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1445
1446         /* check to see if caller supports us returning a tuplestore */
1447         if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1448                 ereport(ERROR,
1449                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1450                                  errmsg("set-valued function called in context that cannot accept a set")));
1451         if (!(rsinfo->allowedModes & SFRM_Materialize))
1452                 ereport(ERROR,
1453                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1454                                  errmsg("materialize mode required, but it is not " \
1455                                                 "allowed in this context")));
1456
1457         /* Build a tuple descriptor for our result type */
1458         if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1459                 elog(ERROR, "return type must be a row type");
1460
1461         per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1462         oldcontext = MemoryContextSwitchTo(per_query_ctx);
1463
1464         tupstore = tuplestore_begin_heap(true, false, work_mem);
1465         rsinfo->returnMode = SFRM_Materialize;
1466         rsinfo->setResult = tupstore;
1467         rsinfo->setDesc = tupdesc;
1468
1469         MemoryContextSwitchTo(oldcontext);
1470
1471         /*
1472          * We'd like to load the plan text file (if needed) while not holding any
1473          * lock on shared_state->lock.  In the worst case we'll have to do this
1474          * again after we have the lock, but it's unlikely enough to make this a
1475          * win despite occasional duplicated work.  We need to reload if anybody
1476          * writes to the file (either a retail ptext_store(), or a garbage
1477          * collection) between this point and where we've gotten shared lock.  If a
1478          * ptext_store is actually in progress when we look, we might as well skip
1479          * the speculative load entirely.
1480          */
1481
1482         /* Take the mutex so we can examine variables */
1483         {
1484                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1485
1486                 SpinLockAcquire(&s->mutex);
1487                 extent = s->extent;
1488                 n_writers = s->n_writers;
1489                 gc_count = s->gc_count;
1490                 SpinLockRelease(&s->mutex);
1491         }
1492
1493         /* No point in loading file now if there are active writers */
1494         if (n_writers == 0 && plan_storage == PLAN_STORAGE_FILE)
1495                 pbuffer = ptext_load_file(&pbuffer_size);
1496
1497         /*
1498          * Get shared lock, load or reload the plan text file if we must, and
1499          * iterate over the hashtable entries.
1500          *
1501          * With a large hash table, we might be holding the lock rather longer
1502          * than one could wish.  However, this only blocks creation of new hash
1503          * table entries, and the larger the hash table the less likely that is to
1504          * be needed.  So we can hope this is okay.  Perhaps someday we'll decide
1505          * we need to partition the hash table to limit the time spent holding any
1506          * one lock.
1507          */
1508         LWLockAcquire(shared_state->lock, LW_SHARED);
1509
1510         /*
1511          * Here it is safe to examine extent and gc_count without taking the mutex.
1512          * Note that although other processes might change shared_state->extent
1513          * just after we look at it, the strings they then write into the file
1514          * cannot yet be referenced in the hashtable, so we don't care whether we
1515          * see them or not.
1516          *
1517          * If ptext_load_file fails, we just press on; we'll return NULL for every
1518          * plan text.
1519          */
1520         if (plan_storage == PLAN_STORAGE_FILE &&
1521                 (pbuffer == NULL ||
1522                  shared_state->extent != extent ||
1523                  shared_state->gc_count != gc_count))
1524         {
1525                 if (pbuffer)
1526                         free(pbuffer);
1527                 pbuffer = ptext_load_file(&pbuffer_size);
1528         }
1529
1530         hash_seq_init(&hash_seq, hash_table);
1531         while ((entry = hash_seq_search(&hash_seq)) != NULL)
1532         {
1533                 Datum           values[PG_STORE_PLANS_COLS];
1534                 bool            nulls[PG_STORE_PLANS_COLS];
1535                 int                     i = 0;
1536                 int64           queryid      = entry->key.queryid;
1537                 int64           planid       = entry->key.planid;
1538                 Counters        tmp;
1539                 double          stddev;
1540
1541                 memset(values, 0, sizeof(values));
1542                 memset(nulls, 0, sizeof(nulls));
1543
1544                 values[i++] = ObjectIdGetDatum(entry->key.userid);
1545                 values[i++] = ObjectIdGetDatum(entry->key.dbid);
1546                 if (is_allowed_role || entry->key.userid == userid)
1547                 {
1548                         values[i++] = Int64GetDatumFast(queryid);
1549                         values[i++] = Int64GetDatumFast(planid);
1550
1551                         /* fill queryid_stat_statements with the same value with queryid */
1552                         if (api_version == PGSP_V1_5)
1553                                 values[i++] = Int64GetDatumFast(queryid);
1554                 }
1555                 else
1556                 {
1557                         nulls[i++] = true;      /* queryid */
1558                         nulls[i++] = true;      /* planid */
1559
1560                         /* queryid_stat_statemetns*/
1561                         if (api_version == PGSP_V1_5)
1562                                 nulls[i++] = true;
1563                 }
1564
1565                 if (is_allowed_role || entry->key.userid == userid)
1566                 {
1567                         char       *pstr; /* Plan string */
1568                         char       *mstr; /* Modified plan string */
1569                         char       *estr; /* Encoded modified plan string */
1570
1571                         if (plan_storage == PLAN_STORAGE_FILE)
1572                                 pstr = ptext_fetch(entry->plan_offset, entry->plan_len,
1573                                                                    pbuffer, pbuffer_size);
1574                         else
1575                                 pstr = SHMEM_PLAN_PTR(entry);
1576
1577                         switch (plan_format)
1578                         {
1579                                 case PLAN_FORMAT_TEXT:
1580                                         mstr = pgsp_json_textize(pstr);
1581                                         break;
1582                                 case PLAN_FORMAT_JSON:
1583                                         mstr = pgsp_json_inflate(pstr);
1584                                         break;
1585                                 case PLAN_FORMAT_YAML:
1586                                         mstr = pgsp_json_yamlize(pstr);
1587                                         break;
1588                                 case PLAN_FORMAT_XML:
1589                                         mstr = pgsp_json_xmlize(pstr);
1590                                         break;
1591                                 default:
1592                                         mstr = pstr;
1593                                         break;
1594                         }
1595
1596                         estr = (char *)
1597                                 pg_do_encoding_conversion((unsigned char *) mstr,
1598                                                                                   strlen(mstr),
1599                                                                                   entry->encoding,
1600                                                                                   GetDatabaseEncoding());
1601                         values[i++] = CStringGetTextDatum(estr);
1602
1603                         if (estr != mstr)
1604                                 pfree(estr);
1605
1606                         if (mstr != pstr)
1607                                 pfree(mstr);
1608
1609                         /* pstr is a pointer onto pbuffer */
1610                 }
1611                 else
1612                         values[i++] = CStringGetTextDatum("<insufficient privilege>");
1613
1614                 /* copy counters to a local variable to keep locking time short */
1615                 {
1616                         volatile pgspEntry *e = (volatile pgspEntry *) entry;
1617
1618                         SpinLockAcquire(&e->mutex);
1619                         tmp = e->counters;
1620                         SpinLockRelease(&e->mutex);
1621                 }
1622
1623                 /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1624                 if (tmp.calls == 0)
1625                         continue;
1626
1627                 values[i++] = Int64GetDatumFast(tmp.calls);
1628                 values[i++] = Float8GetDatumFast(tmp.total_time);
1629                 values[i++] = Float8GetDatumFast(tmp.min_time);
1630                 values[i++] = Float8GetDatumFast(tmp.max_time);
1631                 values[i++] = Float8GetDatumFast(tmp.mean_time);
1632
1633                 /*
1634                  * Note we are calculating the population variance here, not the
1635                  * sample variance, as we have data for the whole population, so
1636                  * Bessel's correction is not used, and we don't divide by
1637                  * tmp.calls - 1.
1638                  */
1639                 if (tmp.calls > 1)
1640                         stddev = sqrt(tmp.sum_var_time / tmp.calls);
1641                 else
1642                         stddev = 0.0;
1643                 values[i++] = Float8GetDatumFast(stddev);
1644
1645                 values[i++] = Int64GetDatumFast(tmp.rows);
1646                 values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1647                 values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
1648                 values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
1649                 values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1650                 values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1651                 values[i++] = Int64GetDatumFast(tmp.local_blks_read);
1652                 values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
1653                 values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1654                 values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1655                 values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
1656                 values[i++] = Float8GetDatumFast(tmp.blk_read_time);
1657                 values[i++] = Float8GetDatumFast(tmp.blk_write_time);
1658
1659                 if (api_version >= PGSP_V1_7)
1660                 {
1661                         values[i++] = Float8GetDatumFast(tmp.temp_blk_read_time);
1662                         values[i++] = Float8GetDatumFast(tmp.temp_blk_write_time);
1663                 }
1664
1665                 values[i++] = TimestampTzGetDatum(tmp.first_call);
1666                 values[i++] = TimestampTzGetDatum(tmp.last_call);
1667
1668                 Assert(i == (api_version == PGSP_V1_5 ? PG_STORE_PLANS_COLS_V1_5 :
1669                                          api_version == PGSP_V1_6 ? PG_STORE_PLANS_COLS_V1_6 :
1670                                          api_version == PGSP_V1_7 ? PG_STORE_PLANS_COLS_V1_7 :
1671                                          -1 /* fail if you forget to update this assert */ ));
1672
1673                 tuplestore_putvalues(tupstore, tupdesc, values, nulls);
1674         }
1675
1676         LWLockRelease(shared_state->lock);
1677
1678         /* clean up and return the tuplestore */
1679         tuplestore_donestoring(tupstore);
1680 }
1681
1682 /* Number of output arguments (columns) for pg_stat_statements_info */
1683 #define PG_STORE_PLANS_INFO_COLS        2
1684
1685 /*
1686  * Return statistics of pg_stat_statements.
1687  */
1688 Datum
1689 pg_store_plans_info(PG_FUNCTION_ARGS)
1690 {
1691         pgspGlobalStats stats;
1692         TupleDesc       tupdesc;
1693         Datum           values[PG_STORE_PLANS_INFO_COLS];
1694         bool            nulls[PG_STORE_PLANS_INFO_COLS];
1695
1696         if (!shared_state || !hash_table)
1697                 ereport(ERROR,
1698                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1699                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1700
1701         /* Build a tuple descriptor for our result type */
1702         if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1703                 elog(ERROR, "return type must be a row type");
1704
1705         MemSet(values, 0, sizeof(values));
1706         MemSet(nulls, 0, sizeof(nulls));
1707
1708         /* Read global statistics for pg_stat_statements */
1709         {
1710                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1711
1712                 SpinLockAcquire(&s->mutex);
1713                 stats = s->stats;
1714                 SpinLockRelease(&s->mutex);
1715         }
1716
1717         values[0] = Int64GetDatum(stats.dealloc);
1718         values[1] = TimestampTzGetDatum(stats.stats_reset);
1719
1720         PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
1721 }
1722
1723 /*
1724  * Estimate shared memory space needed.
1725  */
1726 static Size
1727 shared_mem_size(void)
1728 {
1729         Size            size;
1730         int                     entry_size;
1731
1732         size = MAXALIGN(sizeof(pgspSharedState));
1733         entry_size = sizeof(pgspEntry);
1734
1735         /* plan text is apppended to the struct body */
1736         if (plan_storage == PLAN_STORAGE_SHMEM)
1737                 entry_size += max_plan_len;
1738
1739         size = add_size(size, hash_estimate_size(store_size, entry_size));
1740
1741         return size;
1742 }
1743
1744 /*
1745  * Allocate a new hashtable entry.
1746  * caller must hold an exclusive lock on shared_state->lock
1747  *
1748  * "plan" need not be null-terminated; we rely on plan_len instead
1749  *
1750  * If "sticky" is true, make the new entry artificially sticky so that it will
1751  * probably still be there when the query finishes execution.  We do this by
1752  * giving it a median usage value rather than the normal value.  (Strictly
1753  * speaking, query strings are normalized on a best effort basis, though it
1754  * would be difficult to demonstrate this even under artificial conditions.)
1755  *
1756  * Note: despite needing exclusive lock, it's not an error for the target
1757  * entry to already exist.      This is because pgsp_store releases and
1758  * reacquires lock after failing to find a match; so someone else could
1759  * have made the entry while we waited to get exclusive lock.
1760  */
1761 static pgspEntry *
1762 entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len, bool sticky)
1763 {
1764         pgspEntry  *entry;
1765         bool            found;
1766
1767         /* Make space if needed */
1768         while (hash_get_num_entries(hash_table) >= store_size)
1769                 entry_dealloc();
1770
1771         /* Find or create an entry with desired hash code */
1772         entry = (pgspEntry *) hash_search(hash_table, key, HASH_ENTER, &found);
1773
1774         if (!found)
1775         {
1776                 /* New entry, initialize it */
1777
1778                 /* reset the statistics */
1779                 memset(&entry->counters, 0, sizeof(Counters));
1780                 /* set the appropriate initial usage count */
1781                 entry->counters.usage = sticky ? shared_state->cur_median_usage : USAGE_INIT;
1782                 /* re-initialize the mutex each time ... we assume no one using it */
1783                 SpinLockInit(&entry->mutex);
1784                 /* ... and don't forget the query text */
1785                 Assert(plan_len >= 0 && plan_len < shared_state->plan_size);
1786                 entry->plan_offset = plan_offset;
1787                 entry->plan_len = plan_len;
1788                 entry->encoding = GetDatabaseEncoding();
1789         }
1790
1791         return entry;
1792 }
1793
1794 /*
1795  * qsort comparator for sorting into increasing usage order
1796  */
1797 static int
1798 entry_cmp(const void *lhs, const void *rhs)
1799 {
1800         double          l_usage = (*(pgspEntry *const *) lhs)->counters.usage;
1801         double          r_usage = (*(pgspEntry *const *) rhs)->counters.usage;
1802
1803         if (l_usage < r_usage)
1804                 return -1;
1805         else if (l_usage > r_usage)
1806                 return +1;
1807         else
1808                 return 0;
1809 }
1810
1811 /*
1812  * Deallocate least used entries.
1813  * Caller must hold an exclusive lock on shared_state->lock.
1814  */
1815 static void
1816 entry_dealloc(void)
1817 {
1818         HASH_SEQ_STATUS hash_seq;
1819         pgspEntry **entries;
1820         pgspEntry  *entry;
1821         int                     nvictims;
1822         int                     i;
1823         Size            tottextlen;
1824         int                     nvalidtexts;
1825
1826         /*
1827          * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1828          * While we're scanning the table, apply the decay factor to the usage
1829          * values.
1830          */
1831
1832         entries = palloc(hash_get_num_entries(hash_table) * sizeof(pgspEntry *));
1833
1834         i = 0;
1835         tottextlen = 0;
1836         nvalidtexts = 0;
1837
1838         hash_seq_init(&hash_seq, hash_table);
1839         while ((entry = hash_seq_search(&hash_seq)) != NULL)
1840         {
1841                 entries[i++] = entry;
1842                 /* "Sticky" entries get a different usage decay rate. */
1843                 if (entry->counters.calls == 0)
1844                         entry->counters.usage *= STICKY_DECREASE_FACTOR;
1845                 else
1846                         entry->counters.usage *= USAGE_DECREASE_FACTOR;
1847
1848                 /* In the mean length computation, ignore dropped texts. */
1849                 if (entry->plan_len >= 0)
1850                 {
1851                         tottextlen += entry->plan_len + 1;
1852                         nvalidtexts++;
1853                 }
1854         }
1855
1856         qsort(entries, i, sizeof(pgspEntry *), entry_cmp);
1857
1858         /* Also, record the (approximate) median usage */
1859         if (i > 0)
1860                 shared_state->cur_median_usage = entries[i / 2]->counters.usage;
1861         /* Record the mean plan length */
1862         if (nvalidtexts > 0)
1863                 shared_state->mean_plan_len = tottextlen / nvalidtexts;
1864         else
1865                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
1866
1867         nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
1868         nvictims = Min(nvictims, i);
1869
1870         for (i = 0; i < nvictims; i++)
1871         {
1872                 hash_search(hash_table, &entries[i]->key, HASH_REMOVE, NULL);
1873         }
1874
1875         pfree(entries);
1876
1877         /* Increment the number of times entries are deallocated */
1878         {
1879                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1880
1881                 SpinLockAcquire(&s->mutex);
1882                 s->stats.dealloc += 1;
1883                 SpinLockRelease(&s->mutex);
1884         }
1885 }
1886
1887 /*
1888  * Given a plan string (not necessarily null-terminated), allocate a new
1889  * entry in the external plan text file and store the string there.
1890  *
1891  * If successful, returns true, and stores the new entry's offset in the file
1892  * into *plan_offset.  Also, if gc_count isn't NULL, *gc_count is set to the
1893  * number of garbage collections that have occurred so far.
1894  *
1895  * On failure, returns false.
1896  *
1897  * At least a shared lock on shared_state->lock must be held by the caller, so
1898  * as to prevent a concurrent garbage collection.  Share-lock-holding callers
1899  * should pass a gc_count pointer to obtain the number of garbage collections,
1900  * so that they can recheck the count after obtaining exclusive lock to detect
1901  * whether a garbage collection occurred (and removed this entry).
1902  */
1903 static bool
1904 ptext_store(const char *plan, int plan_len, Size *plan_offset, int *gc_count)
1905 {
1906         Size            off;
1907         int                     fd;
1908
1909         Assert (plan_storage == PLAN_STORAGE_FILE);
1910
1911         /*
1912          * We use a spinlock to protect extent/n_writers/gc_count, so that
1913          * multiple processes may execute this function concurrently.
1914          */
1915         {
1916                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1917
1918                 SpinLockAcquire(&s->mutex);
1919                 off = s->extent;
1920                 s->extent += plan_len + 1;
1921                 s->n_writers++;
1922                 if (gc_count)
1923                         *gc_count = s->gc_count;
1924                 SpinLockRelease(&s->mutex);
1925         }
1926
1927         *plan_offset = off;
1928
1929         /* Now write the data into the successfully-reserved part of the file */
1930         fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
1931         if (fd < 0)
1932                 goto error;
1933
1934         if (pg_pwrite(fd, plan, plan_len, off) != plan_len)
1935                 goto error;
1936         if (pg_pwrite(fd, "\0", 1, off + plan_len) != 1)
1937                 goto error;
1938
1939         CloseTransientFile(fd);
1940
1941         /* Mark our write complete */
1942         {
1943                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1944
1945                 SpinLockAcquire(&s->mutex);
1946                 s->n_writers--;
1947                 SpinLockRelease(&s->mutex);
1948         }
1949
1950         return true;
1951
1952 error:
1953         ereport(LOG,
1954                         (errcode_for_file_access(),
1955                          errmsg("could not write file \"%s\": %m",
1956                                         PGSP_TEXT_FILE)));
1957
1958         if (fd >= 0)
1959                 CloseTransientFile(fd);
1960
1961         /* Mark our write complete */
1962         {
1963                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1964
1965                 SpinLockAcquire(&s->mutex);
1966                 s->n_writers--;
1967                 SpinLockRelease(&s->mutex);
1968         }
1969
1970         return false;
1971 }
1972
1973 /*
1974  * Read the external plan text file into a malloc'd buffer.
1975  *
1976  * Returns NULL (without throwing an error) if unable to read, eg
1977  * file not there or insufficient memory.
1978  *
1979  * On success, the buffer size is also returned into *buffer_size.
1980  *
1981  * This can be called without any lock on shared_state->lock, but in that case
1982  * the caller is responsible for verifying that the result is sane.
1983  */
1984 static char *
1985 ptext_load_file(Size *buffer_size)
1986 {
1987         char       *buf;
1988         int                     fd;
1989         struct stat stat;
1990         Size            nread;
1991
1992         Assert (plan_storage == PLAN_STORAGE_FILE);
1993
1994         fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDONLY | PG_BINARY);
1995         if (fd < 0)
1996         {
1997                 if (errno != ENOENT)
1998                         ereport(LOG,
1999                                         (errcode_for_file_access(),
2000                                          errmsg("could not read file \"%s\": %m",
2001                                                         PGSP_TEXT_FILE)));
2002                 return NULL;
2003         }
2004
2005         /* Get file length */
2006         if (fstat(fd, &stat))
2007         {
2008                 ereport(LOG,
2009                                 (errcode_for_file_access(),
2010                                  errmsg("could not stat file \"%s\": %m",
2011                                                 PGSP_TEXT_FILE)));
2012                 CloseTransientFile(fd);
2013                 return NULL;
2014         }
2015
2016         /* Allocate buffer; beware that off_t might be wider than size_t */
2017         if (stat.st_size <= MaxAllocHugeSize)
2018                 buf = (char *) malloc(stat.st_size);
2019         else
2020                 buf = NULL;
2021         if (buf == NULL)
2022         {
2023                 ereport(LOG,
2024                                 (errcode(ERRCODE_OUT_OF_MEMORY),
2025                                  errmsg("out of memory"),
2026                                  errdetail("Could not allocate enough memory to read file \"%s\".",
2027                                                    PGSP_TEXT_FILE)));
2028                 CloseTransientFile(fd);
2029                 return NULL;
2030         }
2031
2032         /*
2033          * OK, slurp in the file.  Windows fails if we try to read more than
2034          * INT_MAX bytes at once, and other platforms might not like that either,
2035          * so read a very large file in 1GB segments.
2036          */
2037         nread = 0;
2038         while (nread < stat.st_size)
2039         {
2040                 int                     toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
2041
2042                 /*
2043                  * If we get a short read and errno doesn't get set, the reason is
2044                  * probably that garbage collection truncated the file since we did
2045                  * the fstat(), so we don't log a complaint --- but we don't return
2046                  * the data, either, since it's most likely corrupt due to concurrent
2047                  * writes from garbage collection.
2048                  */
2049                 errno = 0;
2050                 if (read(fd, buf + nread, toread) != toread)
2051                 {
2052                         if (errno)
2053                                 ereport(LOG,
2054                                                 (errcode_for_file_access(),
2055                                                  errmsg("could not read file \"%s\": %m",
2056                                                                 PGSP_TEXT_FILE)));
2057                         free(buf);
2058                         CloseTransientFile(fd);
2059                         return NULL;
2060                 }
2061                 nread += toread;
2062         }
2063
2064         if (CloseTransientFile(fd) != 0)
2065                 ereport(LOG,
2066                                 (errcode_for_file_access(),
2067                                  errmsg("could not close file \"%s\": %m", PGSP_TEXT_FILE)));
2068
2069         *buffer_size = nread;
2070         return buf;
2071 }
2072
2073 /*
2074  * Locate a plan text in the file image previously read by ptext_load_file().
2075  *
2076  * We validate the given offset/length, and return NULL if bogus.  Otherwise,
2077  * the result points to a null-terminated string within the buffer.
2078  */
2079 static char *
2080 ptext_fetch(Size plan_offset, int plan_len,
2081                         char *buffer, Size buffer_size)
2082 {
2083         Assert (plan_storage == PLAN_STORAGE_FILE);
2084
2085         /* File read failed? */
2086         if (buffer == NULL)
2087                 return NULL;
2088         /* Bogus offset/length? */
2089         if (plan_len < 0 ||
2090                 plan_offset + plan_len >= buffer_size)
2091                 return NULL;
2092         /* As a further sanity check, make sure there's a trailing null */
2093         if (buffer[plan_offset + plan_len] != '\0')
2094                 return NULL;
2095         /* Looks OK */
2096         return buffer + plan_offset;
2097 }
2098
2099 /*
2100  * Do we need to garbage-collect the external plan text file?
2101  *
2102  * Caller should hold at least a shared lock on shared_state->lock.
2103  */
2104 static bool
2105 need_gc_ptexts(void)
2106 {
2107         Size            extent;
2108
2109         Assert (plan_storage == PLAN_STORAGE_FILE);
2110
2111         /* Read shared extent pointer */
2112         {
2113                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2114
2115                 SpinLockAcquire(&s->mutex);
2116                 extent = s->extent;
2117                 SpinLockRelease(&s->mutex);
2118         }
2119
2120         /* Don't proceed if file does not exceed 512 bytes per possible entry */
2121         if (extent < 512 * store_size)
2122                 return false;
2123
2124         /*
2125          * Don't proceed if file is less than about 50% bloat.  Nothing can or
2126          * should be done in the event of unusually large query texts accounting
2127          * for file's large size.  We go to the trouble of maintaining the mean
2128          * query length in order to prevent garbage collection from thrashing
2129          * uselessly.
2130          */
2131         if (extent < shared_state->mean_plan_len * store_size * 2)
2132                 return false;
2133
2134         return true;
2135 }
2136
2137 /*
2138  * Garbage-collect orphaned plan texts in external file.
2139  *
2140  * This won't be called often in the typical case, since it's likely that
2141  * there won't be too much churn, and besides, a similar compaction process
2142  * occurs when serializing to disk at shutdown or as part of resetting.
2143  * Despite this, it seems prudent to plan for the edge case where the file
2144  * becomes unreasonably large, with no other method of compaction likely to
2145  * occur in the foreseeable future.
2146  *
2147  * The caller must hold an exclusive lock on shared_state->lock.
2148  *
2149  * At the first sign of trouble we unlink the query text file to get a clean
2150  * slate (although existing statistics are retained), rather than risk
2151  * thrashing by allowing the same problem case to recur indefinitely.
2152  */
2153 static void
2154 gc_ptexts(void)
2155 {
2156         char       *pbuffer;
2157         Size            pbuffer_size;
2158         FILE       *pfile = NULL;
2159         HASH_SEQ_STATUS hash_seq;
2160         pgspEntry  *entry;
2161         Size            extent;
2162         int                     nentries;
2163
2164         Assert (plan_storage == PLAN_STORAGE_FILE);
2165
2166         /*
2167          * When called from store_entry, some other session might have proceeded
2168          * with garbage collection in the no-lock-held interim of lock strength
2169          * escalation.  Check once more that this is actually necessary.
2170          */
2171         if (!need_gc_ptexts())
2172                 return;
2173
2174         /*
2175          * Load the old texts file.  If we fail (out of memory, for instance),
2176          * invalidate query texts.  Hopefully this is rare.  It might seem better
2177          * to leave things alone on an OOM failure, but the problem is that the
2178          * file is only going to get bigger; hoping for a future non-OOM result is
2179          * risky and can easily lead to complete denial of service.
2180          */
2181         pbuffer = ptext_load_file(&pbuffer_size);
2182         if (pbuffer == NULL)
2183                 goto gc_fail;
2184
2185         /*
2186          * We overwrite the plan texts file in place, so as to reduce the risk of
2187          * an out-of-disk-space failure.  Since the file is guaranteed not to get
2188          * larger, this should always work on traditional filesystems; though we
2189          * could still lose on copy-on-write filesystems.
2190          */
2191         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2192         if (pfile == NULL)
2193         {
2194                 ereport(LOG,
2195                                 (errcode_for_file_access(),
2196                                  errmsg("could not write file \"%s\": %m",
2197                                                 PGSP_TEXT_FILE)));
2198                 goto gc_fail;
2199         }
2200
2201         extent = 0;
2202         nentries = 0;
2203
2204         hash_seq_init(&hash_seq, hash_table);
2205         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2206         {
2207                 int                     plan_len = entry->plan_len;
2208                 char       *plan = ptext_fetch(entry->plan_offset,
2209                                                                            plan_len,
2210                                                                            pbuffer,
2211                                                                            pbuffer_size);
2212
2213                 if (plan == NULL)
2214                 {
2215                         /* Trouble ... drop the text */
2216                         entry->plan_offset = 0;
2217                         entry->plan_len = -1;
2218                         /* entry will not be counted in mean plan length computation */
2219                         continue;
2220                 }
2221
2222                 if (fwrite(plan, 1, plan_len + 1, pfile) != plan_len + 1)
2223                 {
2224                         ereport(LOG,
2225                                         (errcode_for_file_access(),
2226                                          errmsg("could not write file \"%s\": %m",
2227                                                         PGSP_TEXT_FILE)));
2228                         hash_seq_term(&hash_seq);
2229                         goto gc_fail;
2230                 }
2231
2232                 entry->plan_offset = extent;
2233                 extent += plan_len + 1;
2234                 nentries++;
2235         }
2236
2237         /*
2238          * Truncate away any now-unused space.  If this fails for some odd reason,
2239          * we log it, but there's no need to fail.
2240          */
2241         if (ftruncate(fileno(pfile), extent) != 0)
2242                 ereport(LOG,
2243                                 (errcode_for_file_access(),
2244                                  errmsg("could not truncate file \"%s\": %m",
2245                                                 PGSP_TEXT_FILE)));
2246
2247         if (FreeFile(pfile))
2248         {
2249                 ereport(LOG,
2250                                 (errcode_for_file_access(),
2251                                  errmsg("could not write file \"%s\": %m",
2252                                                 PGSP_TEXT_FILE)));
2253                 pfile = NULL;
2254                 goto gc_fail;
2255         }
2256
2257         elog(DEBUG1, "pgsp gc of queries file shrunk size from %zu to %zu",
2258                  shared_state->extent, extent);
2259
2260         /* Reset the shared extent pointer */
2261         shared_state->extent = extent;
2262
2263         /*
2264          * Also update the mean plan length, to be sure that need_gc_ptexts()
2265          * won't still think we have a problem.
2266          */
2267         if (nentries > 0)
2268                 shared_state->mean_plan_len = extent / nentries;
2269         else
2270                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2271
2272         free(pbuffer);
2273
2274         return;
2275
2276 gc_fail:
2277         /* clean up resources */
2278         if (pfile)
2279                 FreeFile(pfile);
2280         if (pbuffer)
2281                 free(pbuffer);
2282
2283         /*
2284          * Since the contents of the external file are now uncertain, mark all
2285          * hashtable entries as having invalid texts.
2286          */
2287         hash_seq_init(&hash_seq, hash_table);
2288         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2289         {
2290                 entry->plan_offset = 0;
2291                 entry->plan_len = -1;
2292         }
2293
2294         /*
2295          * Destroy the query text file and create a new, empty one
2296          */
2297         (void) unlink(PGSP_TEXT_FILE);
2298         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2299         if (pfile == NULL)
2300                 ereport(LOG,
2301                                 (errcode_for_file_access(),
2302                                  errmsg("could not recreate file \"%s\": %m",
2303                                                 PGSP_TEXT_FILE)));
2304         else
2305                 FreeFile(pfile);
2306
2307         /* Reset the shared extent pointer */
2308         shared_state->extent = 0;
2309
2310         /* Reset mean_plan_len to match the new state */
2311         shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2312 }
2313
2314 /*
2315  * Release all entries.
2316  */
2317 static void
2318 entry_reset(void)
2319 {
2320         HASH_SEQ_STATUS hash_seq;
2321         pgspEntry  *entry;
2322         FILE       *pfile;
2323
2324         if (!shared_state || !hash_table)
2325                 ereport(ERROR,
2326                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2327                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
2328
2329         LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
2330
2331         hash_seq_init(&hash_seq, hash_table);
2332         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2333         {
2334                 hash_search(hash_table, &entry->key, HASH_REMOVE, NULL);
2335         }
2336
2337         /*
2338          * Reset global statistics for pg_store_plans.
2339          */
2340         {
2341                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2342                 TimestampTz stats_reset = GetCurrentTimestamp();
2343
2344                 SpinLockAcquire(&s->mutex);
2345                 s->stats.dealloc = 0;
2346                 s->stats.stats_reset = stats_reset;
2347                 SpinLockRelease(&s->mutex);
2348         }
2349
2350         /*
2351          * Write new empty plan file, perhaps even creating a new one to recover
2352          * if the file was missing.
2353          */
2354         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2355         if (pfile == NULL)
2356         {
2357                 ereport(LOG,
2358                                 (errcode_for_file_access(),
2359                                  errmsg("could not create file \"%s\": %m",
2360                                                 PGSP_TEXT_FILE)));
2361                 goto done;
2362         }
2363
2364         /* If ftruncate fails, log it, but it's not a fatal problem */
2365         if (ftruncate(fileno(pfile), 0) != 0)
2366                 ereport(LOG,
2367                                 (errcode_for_file_access(),
2368                                  errmsg("could not truncate file \"%s\": %m",
2369                                                 PGSP_TEXT_FILE)));
2370
2371         FreeFile(pfile);
2372
2373 done:
2374         shared_state->extent = 0;
2375         LWLockRelease(shared_state->lock);
2376 }
2377
2378 Datum
2379 pg_store_plans_hash_query(PG_FUNCTION_ARGS)
2380 {
2381         PG_RETURN_OID(hash_query(text_to_cstring(PG_GETARG_TEXT_P(0))));
2382 }
2383
2384 Datum
2385 pg_store_plans_shorten(PG_FUNCTION_ARGS)
2386 {
2387         text *short_plan = PG_GETARG_TEXT_P(0);
2388         char *cjson = text_to_cstring(short_plan);
2389         char *cshorten = pgsp_json_shorten(cjson);
2390         PG_RETURN_TEXT_P(cstring_to_text(cshorten));
2391 }
2392
2393 Datum
2394 pg_store_plans_normalize(PG_FUNCTION_ARGS)
2395 {
2396         text *short_plan = PG_GETARG_TEXT_P(0);
2397         char *cjson = text_to_cstring(short_plan);
2398         char *cnormalized = pgsp_json_normalize(cjson);
2399         PG_RETURN_TEXT_P(cstring_to_text(cnormalized));
2400 }
2401
2402 Datum
2403 pg_store_plans_jsonplan(PG_FUNCTION_ARGS)
2404 {
2405         text *short_plan = PG_GETARG_TEXT_P(0);
2406         char *cshort = text_to_cstring(short_plan);
2407         char *cinflated = pgsp_json_inflate(cshort);
2408         PG_RETURN_TEXT_P(cstring_to_text(cinflated));
2409 }
2410
2411 Datum
2412 pg_store_plans_textplan(PG_FUNCTION_ARGS)
2413 {
2414         text *short_plan = PG_GETARG_TEXT_P(0);
2415         char *cshort = text_to_cstring(short_plan);
2416         char *ctextized = pgsp_json_textize(cshort);
2417
2418         PG_RETURN_TEXT_P(cstring_to_text(ctextized));
2419 }
2420
2421 Datum
2422 pg_store_plans_yamlplan(PG_FUNCTION_ARGS)
2423 {
2424         text *short_plan = PG_GETARG_TEXT_P(0);
2425         char *cshort = text_to_cstring(short_plan);
2426         char *cyamlized = pgsp_json_yamlize(cshort);
2427
2428         PG_RETURN_TEXT_P(cstring_to_text(cyamlized));
2429 }
2430
2431 Datum
2432 pg_store_plans_xmlplan(PG_FUNCTION_ARGS)
2433 {
2434         text *short_plan = PG_GETARG_TEXT_P(0);
2435         char *cshort = text_to_cstring(short_plan);
2436         char *cxmlized = pgsp_json_xmlize(cshort);
2437
2438         PG_RETURN_TEXT_P(cstring_to_text(cxmlized));
2439 }