OSDN Git Service

Add statistics of pg_store_plans
[pgstoreplans/pg_store_plans.git] / pg_store_plans.c
1 /*-------------------------------------------------------------------------
2  *
3  * pg_store_plans.c
4  *              Take statistics of plan selection across a whole database cluster.
5  *
6  * Execution costs are totaled for each distinct plan for each query,
7  * and plan and queryid are kept in a shared hashtable, each record in
8  * which is associated with a record in pg_stat_statements, if any, by
9  * the queryid.
10  *
11  * For Postgres 9.3 or earlier does not expose query id so
12  * pg_store_plans needs to calculate it based on the given query
13  * string using different algorithm from pg_stat_statements, and later
14  * the id will be matched against the one made from query string
15  * stored in pg_stat_statements. For the reason, queryid matching in
16  * this way will fail if the query string kept in pg_stat_statements
17  * is truncated in the middle.
18  *
19  * Plans are identified by fingerprinting plan representations in
20  * "shortened" JSON format with constants and unstable values such as
21  * rows, width, loops ignored. Nevertheless, stored plan entries hold
22  * them of the latest execution. Entry eviction is done in the same
23  * way to pg_stat_statements.
24  *
25  * Copyright (c) 2008-2020, PostgreSQL Global Development Group
26  * Copyright (c) 2012-2021, NIPPON TELEGRAPH AND TELEPHONE CORPORATION
27  *
28  * IDENTIFICATION
29  *        pg_store_plans/pg_store_plans.c
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34
35 #include <sys/stat.h>
36 #include <unistd.h>
37 #include <dlfcn.h>
38 #include <math.h>
39
40 #include "catalog/pg_authid.h"
41 #include "commands/explain.h"
42 #include "access/hash.h"
43 #include "executor/instrument.h"
44 #include "funcapi.h"
45 #include "mb/pg_wchar.h"
46 #include "miscadmin.h"
47 #include "pgstat.h"
48 #include "storage/fd.h"
49 #include "storage/ipc.h"
50 #include "storage/lwlock.h"
51 #include "storage/spin.h"
52 #include "storage/shmem.h"
53 #include "tcop/utility.h"
54 #include "utils/acl.h"
55 #include "utils/builtins.h"
56 #if PG_VERSION_NUM >= 140000
57 #include "utils/queryjumble.h"
58 #endif
59 #include "utils/timestamp.h"
60
61 #include "pgsp_json.h"
62 #include "pgsp_explain.h"
63
64 PG_MODULE_MAGIC;
65
66 /* Location of stats file */
67 #define PGSP_DUMP_FILE  "global/pg_store_plans.stat"
68 #define PGSP_TEXT_FILE  PG_STAT_TMP_DIR "/pgsp_plan_texts.stat"
69
70 /* PostgreSQL major version number, changes in which invalidate all entries */
71 static const uint32 PGSP_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
72
73 /* This constant defines the magic number in the stats file header */
74 static const uint32 PGSP_FILE_HEADER = 0x20211125;
75 static int max_plan_len = 5000;
76
77 /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
78 #define USAGE_EXEC(duration)    (1.0)
79 #define USAGE_INIT                              (1.0)   /* including initial planning */
80 #define ASSUMED_MEDIAN_INIT             (10.0)  /* initial assumed median usage */
81 #define ASSUMED_LENGTH_INIT             1024    /* initial assumed mean query length */
82 #define USAGE_DECREASE_FACTOR   (0.99)  /* decreased every entry_dealloc */
83 #define STICKY_DECREASE_FACTOR  (0.50)  /* factor for sticky entries */
84 #define USAGE_DEALLOC_PERCENT   5               /* free this % of entries at once */
85
86 /* In PostgreSQL 11, queryid becomes a uint64 internally.
87  */
88 #if PG_VERSION_NUM >= 110000
89 typedef uint64 queryid_t;
90 #define PGSP_NO_QUERYID         UINT64CONST(0)
91 #else
92 typedef uint32 queryid_t;
93 #define PGSP_NO_QUERYID         0
94 #endif
95
96 /*
97  * Extension version number, for supporting older extension versions' objects
98  */
99 typedef enum pgspVersion
100 {
101         PGSP_V1_5 = 0,
102         PGSP_V1_6
103 } pgspVersion;
104
105 /*
106  * Hashtable key that defines the identity of a hashtable entry.  We separate
107  * queries by user and by database even if they are otherwise identical.
108  *
109  * Presently, the query encoding is fully determined by the source database
110  * and so we don't really need it to be in the key.  But that might not always
111  * be true. Anyway it's notationally convenient to pass it as part of the key.
112  */
113 typedef struct pgspHashKey
114 {
115         Oid                     userid;                 /* user OID */
116         Oid                     dbid;                   /* database OID */
117         queryid_t       queryid;                /* query identifier */
118         uint32          planid;                 /* plan identifier */
119 } pgspHashKey;
120
121 /*
122  * The actual stats counters kept within pgspEntry.
123  */
124 typedef struct Counters
125 {
126         int64           calls;                          /* # of times executed */
127         double          total_time;                     /* total execution time, in msec */
128         double          min_time;                       /* minimum execution time in msec */
129         double          max_time;                       /* maximum execution time in msec */
130         double          mean_time;                      /* mean execution time in msec */
131         double          sum_var_time;   /* sum of variances in execution time in msec */
132         int64           rows;                           /* total # of retrieved or affected rows */
133         int64           shared_blks_hit;        /* # of shared buffer hits */
134         int64           shared_blks_read;       /* # of shared disk blocks read */
135         int64           shared_blks_dirtied;/* # of shared disk blocks dirtied */
136         int64           shared_blks_written;/* # of shared disk blocks written */
137         int64           local_blks_hit;         /* # of local buffer hits */
138         int64           local_blks_read;        /* # of local disk blocks read */
139         int64           local_blks_dirtied;     /* # of local disk blocks dirtied */
140         int64           local_blks_written;     /* # of local disk blocks written */
141         int64           temp_blks_read;         /* # of temp blocks read */
142         int64           temp_blks_written;      /* # of temp blocks written */
143         double          blk_read_time;          /* time spent reading, in msec */
144         double          blk_write_time;         /* time spent writing, in msec */
145         TimestampTz     first_call;                     /* timestamp of first call  */
146         TimestampTz     last_call;                      /* timestamp of last call  */
147         double          usage;                          /* usage factor */
148 } Counters;
149
150 /*
151  * Global statistics for pg_store_plans
152  */
153 typedef struct pgspGlobalStats
154 {
155         int64           dealloc;                /* # of times entries were deallocated */
156         TimestampTz stats_reset;        /* timestamp with all stats reset */
157 } pgspGlobalStats;
158
159 /*
160  * Statistics per plan
161  *
162  * NB: see the file read/write code before changing field order here.
163  */
164 typedef struct pgspEntry
165 {
166         pgspHashKey     key;                    /* hash key of entry - MUST BE FIRST */
167         Counters        counters;               /* the statistics for this query */
168         Size            plan_offset;    /* plan text offset in extern file */
169         int                     plan_len;               /* # of valid bytes in query string */
170         int                     encoding;               /* query encoding */
171         slock_t         mutex;                  /* protects the counters only */
172 } pgspEntry;
173
174 /*
175  * Global shared state
176  */
177 typedef struct pgspSharedState
178 {
179         LWLock     *lock;                       /* protects hashtable search/modification */
180         int                     plan_size;              /* max query length in bytes */
181         double          cur_median_usage;       /* current median usage in hashtable */
182         Size            mean_plan_len;  /* current mean entry text length */
183         slock_t         mutex;                  /* protects following fields only: */
184         Size            extent;                 /* current extent of plan file */
185         int                     n_writers;              /* number of active writers to query file */
186         int                     gc_count;               /* plan file garbage collection cycle count */
187         pgspGlobalStats stats;          /* global statistics for pgsp */
188 } pgspSharedState;
189
190 /*---- Local variables ----*/
191
192 /* Current nesting depth of ExecutorRun+ProcessUtility calls */
193 static int      nested_level = 0;
194
195 /* Saved hook values in case of unload */
196 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
197 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
198 static ExecutorRun_hook_type prev_ExecutorRun = NULL;
199 static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
200 static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
201 static ProcessUtility_hook_type prev_ProcessUtility = NULL;
202
203 /* Links to shared memory state */
204 static pgspSharedState *shared_state = NULL;
205 static HTAB *hash_table = NULL;
206
207 /*---- GUC variables ----*/
208
209 typedef enum
210 {
211         TRACK_LEVEL_NONE,                       /* track no statements */
212         TRACK_LEVEL_TOP,                                /* only top level statements */
213         TRACK_LEVEL_ALL,                                /* all statements, including nested ones */
214         TRACK_LEVEL_FORCE                       /* all statements, including nested ones */
215 }       PGSPTrackLevel;
216
217 static const struct config_enum_entry track_options[] =
218 {
219         {"none", TRACK_LEVEL_NONE, false},
220         {"top", TRACK_LEVEL_TOP, false},
221         {"all", TRACK_LEVEL_ALL, false},
222         {NULL, 0, false}
223 };
224
225 typedef enum
226 {
227         PLAN_FORMAT_RAW,                /* No conversion. Shorten JSON */
228         PLAN_FORMAT_TEXT,               /* Traditional text representation */
229         PLAN_FORMAT_JSON,               /* JSON representation */
230         PLAN_FORMAT_YAML,               /* YAML */
231         PLAN_FORMAT_XML,                /* XML  */
232 }       PGSPPlanFormats;
233
234 static const struct config_enum_entry plan_formats[] =
235 {
236         {"raw" , PLAN_FORMAT_RAW , false},
237         {"text", PLAN_FORMAT_TEXT, false},
238         {"json", PLAN_FORMAT_JSON, false},
239         {"yaml", PLAN_FORMAT_YAML, false},
240         {"xml" , PLAN_FORMAT_XML , false},
241         {NULL, 0, false}
242 };
243
244 /* options for plan storage */
245 typedef enum
246 {
247         PLAN_STORAGE_SHMEM,             /* plan is stored as a part of hash entry */
248         PLAN_STORAGE_FILE               /* plan is stored in a separate file */
249 }  pgspPlanStorage;
250
251 static const struct config_enum_entry plan_storage_options[] =
252 {
253         {"shmem", PLAN_STORAGE_SHMEM, false},
254         {"file", PLAN_STORAGE_FILE, false},
255         {NULL, 0, false}
256 };
257
258 static int      store_size;                     /* max # statements to track */
259 static int      track_level;            /* tracking level */
260 static int      min_duration;           /* min duration to record */
261 static bool dump_on_shutdown;   /* whether to save stats across shutdown */
262 static bool log_analyze;                /* Similar to EXPLAIN (ANALYZE *) */
263 static bool log_verbose;                /* Similar to EXPLAIN (VERBOSE *) */
264 static bool log_buffers;                /* Similar to EXPLAIN (BUFFERS *) */
265 static bool log_timing;                 /* Similar to EXPLAIN (TIMING *) */
266 static bool log_triggers;               /* whether to log trigger statistics  */
267 static int  plan_format;                /* Plan representation style in
268                                                                  * pg_store_plans.plan  */
269 static int  plan_storage;               /* Plan storage type */
270
271 #if PG_VERSION_NUM >= 140000
272 /*
273  * For pg14 and later, we rely on core queryid calculation.  If
274  * it's not available it means that the admin explicitly refused to
275  * compute it, for performance reason or other.  In that case, we
276  * will also consider that this extension is disabled.
277  */
278 #define pgsp_enabled(q) \
279         ((track_level == TRACK_LEVEL_ALL || \
280         (track_level == TRACK_LEVEL_TOP && nested_level == 0)) && \
281         (q != PGSP_NO_QUERYID))
282 #else
283 #define pgsp_enabled(q) \
284         (track_level == TRACK_LEVEL_ALL || \
285         (track_level == TRACK_LEVEL_TOP && nested_level == 0))
286 #endif
287
288 #define SHMEM_PLAN_PTR(ent) (((char *) ent) + sizeof(pgspEntry))
289
290 /*---- Function declarations ----*/
291
292 void            _PG_init(void);
293 void            _PG_fini(void);
294
295 Datum           pg_store_plans_reset(PG_FUNCTION_ARGS);
296 Datum           pg_store_plans_hash_query(PG_FUNCTION_ARGS);
297 Datum           pg_store_plans(PG_FUNCTION_ARGS);
298 Datum           pg_store_plans_shorten(PG_FUNCTION_ARGS);
299 Datum           pg_store_plans_normalize(PG_FUNCTION_ARGS);
300 Datum           pg_store_plans_jsonplan(PG_FUNCTION_ARGS);
301 Datum           pg_store_plans_yamlplan(PG_FUNCTION_ARGS);
302 Datum           pg_store_plans_xmlplan(PG_FUNCTION_ARGS);
303 Datum           pg_store_plans_textplan(PG_FUNCTION_ARGS);
304 Datum           pg_store_plans_info(PG_FUNCTION_ARGS);
305
306 PG_FUNCTION_INFO_V1(pg_store_plans_reset);
307 PG_FUNCTION_INFO_V1(pg_store_plans_hash_query);
308 PG_FUNCTION_INFO_V1(pg_store_plans);
309 PG_FUNCTION_INFO_V1(pg_store_plans_1_6);
310 PG_FUNCTION_INFO_V1(pg_store_plans_shorten);
311 PG_FUNCTION_INFO_V1(pg_store_plans_normalize);
312 PG_FUNCTION_INFO_V1(pg_store_plans_jsonplan);
313 PG_FUNCTION_INFO_V1(pg_store_plans_yamlplan);
314 PG_FUNCTION_INFO_V1(pg_store_plans_xmlplan);
315 PG_FUNCTION_INFO_V1(pg_store_plans_textplan);
316 PG_FUNCTION_INFO_V1(pg_store_plans_info);
317
318 #if PG_VERSION_NUM < 130000
319 #define COMPTAG_TYPE char
320 #else
321 #define COMPTAG_TYPE QueryCompletion
322 #endif
323
324 #if PG_VERSION_NUM < 140000
325 #define ROLE_PG_READ_ALL_STATS          DEFAULT_ROLE_READ_ALL_STATS
326 #endif
327
328 static void pgsp_shmem_startup(void);
329 static void pgsp_shmem_shutdown(int code, Datum arg);
330 static void pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags);
331 static void pgsp_ExecutorRun(QueryDesc *queryDesc,
332                                  ScanDirection direction,
333                                                          uint64 count, bool execute_once);
334 static void pgsp_ExecutorFinish(QueryDesc *queryDesc);
335 static void pgsp_ExecutorEnd(QueryDesc *queryDesc);
336 static void pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
337 #if PG_VERSION_NUM >= 140000
338                                         bool readOnlyTree,
339 #endif
340                                         ProcessUtilityContext context, ParamListInfo params,
341                                         QueryEnvironment *queryEnv,
342                                         DestReceiver *dest, COMPTAG_TYPE *completionTag);
343 static uint32 hash_query(const char* query);
344 static void pgsp_store(char *plan, queryid_t queryId,
345                    double total_time, uint64 rows,
346                    const BufferUsage *bufusage);
347 static void pg_store_plans_internal(FunctionCallInfo fcinfo,
348                                                                         pgspVersion api_version);
349 static Size shared_mem_size(void);
350 static pgspEntry *entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len,
351                                                           bool sticky);
352 static bool ptext_store(const char *plan, int plan_len, Size *plan_offset,
353                                                 int *gc_count);
354 static char *ptext_load_file(Size *buffer_size);
355 static char *ptext_fetch(Size plan_offset, int plan_len, char *buffer,
356                                                  Size buffer_size);
357 static bool need_gc_ptexts(void);
358 static void gc_ptexts(void);
359 static void entry_dealloc(void);
360 static void entry_reset(void);
361
362 /*
363  * Module load callback
364  */
365 void
366 _PG_init(void)
367 {
368         /*
369          * In order to create our shared memory area, we have to be loaded via
370          * shared_preload_libraries.  If not, fall out without hooking into any of
371          * the main system.  (We don't throw error here because it seems useful to
372          * allow the pg_stat_statements functions to be created even when the
373          * module isn't active.  The functions must protect themselves against
374          * being called then, however.)
375          */
376         if (!process_shared_preload_libraries_in_progress)
377                 return;
378
379 #if PG_VERSION_NUM >= 140000
380         /*
381          * Inform the postmaster that we want to enable query_id calculation if
382          * compute_query_id is set to auto.
383          */
384         EnableQueryId();
385 #endif
386
387         /*
388          * Define (or redefine) custom GUC variables.
389          */
390         DefineCustomIntVariable("pg_store_plans.max",
391           "Sets the maximum number of plans tracked by pg_store_plans.",
392                                                         NULL,
393                                                         &store_size,
394                                                         1000,
395                                                         100,
396                                                         INT_MAX,
397                                                         PGC_POSTMASTER,
398                                                         0,
399                                                         NULL,
400                                                         NULL,
401                                                         NULL);
402
403         DefineCustomIntVariable("pg_store_plans.max_plan_length",
404           "Sets the maximum length of plans stored by pg_store_plans.",
405                                                         NULL,
406                                                         &max_plan_len,
407                                                         5000,
408                                                         100,
409                                                         INT32_MAX,
410                                                         PGC_POSTMASTER,
411                                                         0,
412                                                         NULL,
413                                                         NULL,
414                                                         NULL);
415
416         DefineCustomEnumVariable("pg_store_plans.plan_storage",
417                            "Selects where to store plan texts.",
418                                                          NULL,
419                                                          &plan_storage,
420                                                          PLAN_STORAGE_FILE,
421                                                          plan_storage_options,
422                                                          PGC_USERSET,
423                                                          0,
424                                                          NULL,
425                                                          NULL,
426                                                          NULL);
427
428         DefineCustomEnumVariable("pg_store_plans.track",
429                            "Selects which plans are tracked by pg_store_plans.",
430                                                          NULL,
431                                                          &track_level,
432                                                          TRACK_LEVEL_TOP,
433                                                          track_options,
434                                                          PGC_SUSET,
435                                                          0,
436                                                          NULL,
437                                                          NULL,
438                                                          NULL);
439
440         DefineCustomEnumVariable("pg_store_plans.plan_format",
441                            "Selects which format to be appied for plan representation in pg_store_plans.",
442                                                          NULL,
443                                                          &plan_format,
444                                                          PLAN_FORMAT_TEXT,
445                                                          plan_formats,
446                                                          PGC_USERSET,
447                                                          0,
448                                                          NULL,
449                                                          NULL,
450                                                          NULL);
451
452         DefineCustomIntVariable("pg_store_plans.min_duration",
453                                         "Minimum duration to record plan in milliseconds.",
454                                                         NULL,
455                                                         &min_duration,
456                                                         0,
457                                                         0,
458                                                         INT_MAX,
459                                                         PGC_SUSET,
460                                                         0,
461                                                         NULL,
462                                                         NULL,
463                                                         NULL);
464
465         DefineCustomBoolVariable("pg_store_plans.save",
466                            "Save pg_store_plans statistics across server shutdowns.",
467                                                          NULL,
468                                                          &dump_on_shutdown,
469                                                          true,
470                                                          PGC_SIGHUP,
471                                                          0,
472                                                          NULL,
473                                                          NULL,
474                                                          NULL);
475
476         DefineCustomBoolVariable("pg_store_plans.log_analyze",
477                                                          "Use EXPLAIN ANALYZE for plan logging.",
478                                                          NULL,
479                                                          &log_analyze,
480                                                          false,
481                                                          PGC_SUSET,
482                                                          0,
483                                                          NULL,
484                                                          NULL,
485                                                          NULL);
486
487         DefineCustomBoolVariable("pg_store_plans.log_buffers",
488                                                          "Log buffer usage.",
489                                                          NULL,
490                                                          &log_buffers,
491                                                          false,
492                                                          PGC_SUSET,
493                                                          0,
494                                                          NULL,
495                                                          NULL,
496                                                          NULL);
497
498         DefineCustomBoolVariable("pg_store_plans.log_timing",
499                                                          "Log timings.",
500                                                          NULL,
501                                                          &log_timing,
502                                                          true,
503                                                          PGC_SUSET,
504                                                          0,
505                                                          NULL,
506                                                          NULL,
507                                                          NULL);
508
509         DefineCustomBoolVariable("pg_store_plans.log_triggers",
510                                                          "Log trigger trace.",
511                                                          NULL,
512                                                          &log_triggers,
513                                                          false,
514                                                          PGC_SUSET,
515                                                          0,
516                                                          NULL,
517                                                          NULL,
518                                                          NULL);
519
520         DefineCustomBoolVariable("pg_store_plans.log_verbose",
521                            "Set VERBOSE for EXPLAIN on logging.",
522                                                          NULL,
523                                                          &log_verbose,
524                                                          false,
525                                                          PGC_SUSET,
526                                                          0,
527                                                          NULL,
528                                                          NULL,
529                                                          NULL);
530
531         EmitWarningsOnPlaceholders("pg_store_plans");
532
533         /*
534          * Request additional shared resources.  (These are no-ops if we're not in
535          * the postmaster process.)  We'll allocate or attach to the shared
536          * resources in pgsp_shmem_startup().
537          */
538         RequestAddinShmemSpace(shared_mem_size());
539         RequestNamedLWLockTranche("pg_store_plans", 1);
540
541         /*
542          * Install hooks.
543          */
544         prev_shmem_startup_hook = shmem_startup_hook;
545         shmem_startup_hook = pgsp_shmem_startup;
546         prev_ExecutorStart = ExecutorStart_hook;
547         ExecutorStart_hook = pgsp_ExecutorStart;
548         prev_ExecutorRun = ExecutorRun_hook;
549         ExecutorRun_hook = pgsp_ExecutorRun;
550         prev_ExecutorFinish = ExecutorFinish_hook;
551         ExecutorFinish_hook = pgsp_ExecutorFinish;
552         prev_ExecutorEnd = ExecutorEnd_hook;
553         ExecutorEnd_hook = pgsp_ExecutorEnd;
554         prev_ProcessUtility = ProcessUtility_hook;
555         ProcessUtility_hook = pgsp_ProcessUtility;
556 }
557
558 /*
559  * Module unload callback
560  */
561 void
562 _PG_fini(void)
563 {
564         /* Uninstall hooks. */
565         shmem_startup_hook = prev_shmem_startup_hook;
566         ExecutorStart_hook = prev_ExecutorStart;
567         ExecutorRun_hook = prev_ExecutorRun;
568         ExecutorFinish_hook = prev_ExecutorFinish;
569         ExecutorEnd_hook = prev_ExecutorEnd;
570         ProcessUtility_hook = prev_ProcessUtility;
571 }
572
573 /*
574  * shmem_startup hook: allocate or attach to shared memory,
575  * then load any pre-existing statistics from file.
576  */
577 static void
578 pgsp_shmem_startup(void)
579 {
580         bool            found;
581         HASHCTL         info;
582         FILE       *file;
583         FILE       *pfile = NULL;
584         uint32          header;
585         int32           num;
586         int32           pgver;
587         int32           i;
588         int                     plan_size;
589         int                     buffer_size;
590         char       *buffer = NULL;
591
592         if (prev_shmem_startup_hook)
593                 prev_shmem_startup_hook();
594
595         /* reset in case this is a restart within the postmaster */
596         shared_state = NULL;
597         hash_table = NULL;
598
599         /*
600          * Create or attach to the shared memory state, including hash table
601          */
602         LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
603
604         shared_state = ShmemInitStruct("pg_store_plans",
605                                                    sizeof(pgspSharedState),
606                                                    &found);
607
608         if (!found)
609         {
610                 /* First time through ... */
611                 shared_state->lock = &(GetNamedLWLockTranche("pg_store_plans"))->lock;
612                 shared_state->plan_size = max_plan_len;
613                 shared_state->cur_median_usage = ASSUMED_MEDIAN_INIT;
614                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
615                 SpinLockInit(&shared_state->mutex);
616                 shared_state->extent = 0;
617                 shared_state->n_writers = 0;
618                 shared_state->gc_count = 0;
619                 shared_state->stats.dealloc = 0;
620                 shared_state->stats.stats_reset = GetCurrentTimestamp();
621         }
622
623         /* Be sure everyone agrees on the hash table entry size */
624         plan_size = shared_state->plan_size;
625
626         memset(&info, 0, sizeof(info));
627         info.keysize = sizeof(pgspHashKey);
628         info.entrysize = sizeof(pgspEntry);
629         if (plan_storage == PLAN_STORAGE_SHMEM)
630                 info.entrysize += max_plan_len;
631         hash_table = ShmemInitHash("pg_store_plans hash",
632                                                           store_size, store_size,
633                                                           &info, HASH_ELEM |
634                                                           HASH_BLOBS);
635
636         LWLockRelease(AddinShmemInitLock);
637
638         /*
639          * If we're in the postmaster (or a standalone backend...), set up a shmem
640          * exit hook to dump the statistics to disk.
641          */
642         if (!IsUnderPostmaster)
643                 on_shmem_exit(pgsp_shmem_shutdown, (Datum) 0);
644
645         /*
646          * Done if some other process already completed our initialization.
647          */
648         if (found)
649                 return;
650
651         /*
652          * Note: we don't bother with locks here, because there should be no other
653          * processes running when this code is reached.
654          */
655
656         /* Unlink query text file possibly left over from crash */
657         unlink(PGSP_TEXT_FILE);
658
659         if (plan_storage == PLAN_STORAGE_FILE)
660         {
661                 /* Allocate new query text temp file */
662                 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
663                 if (pfile == NULL)
664                         goto write_error;
665         }
666
667         /*
668          * If we were told not to load old statistics, we're done.  (Note we do
669          * not try to unlink any old dump file in this case.  This seems a bit
670          * questionable but it's the historical behavior.)
671          */
672         if (!dump_on_shutdown)
673         {
674                 if (pfile)
675                         FreeFile(pfile);
676                 return;
677         }
678
679         /*
680          * Attempt to load old statistics from the dump file.
681          */
682         file = AllocateFile(PGSP_DUMP_FILE, PG_BINARY_R);
683         if (file == NULL)
684         {
685                 if (errno == ENOENT)
686                         return;                         /* ignore not-found error */
687                 /* No existing persisted stats file, so we're done */
688                 goto read_error;
689         }
690
691         buffer_size = plan_size;
692         buffer = (char *) palloc(buffer_size);
693
694         if (fread(&header, sizeof(uint32), 1, file) != 1 ||
695                 fread(&pgver, sizeof(uint32), 1, file) != 1 ||
696                 fread(&num, sizeof(int32), 1, file) != 1)
697                 goto read_error;
698
699         if (header != PGSP_FILE_HEADER ||
700                 pgver != PGSP_PG_MAJOR_VERSION)
701                 goto data_error;
702
703         for (i = 0; i < num; i++)
704         {
705                 pgspEntry       temp;
706                 pgspEntry  *entry;
707                 Size            plan_offset = 0;
708
709                 if (fread(&temp, sizeof(pgspEntry), 1, file) != 1)
710                         goto read_error;
711
712                 /* Encoding is the only field we can easily sanity-check */
713                 if (!PG_VALID_BE_ENCODING(temp.encoding))
714                         goto data_error;
715
716                 /* Previous incarnation might have had a larger plan_size */
717                 if (temp.plan_len >= buffer_size)
718                 {
719                         buffer = (char *) repalloc(buffer, temp.plan_len + 1);
720                         buffer_size = temp.plan_len + 1;
721                 }
722
723                 if (fread(buffer, 1, temp.plan_len + 1, file) != temp.plan_len + 1)
724                         goto read_error;
725
726                 /* Skip loading "sticky" entries */
727                 if (temp.counters.calls == 0)
728                         continue;
729
730                 /* Clip to available length if needed */
731                 if (temp.plan_len >= plan_size)
732                         temp.plan_len = pg_encoding_mbcliplen(temp.encoding,
733                                                                                                    buffer,
734                                                                                                    temp.plan_len,
735                                                                                                    plan_size - 1);
736
737                 buffer[temp.plan_len] = '\0';
738
739                 if (plan_storage == PLAN_STORAGE_FILE)
740                 {
741                         /* Store the plan text */
742                         plan_offset = shared_state->extent;
743                         if (fwrite(buffer, 1, temp.plan_len + 1, pfile) !=
744                                 temp.plan_len + 1)
745                                 goto write_error;
746                         shared_state->extent += temp.plan_len + 1;
747                 }
748
749                 /* make the hashtable entry (discards old entries if too many) */
750                 entry = entry_alloc(&temp.key, plan_offset, temp.plan_len, false);
751
752                 if (plan_storage == PLAN_STORAGE_SHMEM)
753                         memcpy(SHMEM_PLAN_PTR(entry), buffer, temp.plan_len + 1);
754
755                 /* copy in the actual stats */
756                 entry->counters = temp.counters;
757         }
758
759         pfree(buffer);
760         FreeFile(file);
761
762         if (pfile)
763                 FreeFile(pfile);
764
765         /*
766          * Remove the file so it's not included in backups/replication slaves,
767          * etc. A new file will be written on next shutdown.
768          */
769         unlink(PGSP_DUMP_FILE);
770
771         return;
772
773 read_error:
774         ereport(LOG,
775                         (errcode_for_file_access(),
776                          errmsg("could not read file \"%s\": %m",
777                                         PGSP_DUMP_FILE)));
778         goto fail;
779 data_error:
780         ereport(LOG,
781                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
782                          errmsg("ignoring invalid data in file \"%s\"",
783                                         PGSP_DUMP_FILE)));
784         goto fail;
785 write_error:
786         ereport(LOG,
787                         (errcode_for_file_access(),
788                          errmsg("could not write file \"%s\": %m",
789                                         PGSP_TEXT_FILE)));
790 fail:
791         if (buffer)
792                 pfree(buffer);
793         if (file)
794                 FreeFile(file);
795         if (pfile)
796                 FreeFile(pfile);
797         /* If possible, throw away the bogus file; ignore any error */
798         unlink(PGSP_DUMP_FILE);
799
800         /*
801          * Don't unlink PGSP_TEXT_FILE here; it should always be around while the
802          * server is running with pg_stat_statements enabled
803          */
804 }
805
806 /*
807  * shmem_shutdown hook: Dump statistics into file.
808  *
809  * Note: we don't bother with acquiring lock, because there should be no
810  * other processes running when this is called.
811  */
812 static void
813 pgsp_shmem_shutdown(int code, Datum arg)
814 {
815         FILE       *file;
816         char       *pbuffer = NULL;
817         Size            pbuffer_size = 0;
818         HASH_SEQ_STATUS hash_seq;
819         int32           num_entries;
820         pgspEntry  *entry;
821
822         /* Don't try to dump during a crash. */
823         if (code)
824                 return;
825
826         /* Safety check ... shouldn't get here unless shmem is set up. */
827         if (!shared_state || !hash_table)
828                 return;
829
830         /* Don't dump if told not to. */
831         if (!dump_on_shutdown)
832                 return;
833
834         file = AllocateFile(PGSP_DUMP_FILE ".tmp", PG_BINARY_W);
835         if (file == NULL)
836                 goto error;
837
838         if (fwrite(&PGSP_FILE_HEADER, sizeof(uint32), 1, file) != 1)
839                 goto error;
840         if (fwrite(&PGSP_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
841                 goto error;
842         num_entries = hash_get_num_entries(hash_table);
843         if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
844                 goto error;
845
846         if (plan_storage == PLAN_STORAGE_FILE)
847         {
848                 pbuffer = ptext_load_file(&pbuffer_size);
849                 if (pbuffer == NULL)
850                         goto error;
851         }
852
853         hash_seq_init(&hash_seq, hash_table);
854         while ((entry = hash_seq_search(&hash_seq)) != NULL)
855         {
856                 int                     len = entry->plan_len;
857                 char       *pstr;
858
859                 if (plan_storage == PLAN_STORAGE_FILE)
860                         pstr = ptext_fetch(entry->plan_offset, len,
861                                                            pbuffer, pbuffer_size);
862                 else
863                         pstr = SHMEM_PLAN_PTR(entry);
864
865                 if (pstr == NULL)
866                         continue;                       /* Ignore any entries with bogus texts */
867
868                 if (fwrite(entry, sizeof(pgspEntry), 1, file) != 1 ||
869                         fwrite(pstr, 1, len + 1, file) != len + 1)
870                 {
871                         /* note: we assume hash_seq_term won't change errno */
872                         hash_seq_term(&hash_seq);
873                         goto error;
874                 }
875         }
876
877         if (FreeFile(file))
878         {
879                 file = NULL;
880                 goto error;
881         }
882
883         /*
884          * Rename file into place, so we atomically replace the old one.
885          */
886         if (rename(PGSP_DUMP_FILE ".tmp", PGSP_DUMP_FILE) != 0)
887                 ereport(LOG,
888                                 (errcode_for_file_access(),
889                                  errmsg("could not rename pg_store_plans file \"%s\": %m",
890                                                 PGSP_DUMP_FILE ".tmp")));
891
892         /* Unlink query-texts file; it's not needed while shutdown */
893         unlink(PGSP_TEXT_FILE);
894
895         return;
896
897 error:
898         ereport(LOG,
899                         (errcode_for_file_access(),
900                          errmsg("could not write pg_store_plans file \"%s\": %m",
901                                         PGSP_DUMP_FILE ".tmp")));
902         if (file)
903                 FreeFile(file);
904         unlink(PGSP_DUMP_FILE ".tmp");
905 }
906
907
908 /*
909  * ExecutorStart hook: start up tracking if needed
910  */
911 static void
912 pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags)
913 {
914         if (log_analyze &&
915                 (eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0)
916         {
917                 queryDesc->instrument_options |=
918                         (log_timing ? INSTRUMENT_TIMER : 0)|
919                         (log_timing ? 0: INSTRUMENT_ROWS)|
920                         (log_buffers ? INSTRUMENT_BUFFERS : 0);
921         }
922
923         if (prev_ExecutorStart)
924                 prev_ExecutorStart(queryDesc, eflags);
925         else
926                 standard_ExecutorStart(queryDesc, eflags);
927
928         /*
929          * Set up to track total elapsed time in ExecutorRun. Allocate in per-query
930          * context so as to be free at ExecutorEnd.
931          */
932         if (queryDesc->totaltime == NULL &&
933                         pgsp_enabled(queryDesc->plannedstmt->queryId))
934         {
935                 MemoryContext oldcxt;
936
937                 oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
938                 queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL
939 #if PG_VERSION_NUM >= 140000
940                                                                                   , false
941 #endif
942                                                                                  );
943                 MemoryContextSwitchTo(oldcxt);
944         }
945
946 }
947
948 /*
949  * ExecutorRun hook: all we need do is track nesting depth
950  */
951 static void
952 pgsp_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
953                                  bool execute_once)
954 {
955         nested_level++;
956         PG_TRY();
957         {
958                 if (prev_ExecutorRun)
959                         prev_ExecutorRun(queryDesc, direction, count, execute_once);
960                 else
961                         standard_ExecutorRun(queryDesc, direction, count, execute_once);
962                 nested_level--;
963         }
964         PG_CATCH();
965         {
966                 nested_level--;
967                 PG_RE_THROW();
968         }
969         PG_END_TRY();
970 }
971
972 /*
973  * ExecutorFinish hook: all we need do is track nesting depth
974  */
975 static void
976 pgsp_ExecutorFinish(QueryDesc *queryDesc)
977 {
978         nested_level++;
979         PG_TRY();
980         {
981                 if (prev_ExecutorFinish)
982                         prev_ExecutorFinish(queryDesc);
983                 else
984                         standard_ExecutorFinish(queryDesc);
985                 nested_level--;
986         }
987         PG_CATCH();
988         {
989                 nested_level--;
990                 PG_RE_THROW();
991         }
992         PG_END_TRY();
993 }
994
995 /*
996  * ExecutorEnd hook: store results if needed
997  */
998 static void
999 pgsp_ExecutorEnd(QueryDesc *queryDesc)
1000 {
1001         if (queryDesc->totaltime)
1002         {
1003                 /*
1004                  * Make sure stats accumulation is done.  (Note: it's okay if several
1005                  * levels of hook all do this.)
1006                  */
1007                 InstrEndLoop(queryDesc->totaltime);
1008
1009                 if (pgsp_enabled(queryDesc->plannedstmt->queryId) &&
1010                         queryDesc->totaltime->total &&
1011                         queryDesc->totaltime->total >=
1012                         (double)min_duration / 1000.0)
1013                 {
1014                         queryid_t         queryid;
1015                         ExplainState *es;
1016                         StringInfo        es_str;
1017
1018                         es = NewExplainState();
1019                         es_str = es->str;
1020
1021                         es->analyze = queryDesc->instrument_options;
1022                         es->verbose = log_verbose;
1023                         es->buffers = (es->analyze && log_buffers);
1024                         es->timing = (es->analyze && log_timing);
1025                         es->format = EXPLAIN_FORMAT_JSON;
1026
1027                         ExplainBeginOutput(es);
1028                         ExplainPrintPlan(es, queryDesc);
1029                         if (log_triggers)
1030                                 pgspExplainTriggers(es, queryDesc);
1031                         ExplainEndOutput(es);
1032
1033                         /* Remove last line break */
1034                         if (es_str->len > 0 && es_str->data[es_str->len - 1] == '\n')
1035                                 es_str->data[--es_str->len] = '\0';
1036
1037                         /* JSON outmost braces. */
1038                         es_str->data[0] = '{';
1039                         es_str->data[es_str->len - 1] = '}';
1040
1041                         queryid = queryDesc->plannedstmt->queryId;
1042 #if PG_VERSION_NUM < 140000
1043                         /*
1044                          * For versions before pg14, a queryid is only available if
1045                          * pg_stat_statements extension (or similar) if configured.  We
1046                          * don't want a hard requirement for such an extension so fallback
1047                          * to an internal queryid calculation in some case.
1048                          * For pg14 and above, core postgres can compute a queryid so we
1049                          * will rely on it.
1050                          */
1051                         if (queryid == PGSP_NO_QUERYID)
1052                                 queryid = (queryid_t) hash_query(queryDesc->sourceText);
1053 #else
1054                         Assert(queryid != PGSP_NO_QUERYID);
1055 #endif
1056
1057                         pgsp_store(es_str->data,
1058                                                 queryid,
1059                                                 queryDesc->totaltime->total * 1000.0,   /* convert to msec */
1060                                                 queryDesc->estate->es_processed,
1061                                                 &queryDesc->totaltime->bufusage);
1062                         pfree(es_str->data);
1063                 }
1064         }
1065
1066         if (prev_ExecutorEnd)
1067                 prev_ExecutorEnd(queryDesc);
1068         else
1069                 standard_ExecutorEnd(queryDesc);
1070 }
1071
1072 /*
1073  * ProcessUtility hook
1074  */
1075 static void
1076 pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
1077 #if PG_VERSION_NUM >= 140000
1078                                         bool readOnlyTree,
1079 #endif
1080                                         ProcessUtilityContext context, ParamListInfo params,
1081                                         QueryEnvironment *queryEnv,
1082                                         DestReceiver *dest, COMPTAG_TYPE *completionTag)
1083 {
1084         if (prev_ProcessUtility)
1085                 prev_ProcessUtility(pstmt, queryString,
1086 #if PG_VERSION_NUM >= 140000
1087                                                         readOnlyTree,
1088 #endif
1089                                                         context, params, queryEnv,
1090                                                         dest, completionTag);
1091         else
1092                 standard_ProcessUtility(pstmt, queryString,
1093 #if PG_VERSION_NUM >= 140000
1094                                                                 readOnlyTree,
1095 #endif
1096                                                                 context, params, queryEnv,
1097                                                                 dest, completionTag);
1098 }
1099
1100 /*
1101  * hash_query: calculate internal query ID for a query
1102  *
1103  *  As of PG11, Query.queryId has been widen to 64 bit to reduce collision of
1104  *  queries to practical level. On the other hand pg_store_plans uses the
1105  *  combination of query hash and plan hash values as the hash table key and
1106  *  the resolution of the hash value effectively has the same degree so we
1107  *  continue to use uint32 as internal queryid.
1108  *
1109  *  This may merge plans from different queries into single internal query id
1110  *  but it is not a problem when pg_stat_statements is used together since the
1111  *  extension gives enough resolution on queries.
1112  */
1113 static uint32
1114 hash_query(const char* query)
1115 {
1116         uint32 queryid;
1117
1118         char *normquery = pstrdup(query);
1119         normalize_expr(normquery, false);
1120         queryid = hash_any((const unsigned char*)normquery, strlen(normquery));
1121         pfree(normquery);
1122
1123         /* If we are unlucky enough to get a hash of zero, use 1 instead */
1124         if (queryid == 0)
1125                 queryid = 1;
1126
1127         return queryid;
1128 }
1129
1130
1131 /*
1132  * Store some statistics for a plan.
1133  *
1134  * Table entry is keyed with userid.dbid.queryId.planId. planId is the hash
1135  * value of the given plan, which is calculated in ths function.
1136  */
1137 static void
1138 pgsp_store(char *plan, queryid_t queryId,
1139                    double total_time, uint64 rows,
1140                    const BufferUsage *bufusage)
1141 {
1142         pgspHashKey key;
1143         pgspEntry  *entry;
1144         char       *norm_query = NULL;
1145         int             plan_len;
1146         char       *normalized_plan = NULL;
1147         char       *shorten_plan = NULL;
1148         volatile pgspEntry *e;
1149         Size            plan_offset = 0;
1150         bool            do_gc = false;
1151
1152         Assert(plan != NULL && queryId != PGSP_NO_QUERYID);
1153
1154         /* Safety check... */
1155         if (!shared_state || !hash_table)
1156                 return;
1157
1158         /* Set up key for hashtable search */
1159         key.userid = GetUserId();
1160         key.dbid = MyDatabaseId;
1161         key.queryid = queryId;
1162
1163         normalized_plan = pgsp_json_normalize(plan);
1164         shorten_plan = pgsp_json_shorten(plan);
1165         elog(DEBUG3, "pg_store_plans: Normalized plan: %s", normalized_plan);
1166         elog(DEBUG3, "pg_store_plans: Shorten plan: %s", shorten_plan);
1167         elog(DEBUG3, "pg_store_plans: Original plan: %s", plan);
1168         plan_len = strlen(shorten_plan);
1169
1170         key.planid = hash_any((const unsigned char *)normalized_plan,
1171                                                   strlen(normalized_plan));
1172         pfree(normalized_plan);
1173
1174         if (plan_len >= shared_state->plan_size)
1175                 plan_len = pg_encoding_mbcliplen(GetDatabaseEncoding(),
1176                                                                                  shorten_plan,
1177                                                                                  plan_len,
1178                                                                                  shared_state->plan_size - 1);
1179
1180
1181         /* Look up the hash table entry with shared lock. */
1182         LWLockAcquire(shared_state->lock, LW_SHARED);
1183
1184         entry = (pgspEntry *) hash_search(hash_table, &key, HASH_FIND, NULL);
1185
1186         /* Store the plan text, if the entry not present */
1187         if (!entry && plan_storage == PLAN_STORAGE_FILE)
1188         {
1189                 int             gc_count;
1190                 bool    stored;
1191
1192                 /* Append new plan text to file with only shared lock held */
1193                 stored = ptext_store(shorten_plan, plan_len, &plan_offset, &gc_count);
1194
1195                 /*
1196                  * Determine whether we need to garbage collect external query texts
1197                  * while the shared lock is still held.  This micro-optimization
1198                  * avoids taking the time to decide this while holding exclusive lock.
1199                  */
1200                 do_gc = need_gc_ptexts();
1201
1202                 /* Acquire exclusive lock as required by entry_alloc() */
1203                 LWLockRelease(shared_state->lock);
1204                 LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
1205
1206                 /*
1207                  * A garbage collection may have occurred while we weren't holding the
1208                  * lock.  In the unlikely event that this happens, the plan text we
1209                  * stored above will have been garbage collected, so write it again.
1210                  * This should be infrequent enough that doing it while holding
1211                  * exclusive lock isn't a performance problem.
1212                  */
1213                 if (!stored || shared_state->gc_count != gc_count)
1214                         stored = ptext_store(shorten_plan, plan_len, &plan_offset, NULL);
1215
1216                 /* If we failed to write to the text file, give up */
1217                 if (!stored)
1218                         goto done;
1219
1220         }
1221
1222         /* Create new entry, if not present */
1223         if (!entry)
1224         {
1225                 entry = entry_alloc(&key, plan_offset, plan_len, false);
1226
1227                 /* shorten_plan is terminated by NUL */
1228                 if (plan_storage == PLAN_STORAGE_SHMEM)
1229                         memcpy(SHMEM_PLAN_PTR(entry), shorten_plan, plan_len + 1);
1230                         
1231
1232                 /* If needed, perform garbage collection while exclusive lock held */
1233                 if (do_gc)
1234                         gc_ptexts();
1235         }
1236
1237         /* Increment the counts, except when jstate is not NULL */
1238
1239         /*
1240          * Grab the spinlock while updating the counters (see comment about
1241          * locking rules at the head of the file)
1242          */
1243
1244         e = (volatile pgspEntry *) entry;
1245         SpinLockAcquire(&e->mutex);
1246
1247         /* "Unstick" entry if it was previously sticky */
1248         if (e->counters.calls == 0)
1249         {
1250                 e->counters.usage = USAGE_INIT;
1251                 e->counters.first_call = GetCurrentTimestamp();
1252         }
1253
1254         e->counters.calls += 1;
1255         e->counters.total_time += total_time;
1256         if (e->counters.calls == 1)
1257         {
1258                 e->counters.min_time = total_time;
1259                 e->counters.max_time = total_time;
1260                 e->counters.mean_time = total_time;
1261         }
1262         else
1263         {
1264                 /*
1265                  * Welford's method for accurately computing variance. See
1266                  * <http://www.johndcook.com/blog/standard_deviation/>
1267                  */
1268                 double          old_mean = e->counters.mean_time;
1269
1270                 e->counters.mean_time +=
1271                         (total_time - old_mean) / e->counters.calls;
1272                 e->counters.sum_var_time +=
1273                         (total_time - old_mean) * (total_time - e->counters.mean_time);
1274
1275                 /* calculate min and max time */
1276                 if (e->counters.min_time > total_time)
1277                         e->counters.min_time = total_time;
1278                 if (e->counters.max_time < total_time)
1279                         e->counters.max_time = total_time;
1280         }
1281
1282         e->counters.rows += rows;
1283         e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1284         e->counters.shared_blks_read += bufusage->shared_blks_read;
1285         e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
1286         e->counters.shared_blks_written += bufusage->shared_blks_written;
1287         e->counters.local_blks_hit += bufusage->local_blks_hit;
1288         e->counters.local_blks_read += bufusage->local_blks_read;
1289         e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
1290         e->counters.local_blks_written += bufusage->local_blks_written;
1291         e->counters.temp_blks_read += bufusage->temp_blks_read;
1292         e->counters.temp_blks_written += bufusage->temp_blks_written;
1293         e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1294         e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
1295         e->counters.last_call = GetCurrentTimestamp();
1296         e->counters.usage += USAGE_EXEC(total_time);
1297
1298         SpinLockRelease(&e->mutex);
1299
1300 done:
1301         LWLockRelease(shared_state->lock);
1302
1303         /* We postpone this pfree until we're out of the lock */
1304         if (norm_query)
1305                 pfree(norm_query);
1306 }
1307
1308 /*
1309  * Reset all statement statistics.
1310  */
1311 Datum
1312 pg_store_plans_reset(PG_FUNCTION_ARGS)
1313 {
1314         if (!shared_state || !hash_table)
1315                 ereport(ERROR,
1316                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1317                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1318         entry_reset();
1319         PG_RETURN_VOID();
1320 }
1321
1322 /* Number of output arguments (columns) for various API versions */
1323 #define PG_STORE_PLANS_COLS_V1_5        27
1324 #define PG_STORE_PLANS_COLS_V1_6        26
1325 #define PG_STORE_PLANS_COLS                     27      /* maximum of above */
1326
1327 /*
1328  * Retrieve statement statistics.
1329  */
1330 Datum
1331 pg_store_plans_1_6(PG_FUNCTION_ARGS)
1332 {
1333         pg_store_plans_internal(fcinfo, PGSP_V1_6);
1334
1335         return (Datum) 0;
1336 }
1337
1338 Datum
1339 pg_store_plans(PG_FUNCTION_ARGS)
1340 {
1341         pg_store_plans_internal(fcinfo, PGSP_V1_5);
1342
1343         return (Datum) 0;
1344 }
1345
1346 static void
1347 pg_store_plans_internal(FunctionCallInfo fcinfo,
1348                                                 pgspVersion api_version)
1349 {
1350         ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1351         TupleDesc       tupdesc;
1352         Tuplestorestate *tupstore;
1353         MemoryContext per_query_ctx;
1354         MemoryContext oldcontext;
1355         Oid                     userid = GetUserId();
1356         bool            is_allowed_role = is_member_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS);
1357         int                     n_writers;
1358         char       *pbuffer = NULL;
1359         Size            pbuffer_size = 0;
1360         Size            extent = 0;
1361         int                     gc_count = 0;
1362         HASH_SEQ_STATUS hash_seq;
1363         pgspEntry  *entry;
1364
1365         if (!shared_state || !hash_table)
1366                 ereport(ERROR,
1367                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1368                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1369
1370         /* check to see if caller supports us returning a tuplestore */
1371         if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1372                 ereport(ERROR,
1373                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1374                                  errmsg("set-valued function called in context that cannot accept a set")));
1375         if (!(rsinfo->allowedModes & SFRM_Materialize))
1376                 ereport(ERROR,
1377                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1378                                  errmsg("materialize mode required, but it is not " \
1379                                                 "allowed in this context")));
1380
1381         /* Build a tuple descriptor for our result type */
1382         if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1383                 elog(ERROR, "return type must be a row type");
1384
1385         per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1386         oldcontext = MemoryContextSwitchTo(per_query_ctx);
1387
1388         tupstore = tuplestore_begin_heap(true, false, work_mem);
1389         rsinfo->returnMode = SFRM_Materialize;
1390         rsinfo->setResult = tupstore;
1391         rsinfo->setDesc = tupdesc;
1392
1393         MemoryContextSwitchTo(oldcontext);
1394
1395         /*
1396          * We'd like to load the plan text file (if needed) while not holding any
1397          * lock on shared_state->lock.  In the worst case we'll have to do this
1398          * again after we have the lock, but it's unlikely enough to make this a
1399          * win despite occasional duplicated work.  We need to reload if anybody
1400          * writes to the file (either a retail ptext_store(), or a garbage
1401          * collection) between this point and where we've gotten shared lock.  If a
1402          * ptext_store is actually in progress when we look, we might as well skip
1403          * the speculative load entirely.
1404          */
1405
1406         /* Take the mutex so we can examine variables */
1407         {
1408                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1409
1410                 SpinLockAcquire(&s->mutex);
1411                 extent = s->extent;
1412                 n_writers = s->n_writers;
1413                 gc_count = s->gc_count;
1414                 SpinLockRelease(&s->mutex);
1415         }
1416
1417         /* No point in loading file now if there are active writers */
1418         if (n_writers == 0 && plan_storage == PLAN_STORAGE_FILE)
1419                 pbuffer = ptext_load_file(&pbuffer_size);
1420
1421         /*
1422          * Get shared lock, load or reload the plan text file if we must, and
1423          * iterate over the hashtable entries.
1424          *
1425          * With a large hash table, we might be holding the lock rather longer
1426          * than one could wish.  However, this only blocks creation of new hash
1427          * table entries, and the larger the hash table the less likely that is to
1428          * be needed.  So we can hope this is okay.  Perhaps someday we'll decide
1429          * we need to partition the hash table to limit the time spent holding any
1430          * one lock.
1431          */
1432         LWLockAcquire(shared_state->lock, LW_SHARED);
1433
1434         /*
1435          * Here it is safe to examine extent and gc_count without taking the mutex.
1436          * Note that although other processes might change shared_state->extent
1437          * just after we look at it, the strings they then write into the file
1438          * cannot yet be referenced in the hashtable, so we don't care whether we
1439          * see them or not.
1440          *
1441          * If ptext_load_file fails, we just press on; we'll return NULL for every
1442          * plan text.
1443          */
1444         if (plan_storage == PLAN_STORAGE_FILE &&
1445                 (pbuffer == NULL ||
1446                  shared_state->extent != extent ||
1447                  shared_state->gc_count != gc_count))
1448         {
1449                 if (pbuffer)
1450                         free(pbuffer);
1451                 pbuffer = ptext_load_file(&pbuffer_size);
1452         }
1453
1454         hash_seq_init(&hash_seq, hash_table);
1455         while ((entry = hash_seq_search(&hash_seq)) != NULL)
1456         {
1457                 Datum           values[PG_STORE_PLANS_COLS];
1458                 bool            nulls[PG_STORE_PLANS_COLS];
1459                 int                     i = 0;
1460                 int64           queryid      = entry->key.queryid;
1461                 int64           planid       = entry->key.planid;
1462                 Counters        tmp;
1463                 double          stddev;
1464
1465                 memset(values, 0, sizeof(values));
1466                 memset(nulls, 0, sizeof(nulls));
1467
1468                 values[i++] = ObjectIdGetDatum(entry->key.userid);
1469                 values[i++] = ObjectIdGetDatum(entry->key.dbid);
1470                 if (is_allowed_role || entry->key.userid == userid)
1471                 {
1472                         values[i++] = Int64GetDatumFast(queryid);
1473                         values[i++] = Int64GetDatumFast(planid);
1474                         if (api_version == PGSP_V1_5)
1475                                 values[i++] = ObjectIdGetDatum(queryid);
1476                 }
1477                 else
1478                 {
1479                         values[i++] = Int64GetDatumFast(0);
1480                         values[i++] = Int64GetDatumFast(0);
1481                         if (api_version == PGSP_V1_5)
1482                                 values[i++] = Int64GetDatumFast(0);
1483                 }
1484
1485                 if (is_allowed_role || entry->key.userid == userid)
1486                 {
1487                         char       *pstr; /* Plan string */
1488                         char       *mstr; /* Modified plan string */
1489                         char       *estr; /* Encoded modified plan string */
1490
1491                         if (plan_storage == PLAN_STORAGE_FILE)
1492                                 pstr = ptext_fetch(entry->plan_offset, entry->plan_len,
1493                                                                    pbuffer, pbuffer_size);
1494                         else
1495                                 pstr = SHMEM_PLAN_PTR(entry);
1496
1497                         switch (plan_format)
1498                         {
1499                                 case PLAN_FORMAT_TEXT:
1500                                         mstr = pgsp_json_textize(pstr);
1501                                         break;
1502                                 case PLAN_FORMAT_JSON:
1503                                         mstr = pgsp_json_inflate(pstr);
1504                                         break;
1505                                 case PLAN_FORMAT_YAML:
1506                                         mstr = pgsp_json_yamlize(pstr);
1507                                         break;
1508                                 case PLAN_FORMAT_XML:
1509                                         mstr = pgsp_json_xmlize(pstr);
1510                                         break;
1511                                 default:
1512                                         break;
1513                         }
1514
1515                         estr = (char *)
1516                                 pg_do_encoding_conversion((unsigned char *) mstr,
1517                                                                                   strlen(mstr),
1518                                                                                   entry->encoding,
1519                                                                                   GetDatabaseEncoding());
1520                         values[i++] = CStringGetTextDatum(estr);
1521
1522                         if (estr != mstr)
1523                                 pfree(estr);
1524
1525                         if (mstr != pstr)
1526                                 pfree(mstr);
1527
1528                         /* pstr is a pointer onto pbuffer */
1529                 }
1530                 else
1531                         values[i++] = CStringGetTextDatum("<insufficient privilege>");
1532
1533                 /* copy counters to a local variable to keep locking time short */
1534                 {
1535                         volatile pgspEntry *e = (volatile pgspEntry *) entry;
1536
1537                         SpinLockAcquire(&e->mutex);
1538                         tmp = e->counters;
1539                         SpinLockRelease(&e->mutex);
1540                 }
1541
1542                 /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1543                 if (tmp.calls == 0)
1544                         continue;
1545
1546                 values[i++] = Int64GetDatumFast(tmp.calls);
1547                 values[i++] = Float8GetDatumFast(tmp.total_time);
1548                 values[i++] = Float8GetDatumFast(tmp.min_time);
1549                 values[i++] = Float8GetDatumFast(tmp.max_time);
1550                 values[i++] = Float8GetDatumFast(tmp.mean_time);
1551
1552                 /*
1553                  * Note we are calculating the population variance here, not the
1554                  * sample variance, as we have data for the whole population, so
1555                  * Bessel's correction is not used, and we don't divide by
1556                  * tmp.calls - 1.
1557                  */
1558                 if (tmp.calls > 1)
1559                         stddev = sqrt(tmp.sum_var_time / tmp.calls);
1560                 else
1561                         stddev = 0.0;
1562                 values[i++] = Float8GetDatumFast(stddev);
1563
1564                 values[i++] = Int64GetDatumFast(tmp.rows);
1565                 values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1566                 values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
1567                 values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
1568                 values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1569                 values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1570                 values[i++] = Int64GetDatumFast(tmp.local_blks_read);
1571                 values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
1572                 values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1573                 values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1574                 values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
1575                 values[i++] = Float8GetDatumFast(tmp.blk_read_time);
1576                 values[i++] = Float8GetDatumFast(tmp.blk_write_time);
1577                 values[i++] = TimestampTzGetDatum(tmp.first_call);
1578                 values[i++] = TimestampTzGetDatum(tmp.last_call);
1579
1580                 Assert(i == (api_version == PGSP_V1_5 ? PG_STORE_PLANS_COLS_V1_5 :
1581                                          api_version == PGSP_V1_6 ? PG_STORE_PLANS_COLS_V1_6 :
1582                                          -1 /* fail if you forget to update this assert */ ));
1583
1584                 tuplestore_putvalues(tupstore, tupdesc, values, nulls);
1585         }
1586
1587         LWLockRelease(shared_state->lock);
1588
1589         /* clean up and return the tuplestore */
1590         tuplestore_donestoring(tupstore);
1591 }
1592
1593 /* Number of output arguments (columns) for pg_stat_statements_info */
1594 #define PG_STORE_PLANS_INFO_COLS        2
1595
1596 /*
1597  * Return statistics of pg_stat_statements.
1598  */
1599 Datum
1600 pg_store_plans_info(PG_FUNCTION_ARGS)
1601 {
1602         pgspGlobalStats stats;
1603         TupleDesc       tupdesc;
1604         Datum           values[PG_STORE_PLANS_INFO_COLS];
1605         bool            nulls[PG_STORE_PLANS_INFO_COLS];
1606
1607         if (!shared_state || !hash_table)
1608                 ereport(ERROR,
1609                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1610                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1611
1612         /* Build a tuple descriptor for our result type */
1613         if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1614                 elog(ERROR, "return type must be a row type");
1615
1616         MemSet(values, 0, sizeof(values));
1617         MemSet(nulls, 0, sizeof(nulls));
1618
1619         /* Read global statistics for pg_stat_statements */
1620         {
1621                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1622
1623                 SpinLockAcquire(&s->mutex);
1624                 stats = s->stats;
1625                 SpinLockRelease(&s->mutex);
1626         }
1627
1628         values[0] = Int64GetDatum(stats.dealloc);
1629         values[1] = TimestampTzGetDatum(stats.stats_reset);
1630
1631         PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
1632 }
1633
1634 /*
1635  * Estimate shared memory space needed.
1636  */
1637 static Size
1638 shared_mem_size(void)
1639 {
1640         Size            size;
1641         int                     entry_size;
1642
1643         size = MAXALIGN(sizeof(pgspSharedState));
1644         entry_size = sizeof(pgspEntry);
1645
1646         /* plan text is apppended to the struct body */
1647         if (plan_storage == PLAN_STORAGE_SHMEM)
1648                 entry_size += max_plan_len;
1649
1650         size = add_size(size, hash_estimate_size(store_size, entry_size));
1651
1652         return size;
1653 }
1654
1655 /*
1656  * Allocate a new hashtable entry.
1657  * caller must hold an exclusive lock on shared_state->lock
1658  *
1659  * "plan" need not be null-terminated; we rely on plan_len instead
1660  *
1661  * If "sticky" is true, make the new entry artificially sticky so that it will
1662  * probably still be there when the query finishes execution.  We do this by
1663  * giving it a median usage value rather than the normal value.  (Strictly
1664  * speaking, query strings are normalized on a best effort basis, though it
1665  * would be difficult to demonstrate this even under artificial conditions.)
1666  *
1667  * Note: despite needing exclusive lock, it's not an error for the target
1668  * entry to already exist.      This is because pgsp_store releases and
1669  * reacquires lock after failing to find a match; so someone else could
1670  * have made the entry while we waited to get exclusive lock.
1671  */
1672 static pgspEntry *
1673 entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len, bool sticky)
1674 {
1675         pgspEntry  *entry;
1676         bool            found;
1677
1678         /* Make space if needed */
1679         while (hash_get_num_entries(hash_table) >= store_size)
1680                 entry_dealloc();
1681
1682         /* Find or create an entry with desired hash code */
1683         entry = (pgspEntry *) hash_search(hash_table, key, HASH_ENTER, &found);
1684
1685         if (!found)
1686         {
1687                 /* New entry, initialize it */
1688
1689                 /* reset the statistics */
1690                 memset(&entry->counters, 0, sizeof(Counters));
1691                 /* set the appropriate initial usage count */
1692                 entry->counters.usage = sticky ? shared_state->cur_median_usage : USAGE_INIT;
1693                 /* re-initialize the mutex each time ... we assume no one using it */
1694                 SpinLockInit(&entry->mutex);
1695                 /* ... and don't forget the query text */
1696                 Assert(plan_len >= 0 && plan_len < shared_state->plan_size);
1697                 entry->plan_offset = plan_offset;
1698                 entry->plan_len = plan_len;
1699                 entry->encoding = GetDatabaseEncoding();
1700         }
1701
1702         return entry;
1703 }
1704
1705 /*
1706  * qsort comparator for sorting into increasing usage order
1707  */
1708 static int
1709 entry_cmp(const void *lhs, const void *rhs)
1710 {
1711         double          l_usage = (*(pgspEntry *const *) lhs)->counters.usage;
1712         double          r_usage = (*(pgspEntry *const *) rhs)->counters.usage;
1713
1714         if (l_usage < r_usage)
1715                 return -1;
1716         else if (l_usage > r_usage)
1717                 return +1;
1718         else
1719                 return 0;
1720 }
1721
1722 /*
1723  * Deallocate least used entries.
1724  * Caller must hold an exclusive lock on shared_state->lock.
1725  */
1726 static void
1727 entry_dealloc(void)
1728 {
1729         HASH_SEQ_STATUS hash_seq;
1730         pgspEntry **entries;
1731         pgspEntry  *entry;
1732         int                     nvictims;
1733         int                     i;
1734         Size            tottextlen;
1735         int                     nvalidtexts;
1736
1737         /*
1738          * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1739          * While we're scanning the table, apply the decay factor to the usage
1740          * values.
1741          */
1742
1743         entries = palloc(hash_get_num_entries(hash_table) * sizeof(pgspEntry *));
1744
1745         i = 0;
1746         hash_seq_init(&hash_seq, hash_table);
1747         while ((entry = hash_seq_search(&hash_seq)) != NULL)
1748         {
1749                 entries[i++] = entry;
1750                 /* "Sticky" entries get a different usage decay rate. */
1751                 if (entry->counters.calls == 0)
1752                         entry->counters.usage *= STICKY_DECREASE_FACTOR;
1753                 else
1754                         entry->counters.usage *= USAGE_DECREASE_FACTOR;
1755
1756                 /* In the mean length computation, ignore dropped texts. */
1757                 if (entry->plan_len >= 0)
1758                 {
1759                         tottextlen += entry->plan_len + 1;
1760                         nvalidtexts++;
1761                 }
1762         }
1763
1764         qsort(entries, i, sizeof(pgspEntry *), entry_cmp);
1765
1766         /* Also, record the (approximate) median usage */
1767         if (i > 0)
1768                 shared_state->cur_median_usage = entries[i / 2]->counters.usage;
1769         /* Record the mean plan length */
1770         if (nvalidtexts > 0)
1771                 shared_state->mean_plan_len = tottextlen / nvalidtexts;
1772         else
1773                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
1774
1775         nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
1776         nvictims = Min(nvictims, i);
1777
1778         for (i = 0; i < nvictims; i++)
1779         {
1780                 hash_search(hash_table, &entries[i]->key, HASH_REMOVE, NULL);
1781         }
1782
1783         pfree(entries);
1784
1785         /* Increment the number of times entries are deallocated */
1786         {
1787                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1788
1789                 SpinLockAcquire(&s->mutex);
1790                 s->stats.dealloc += 1;
1791                 SpinLockRelease(&s->mutex);
1792         }
1793 }
1794
1795 /*
1796  * Given a plan string (not necessarily null-terminated), allocate a new
1797  * entry in the external plan text file and store the string there.
1798  *
1799  * If successful, returns true, and stores the new entry's offset in the file
1800  * into *plan_offset.  Also, if gc_count isn't NULL, *gc_count is set to the
1801  * number of garbage collections that have occurred so far.
1802  *
1803  * On failure, returns false.
1804  *
1805  * At least a shared lock on shared_state->lock must be held by the caller, so
1806  * as to prevent a concurrent garbage collection.  Share-lock-holding callers
1807  * should pass a gc_count pointer to obtain the number of garbage collections,
1808  * so that they can recheck the count after obtaining exclusive lock to detect
1809  * whether a garbage collection occurred (and removed this entry).
1810  */
1811 static bool
1812 ptext_store(const char *plan, int plan_len, Size *plan_offset, int *gc_count)
1813 {
1814         Size            off;
1815         int                     fd;
1816
1817         Assert (plan_storage == PLAN_STORAGE_FILE);
1818
1819         /*
1820          * We use a spinlock to protect extent/n_writers/gc_count, so that
1821          * multiple processes may execute this function concurrently.
1822          */
1823         {
1824                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1825
1826                 SpinLockAcquire(&s->mutex);
1827                 off = s->extent;
1828                 s->extent += plan_len + 1;
1829                 s->n_writers++;
1830                 if (gc_count)
1831                         *gc_count = s->gc_count;
1832                 SpinLockRelease(&s->mutex);
1833         }
1834
1835         *plan_offset = off;
1836
1837         /* Now write the data into the successfully-reserved part of the file */
1838         fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
1839         if (fd < 0)
1840                 goto error;
1841
1842         if (pg_pwrite(fd, plan, plan_len, off) != plan_len)
1843                 goto error;
1844         if (pg_pwrite(fd, "\0", 1, off + plan_len) != 1)
1845                 goto error;
1846
1847         CloseTransientFile(fd);
1848
1849         /* Mark our write complete */
1850         {
1851                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1852
1853                 SpinLockAcquire(&s->mutex);
1854                 s->n_writers--;
1855                 SpinLockRelease(&s->mutex);
1856         }
1857
1858         return true;
1859
1860 error:
1861         ereport(LOG,
1862                         (errcode_for_file_access(),
1863                          errmsg("could not write file \"%s\": %m",
1864                                         PGSP_TEXT_FILE)));
1865
1866         if (fd >= 0)
1867                 CloseTransientFile(fd);
1868
1869         /* Mark our write complete */
1870         {
1871                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1872
1873                 SpinLockAcquire(&s->mutex);
1874                 s->n_writers--;
1875                 SpinLockRelease(&s->mutex);
1876         }
1877
1878         return false;
1879 }
1880
1881 /*
1882  * Read the external plan text file into a malloc'd buffer.
1883  *
1884  * Returns NULL (without throwing an error) if unable to read, eg
1885  * file not there or insufficient memory.
1886  *
1887  * On success, the buffer size is also returned into *buffer_size.
1888  *
1889  * This can be called without any lock on shared_state->lock, but in that case
1890  * the caller is responsible for verifying that the result is sane.
1891  */
1892 static char *
1893 ptext_load_file(Size *buffer_size)
1894 {
1895         char       *buf;
1896         int                     fd;
1897         struct stat stat;
1898         Size            nread;
1899
1900         Assert (plan_storage == PLAN_STORAGE_FILE);
1901
1902         fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDONLY | PG_BINARY);
1903         if (fd < 0)
1904         {
1905                 if (errno != ENOENT)
1906                         ereport(LOG,
1907                                         (errcode_for_file_access(),
1908                                          errmsg("could not read file \"%s\": %m",
1909                                                         PGSP_TEXT_FILE)));
1910                 return NULL;
1911         }
1912
1913         /* Get file length */
1914         if (fstat(fd, &stat))
1915         {
1916                 ereport(LOG,
1917                                 (errcode_for_file_access(),
1918                                  errmsg("could not stat file \"%s\": %m",
1919                                                 PGSP_TEXT_FILE)));
1920                 CloseTransientFile(fd);
1921                 return NULL;
1922         }
1923
1924         /* Allocate buffer; beware that off_t might be wider than size_t */
1925         if (stat.st_size <= MaxAllocHugeSize)
1926                 buf = (char *) malloc(stat.st_size);
1927         else
1928                 buf = NULL;
1929         if (buf == NULL)
1930         {
1931                 ereport(LOG,
1932                                 (errcode(ERRCODE_OUT_OF_MEMORY),
1933                                  errmsg("out of memory"),
1934                                  errdetail("Could not allocate enough memory to read file \"%s\".",
1935                                                    PGSP_TEXT_FILE)));
1936                 CloseTransientFile(fd);
1937                 return NULL;
1938         }
1939
1940         /*
1941          * OK, slurp in the file.  Windows fails if we try to read more than
1942          * INT_MAX bytes at once, and other platforms might not like that either,
1943          * so read a very large file in 1GB segments.
1944          */
1945         nread = 0;
1946         while (nread < stat.st_size)
1947         {
1948                 int                     toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
1949
1950                 /*
1951                  * If we get a short read and errno doesn't get set, the reason is
1952                  * probably that garbage collection truncated the file since we did
1953                  * the fstat(), so we don't log a complaint --- but we don't return
1954                  * the data, either, since it's most likely corrupt due to concurrent
1955                  * writes from garbage collection.
1956                  */
1957                 errno = 0;
1958                 if (read(fd, buf + nread, toread) != toread)
1959                 {
1960                         if (errno)
1961                                 ereport(LOG,
1962                                                 (errcode_for_file_access(),
1963                                                  errmsg("could not read file \"%s\": %m",
1964                                                                 PGSP_TEXT_FILE)));
1965                         free(buf);
1966                         CloseTransientFile(fd);
1967                         return NULL;
1968                 }
1969                 nread += toread;
1970         }
1971
1972         if (CloseTransientFile(fd) != 0)
1973                 ereport(LOG,
1974                                 (errcode_for_file_access(),
1975                                  errmsg("could not close file \"%s\": %m", PGSP_TEXT_FILE)));
1976
1977         *buffer_size = nread;
1978         return buf;
1979 }
1980
1981 /*
1982  * Locate a plan text in the file image previously read by ptext_load_file().
1983  *
1984  * We validate the given offset/length, and return NULL if bogus.  Otherwise,
1985  * the result points to a null-terminated string within the buffer.
1986  */
1987 static char *
1988 ptext_fetch(Size plan_offset, int plan_len,
1989                         char *buffer, Size buffer_size)
1990 {
1991         Assert (plan_storage == PLAN_STORAGE_FILE);
1992
1993         /* File read failed? */
1994         if (buffer == NULL)
1995                 return NULL;
1996         /* Bogus offset/length? */
1997         if (plan_len < 0 ||
1998                 plan_offset + plan_len >= buffer_size)
1999                 return NULL;
2000         /* As a further sanity check, make sure there's a trailing null */
2001         if (buffer[plan_offset + plan_len] != '\0')
2002                 return NULL;
2003         /* Looks OK */
2004         return buffer + plan_offset;
2005 }
2006
2007 /*
2008  * Do we need to garbage-collect the external plan text file?
2009  *
2010  * Caller should hold at least a shared lock on shared_state->lock.
2011  */
2012 static bool
2013 need_gc_ptexts(void)
2014 {
2015         Size            extent;
2016
2017         Assert (plan_storage == PLAN_STORAGE_FILE);
2018
2019         /* Read shared extent pointer */
2020         {
2021                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2022
2023                 SpinLockAcquire(&s->mutex);
2024                 extent = s->extent;
2025                 SpinLockRelease(&s->mutex);
2026         }
2027
2028         /* Don't proceed if file does not exceed 512 bytes per possible entry */
2029         if (extent < 512 * store_size)
2030                 return false;
2031
2032         /*
2033          * Don't proceed if file is less than about 50% bloat.  Nothing can or
2034          * should be done in the event of unusually large query texts accounting
2035          * for file's large size.  We go to the trouble of maintaining the mean
2036          * query length in order to prevent garbage collection from thrashing
2037          * uselessly.
2038          */
2039         if (extent < shared_state->mean_plan_len * store_size * 2)
2040                 return false;
2041
2042         return true;
2043 }
2044
2045 /*
2046  * Garbage-collect orphaned plan texts in external file.
2047  *
2048  * This won't be called often in the typical case, since it's likely that
2049  * there won't be too much churn, and besides, a similar compaction process
2050  * occurs when serializing to disk at shutdown or as part of resetting.
2051  * Despite this, it seems prudent to plan for the edge case where the file
2052  * becomes unreasonably large, with no other method of compaction likely to
2053  * occur in the foreseeable future.
2054  *
2055  * The caller must hold an exclusive lock on shared_state->lock.
2056  *
2057  * At the first sign of trouble we unlink the query text file to get a clean
2058  * slate (although existing statistics are retained), rather than risk
2059  * thrashing by allowing the same problem case to recur indefinitely.
2060  */
2061 static void
2062 gc_ptexts(void)
2063 {
2064         char       *pbuffer;
2065         Size            pbuffer_size;
2066         FILE       *pfile = NULL;
2067         HASH_SEQ_STATUS hash_seq;
2068         pgspEntry  *entry;
2069         Size            extent;
2070         int                     nentries;
2071
2072         Assert (plan_storage == PLAN_STORAGE_FILE);
2073
2074         /*
2075          * When called from store_entry, some other session might have proceeded
2076          * with garbage collection in the no-lock-held interim of lock strength
2077          * escalation.  Check once more that this is actually necessary.
2078          */
2079         if (!need_gc_ptexts())
2080                 return;
2081
2082         /*
2083          * Load the old texts file.  If we fail (out of memory, for instance),
2084          * invalidate query texts.  Hopefully this is rare.  It might seem better
2085          * to leave things alone on an OOM failure, but the problem is that the
2086          * file is only going to get bigger; hoping for a future non-OOM result is
2087          * risky and can easily lead to complete denial of service.
2088          */
2089         pbuffer = ptext_load_file(&pbuffer_size);
2090         if (pbuffer == NULL)
2091                 goto gc_fail;
2092
2093         /*
2094          * We overwrite the plan texts file in place, so as to reduce the risk of
2095          * an out-of-disk-space failure.  Since the file is guaranteed not to get
2096          * larger, this should always work on traditional filesystems; though we
2097          * could still lose on copy-on-write filesystems.
2098          */
2099         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2100         if (pfile == NULL)
2101         {
2102                 ereport(LOG,
2103                                 (errcode_for_file_access(),
2104                                  errmsg("could not write file \"%s\": %m",
2105                                                 PGSP_TEXT_FILE)));
2106                 goto gc_fail;
2107         }
2108
2109         extent = 0;
2110         nentries = 0;
2111
2112         hash_seq_init(&hash_seq, hash_table);
2113         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2114         {
2115                 int                     plan_len = entry->plan_len;
2116                 char       *plan = ptext_fetch(entry->plan_offset,
2117                                                                            plan_len,
2118                                                                            pbuffer,
2119                                                                            pbuffer_size);
2120
2121                 if (plan == NULL)
2122                 {
2123                         /* Trouble ... drop the text */
2124                         entry->plan_offset = 0;
2125                         entry->plan_len = -1;
2126                         /* entry will not be counted in mean plan length computation */
2127                         continue;
2128                 }
2129
2130                 if (fwrite(plan, 1, plan_len + 1, pfile) != plan_len + 1)
2131                 {
2132                         ereport(LOG,
2133                                         (errcode_for_file_access(),
2134                                          errmsg("could not write file \"%s\": %m",
2135                                                         PGSP_TEXT_FILE)));
2136                         hash_seq_term(&hash_seq);
2137                         goto gc_fail;
2138                 }
2139
2140                 entry->plan_offset = extent;
2141                 extent += plan_len + 1;
2142                 nentries++;
2143         }
2144
2145         /*
2146          * Truncate away any now-unused space.  If this fails for some odd reason,
2147          * we log it, but there's no need to fail.
2148          */
2149         if (ftruncate(fileno(pfile), extent) != 0)
2150                 ereport(LOG,
2151                                 (errcode_for_file_access(),
2152                                  errmsg("could not truncate file \"%s\": %m",
2153                                                 PGSP_TEXT_FILE)));
2154
2155         if (FreeFile(pfile))
2156         {
2157                 ereport(LOG,
2158                                 (errcode_for_file_access(),
2159                                  errmsg("could not write file \"%s\": %m",
2160                                                 PGSP_TEXT_FILE)));
2161                 pfile = NULL;
2162                 goto gc_fail;
2163         }
2164
2165         elog(DEBUG1, "pgsp gc of queries file shrunk size from %zu to %zu",
2166                  shared_state->extent, extent);
2167
2168         /* Reset the shared extent pointer */
2169         shared_state->extent = extent;
2170
2171         /*
2172          * Also update the mean plan length, to be sure that need_gc_ptexts()
2173          * won't still think we have a problem.
2174          */
2175         if (nentries > 0)
2176                 shared_state->mean_plan_len = extent / nentries;
2177         else
2178                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2179
2180         free(pbuffer);
2181
2182         return;
2183
2184 gc_fail:
2185         /* clean up resources */
2186         if (pfile)
2187                 FreeFile(pfile);
2188         if (pbuffer)
2189                 free(pbuffer);
2190
2191         /*
2192          * Since the contents of the external file are now uncertain, mark all
2193          * hashtable entries as having invalid texts.
2194          */
2195         hash_seq_init(&hash_seq, hash_table);
2196         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2197         {
2198                 entry->plan_offset = 0;
2199                 entry->plan_len = -1;
2200         }
2201
2202         /*
2203          * Destroy the query text file and create a new, empty one
2204          */
2205         (void) unlink(PGSP_TEXT_FILE);
2206         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2207         if (pfile == NULL)
2208                 ereport(LOG,
2209                                 (errcode_for_file_access(),
2210                                  errmsg("could not recreate file \"%s\": %m",
2211                                                 PGSP_TEXT_FILE)));
2212         else
2213                 FreeFile(pfile);
2214
2215         /* Reset the shared extent pointer */
2216         shared_state->extent = 0;
2217
2218         /* Reset mean_plan_len to match the new state */
2219         shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2220 }
2221
2222 /*
2223  * Release all entries.
2224  */
2225 static void
2226 entry_reset(void)
2227 {
2228         HASH_SEQ_STATUS hash_seq;
2229         pgspEntry  *entry;
2230         FILE       *pfile;
2231
2232         if (!shared_state || !hash_table)
2233                 ereport(ERROR,
2234                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2235                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
2236
2237         LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
2238
2239         hash_seq_init(&hash_seq, hash_table);
2240         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2241         {
2242                 hash_search(hash_table, &entry->key, HASH_REMOVE, NULL);
2243         }
2244
2245         /*
2246          * Reset global statistics for pg_store_plans.
2247          */
2248         {
2249                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2250                 TimestampTz stats_reset = GetCurrentTimestamp();
2251
2252                 SpinLockAcquire(&s->mutex);
2253                 s->stats.dealloc = 0;
2254                 s->stats.stats_reset = stats_reset;
2255                 SpinLockRelease(&s->mutex);
2256         }
2257
2258         /*
2259          * Write new empty plan file, perhaps even creating a new one to recover
2260          * if the file was missing.
2261          */
2262         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2263         if (pfile == NULL)
2264         {
2265                 ereport(LOG,
2266                                 (errcode_for_file_access(),
2267                                  errmsg("could not create file \"%s\": %m",
2268                                                 PGSP_TEXT_FILE)));
2269                 goto done;
2270         }
2271
2272         /* If ftruncate fails, log it, but it's not a fatal problem */
2273         if (ftruncate(fileno(pfile), 0) != 0)
2274                 ereport(LOG,
2275                                 (errcode_for_file_access(),
2276                                  errmsg("could not truncate file \"%s\": %m",
2277                                                 PGSP_TEXT_FILE)));
2278
2279         FreeFile(pfile);
2280
2281 done:
2282         shared_state->extent = 0;
2283         LWLockRelease(shared_state->lock);
2284 }
2285
2286 Datum
2287 pg_store_plans_hash_query(PG_FUNCTION_ARGS)
2288 {
2289         PG_RETURN_OID(hash_query(text_to_cstring(PG_GETARG_TEXT_P(0))));
2290 }
2291
2292 Datum
2293 pg_store_plans_shorten(PG_FUNCTION_ARGS)
2294 {
2295         text *short_plan = PG_GETARG_TEXT_P(0);
2296         char *cjson = text_to_cstring(short_plan);
2297         char *cshorten = pgsp_json_shorten(cjson);
2298         PG_RETURN_TEXT_P(cstring_to_text(cshorten));
2299 }
2300
2301 Datum
2302 pg_store_plans_normalize(PG_FUNCTION_ARGS)
2303 {
2304         text *short_plan = PG_GETARG_TEXT_P(0);
2305         char *cjson = text_to_cstring(short_plan);
2306         char *cnormalized = pgsp_json_normalize(cjson);
2307         PG_RETURN_TEXT_P(cstring_to_text(cnormalized));
2308 }
2309
2310 Datum
2311 pg_store_plans_jsonplan(PG_FUNCTION_ARGS)
2312 {
2313         text *short_plan = PG_GETARG_TEXT_P(0);
2314         char *cshort = text_to_cstring(short_plan);
2315         char *cinflated = pgsp_json_inflate(cshort);
2316         PG_RETURN_TEXT_P(cstring_to_text(cinflated));
2317 }
2318
2319 Datum
2320 pg_store_plans_textplan(PG_FUNCTION_ARGS)
2321 {
2322         text *short_plan = PG_GETARG_TEXT_P(0);
2323         char *cshort = text_to_cstring(short_plan);
2324         char *ctextized = pgsp_json_textize(cshort);
2325
2326         PG_RETURN_TEXT_P(cstring_to_text(ctextized));
2327 }
2328
2329 Datum
2330 pg_store_plans_yamlplan(PG_FUNCTION_ARGS)
2331 {
2332         text *short_plan = PG_GETARG_TEXT_P(0);
2333         char *cshort = text_to_cstring(short_plan);
2334         char *cyamlized = pgsp_json_yamlize(cshort);
2335
2336         PG_RETURN_TEXT_P(cstring_to_text(cyamlized));
2337 }
2338
2339 Datum
2340 pg_store_plans_xmlplan(PG_FUNCTION_ARGS)
2341 {
2342         text *short_plan = PG_GETARG_TEXT_P(0);
2343         char *cshort = text_to_cstring(short_plan);
2344         char *cxmlized = pgsp_json_xmlize(cshort);
2345
2346         PG_RETURN_TEXT_P(cstring_to_text(cxmlized));
2347 }