OSDN Git Service

Return null in queryid/planid when privilege insifficiency
[pgstoreplans/pg_store_plans.git] / pg_store_plans.c
1 /*-------------------------------------------------------------------------
2  *
3  * pg_store_plans.c
4  *              Take statistics of plan selection across a whole database cluster.
5  *
6  * Execution costs are totaled for each distinct plan for each query,
7  * and plan and queryid are kept in a shared hashtable, each record in
8  * which is associated with a record in pg_stat_statements, if any, by
9  * the queryid.
10  *
11  * For Postgres 9.3 or earlier does not expose query id so
12  * pg_store_plans needs to calculate it based on the given query
13  * string using different algorithm from pg_stat_statements, and later
14  * the id will be matched against the one made from query string
15  * stored in pg_stat_statements. For the reason, queryid matching in
16  * this way will fail if the query string kept in pg_stat_statements
17  * is truncated in the middle.
18  *
19  * Plans are identified by fingerprinting plan representations in
20  * "shortened" JSON format with constants and unstable values such as
21  * rows, width, loops ignored. Nevertheless, stored plan entries hold
22  * them of the latest execution. Entry eviction is done in the same
23  * way to pg_stat_statements.
24  *
25  * Copyright (c) 2008-2020, PostgreSQL Global Development Group
26  * Copyright (c) 2012-2021, NIPPON TELEGRAPH AND TELEPHONE CORPORATION
27  *
28  * IDENTIFICATION
29  *        pg_store_plans/pg_store_plans.c
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34
35 #include <sys/stat.h>
36 #include <unistd.h>
37 #include <dlfcn.h>
38 #include <math.h>
39
40 #include "catalog/pg_authid.h"
41 #include "commands/explain.h"
42 #include "access/hash.h"
43 #include "executor/instrument.h"
44 #include "funcapi.h"
45 #include "mb/pg_wchar.h"
46 #include "miscadmin.h"
47 #include "pgstat.h"
48 #include "storage/fd.h"
49 #include "storage/ipc.h"
50 #include "storage/lwlock.h"
51 #include "storage/spin.h"
52 #include "storage/shmem.h"
53 #include "tcop/utility.h"
54 #include "utils/acl.h"
55 #include "utils/builtins.h"
56 #if PG_VERSION_NUM >= 140000
57 #include "utils/queryjumble.h"
58 #endif
59 #include "utils/timestamp.h"
60
61 #include "pgsp_json.h"
62 #include "pgsp_explain.h"
63
64 PG_MODULE_MAGIC;
65
66 /* Location of stats file */
67 #define PGSP_DUMP_FILE  "global/pg_store_plans.stat"
68 #define PGSP_TEXT_FILE  PG_STAT_TMP_DIR "/pgsp_plan_texts.stat"
69
70 /* PostgreSQL major version number, changes in which invalidate all entries */
71 static const uint32 PGSP_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
72
73 /* This constant defines the magic number in the stats file header */
74 static const uint32 PGSP_FILE_HEADER = 0x20211125;
75 static int max_plan_len = 5000;
76
77 /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
78 #define USAGE_EXEC(duration)    (1.0)
79 #define USAGE_INIT                              (1.0)   /* including initial planning */
80 #define ASSUMED_MEDIAN_INIT             (10.0)  /* initial assumed median usage */
81 #define ASSUMED_LENGTH_INIT             1024    /* initial assumed mean query length */
82 #define USAGE_DECREASE_FACTOR   (0.99)  /* decreased every entry_dealloc */
83 #define STICKY_DECREASE_FACTOR  (0.50)  /* factor for sticky entries */
84 #define USAGE_DEALLOC_PERCENT   5               /* free this % of entries at once */
85
86 /* In PostgreSQL 11, queryid becomes a uint64 internally.
87  */
88 #if PG_VERSION_NUM >= 110000
89 typedef uint64 queryid_t;
90 #define PGSP_NO_QUERYID         UINT64CONST(0)
91 #else
92 typedef uint32 queryid_t;
93 #define PGSP_NO_QUERYID         0
94 #endif
95
96 /*
97  * Extension version number, for supporting older extension versions' objects
98  */
99 typedef enum pgspVersion
100 {
101         PGSP_V1_5 = 0,
102         PGSP_V1_6
103 } pgspVersion;
104
105 /*
106  * Hashtable key that defines the identity of a hashtable entry.  We separate
107  * queries by user and by database even if they are otherwise identical.
108  *
109  * Presently, the query encoding is fully determined by the source database
110  * and so we don't really need it to be in the key.  But that might not always
111  * be true. Anyway it's notationally convenient to pass it as part of the key.
112  */
113 typedef struct pgspHashKey
114 {
115         Oid                     userid;                 /* user OID */
116         Oid                     dbid;                   /* database OID */
117         queryid_t       queryid;                /* query identifier */
118         uint32          planid;                 /* plan identifier */
119 } pgspHashKey;
120
121 /*
122  * The actual stats counters kept within pgspEntry.
123  */
124 typedef struct Counters
125 {
126         int64           calls;                          /* # of times executed */
127         double          total_time;                     /* total execution time, in msec */
128         double          min_time;                       /* minimum execution time in msec */
129         double          max_time;                       /* maximum execution time in msec */
130         double          mean_time;                      /* mean execution time in msec */
131         double          sum_var_time;   /* sum of variances in execution time in msec */
132         int64           rows;                           /* total # of retrieved or affected rows */
133         int64           shared_blks_hit;        /* # of shared buffer hits */
134         int64           shared_blks_read;       /* # of shared disk blocks read */
135         int64           shared_blks_dirtied;/* # of shared disk blocks dirtied */
136         int64           shared_blks_written;/* # of shared disk blocks written */
137         int64           local_blks_hit;         /* # of local buffer hits */
138         int64           local_blks_read;        /* # of local disk blocks read */
139         int64           local_blks_dirtied;     /* # of local disk blocks dirtied */
140         int64           local_blks_written;     /* # of local disk blocks written */
141         int64           temp_blks_read;         /* # of temp blocks read */
142         int64           temp_blks_written;      /* # of temp blocks written */
143         double          blk_read_time;          /* time spent reading, in msec */
144         double          blk_write_time;         /* time spent writing, in msec */
145         TimestampTz     first_call;                     /* timestamp of first call  */
146         TimestampTz     last_call;                      /* timestamp of last call  */
147         double          usage;                          /* usage factor */
148 } Counters;
149
150 /*
151  * Global statistics for pg_store_plans
152  */
153 typedef struct pgspGlobalStats
154 {
155         int64           dealloc;                /* # of times entries were deallocated */
156         TimestampTz stats_reset;        /* timestamp with all stats reset */
157 } pgspGlobalStats;
158
159 /*
160  * Statistics per plan
161  *
162  * NB: see the file read/write code before changing field order here.
163  */
164 typedef struct pgspEntry
165 {
166         pgspHashKey     key;                    /* hash key of entry - MUST BE FIRST */
167         Counters        counters;               /* the statistics for this query */
168         Size            plan_offset;    /* plan text offset in extern file */
169         int                     plan_len;               /* # of valid bytes in query string */
170         int                     encoding;               /* query encoding */
171         slock_t         mutex;                  /* protects the counters only */
172 } pgspEntry;
173
174 /*
175  * Global shared state
176  */
177 typedef struct pgspSharedState
178 {
179         LWLock     *lock;                       /* protects hashtable search/modification */
180         int                     plan_size;              /* max query length in bytes */
181         double          cur_median_usage;       /* current median usage in hashtable */
182         Size            mean_plan_len;  /* current mean entry text length */
183         slock_t         mutex;                  /* protects following fields only: */
184         Size            extent;                 /* current extent of plan file */
185         int                     n_writers;              /* number of active writers to query file */
186         int                     gc_count;               /* plan file garbage collection cycle count */
187         pgspGlobalStats stats;          /* global statistics for pgsp */
188 } pgspSharedState;
189
190 /*---- Local variables ----*/
191
192 /* Current nesting depth of ExecutorRun+ProcessUtility calls */
193 static int      nested_level = 0;
194
195 /* Saved hook values in case of unload */
196 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
197 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
198 static ExecutorRun_hook_type prev_ExecutorRun = NULL;
199 static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
200 static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
201 static ProcessUtility_hook_type prev_ProcessUtility = NULL;
202
203 /* Links to shared memory state */
204 static pgspSharedState *shared_state = NULL;
205 static HTAB *hash_table = NULL;
206
207 /*---- GUC variables ----*/
208
209 typedef enum
210 {
211         TRACK_LEVEL_NONE,                       /* track no statements */
212         TRACK_LEVEL_TOP,                                /* only top level statements */
213         TRACK_LEVEL_ALL,                                /* all statements, including nested ones */
214         TRACK_LEVEL_FORCE                       /* all statements, including nested ones */
215 }       PGSPTrackLevel;
216
217 static const struct config_enum_entry track_options[] =
218 {
219         {"none", TRACK_LEVEL_NONE, false},
220         {"top", TRACK_LEVEL_TOP, false},
221         {"all", TRACK_LEVEL_ALL, false},
222         {NULL, 0, false}
223 };
224
225 typedef enum
226 {
227         PLAN_FORMAT_RAW,                /* No conversion. Shorten JSON */
228         PLAN_FORMAT_TEXT,               /* Traditional text representation */
229         PLAN_FORMAT_JSON,               /* JSON representation */
230         PLAN_FORMAT_YAML,               /* YAML */
231         PLAN_FORMAT_XML,                /* XML  */
232 }       PGSPPlanFormats;
233
234 static const struct config_enum_entry plan_formats[] =
235 {
236         {"raw" , PLAN_FORMAT_RAW , false},
237         {"text", PLAN_FORMAT_TEXT, false},
238         {"json", PLAN_FORMAT_JSON, false},
239         {"yaml", PLAN_FORMAT_YAML, false},
240         {"xml" , PLAN_FORMAT_XML , false},
241         {NULL, 0, false}
242 };
243
244 /* options for plan storage */
245 typedef enum
246 {
247         PLAN_STORAGE_SHMEM,             /* plan is stored as a part of hash entry */
248         PLAN_STORAGE_FILE               /* plan is stored in a separate file */
249 }  pgspPlanStorage;
250
251 static const struct config_enum_entry plan_storage_options[] =
252 {
253         {"shmem", PLAN_STORAGE_SHMEM, false},
254         {"file", PLAN_STORAGE_FILE, false},
255         {NULL, 0, false}
256 };
257
258 static int      store_size;                     /* max # statements to track */
259 static int      track_level;            /* tracking level */
260 static int      min_duration;           /* min duration to record */
261 static bool dump_on_shutdown;   /* whether to save stats across shutdown */
262 static bool log_analyze;                /* Similar to EXPLAIN (ANALYZE *) */
263 static bool log_verbose;                /* Similar to EXPLAIN (VERBOSE *) */
264 static bool log_buffers;                /* Similar to EXPLAIN (BUFFERS *) */
265 static bool log_timing;                 /* Similar to EXPLAIN (TIMING *) */
266 static bool log_triggers;               /* whether to log trigger statistics  */
267 static int  plan_format;                /* Plan representation style in
268                                                                  * pg_store_plans.plan  */
269 static int  plan_storage;               /* Plan storage type */
270
271 #if PG_VERSION_NUM >= 140000
272 /*
273  * For pg14 and later, we rely on core queryid calculation.  If
274  * it's not available it means that the admin explicitly refused to
275  * compute it, for performance reason or other.  In that case, we
276  * will also consider that this extension is disabled.
277  */
278 #define pgsp_enabled(q) \
279         ((track_level == TRACK_LEVEL_ALL || \
280         (track_level == TRACK_LEVEL_TOP && nested_level == 0)) && \
281         (q != PGSP_NO_QUERYID))
282 #else
283 #define pgsp_enabled(q) \
284         (track_level == TRACK_LEVEL_ALL || \
285         (track_level == TRACK_LEVEL_TOP && nested_level == 0))
286 #endif
287
288 #define SHMEM_PLAN_PTR(ent) (((char *) ent) + sizeof(pgspEntry))
289
290 /*---- Function declarations ----*/
291
292 void            _PG_init(void);
293 void            _PG_fini(void);
294
295 Datum           pg_store_plans_reset(PG_FUNCTION_ARGS);
296 Datum           pg_store_plans_hash_query(PG_FUNCTION_ARGS);
297 Datum           pg_store_plans(PG_FUNCTION_ARGS);
298 Datum           pg_store_plans_shorten(PG_FUNCTION_ARGS);
299 Datum           pg_store_plans_normalize(PG_FUNCTION_ARGS);
300 Datum           pg_store_plans_jsonplan(PG_FUNCTION_ARGS);
301 Datum           pg_store_plans_yamlplan(PG_FUNCTION_ARGS);
302 Datum           pg_store_plans_xmlplan(PG_FUNCTION_ARGS);
303 Datum           pg_store_plans_textplan(PG_FUNCTION_ARGS);
304 Datum           pg_store_plans_info(PG_FUNCTION_ARGS);
305
306 PG_FUNCTION_INFO_V1(pg_store_plans_reset);
307 PG_FUNCTION_INFO_V1(pg_store_plans_hash_query);
308 PG_FUNCTION_INFO_V1(pg_store_plans);
309 PG_FUNCTION_INFO_V1(pg_store_plans_1_6);
310 PG_FUNCTION_INFO_V1(pg_store_plans_shorten);
311 PG_FUNCTION_INFO_V1(pg_store_plans_normalize);
312 PG_FUNCTION_INFO_V1(pg_store_plans_jsonplan);
313 PG_FUNCTION_INFO_V1(pg_store_plans_yamlplan);
314 PG_FUNCTION_INFO_V1(pg_store_plans_xmlplan);
315 PG_FUNCTION_INFO_V1(pg_store_plans_textplan);
316 PG_FUNCTION_INFO_V1(pg_store_plans_info);
317
318 #if PG_VERSION_NUM < 130000
319 #define COMPTAG_TYPE char
320 #else
321 #define COMPTAG_TYPE QueryCompletion
322 #endif
323
324 #if PG_VERSION_NUM < 140000
325 #define ROLE_PG_READ_ALL_STATS          DEFAULT_ROLE_READ_ALL_STATS
326 #endif
327
328 static void pgsp_shmem_startup(void);
329 static void pgsp_shmem_shutdown(int code, Datum arg);
330 static void pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags);
331 static void pgsp_ExecutorRun(QueryDesc *queryDesc,
332                                  ScanDirection direction,
333                                                          uint64 count, bool execute_once);
334 static void pgsp_ExecutorFinish(QueryDesc *queryDesc);
335 static void pgsp_ExecutorEnd(QueryDesc *queryDesc);
336 static void pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
337 #if PG_VERSION_NUM >= 140000
338                                         bool readOnlyTree,
339 #endif
340                                         ProcessUtilityContext context, ParamListInfo params,
341                                         QueryEnvironment *queryEnv,
342                                         DestReceiver *dest, COMPTAG_TYPE *completionTag);
343 static uint32 hash_query(const char* query);
344 static void pgsp_store(char *plan, queryid_t queryId,
345                    double total_time, uint64 rows,
346                    const BufferUsage *bufusage);
347 static void pg_store_plans_internal(FunctionCallInfo fcinfo,
348                                                                         pgspVersion api_version);
349 static Size shared_mem_size(void);
350 static pgspEntry *entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len,
351                                                           bool sticky);
352 static bool ptext_store(const char *plan, int plan_len, Size *plan_offset,
353                                                 int *gc_count);
354 static char *ptext_load_file(Size *buffer_size);
355 static char *ptext_fetch(Size plan_offset, int plan_len, char *buffer,
356                                                  Size buffer_size);
357 static bool need_gc_ptexts(void);
358 static void gc_ptexts(void);
359 static void entry_dealloc(void);
360 static void entry_reset(void);
361
362 /*
363  * Module load callback
364  */
365 void
366 _PG_init(void)
367 {
368         /*
369          * In order to create our shared memory area, we have to be loaded via
370          * shared_preload_libraries.  If not, fall out without hooking into any of
371          * the main system.  (We don't throw error here because it seems useful to
372          * allow the pg_stat_statements functions to be created even when the
373          * module isn't active.  The functions must protect themselves against
374          * being called then, however.)
375          */
376         if (!process_shared_preload_libraries_in_progress)
377                 return;
378
379 #if PG_VERSION_NUM >= 140000
380         /*
381          * Inform the postmaster that we want to enable query_id calculation if
382          * compute_query_id is set to auto.
383          */
384         EnableQueryId();
385 #endif
386
387         /*
388          * Define (or redefine) custom GUC variables.
389          */
390         DefineCustomIntVariable("pg_store_plans.max",
391           "Sets the maximum number of plans tracked by pg_store_plans.",
392                                                         NULL,
393                                                         &store_size,
394                                                         1000,
395                                                         100,
396                                                         INT_MAX,
397                                                         PGC_POSTMASTER,
398                                                         0,
399                                                         NULL,
400                                                         NULL,
401                                                         NULL);
402
403         DefineCustomIntVariable("pg_store_plans.max_plan_length",
404           "Sets the maximum length of plans stored by pg_store_plans.",
405                                                         NULL,
406                                                         &max_plan_len,
407                                                         5000,
408                                                         100,
409                                                         INT32_MAX,
410                                                         PGC_POSTMASTER,
411                                                         0,
412                                                         NULL,
413                                                         NULL,
414                                                         NULL);
415
416         DefineCustomEnumVariable("pg_store_plans.plan_storage",
417                            "Selects where to store plan texts.",
418                                                          NULL,
419                                                          &plan_storage,
420                                                          PLAN_STORAGE_FILE,
421                                                          plan_storage_options,
422                                                          PGC_POSTMASTER,
423                                                          0,
424                                                          NULL,
425                                                          NULL,
426                                                          NULL);
427
428         DefineCustomEnumVariable("pg_store_plans.track",
429                            "Selects which plans are tracked by pg_store_plans.",
430                                                          NULL,
431                                                          &track_level,
432                                                          TRACK_LEVEL_TOP,
433                                                          track_options,
434                                                          PGC_SUSET,
435                                                          0,
436                                                          NULL,
437                                                          NULL,
438                                                          NULL);
439
440         DefineCustomEnumVariable("pg_store_plans.plan_format",
441                            "Selects which format to be appied for plan representation in pg_store_plans.",
442                                                          NULL,
443                                                          &plan_format,
444                                                          PLAN_FORMAT_TEXT,
445                                                          plan_formats,
446                                                          PGC_USERSET,
447                                                          0,
448                                                          NULL,
449                                                          NULL,
450                                                          NULL);
451
452         DefineCustomIntVariable("pg_store_plans.min_duration",
453                                         "Minimum duration to record plan in milliseconds.",
454                                                         NULL,
455                                                         &min_duration,
456                                                         0,
457                                                         0,
458                                                         INT_MAX,
459                                                         PGC_SUSET,
460                                                         0,
461                                                         NULL,
462                                                         NULL,
463                                                         NULL);
464
465         DefineCustomBoolVariable("pg_store_plans.save",
466                            "Save pg_store_plans statistics across server shutdowns.",
467                                                          NULL,
468                                                          &dump_on_shutdown,
469                                                          true,
470                                                          PGC_SIGHUP,
471                                                          0,
472                                                          NULL,
473                                                          NULL,
474                                                          NULL);
475
476         DefineCustomBoolVariable("pg_store_plans.log_analyze",
477                                                          "Use EXPLAIN ANALYZE for plan logging.",
478                                                          NULL,
479                                                          &log_analyze,
480                                                          false,
481                                                          PGC_SUSET,
482                                                          0,
483                                                          NULL,
484                                                          NULL,
485                                                          NULL);
486
487         DefineCustomBoolVariable("pg_store_plans.log_buffers",
488                                                          "Log buffer usage.",
489                                                          NULL,
490                                                          &log_buffers,
491                                                          false,
492                                                          PGC_SUSET,
493                                                          0,
494                                                          NULL,
495                                                          NULL,
496                                                          NULL);
497
498         DefineCustomBoolVariable("pg_store_plans.log_timing",
499                                                          "Log timings.",
500                                                          NULL,
501                                                          &log_timing,
502                                                          true,
503                                                          PGC_SUSET,
504                                                          0,
505                                                          NULL,
506                                                          NULL,
507                                                          NULL);
508
509         DefineCustomBoolVariable("pg_store_plans.log_triggers",
510                                                          "Log trigger trace.",
511                                                          NULL,
512                                                          &log_triggers,
513                                                          false,
514                                                          PGC_SUSET,
515                                                          0,
516                                                          NULL,
517                                                          NULL,
518                                                          NULL);
519
520         DefineCustomBoolVariable("pg_store_plans.log_verbose",
521                            "Set VERBOSE for EXPLAIN on logging.",
522                                                          NULL,
523                                                          &log_verbose,
524                                                          false,
525                                                          PGC_SUSET,
526                                                          0,
527                                                          NULL,
528                                                          NULL,
529                                                          NULL);
530
531         EmitWarningsOnPlaceholders("pg_store_plans");
532
533         /*
534          * Request additional shared resources.  (These are no-ops if we're not in
535          * the postmaster process.)  We'll allocate or attach to the shared
536          * resources in pgsp_shmem_startup().
537          */
538         RequestAddinShmemSpace(shared_mem_size());
539         RequestNamedLWLockTranche("pg_store_plans", 1);
540
541         /*
542          * Install hooks.
543          */
544         prev_shmem_startup_hook = shmem_startup_hook;
545         shmem_startup_hook = pgsp_shmem_startup;
546         prev_ExecutorStart = ExecutorStart_hook;
547         ExecutorStart_hook = pgsp_ExecutorStart;
548         prev_ExecutorRun = ExecutorRun_hook;
549         ExecutorRun_hook = pgsp_ExecutorRun;
550         prev_ExecutorFinish = ExecutorFinish_hook;
551         ExecutorFinish_hook = pgsp_ExecutorFinish;
552         prev_ExecutorEnd = ExecutorEnd_hook;
553         ExecutorEnd_hook = pgsp_ExecutorEnd;
554         prev_ProcessUtility = ProcessUtility_hook;
555         ProcessUtility_hook = pgsp_ProcessUtility;
556 }
557
558 /*
559  * Module unload callback
560  */
561 void
562 _PG_fini(void)
563 {
564         /* Uninstall hooks. */
565         shmem_startup_hook = prev_shmem_startup_hook;
566         ExecutorStart_hook = prev_ExecutorStart;
567         ExecutorRun_hook = prev_ExecutorRun;
568         ExecutorFinish_hook = prev_ExecutorFinish;
569         ExecutorEnd_hook = prev_ExecutorEnd;
570         ProcessUtility_hook = prev_ProcessUtility;
571 }
572
573 /*
574  * shmem_startup hook: allocate or attach to shared memory,
575  * then load any pre-existing statistics from file.
576  */
577 static void
578 pgsp_shmem_startup(void)
579 {
580         bool            found;
581         HASHCTL         info;
582         FILE       *file;
583         FILE       *pfile = NULL;
584         uint32          header;
585         int32           num;
586         int32           pgver;
587         int32           i;
588         int                     plan_size;
589         int                     buffer_size;
590         char       *buffer = NULL;
591
592         if (prev_shmem_startup_hook)
593                 prev_shmem_startup_hook();
594
595         /* reset in case this is a restart within the postmaster */
596         shared_state = NULL;
597         hash_table = NULL;
598
599         /*
600          * Create or attach to the shared memory state, including hash table
601          */
602         LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
603
604         shared_state = ShmemInitStruct("pg_store_plans",
605                                                    sizeof(pgspSharedState),
606                                                    &found);
607
608         if (!found)
609         {
610                 /* First time through ... */
611                 shared_state->lock = &(GetNamedLWLockTranche("pg_store_plans"))->lock;
612                 shared_state->plan_size = max_plan_len;
613                 shared_state->cur_median_usage = ASSUMED_MEDIAN_INIT;
614                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
615                 SpinLockInit(&shared_state->mutex);
616                 shared_state->extent = 0;
617                 shared_state->n_writers = 0;
618                 shared_state->gc_count = 0;
619                 shared_state->stats.dealloc = 0;
620                 shared_state->stats.stats_reset = GetCurrentTimestamp();
621         }
622
623         /* Be sure everyone agrees on the hash table entry size */
624         plan_size = shared_state->plan_size;
625
626         memset(&info, 0, sizeof(info));
627         info.keysize = sizeof(pgspHashKey);
628         info.entrysize = sizeof(pgspEntry);
629         if (plan_storage == PLAN_STORAGE_SHMEM)
630                 info.entrysize += max_plan_len;
631         hash_table = ShmemInitHash("pg_store_plans hash",
632                                                           store_size, store_size,
633                                                           &info, HASH_ELEM |
634                                                           HASH_BLOBS);
635
636         LWLockRelease(AddinShmemInitLock);
637
638         /*
639          * If we're in the postmaster (or a standalone backend...), set up a shmem
640          * exit hook to dump the statistics to disk.
641          */
642         if (!IsUnderPostmaster)
643                 on_shmem_exit(pgsp_shmem_shutdown, (Datum) 0);
644
645         /*
646          * Done if some other process already completed our initialization.
647          */
648         if (found)
649                 return;
650
651         /*
652          * Note: we don't bother with locks here, because there should be no other
653          * processes running when this code is reached.
654          */
655
656         /* Unlink query text file possibly left over from crash */
657         unlink(PGSP_TEXT_FILE);
658
659         if (plan_storage == PLAN_STORAGE_FILE)
660         {
661                 /* Allocate new query text temp file */
662                 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
663                 if (pfile == NULL)
664                         goto write_error;
665         }
666
667         /*
668          * If we were told not to load old statistics, we're done.  (Note we do
669          * not try to unlink any old dump file in this case.  This seems a bit
670          * questionable but it's the historical behavior.)
671          */
672         if (!dump_on_shutdown)
673         {
674                 if (pfile)
675                         FreeFile(pfile);
676                 return;
677         }
678
679         /*
680          * Attempt to load old statistics from the dump file.
681          */
682         file = AllocateFile(PGSP_DUMP_FILE, PG_BINARY_R);
683         if (file == NULL)
684         {
685                 if (errno == ENOENT)
686                         return;                         /* ignore not-found error */
687                 /* No existing persisted stats file, so we're done */
688                 goto read_error;
689         }
690
691         buffer_size = plan_size;
692         buffer = (char *) palloc(buffer_size);
693
694         if (fread(&header, sizeof(uint32), 1, file) != 1 ||
695                 fread(&pgver, sizeof(uint32), 1, file) != 1 ||
696                 fread(&num, sizeof(int32), 1, file) != 1)
697                 goto read_error;
698
699         if (header != PGSP_FILE_HEADER ||
700                 pgver != PGSP_PG_MAJOR_VERSION)
701                 goto data_error;
702
703         for (i = 0; i < num; i++)
704         {
705                 pgspEntry       temp;
706                 pgspEntry  *entry;
707                 Size            plan_offset = 0;
708
709                 if (fread(&temp, sizeof(pgspEntry), 1, file) != 1)
710                         goto read_error;
711
712                 /* Encoding is the only field we can easily sanity-check */
713                 if (!PG_VALID_BE_ENCODING(temp.encoding))
714                         goto data_error;
715
716                 /* Previous incarnation might have had a larger plan_size */
717                 if (temp.plan_len >= buffer_size)
718                 {
719                         buffer = (char *) repalloc(buffer, temp.plan_len + 1);
720                         buffer_size = temp.plan_len + 1;
721                 }
722
723                 if (fread(buffer, 1, temp.plan_len + 1, file) != temp.plan_len + 1)
724                         goto read_error;
725
726                 /* Skip loading "sticky" entries */
727                 if (temp.counters.calls == 0)
728                         continue;
729
730                 /* Clip to available length if needed */
731                 if (temp.plan_len >= plan_size)
732                         temp.plan_len = pg_encoding_mbcliplen(temp.encoding,
733                                                                                                    buffer,
734                                                                                                    temp.plan_len,
735                                                                                                    plan_size - 1);
736
737                 buffer[temp.plan_len] = '\0';
738
739                 if (plan_storage == PLAN_STORAGE_FILE)
740                 {
741                         /* Store the plan text */
742                         plan_offset = shared_state->extent;
743                         if (fwrite(buffer, 1, temp.plan_len + 1, pfile) !=
744                                 temp.plan_len + 1)
745                                 goto write_error;
746                         shared_state->extent += temp.plan_len + 1;
747                 }
748
749                 /* make the hashtable entry (discards old entries if too many) */
750                 entry = entry_alloc(&temp.key, plan_offset, temp.plan_len, false);
751
752                 if (plan_storage == PLAN_STORAGE_SHMEM)
753                         memcpy(SHMEM_PLAN_PTR(entry), buffer, temp.plan_len + 1);
754
755                 /* copy in the actual stats */
756                 entry->counters = temp.counters;
757         }
758
759         pfree(buffer);
760         FreeFile(file);
761
762         if (pfile)
763                 FreeFile(pfile);
764
765         /*
766          * Remove the file so it's not included in backups/replication slaves,
767          * etc. A new file will be written on next shutdown.
768          */
769         unlink(PGSP_DUMP_FILE);
770
771         return;
772
773 read_error:
774         ereport(LOG,
775                         (errcode_for_file_access(),
776                          errmsg("could not read file \"%s\": %m",
777                                         PGSP_DUMP_FILE)));
778         goto fail;
779 data_error:
780         ereport(LOG,
781                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
782                          errmsg("ignoring invalid data in file \"%s\"",
783                                         PGSP_DUMP_FILE)));
784         goto fail;
785 write_error:
786         ereport(LOG,
787                         (errcode_for_file_access(),
788                          errmsg("could not write file \"%s\": %m",
789                                         PGSP_TEXT_FILE)));
790 fail:
791         if (buffer)
792                 pfree(buffer);
793         if (file)
794                 FreeFile(file);
795         if (pfile)
796                 FreeFile(pfile);
797         /* If possible, throw away the bogus file; ignore any error */
798         unlink(PGSP_DUMP_FILE);
799
800         /*
801          * Don't unlink PGSP_TEXT_FILE here; it should always be around while the
802          * server is running with pg_stat_statements enabled
803          */
804 }
805
806 /*
807  * shmem_shutdown hook: Dump statistics into file.
808  *
809  * Note: we don't bother with acquiring lock, because there should be no
810  * other processes running when this is called.
811  */
812 static void
813 pgsp_shmem_shutdown(int code, Datum arg)
814 {
815         FILE       *file;
816         char       *pbuffer = NULL;
817         Size            pbuffer_size = 0;
818         HASH_SEQ_STATUS hash_seq;
819         int32           num_entries;
820         pgspEntry  *entry;
821
822         /* Don't try to dump during a crash. */
823         if (code)
824                 return;
825
826         /* Safety check ... shouldn't get here unless shmem is set up. */
827         if (!shared_state || !hash_table)
828                 return;
829
830         /* Don't dump if told not to. */
831         if (!dump_on_shutdown)
832                 return;
833
834         file = AllocateFile(PGSP_DUMP_FILE ".tmp", PG_BINARY_W);
835         if (file == NULL)
836                 goto error;
837
838         if (fwrite(&PGSP_FILE_HEADER, sizeof(uint32), 1, file) != 1)
839                 goto error;
840         if (fwrite(&PGSP_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
841                 goto error;
842         num_entries = hash_get_num_entries(hash_table);
843         if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
844                 goto error;
845
846         if (plan_storage == PLAN_STORAGE_FILE)
847         {
848                 pbuffer = ptext_load_file(&pbuffer_size);
849                 if (pbuffer == NULL)
850                         goto error;
851         }
852
853         hash_seq_init(&hash_seq, hash_table);
854         while ((entry = hash_seq_search(&hash_seq)) != NULL)
855         {
856                 int                     len = entry->plan_len;
857                 char       *pstr;
858
859                 if (plan_storage == PLAN_STORAGE_FILE)
860                         pstr = ptext_fetch(entry->plan_offset, len,
861                                                            pbuffer, pbuffer_size);
862                 else
863                         pstr = SHMEM_PLAN_PTR(entry);
864
865                 if (pstr == NULL)
866                         continue;                       /* Ignore any entries with bogus texts */
867
868                 if (fwrite(entry, sizeof(pgspEntry), 1, file) != 1 ||
869                         fwrite(pstr, 1, len + 1, file) != len + 1)
870                 {
871                         /* note: we assume hash_seq_term won't change errno */
872                         hash_seq_term(&hash_seq);
873                         goto error;
874                 }
875         }
876
877         if (FreeFile(file))
878         {
879                 file = NULL;
880                 goto error;
881         }
882
883         /*
884          * Rename file into place, so we atomically replace the old one.
885          */
886         if (rename(PGSP_DUMP_FILE ".tmp", PGSP_DUMP_FILE) != 0)
887                 ereport(LOG,
888                                 (errcode_for_file_access(),
889                                  errmsg("could not rename pg_store_plans file \"%s\": %m",
890                                                 PGSP_DUMP_FILE ".tmp")));
891
892         /* Unlink query-texts file; it's not needed while shutdown */
893         unlink(PGSP_TEXT_FILE);
894
895         return;
896
897 error:
898         ereport(LOG,
899                         (errcode_for_file_access(),
900                          errmsg("could not write pg_store_plans file \"%s\": %m",
901                                         PGSP_DUMP_FILE ".tmp")));
902         if (file)
903                 FreeFile(file);
904         unlink(PGSP_DUMP_FILE ".tmp");
905 }
906
907
908 /*
909  * ExecutorStart hook: start up tracking if needed
910  */
911 static void
912 pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags)
913 {
914         if (log_analyze &&
915                 (eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0)
916         {
917                 queryDesc->instrument_options |=
918                         (log_timing ? INSTRUMENT_TIMER : 0)|
919                         (log_timing ? 0: INSTRUMENT_ROWS)|
920                         (log_buffers ? INSTRUMENT_BUFFERS : 0);
921         }
922
923         if (prev_ExecutorStart)
924                 prev_ExecutorStart(queryDesc, eflags);
925         else
926                 standard_ExecutorStart(queryDesc, eflags);
927
928         /*
929          * Set up to track total elapsed time in ExecutorRun. Allocate in per-query
930          * context so as to be free at ExecutorEnd.
931          */
932         if (queryDesc->totaltime == NULL &&
933                         pgsp_enabled(queryDesc->plannedstmt->queryId))
934         {
935                 MemoryContext oldcxt;
936
937                 oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
938                 queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL
939 #if PG_VERSION_NUM >= 140000
940                                                                                   , false
941 #endif
942                                                                                  );
943                 MemoryContextSwitchTo(oldcxt);
944         }
945
946 }
947
948 /*
949  * ExecutorRun hook: all we need do is track nesting depth
950  */
951 static void
952 pgsp_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
953                                  bool execute_once)
954 {
955         nested_level++;
956         PG_TRY();
957         {
958                 if (prev_ExecutorRun)
959                         prev_ExecutorRun(queryDesc, direction, count, execute_once);
960                 else
961                         standard_ExecutorRun(queryDesc, direction, count, execute_once);
962                 nested_level--;
963         }
964         PG_CATCH();
965         {
966                 nested_level--;
967                 PG_RE_THROW();
968         }
969         PG_END_TRY();
970 }
971
972 /*
973  * ExecutorFinish hook: all we need do is track nesting depth
974  */
975 static void
976 pgsp_ExecutorFinish(QueryDesc *queryDesc)
977 {
978         nested_level++;
979         PG_TRY();
980         {
981                 if (prev_ExecutorFinish)
982                         prev_ExecutorFinish(queryDesc);
983                 else
984                         standard_ExecutorFinish(queryDesc);
985                 nested_level--;
986         }
987         PG_CATCH();
988         {
989                 nested_level--;
990                 PG_RE_THROW();
991         }
992         PG_END_TRY();
993 }
994
995 /*
996  * ExecutorEnd hook: store results if needed
997  */
998 static void
999 pgsp_ExecutorEnd(QueryDesc *queryDesc)
1000 {
1001         if (queryDesc->totaltime)
1002         {
1003                 /*
1004                  * Make sure stats accumulation is done.  (Note: it's okay if several
1005                  * levels of hook all do this.)
1006                  */
1007                 InstrEndLoop(queryDesc->totaltime);
1008
1009                 if (pgsp_enabled(queryDesc->plannedstmt->queryId) &&
1010                         queryDesc->totaltime->total &&
1011                         queryDesc->totaltime->total >=
1012                         (double)min_duration / 1000.0)
1013                 {
1014                         queryid_t         queryid;
1015                         ExplainState *es;
1016                         StringInfo        es_str;
1017
1018                         es = NewExplainState();
1019                         es_str = es->str;
1020
1021                         es->analyze = queryDesc->instrument_options;
1022                         es->verbose = log_verbose;
1023                         es->buffers = (es->analyze && log_buffers);
1024                         es->timing = (es->analyze && log_timing);
1025                         es->format = EXPLAIN_FORMAT_JSON;
1026
1027                         ExplainBeginOutput(es);
1028                         ExplainPrintPlan(es, queryDesc);
1029                         if (log_triggers)
1030                                 pgspExplainTriggers(es, queryDesc);
1031                         ExplainEndOutput(es);
1032
1033                         /* Remove last line break */
1034                         if (es_str->len > 0 && es_str->data[es_str->len - 1] == '\n')
1035                                 es_str->data[--es_str->len] = '\0';
1036
1037                         /* JSON outmost braces. */
1038                         es_str->data[0] = '{';
1039                         es_str->data[es_str->len - 1] = '}';
1040
1041                         queryid = queryDesc->plannedstmt->queryId;
1042 #if PG_VERSION_NUM < 140000
1043                         /*
1044                          * For versions before pg14, a queryid is only available if
1045                          * pg_stat_statements extension (or similar) if configured.  We
1046                          * don't want a hard requirement for such an extension so fallback
1047                          * to an internal queryid calculation in some case.
1048                          * For pg14 and above, core postgres can compute a queryid so we
1049                          * will rely on it.
1050                          */
1051                         if (queryid == PGSP_NO_QUERYID)
1052                                 queryid = (queryid_t) hash_query(queryDesc->sourceText);
1053 #else
1054                         Assert(queryid != PGSP_NO_QUERYID);
1055 #endif
1056
1057                         pgsp_store(es_str->data,
1058                                                 queryid,
1059                                                 queryDesc->totaltime->total * 1000.0,   /* convert to msec */
1060                                                 queryDesc->estate->es_processed,
1061                                                 &queryDesc->totaltime->bufusage);
1062                         pfree(es_str->data);
1063                 }
1064         }
1065
1066         if (prev_ExecutorEnd)
1067                 prev_ExecutorEnd(queryDesc);
1068         else
1069                 standard_ExecutorEnd(queryDesc);
1070 }
1071
1072 /*
1073  * ProcessUtility hook
1074  */
1075 static void
1076 pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
1077 #if PG_VERSION_NUM >= 140000
1078                                         bool readOnlyTree,
1079 #endif
1080                                         ProcessUtilityContext context, ParamListInfo params,
1081                                         QueryEnvironment *queryEnv,
1082                                         DestReceiver *dest, COMPTAG_TYPE *completionTag)
1083 {
1084         if (prev_ProcessUtility)
1085                 prev_ProcessUtility(pstmt, queryString,
1086 #if PG_VERSION_NUM >= 140000
1087                                                         readOnlyTree,
1088 #endif
1089                                                         context, params, queryEnv,
1090                                                         dest, completionTag);
1091         else
1092                 standard_ProcessUtility(pstmt, queryString,
1093 #if PG_VERSION_NUM >= 140000
1094                                                                 readOnlyTree,
1095 #endif
1096                                                                 context, params, queryEnv,
1097                                                                 dest, completionTag);
1098 }
1099
1100 /*
1101  * hash_query: calculate internal query ID for a query
1102  *
1103  *  As of PG11, Query.queryId has been widen to 64 bit to reduce collision of
1104  *  queries to practical level. On the other hand pg_store_plans uses the
1105  *  combination of query hash and plan hash values as the hash table key and
1106  *  the resolution of the hash value effectively has the same degree so we
1107  *  continue to use uint32 as internal queryid.
1108  *
1109  *  This may merge plans from different queries into single internal query id
1110  *  but it is not a problem when pg_stat_statements is used together since the
1111  *  extension gives enough resolution on queries.
1112  */
1113 static uint32
1114 hash_query(const char* query)
1115 {
1116         uint32 queryid;
1117
1118         char *normquery = pstrdup(query);
1119         normalize_expr(normquery, false);
1120         queryid = hash_any((const unsigned char*)normquery, strlen(normquery));
1121         pfree(normquery);
1122
1123         /* If we are unlucky enough to get a hash of zero, use 1 instead */
1124         if (queryid == 0)
1125                 queryid = 1;
1126
1127         return queryid;
1128 }
1129
1130
1131 /*
1132  * Store some statistics for a plan.
1133  *
1134  * Table entry is keyed with userid.dbid.queryId.planId. planId is the hash
1135  * value of the given plan, which is calculated in ths function.
1136  */
1137 static void
1138 pgsp_store(char *plan, queryid_t queryId,
1139                    double total_time, uint64 rows,
1140                    const BufferUsage *bufusage)
1141 {
1142         pgspHashKey key;
1143         pgspEntry  *entry;
1144         char       *norm_query = NULL;
1145         int             plan_len;
1146         char       *normalized_plan = NULL;
1147         char       *shorten_plan = NULL;
1148         volatile pgspEntry *e;
1149         Size            plan_offset = 0;
1150         bool            do_gc = false;
1151
1152         Assert(plan != NULL && queryId != PGSP_NO_QUERYID);
1153
1154         /* Safety check... */
1155         if (!shared_state || !hash_table)
1156                 return;
1157
1158         /* Set up key for hashtable search */
1159         key.userid = GetUserId();
1160         key.dbid = MyDatabaseId;
1161         key.queryid = queryId;
1162
1163         normalized_plan = pgsp_json_normalize(plan);
1164         shorten_plan = pgsp_json_shorten(plan);
1165         elog(DEBUG3, "pg_store_plans: Normalized plan: %s", normalized_plan);
1166         elog(DEBUG3, "pg_store_plans: Shorten plan: %s", shorten_plan);
1167         elog(DEBUG3, "pg_store_plans: Original plan: %s", plan);
1168         plan_len = strlen(shorten_plan);
1169
1170         key.planid = hash_any((const unsigned char *)normalized_plan,
1171                                                   strlen(normalized_plan));
1172         pfree(normalized_plan);
1173
1174         if (plan_len >= shared_state->plan_size)
1175                 plan_len = pg_encoding_mbcliplen(GetDatabaseEncoding(),
1176                                                                                  shorten_plan,
1177                                                                                  plan_len,
1178                                                                                  shared_state->plan_size - 1);
1179
1180
1181         /* Look up the hash table entry with shared lock. */
1182         LWLockAcquire(shared_state->lock, LW_SHARED);
1183
1184         entry = (pgspEntry *) hash_search(hash_table, &key, HASH_FIND, NULL);
1185
1186         /* Store the plan text, if the entry not present */
1187         if (!entry && plan_storage == PLAN_STORAGE_FILE)
1188         {
1189                 int             gc_count;
1190                 bool    stored;
1191
1192                 /* Append new plan text to file with only shared lock held */
1193                 stored = ptext_store(shorten_plan, plan_len, &plan_offset, &gc_count);
1194
1195                 /*
1196                  * Determine whether we need to garbage collect external query texts
1197                  * while the shared lock is still held.  This micro-optimization
1198                  * avoids taking the time to decide this while holding exclusive lock.
1199                  */
1200                 do_gc = need_gc_ptexts();
1201
1202                 /* Acquire exclusive lock as required by entry_alloc() */
1203                 LWLockRelease(shared_state->lock);
1204                 LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
1205
1206                 /*
1207                  * A garbage collection may have occurred while we weren't holding the
1208                  * lock.  In the unlikely event that this happens, the plan text we
1209                  * stored above will have been garbage collected, so write it again.
1210                  * This should be infrequent enough that doing it while holding
1211                  * exclusive lock isn't a performance problem.
1212                  */
1213                 if (!stored || shared_state->gc_count != gc_count)
1214                         stored = ptext_store(shorten_plan, plan_len, &plan_offset, NULL);
1215
1216                 /* If we failed to write to the text file, give up */
1217                 if (!stored)
1218                         goto done;
1219
1220         }
1221
1222         /* Create new entry, if not present */
1223         if (!entry)
1224         {
1225                 entry = entry_alloc(&key, plan_offset, plan_len, false);
1226
1227                 /* shorten_plan is terminated by NUL */
1228                 if (plan_storage == PLAN_STORAGE_SHMEM)
1229                         memcpy(SHMEM_PLAN_PTR(entry), shorten_plan, plan_len + 1);
1230                         
1231
1232                 /* If needed, perform garbage collection while exclusive lock held */
1233                 if (do_gc)
1234                         gc_ptexts();
1235         }
1236
1237         /* Increment the counts, except when jstate is not NULL */
1238
1239         /*
1240          * Grab the spinlock while updating the counters (see comment about
1241          * locking rules at the head of the file)
1242          */
1243
1244         e = (volatile pgspEntry *) entry;
1245         SpinLockAcquire(&e->mutex);
1246
1247         /* "Unstick" entry if it was previously sticky */
1248         if (e->counters.calls == 0)
1249         {
1250                 e->counters.usage = USAGE_INIT;
1251                 e->counters.first_call = GetCurrentTimestamp();
1252         }
1253
1254         e->counters.calls += 1;
1255         e->counters.total_time += total_time;
1256         if (e->counters.calls == 1)
1257         {
1258                 e->counters.min_time = total_time;
1259                 e->counters.max_time = total_time;
1260                 e->counters.mean_time = total_time;
1261         }
1262         else
1263         {
1264                 /*
1265                  * Welford's method for accurately computing variance. See
1266                  * <http://www.johndcook.com/blog/standard_deviation/>
1267                  */
1268                 double          old_mean = e->counters.mean_time;
1269
1270                 e->counters.mean_time +=
1271                         (total_time - old_mean) / e->counters.calls;
1272                 e->counters.sum_var_time +=
1273                         (total_time - old_mean) * (total_time - e->counters.mean_time);
1274
1275                 /* calculate min and max time */
1276                 if (e->counters.min_time > total_time)
1277                         e->counters.min_time = total_time;
1278                 if (e->counters.max_time < total_time)
1279                         e->counters.max_time = total_time;
1280         }
1281
1282         e->counters.rows += rows;
1283         e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1284         e->counters.shared_blks_read += bufusage->shared_blks_read;
1285         e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
1286         e->counters.shared_blks_written += bufusage->shared_blks_written;
1287         e->counters.local_blks_hit += bufusage->local_blks_hit;
1288         e->counters.local_blks_read += bufusage->local_blks_read;
1289         e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
1290         e->counters.local_blks_written += bufusage->local_blks_written;
1291         e->counters.temp_blks_read += bufusage->temp_blks_read;
1292         e->counters.temp_blks_written += bufusage->temp_blks_written;
1293         e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1294         e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
1295         e->counters.last_call = GetCurrentTimestamp();
1296         e->counters.usage += USAGE_EXEC(total_time);
1297
1298         SpinLockRelease(&e->mutex);
1299
1300 done:
1301         LWLockRelease(shared_state->lock);
1302
1303         /* We postpone this pfree until we're out of the lock */
1304         if (norm_query)
1305                 pfree(norm_query);
1306 }
1307
1308 /*
1309  * Reset all statement statistics.
1310  */
1311 Datum
1312 pg_store_plans_reset(PG_FUNCTION_ARGS)
1313 {
1314         if (!shared_state || !hash_table)
1315                 ereport(ERROR,
1316                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1317                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1318         entry_reset();
1319         PG_RETURN_VOID();
1320 }
1321
1322 /* Number of output arguments (columns) for various API versions */
1323 #define PG_STORE_PLANS_COLS_V1_5        27
1324 #define PG_STORE_PLANS_COLS_V1_6        26
1325 #define PG_STORE_PLANS_COLS                     27      /* maximum of above */
1326
1327 /*
1328  * Retrieve statement statistics.
1329  */
1330 Datum
1331 pg_store_plans_1_6(PG_FUNCTION_ARGS)
1332 {
1333         pg_store_plans_internal(fcinfo, PGSP_V1_6);
1334
1335         return (Datum) 0;
1336 }
1337
1338 Datum
1339 pg_store_plans(PG_FUNCTION_ARGS)
1340 {
1341         pg_store_plans_internal(fcinfo, PGSP_V1_5);
1342
1343         return (Datum) 0;
1344 }
1345
1346 static void
1347 pg_store_plans_internal(FunctionCallInfo fcinfo,
1348                                                 pgspVersion api_version)
1349 {
1350         ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1351         TupleDesc       tupdesc;
1352         Tuplestorestate *tupstore;
1353         MemoryContext per_query_ctx;
1354         MemoryContext oldcontext;
1355         Oid                     userid = GetUserId();
1356         bool            is_allowed_role = is_member_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS);
1357         int                     n_writers;
1358         char       *pbuffer = NULL;
1359         Size            pbuffer_size = 0;
1360         Size            extent = 0;
1361         int                     gc_count = 0;
1362         HASH_SEQ_STATUS hash_seq;
1363         pgspEntry  *entry;
1364
1365         if (!shared_state || !hash_table)
1366                 ereport(ERROR,
1367                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1368                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1369
1370         /* check to see if caller supports us returning a tuplestore */
1371         if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1372                 ereport(ERROR,
1373                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1374                                  errmsg("set-valued function called in context that cannot accept a set")));
1375         if (!(rsinfo->allowedModes & SFRM_Materialize))
1376                 ereport(ERROR,
1377                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1378                                  errmsg("materialize mode required, but it is not " \
1379                                                 "allowed in this context")));
1380
1381         /* Build a tuple descriptor for our result type */
1382         if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1383                 elog(ERROR, "return type must be a row type");
1384
1385         per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1386         oldcontext = MemoryContextSwitchTo(per_query_ctx);
1387
1388         tupstore = tuplestore_begin_heap(true, false, work_mem);
1389         rsinfo->returnMode = SFRM_Materialize;
1390         rsinfo->setResult = tupstore;
1391         rsinfo->setDesc = tupdesc;
1392
1393         MemoryContextSwitchTo(oldcontext);
1394
1395         /*
1396          * We'd like to load the plan text file (if needed) while not holding any
1397          * lock on shared_state->lock.  In the worst case we'll have to do this
1398          * again after we have the lock, but it's unlikely enough to make this a
1399          * win despite occasional duplicated work.  We need to reload if anybody
1400          * writes to the file (either a retail ptext_store(), or a garbage
1401          * collection) between this point and where we've gotten shared lock.  If a
1402          * ptext_store is actually in progress when we look, we might as well skip
1403          * the speculative load entirely.
1404          */
1405
1406         /* Take the mutex so we can examine variables */
1407         {
1408                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1409
1410                 SpinLockAcquire(&s->mutex);
1411                 extent = s->extent;
1412                 n_writers = s->n_writers;
1413                 gc_count = s->gc_count;
1414                 SpinLockRelease(&s->mutex);
1415         }
1416
1417         /* No point in loading file now if there are active writers */
1418         if (n_writers == 0 && plan_storage == PLAN_STORAGE_FILE)
1419                 pbuffer = ptext_load_file(&pbuffer_size);
1420
1421         /*
1422          * Get shared lock, load or reload the plan text file if we must, and
1423          * iterate over the hashtable entries.
1424          *
1425          * With a large hash table, we might be holding the lock rather longer
1426          * than one could wish.  However, this only blocks creation of new hash
1427          * table entries, and the larger the hash table the less likely that is to
1428          * be needed.  So we can hope this is okay.  Perhaps someday we'll decide
1429          * we need to partition the hash table to limit the time spent holding any
1430          * one lock.
1431          */
1432         LWLockAcquire(shared_state->lock, LW_SHARED);
1433
1434         /*
1435          * Here it is safe to examine extent and gc_count without taking the mutex.
1436          * Note that although other processes might change shared_state->extent
1437          * just after we look at it, the strings they then write into the file
1438          * cannot yet be referenced in the hashtable, so we don't care whether we
1439          * see them or not.
1440          *
1441          * If ptext_load_file fails, we just press on; we'll return NULL for every
1442          * plan text.
1443          */
1444         if (plan_storage == PLAN_STORAGE_FILE &&
1445                 (pbuffer == NULL ||
1446                  shared_state->extent != extent ||
1447                  shared_state->gc_count != gc_count))
1448         {
1449                 if (pbuffer)
1450                         free(pbuffer);
1451                 pbuffer = ptext_load_file(&pbuffer_size);
1452         }
1453
1454         hash_seq_init(&hash_seq, hash_table);
1455         while ((entry = hash_seq_search(&hash_seq)) != NULL)
1456         {
1457                 Datum           values[PG_STORE_PLANS_COLS];
1458                 bool            nulls[PG_STORE_PLANS_COLS];
1459                 int                     i = 0;
1460                 int64           queryid      = entry->key.queryid;
1461                 int64           planid       = entry->key.planid;
1462                 Counters        tmp;
1463                 double          stddev;
1464
1465                 memset(values, 0, sizeof(values));
1466                 memset(nulls, 0, sizeof(nulls));
1467
1468                 values[i++] = ObjectIdGetDatum(entry->key.userid);
1469                 values[i++] = ObjectIdGetDatum(entry->key.dbid);
1470                 if (is_allowed_role || entry->key.userid == userid)
1471                 {
1472                         values[i++] = Int64GetDatumFast(queryid);
1473                         values[i++] = Int64GetDatumFast(planid);
1474
1475                         /* fill queryid_stat_statements with the same value with queryid */
1476                         if (api_version == PGSP_V1_5)
1477                                 values[i++] = Int64GetDatumFast(queryid);
1478                 }
1479                 else
1480                 {
1481                         nulls[i++] = true;      /* queryid */
1482                         nulls[i++] = true;      /* planid */
1483
1484                         /* queryid_stat_statemetns*/
1485                         if (api_version == PGSP_V1_5)
1486                                 nulls[i++] = true;
1487                 }
1488
1489                 if (is_allowed_role || entry->key.userid == userid)
1490                 {
1491                         char       *pstr; /* Plan string */
1492                         char       *mstr; /* Modified plan string */
1493                         char       *estr; /* Encoded modified plan string */
1494
1495                         if (plan_storage == PLAN_STORAGE_FILE)
1496                                 pstr = ptext_fetch(entry->plan_offset, entry->plan_len,
1497                                                                    pbuffer, pbuffer_size);
1498                         else
1499                                 pstr = SHMEM_PLAN_PTR(entry);
1500
1501                         switch (plan_format)
1502                         {
1503                                 case PLAN_FORMAT_TEXT:
1504                                         mstr = pgsp_json_textize(pstr);
1505                                         break;
1506                                 case PLAN_FORMAT_JSON:
1507                                         mstr = pgsp_json_inflate(pstr);
1508                                         break;
1509                                 case PLAN_FORMAT_YAML:
1510                                         mstr = pgsp_json_yamlize(pstr);
1511                                         break;
1512                                 case PLAN_FORMAT_XML:
1513                                         mstr = pgsp_json_xmlize(pstr);
1514                                         break;
1515                                 default:
1516                                         break;
1517                         }
1518
1519                         estr = (char *)
1520                                 pg_do_encoding_conversion((unsigned char *) mstr,
1521                                                                                   strlen(mstr),
1522                                                                                   entry->encoding,
1523                                                                                   GetDatabaseEncoding());
1524                         values[i++] = CStringGetTextDatum(estr);
1525
1526                         if (estr != mstr)
1527                                 pfree(estr);
1528
1529                         if (mstr != pstr)
1530                                 pfree(mstr);
1531
1532                         /* pstr is a pointer onto pbuffer */
1533                 }
1534                 else
1535                         values[i++] = CStringGetTextDatum("<insufficient privilege>");
1536
1537                 /* copy counters to a local variable to keep locking time short */
1538                 {
1539                         volatile pgspEntry *e = (volatile pgspEntry *) entry;
1540
1541                         SpinLockAcquire(&e->mutex);
1542                         tmp = e->counters;
1543                         SpinLockRelease(&e->mutex);
1544                 }
1545
1546                 /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1547                 if (tmp.calls == 0)
1548                         continue;
1549
1550                 values[i++] = Int64GetDatumFast(tmp.calls);
1551                 values[i++] = Float8GetDatumFast(tmp.total_time);
1552                 values[i++] = Float8GetDatumFast(tmp.min_time);
1553                 values[i++] = Float8GetDatumFast(tmp.max_time);
1554                 values[i++] = Float8GetDatumFast(tmp.mean_time);
1555
1556                 /*
1557                  * Note we are calculating the population variance here, not the
1558                  * sample variance, as we have data for the whole population, so
1559                  * Bessel's correction is not used, and we don't divide by
1560                  * tmp.calls - 1.
1561                  */
1562                 if (tmp.calls > 1)
1563                         stddev = sqrt(tmp.sum_var_time / tmp.calls);
1564                 else
1565                         stddev = 0.0;
1566                 values[i++] = Float8GetDatumFast(stddev);
1567
1568                 values[i++] = Int64GetDatumFast(tmp.rows);
1569                 values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1570                 values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
1571                 values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
1572                 values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1573                 values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1574                 values[i++] = Int64GetDatumFast(tmp.local_blks_read);
1575                 values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
1576                 values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1577                 values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1578                 values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
1579                 values[i++] = Float8GetDatumFast(tmp.blk_read_time);
1580                 values[i++] = Float8GetDatumFast(tmp.blk_write_time);
1581                 values[i++] = TimestampTzGetDatum(tmp.first_call);
1582                 values[i++] = TimestampTzGetDatum(tmp.last_call);
1583
1584                 Assert(i == (api_version == PGSP_V1_5 ? PG_STORE_PLANS_COLS_V1_5 :
1585                                          api_version == PGSP_V1_6 ? PG_STORE_PLANS_COLS_V1_6 :
1586                                          -1 /* fail if you forget to update this assert */ ));
1587
1588                 tuplestore_putvalues(tupstore, tupdesc, values, nulls);
1589         }
1590
1591         LWLockRelease(shared_state->lock);
1592
1593         /* clean up and return the tuplestore */
1594         tuplestore_donestoring(tupstore);
1595 }
1596
1597 /* Number of output arguments (columns) for pg_stat_statements_info */
1598 #define PG_STORE_PLANS_INFO_COLS        2
1599
1600 /*
1601  * Return statistics of pg_stat_statements.
1602  */
1603 Datum
1604 pg_store_plans_info(PG_FUNCTION_ARGS)
1605 {
1606         pgspGlobalStats stats;
1607         TupleDesc       tupdesc;
1608         Datum           values[PG_STORE_PLANS_INFO_COLS];
1609         bool            nulls[PG_STORE_PLANS_INFO_COLS];
1610
1611         if (!shared_state || !hash_table)
1612                 ereport(ERROR,
1613                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1614                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1615
1616         /* Build a tuple descriptor for our result type */
1617         if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1618                 elog(ERROR, "return type must be a row type");
1619
1620         MemSet(values, 0, sizeof(values));
1621         MemSet(nulls, 0, sizeof(nulls));
1622
1623         /* Read global statistics for pg_stat_statements */
1624         {
1625                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1626
1627                 SpinLockAcquire(&s->mutex);
1628                 stats = s->stats;
1629                 SpinLockRelease(&s->mutex);
1630         }
1631
1632         values[0] = Int64GetDatum(stats.dealloc);
1633         values[1] = TimestampTzGetDatum(stats.stats_reset);
1634
1635         PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
1636 }
1637
1638 /*
1639  * Estimate shared memory space needed.
1640  */
1641 static Size
1642 shared_mem_size(void)
1643 {
1644         Size            size;
1645         int                     entry_size;
1646
1647         size = MAXALIGN(sizeof(pgspSharedState));
1648         entry_size = sizeof(pgspEntry);
1649
1650         /* plan text is apppended to the struct body */
1651         if (plan_storage == PLAN_STORAGE_SHMEM)
1652                 entry_size += max_plan_len;
1653
1654         size = add_size(size, hash_estimate_size(store_size, entry_size));
1655
1656         return size;
1657 }
1658
1659 /*
1660  * Allocate a new hashtable entry.
1661  * caller must hold an exclusive lock on shared_state->lock
1662  *
1663  * "plan" need not be null-terminated; we rely on plan_len instead
1664  *
1665  * If "sticky" is true, make the new entry artificially sticky so that it will
1666  * probably still be there when the query finishes execution.  We do this by
1667  * giving it a median usage value rather than the normal value.  (Strictly
1668  * speaking, query strings are normalized on a best effort basis, though it
1669  * would be difficult to demonstrate this even under artificial conditions.)
1670  *
1671  * Note: despite needing exclusive lock, it's not an error for the target
1672  * entry to already exist.      This is because pgsp_store releases and
1673  * reacquires lock after failing to find a match; so someone else could
1674  * have made the entry while we waited to get exclusive lock.
1675  */
1676 static pgspEntry *
1677 entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len, bool sticky)
1678 {
1679         pgspEntry  *entry;
1680         bool            found;
1681
1682         /* Make space if needed */
1683         while (hash_get_num_entries(hash_table) >= store_size)
1684                 entry_dealloc();
1685
1686         /* Find or create an entry with desired hash code */
1687         entry = (pgspEntry *) hash_search(hash_table, key, HASH_ENTER, &found);
1688
1689         if (!found)
1690         {
1691                 /* New entry, initialize it */
1692
1693                 /* reset the statistics */
1694                 memset(&entry->counters, 0, sizeof(Counters));
1695                 /* set the appropriate initial usage count */
1696                 entry->counters.usage = sticky ? shared_state->cur_median_usage : USAGE_INIT;
1697                 /* re-initialize the mutex each time ... we assume no one using it */
1698                 SpinLockInit(&entry->mutex);
1699                 /* ... and don't forget the query text */
1700                 Assert(plan_len >= 0 && plan_len < shared_state->plan_size);
1701                 entry->plan_offset = plan_offset;
1702                 entry->plan_len = plan_len;
1703                 entry->encoding = GetDatabaseEncoding();
1704         }
1705
1706         return entry;
1707 }
1708
1709 /*
1710  * qsort comparator for sorting into increasing usage order
1711  */
1712 static int
1713 entry_cmp(const void *lhs, const void *rhs)
1714 {
1715         double          l_usage = (*(pgspEntry *const *) lhs)->counters.usage;
1716         double          r_usage = (*(pgspEntry *const *) rhs)->counters.usage;
1717
1718         if (l_usage < r_usage)
1719                 return -1;
1720         else if (l_usage > r_usage)
1721                 return +1;
1722         else
1723                 return 0;
1724 }
1725
1726 /*
1727  * Deallocate least used entries.
1728  * Caller must hold an exclusive lock on shared_state->lock.
1729  */
1730 static void
1731 entry_dealloc(void)
1732 {
1733         HASH_SEQ_STATUS hash_seq;
1734         pgspEntry **entries;
1735         pgspEntry  *entry;
1736         int                     nvictims;
1737         int                     i;
1738         Size            tottextlen;
1739         int                     nvalidtexts;
1740
1741         /*
1742          * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1743          * While we're scanning the table, apply the decay factor to the usage
1744          * values.
1745          */
1746
1747         entries = palloc(hash_get_num_entries(hash_table) * sizeof(pgspEntry *));
1748
1749         i = 0;
1750         hash_seq_init(&hash_seq, hash_table);
1751         while ((entry = hash_seq_search(&hash_seq)) != NULL)
1752         {
1753                 entries[i++] = entry;
1754                 /* "Sticky" entries get a different usage decay rate. */
1755                 if (entry->counters.calls == 0)
1756                         entry->counters.usage *= STICKY_DECREASE_FACTOR;
1757                 else
1758                         entry->counters.usage *= USAGE_DECREASE_FACTOR;
1759
1760                 /* In the mean length computation, ignore dropped texts. */
1761                 if (entry->plan_len >= 0)
1762                 {
1763                         tottextlen += entry->plan_len + 1;
1764                         nvalidtexts++;
1765                 }
1766         }
1767
1768         qsort(entries, i, sizeof(pgspEntry *), entry_cmp);
1769
1770         /* Also, record the (approximate) median usage */
1771         if (i > 0)
1772                 shared_state->cur_median_usage = entries[i / 2]->counters.usage;
1773         /* Record the mean plan length */
1774         if (nvalidtexts > 0)
1775                 shared_state->mean_plan_len = tottextlen / nvalidtexts;
1776         else
1777                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
1778
1779         nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
1780         nvictims = Min(nvictims, i);
1781
1782         for (i = 0; i < nvictims; i++)
1783         {
1784                 hash_search(hash_table, &entries[i]->key, HASH_REMOVE, NULL);
1785         }
1786
1787         pfree(entries);
1788
1789         /* Increment the number of times entries are deallocated */
1790         {
1791                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1792
1793                 SpinLockAcquire(&s->mutex);
1794                 s->stats.dealloc += 1;
1795                 SpinLockRelease(&s->mutex);
1796         }
1797 }
1798
1799 /*
1800  * Given a plan string (not necessarily null-terminated), allocate a new
1801  * entry in the external plan text file and store the string there.
1802  *
1803  * If successful, returns true, and stores the new entry's offset in the file
1804  * into *plan_offset.  Also, if gc_count isn't NULL, *gc_count is set to the
1805  * number of garbage collections that have occurred so far.
1806  *
1807  * On failure, returns false.
1808  *
1809  * At least a shared lock on shared_state->lock must be held by the caller, so
1810  * as to prevent a concurrent garbage collection.  Share-lock-holding callers
1811  * should pass a gc_count pointer to obtain the number of garbage collections,
1812  * so that they can recheck the count after obtaining exclusive lock to detect
1813  * whether a garbage collection occurred (and removed this entry).
1814  */
1815 static bool
1816 ptext_store(const char *plan, int plan_len, Size *plan_offset, int *gc_count)
1817 {
1818         Size            off;
1819         int                     fd;
1820
1821         Assert (plan_storage == PLAN_STORAGE_FILE);
1822
1823         /*
1824          * We use a spinlock to protect extent/n_writers/gc_count, so that
1825          * multiple processes may execute this function concurrently.
1826          */
1827         {
1828                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1829
1830                 SpinLockAcquire(&s->mutex);
1831                 off = s->extent;
1832                 s->extent += plan_len + 1;
1833                 s->n_writers++;
1834                 if (gc_count)
1835                         *gc_count = s->gc_count;
1836                 SpinLockRelease(&s->mutex);
1837         }
1838
1839         *plan_offset = off;
1840
1841         /* Now write the data into the successfully-reserved part of the file */
1842         fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
1843         if (fd < 0)
1844                 goto error;
1845
1846         if (pg_pwrite(fd, plan, plan_len, off) != plan_len)
1847                 goto error;
1848         if (pg_pwrite(fd, "\0", 1, off + plan_len) != 1)
1849                 goto error;
1850
1851         CloseTransientFile(fd);
1852
1853         /* Mark our write complete */
1854         {
1855                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1856
1857                 SpinLockAcquire(&s->mutex);
1858                 s->n_writers--;
1859                 SpinLockRelease(&s->mutex);
1860         }
1861
1862         return true;
1863
1864 error:
1865         ereport(LOG,
1866                         (errcode_for_file_access(),
1867                          errmsg("could not write file \"%s\": %m",
1868                                         PGSP_TEXT_FILE)));
1869
1870         if (fd >= 0)
1871                 CloseTransientFile(fd);
1872
1873         /* Mark our write complete */
1874         {
1875                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1876
1877                 SpinLockAcquire(&s->mutex);
1878                 s->n_writers--;
1879                 SpinLockRelease(&s->mutex);
1880         }
1881
1882         return false;
1883 }
1884
1885 /*
1886  * Read the external plan text file into a malloc'd buffer.
1887  *
1888  * Returns NULL (without throwing an error) if unable to read, eg
1889  * file not there or insufficient memory.
1890  *
1891  * On success, the buffer size is also returned into *buffer_size.
1892  *
1893  * This can be called without any lock on shared_state->lock, but in that case
1894  * the caller is responsible for verifying that the result is sane.
1895  */
1896 static char *
1897 ptext_load_file(Size *buffer_size)
1898 {
1899         char       *buf;
1900         int                     fd;
1901         struct stat stat;
1902         Size            nread;
1903
1904         Assert (plan_storage == PLAN_STORAGE_FILE);
1905
1906         fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDONLY | PG_BINARY);
1907         if (fd < 0)
1908         {
1909                 if (errno != ENOENT)
1910                         ereport(LOG,
1911                                         (errcode_for_file_access(),
1912                                          errmsg("could not read file \"%s\": %m",
1913                                                         PGSP_TEXT_FILE)));
1914                 return NULL;
1915         }
1916
1917         /* Get file length */
1918         if (fstat(fd, &stat))
1919         {
1920                 ereport(LOG,
1921                                 (errcode_for_file_access(),
1922                                  errmsg("could not stat file \"%s\": %m",
1923                                                 PGSP_TEXT_FILE)));
1924                 CloseTransientFile(fd);
1925                 return NULL;
1926         }
1927
1928         /* Allocate buffer; beware that off_t might be wider than size_t */
1929         if (stat.st_size <= MaxAllocHugeSize)
1930                 buf = (char *) malloc(stat.st_size);
1931         else
1932                 buf = NULL;
1933         if (buf == NULL)
1934         {
1935                 ereport(LOG,
1936                                 (errcode(ERRCODE_OUT_OF_MEMORY),
1937                                  errmsg("out of memory"),
1938                                  errdetail("Could not allocate enough memory to read file \"%s\".",
1939                                                    PGSP_TEXT_FILE)));
1940                 CloseTransientFile(fd);
1941                 return NULL;
1942         }
1943
1944         /*
1945          * OK, slurp in the file.  Windows fails if we try to read more than
1946          * INT_MAX bytes at once, and other platforms might not like that either,
1947          * so read a very large file in 1GB segments.
1948          */
1949         nread = 0;
1950         while (nread < stat.st_size)
1951         {
1952                 int                     toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
1953
1954                 /*
1955                  * If we get a short read and errno doesn't get set, the reason is
1956                  * probably that garbage collection truncated the file since we did
1957                  * the fstat(), so we don't log a complaint --- but we don't return
1958                  * the data, either, since it's most likely corrupt due to concurrent
1959                  * writes from garbage collection.
1960                  */
1961                 errno = 0;
1962                 if (read(fd, buf + nread, toread) != toread)
1963                 {
1964                         if (errno)
1965                                 ereport(LOG,
1966                                                 (errcode_for_file_access(),
1967                                                  errmsg("could not read file \"%s\": %m",
1968                                                                 PGSP_TEXT_FILE)));
1969                         free(buf);
1970                         CloseTransientFile(fd);
1971                         return NULL;
1972                 }
1973                 nread += toread;
1974         }
1975
1976         if (CloseTransientFile(fd) != 0)
1977                 ereport(LOG,
1978                                 (errcode_for_file_access(),
1979                                  errmsg("could not close file \"%s\": %m", PGSP_TEXT_FILE)));
1980
1981         *buffer_size = nread;
1982         return buf;
1983 }
1984
1985 /*
1986  * Locate a plan text in the file image previously read by ptext_load_file().
1987  *
1988  * We validate the given offset/length, and return NULL if bogus.  Otherwise,
1989  * the result points to a null-terminated string within the buffer.
1990  */
1991 static char *
1992 ptext_fetch(Size plan_offset, int plan_len,
1993                         char *buffer, Size buffer_size)
1994 {
1995         Assert (plan_storage == PLAN_STORAGE_FILE);
1996
1997         /* File read failed? */
1998         if (buffer == NULL)
1999                 return NULL;
2000         /* Bogus offset/length? */
2001         if (plan_len < 0 ||
2002                 plan_offset + plan_len >= buffer_size)
2003                 return NULL;
2004         /* As a further sanity check, make sure there's a trailing null */
2005         if (buffer[plan_offset + plan_len] != '\0')
2006                 return NULL;
2007         /* Looks OK */
2008         return buffer + plan_offset;
2009 }
2010
2011 /*
2012  * Do we need to garbage-collect the external plan text file?
2013  *
2014  * Caller should hold at least a shared lock on shared_state->lock.
2015  */
2016 static bool
2017 need_gc_ptexts(void)
2018 {
2019         Size            extent;
2020
2021         Assert (plan_storage == PLAN_STORAGE_FILE);
2022
2023         /* Read shared extent pointer */
2024         {
2025                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2026
2027                 SpinLockAcquire(&s->mutex);
2028                 extent = s->extent;
2029                 SpinLockRelease(&s->mutex);
2030         }
2031
2032         /* Don't proceed if file does not exceed 512 bytes per possible entry */
2033         if (extent < 512 * store_size)
2034                 return false;
2035
2036         /*
2037          * Don't proceed if file is less than about 50% bloat.  Nothing can or
2038          * should be done in the event of unusually large query texts accounting
2039          * for file's large size.  We go to the trouble of maintaining the mean
2040          * query length in order to prevent garbage collection from thrashing
2041          * uselessly.
2042          */
2043         if (extent < shared_state->mean_plan_len * store_size * 2)
2044                 return false;
2045
2046         return true;
2047 }
2048
2049 /*
2050  * Garbage-collect orphaned plan texts in external file.
2051  *
2052  * This won't be called often in the typical case, since it's likely that
2053  * there won't be too much churn, and besides, a similar compaction process
2054  * occurs when serializing to disk at shutdown or as part of resetting.
2055  * Despite this, it seems prudent to plan for the edge case where the file
2056  * becomes unreasonably large, with no other method of compaction likely to
2057  * occur in the foreseeable future.
2058  *
2059  * The caller must hold an exclusive lock on shared_state->lock.
2060  *
2061  * At the first sign of trouble we unlink the query text file to get a clean
2062  * slate (although existing statistics are retained), rather than risk
2063  * thrashing by allowing the same problem case to recur indefinitely.
2064  */
2065 static void
2066 gc_ptexts(void)
2067 {
2068         char       *pbuffer;
2069         Size            pbuffer_size;
2070         FILE       *pfile = NULL;
2071         HASH_SEQ_STATUS hash_seq;
2072         pgspEntry  *entry;
2073         Size            extent;
2074         int                     nentries;
2075
2076         Assert (plan_storage == PLAN_STORAGE_FILE);
2077
2078         /*
2079          * When called from store_entry, some other session might have proceeded
2080          * with garbage collection in the no-lock-held interim of lock strength
2081          * escalation.  Check once more that this is actually necessary.
2082          */
2083         if (!need_gc_ptexts())
2084                 return;
2085
2086         /*
2087          * Load the old texts file.  If we fail (out of memory, for instance),
2088          * invalidate query texts.  Hopefully this is rare.  It might seem better
2089          * to leave things alone on an OOM failure, but the problem is that the
2090          * file is only going to get bigger; hoping for a future non-OOM result is
2091          * risky and can easily lead to complete denial of service.
2092          */
2093         pbuffer = ptext_load_file(&pbuffer_size);
2094         if (pbuffer == NULL)
2095                 goto gc_fail;
2096
2097         /*
2098          * We overwrite the plan texts file in place, so as to reduce the risk of
2099          * an out-of-disk-space failure.  Since the file is guaranteed not to get
2100          * larger, this should always work on traditional filesystems; though we
2101          * could still lose on copy-on-write filesystems.
2102          */
2103         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2104         if (pfile == NULL)
2105         {
2106                 ereport(LOG,
2107                                 (errcode_for_file_access(),
2108                                  errmsg("could not write file \"%s\": %m",
2109                                                 PGSP_TEXT_FILE)));
2110                 goto gc_fail;
2111         }
2112
2113         extent = 0;
2114         nentries = 0;
2115
2116         hash_seq_init(&hash_seq, hash_table);
2117         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2118         {
2119                 int                     plan_len = entry->plan_len;
2120                 char       *plan = ptext_fetch(entry->plan_offset,
2121                                                                            plan_len,
2122                                                                            pbuffer,
2123                                                                            pbuffer_size);
2124
2125                 if (plan == NULL)
2126                 {
2127                         /* Trouble ... drop the text */
2128                         entry->plan_offset = 0;
2129                         entry->plan_len = -1;
2130                         /* entry will not be counted in mean plan length computation */
2131                         continue;
2132                 }
2133
2134                 if (fwrite(plan, 1, plan_len + 1, pfile) != plan_len + 1)
2135                 {
2136                         ereport(LOG,
2137                                         (errcode_for_file_access(),
2138                                          errmsg("could not write file \"%s\": %m",
2139                                                         PGSP_TEXT_FILE)));
2140                         hash_seq_term(&hash_seq);
2141                         goto gc_fail;
2142                 }
2143
2144                 entry->plan_offset = extent;
2145                 extent += plan_len + 1;
2146                 nentries++;
2147         }
2148
2149         /*
2150          * Truncate away any now-unused space.  If this fails for some odd reason,
2151          * we log it, but there's no need to fail.
2152          */
2153         if (ftruncate(fileno(pfile), extent) != 0)
2154                 ereport(LOG,
2155                                 (errcode_for_file_access(),
2156                                  errmsg("could not truncate file \"%s\": %m",
2157                                                 PGSP_TEXT_FILE)));
2158
2159         if (FreeFile(pfile))
2160         {
2161                 ereport(LOG,
2162                                 (errcode_for_file_access(),
2163                                  errmsg("could not write file \"%s\": %m",
2164                                                 PGSP_TEXT_FILE)));
2165                 pfile = NULL;
2166                 goto gc_fail;
2167         }
2168
2169         elog(DEBUG1, "pgsp gc of queries file shrunk size from %zu to %zu",
2170                  shared_state->extent, extent);
2171
2172         /* Reset the shared extent pointer */
2173         shared_state->extent = extent;
2174
2175         /*
2176          * Also update the mean plan length, to be sure that need_gc_ptexts()
2177          * won't still think we have a problem.
2178          */
2179         if (nentries > 0)
2180                 shared_state->mean_plan_len = extent / nentries;
2181         else
2182                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2183
2184         free(pbuffer);
2185
2186         return;
2187
2188 gc_fail:
2189         /* clean up resources */
2190         if (pfile)
2191                 FreeFile(pfile);
2192         if (pbuffer)
2193                 free(pbuffer);
2194
2195         /*
2196          * Since the contents of the external file are now uncertain, mark all
2197          * hashtable entries as having invalid texts.
2198          */
2199         hash_seq_init(&hash_seq, hash_table);
2200         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2201         {
2202                 entry->plan_offset = 0;
2203                 entry->plan_len = -1;
2204         }
2205
2206         /*
2207          * Destroy the query text file and create a new, empty one
2208          */
2209         (void) unlink(PGSP_TEXT_FILE);
2210         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2211         if (pfile == NULL)
2212                 ereport(LOG,
2213                                 (errcode_for_file_access(),
2214                                  errmsg("could not recreate file \"%s\": %m",
2215                                                 PGSP_TEXT_FILE)));
2216         else
2217                 FreeFile(pfile);
2218
2219         /* Reset the shared extent pointer */
2220         shared_state->extent = 0;
2221
2222         /* Reset mean_plan_len to match the new state */
2223         shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2224 }
2225
2226 /*
2227  * Release all entries.
2228  */
2229 static void
2230 entry_reset(void)
2231 {
2232         HASH_SEQ_STATUS hash_seq;
2233         pgspEntry  *entry;
2234         FILE       *pfile;
2235
2236         if (!shared_state || !hash_table)
2237                 ereport(ERROR,
2238                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2239                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
2240
2241         LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
2242
2243         hash_seq_init(&hash_seq, hash_table);
2244         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2245         {
2246                 hash_search(hash_table, &entry->key, HASH_REMOVE, NULL);
2247         }
2248
2249         /*
2250          * Reset global statistics for pg_store_plans.
2251          */
2252         {
2253                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2254                 TimestampTz stats_reset = GetCurrentTimestamp();
2255
2256                 SpinLockAcquire(&s->mutex);
2257                 s->stats.dealloc = 0;
2258                 s->stats.stats_reset = stats_reset;
2259                 SpinLockRelease(&s->mutex);
2260         }
2261
2262         /*
2263          * Write new empty plan file, perhaps even creating a new one to recover
2264          * if the file was missing.
2265          */
2266         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2267         if (pfile == NULL)
2268         {
2269                 ereport(LOG,
2270                                 (errcode_for_file_access(),
2271                                  errmsg("could not create file \"%s\": %m",
2272                                                 PGSP_TEXT_FILE)));
2273                 goto done;
2274         }
2275
2276         /* If ftruncate fails, log it, but it's not a fatal problem */
2277         if (ftruncate(fileno(pfile), 0) != 0)
2278                 ereport(LOG,
2279                                 (errcode_for_file_access(),
2280                                  errmsg("could not truncate file \"%s\": %m",
2281                                                 PGSP_TEXT_FILE)));
2282
2283         FreeFile(pfile);
2284
2285 done:
2286         shared_state->extent = 0;
2287         LWLockRelease(shared_state->lock);
2288 }
2289
2290 Datum
2291 pg_store_plans_hash_query(PG_FUNCTION_ARGS)
2292 {
2293         PG_RETURN_OID(hash_query(text_to_cstring(PG_GETARG_TEXT_P(0))));
2294 }
2295
2296 Datum
2297 pg_store_plans_shorten(PG_FUNCTION_ARGS)
2298 {
2299         text *short_plan = PG_GETARG_TEXT_P(0);
2300         char *cjson = text_to_cstring(short_plan);
2301         char *cshorten = pgsp_json_shorten(cjson);
2302         PG_RETURN_TEXT_P(cstring_to_text(cshorten));
2303 }
2304
2305 Datum
2306 pg_store_plans_normalize(PG_FUNCTION_ARGS)
2307 {
2308         text *short_plan = PG_GETARG_TEXT_P(0);
2309         char *cjson = text_to_cstring(short_plan);
2310         char *cnormalized = pgsp_json_normalize(cjson);
2311         PG_RETURN_TEXT_P(cstring_to_text(cnormalized));
2312 }
2313
2314 Datum
2315 pg_store_plans_jsonplan(PG_FUNCTION_ARGS)
2316 {
2317         text *short_plan = PG_GETARG_TEXT_P(0);
2318         char *cshort = text_to_cstring(short_plan);
2319         char *cinflated = pgsp_json_inflate(cshort);
2320         PG_RETURN_TEXT_P(cstring_to_text(cinflated));
2321 }
2322
2323 Datum
2324 pg_store_plans_textplan(PG_FUNCTION_ARGS)
2325 {
2326         text *short_plan = PG_GETARG_TEXT_P(0);
2327         char *cshort = text_to_cstring(short_plan);
2328         char *ctextized = pgsp_json_textize(cshort);
2329
2330         PG_RETURN_TEXT_P(cstring_to_text(ctextized));
2331 }
2332
2333 Datum
2334 pg_store_plans_yamlplan(PG_FUNCTION_ARGS)
2335 {
2336         text *short_plan = PG_GETARG_TEXT_P(0);
2337         char *cshort = text_to_cstring(short_plan);
2338         char *cyamlized = pgsp_json_yamlize(cshort);
2339
2340         PG_RETURN_TEXT_P(cstring_to_text(cyamlized));
2341 }
2342
2343 Datum
2344 pg_store_plans_xmlplan(PG_FUNCTION_ARGS)
2345 {
2346         text *short_plan = PG_GETARG_TEXT_P(0);
2347         char *cshort = text_to_cstring(short_plan);
2348         char *cxmlized = pgsp_json_xmlize(cshort);
2349
2350         PG_RETURN_TEXT_P(cstring_to_text(cxmlized));
2351 }