OSDN Git Service

Ignore commands under CREATE/ALTER EXTENSION commands
[pgstoreplans/pg_store_plans.git] / pg_store_plans.c
1 /*-------------------------------------------------------------------------
2  *
3  * pg_store_plans.c
4  *              Take statistics of plan selection across a whole database cluster.
5  *
6  * Execution costs are totaled for each distinct plan for each query,
7  * and plan and queryid are kept in a shared hashtable, each record in
8  * which is associated with a record in pg_stat_statements, if any, by
9  * the queryid.
10  *
11  * For Postgres 9.3 or earlier does not expose query id so
12  * pg_store_plans needs to calculate it based on the given query
13  * string using different algorithm from pg_stat_statements, and later
14  * the id will be matched against the one made from query string
15  * stored in pg_stat_statements. For the reason, queryid matching in
16  * this way will fail if the query string kept in pg_stat_statements
17  * is truncated in the middle.
18  *
19  * Plans are identified by fingerprinting plan representations in
20  * "shortened" JSON format with constants and unstable values such as
21  * rows, width, loops ignored. Nevertheless, stored plan entries hold
22  * them of the latest execution. Entry eviction is done in the same
23  * way to pg_stat_statements.
24  *
25  * Copyright (c) 2008-2020, PostgreSQL Global Development Group
26  * Copyright (c) 2012-2021, NIPPON TELEGRAPH AND TELEPHONE CORPORATION
27  *
28  * IDENTIFICATION
29  *        pg_store_plans/pg_store_plans.c
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34
35 #include <sys/stat.h>
36 #include <unistd.h>
37 #include <dlfcn.h>
38 #include <math.h>
39
40 #include "catalog/pg_authid.h"
41 #include "commands/explain.h"
42 #include "access/hash.h"
43 #include "executor/instrument.h"
44 #include "funcapi.h"
45 #include "mb/pg_wchar.h"
46 #include "miscadmin.h"
47 #include "pgstat.h"
48 #include "storage/fd.h"
49 #include "storage/ipc.h"
50 #include "storage/lwlock.h"
51 #include "storage/spin.h"
52 #include "storage/shmem.h"
53 #include "tcop/utility.h"
54 #include "utils/acl.h"
55 #include "utils/builtins.h"
56 #if PG_VERSION_NUM >= 140000
57 #include "utils/queryjumble.h"
58 #endif
59 #include "utils/timestamp.h"
60
61 #include "pgsp_json.h"
62 #include "pgsp_explain.h"
63
64 PG_MODULE_MAGIC;
65
66 /* Location of stats file */
67 #define PGSP_DUMP_FILE  "global/pg_store_plans.stat"
68 #define PGSP_TEXT_FILE  PG_STAT_TMP_DIR "/pgsp_plan_texts.stat"
69
70 /* PostgreSQL major version number, changes in which invalidate all entries */
71 static const uint32 PGSP_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
72
73 /* This constant defines the magic number in the stats file header */
74 static const uint32 PGSP_FILE_HEADER = 0x20211125;
75 static int max_plan_len = 5000;
76
77 /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
78 #define USAGE_EXEC(duration)    (1.0)
79 #define USAGE_INIT                              (1.0)   /* including initial planning */
80 #define ASSUMED_MEDIAN_INIT             (10.0)  /* initial assumed median usage */
81 #define ASSUMED_LENGTH_INIT             1024    /* initial assumed mean query length */
82 #define USAGE_DECREASE_FACTOR   (0.99)  /* decreased every entry_dealloc */
83 #define STICKY_DECREASE_FACTOR  (0.50)  /* factor for sticky entries */
84 #define USAGE_DEALLOC_PERCENT   5               /* free this % of entries at once */
85
86 /* In PostgreSQL 11, queryid becomes a uint64 internally. */
87 #if PG_VERSION_NUM >= 110000
88 typedef uint64 queryid_t;
89 #define PGSP_NO_QUERYID         UINT64CONST(0)
90 #else
91 typedef uint32 queryid_t;
92 #define PGSP_NO_QUERYID         0
93 #endif
94
95 /*
96  * Extension version number, for supporting older extension versions' objects
97  */
98 typedef enum pgspVersion
99 {
100         PGSP_V1_5 = 0,
101         PGSP_V1_6
102 } pgspVersion;
103
104 /*
105  * Hashtable key that defines the identity of a hashtable entry.  We separate
106  * queries by user and by database even if they are otherwise identical.
107  *
108  * Presently, the query encoding is fully determined by the source database
109  * and so we don't really need it to be in the key.  But that might not always
110  * be true. Anyway it's notationally convenient to pass it as part of the key.
111  */
112 typedef struct pgspHashKey
113 {
114         Oid                     userid;                 /* user OID */
115         Oid                     dbid;                   /* database OID */
116         queryid_t       queryid;                /* query identifier */
117         uint32          planid;                 /* plan identifier */
118 } pgspHashKey;
119
120 /*
121  * The actual stats counters kept within pgspEntry.
122  */
123 typedef struct Counters
124 {
125         int64           calls;                          /* # of times executed */
126         double          total_time;                     /* total execution time, in msec */
127         double          min_time;                       /* minimum execution time in msec */
128         double          max_time;                       /* maximum execution time in msec */
129         double          mean_time;                      /* mean execution time in msec */
130         double          sum_var_time;   /* sum of variances in execution time in msec */
131         int64           rows;                           /* total # of retrieved or affected rows */
132         int64           shared_blks_hit;        /* # of shared buffer hits */
133         int64           shared_blks_read;       /* # of shared disk blocks read */
134         int64           shared_blks_dirtied;/* # of shared disk blocks dirtied */
135         int64           shared_blks_written;/* # of shared disk blocks written */
136         int64           local_blks_hit;         /* # of local buffer hits */
137         int64           local_blks_read;        /* # of local disk blocks read */
138         int64           local_blks_dirtied;     /* # of local disk blocks dirtied */
139         int64           local_blks_written;     /* # of local disk blocks written */
140         int64           temp_blks_read;         /* # of temp blocks read */
141         int64           temp_blks_written;      /* # of temp blocks written */
142         double          blk_read_time;          /* time spent reading, in msec */
143         double          blk_write_time;         /* time spent writing, in msec */
144         TimestampTz     first_call;                     /* timestamp of first call  */
145         TimestampTz     last_call;                      /* timestamp of last call  */
146         double          usage;                          /* usage factor */
147 } Counters;
148
149 /*
150  * Global statistics for pg_store_plans
151  */
152 typedef struct pgspGlobalStats
153 {
154         int64           dealloc;                /* # of times entries were deallocated */
155         TimestampTz stats_reset;        /* timestamp with all stats reset */
156 } pgspGlobalStats;
157
158 /*
159  * Statistics per plan
160  *
161  * NB: see the file read/write code before changing field order here.
162  */
163 typedef struct pgspEntry
164 {
165         pgspHashKey     key;                    /* hash key of entry - MUST BE FIRST */
166         Counters        counters;               /* the statistics for this query */
167         Size            plan_offset;    /* plan text offset in extern file */
168         int                     plan_len;               /* # of valid bytes in query string */
169         int                     encoding;               /* query encoding */
170         slock_t         mutex;                  /* protects the counters only */
171 } pgspEntry;
172
173 /*
174  * Global shared state
175  */
176 typedef struct pgspSharedState
177 {
178         LWLock     *lock;                       /* protects hashtable search/modification */
179         int                     plan_size;              /* max query length in bytes */
180         double          cur_median_usage;       /* current median usage in hashtable */
181         Size            mean_plan_len;  /* current mean entry text length */
182         slock_t         mutex;                  /* protects following fields only: */
183         Size            extent;                 /* current extent of plan file */
184         int                     n_writers;              /* number of active writers to query file */
185         int                     gc_count;               /* plan file garbage collection cycle count */
186         pgspGlobalStats stats;          /* global statistics for pgsp */
187 } pgspSharedState;
188
189 /*---- Local variables ----*/
190
191 /* Current nesting depth of ExecutorRun+ProcessUtility calls */
192 static int      nested_level = 0;
193
194 /* Saved hook values in case of unload */
195 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
196 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
197 static ExecutorRun_hook_type prev_ExecutorRun = NULL;
198 static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
199 static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
200 static ProcessUtility_hook_type prev_ProcessUtility = NULL;
201
202 /* Links to shared memory state */
203 static pgspSharedState *shared_state = NULL;
204 static HTAB *hash_table = NULL;
205
206 /*---- GUC variables ----*/
207
208 typedef enum
209 {
210         TRACK_LEVEL_NONE,                       /* track no statements */
211         TRACK_LEVEL_TOP,                                /* only top level statements */
212         TRACK_LEVEL_ALL,                                /* all statements, including nested ones */
213         TRACK_LEVEL_VERBOSE                     /* all statements, including internal ones */
214 }       PGSPTrackLevel;
215
216 static const struct config_enum_entry track_options[] =
217 {
218         {"none", TRACK_LEVEL_NONE, false},
219         {"top", TRACK_LEVEL_TOP, false},
220         {"all", TRACK_LEVEL_ALL, false},
221         {"verbose", TRACK_LEVEL_VERBOSE, false},
222         {NULL, 0, false}
223 };
224
225 typedef enum
226 {
227         PLAN_FORMAT_RAW,                /* No conversion. Shorten JSON */
228         PLAN_FORMAT_TEXT,               /* Traditional text representation */
229         PLAN_FORMAT_JSON,               /* JSON representation */
230         PLAN_FORMAT_YAML,               /* YAML */
231         PLAN_FORMAT_XML,                /* XML  */
232 }       PGSPPlanFormats;
233
234 static const struct config_enum_entry plan_formats[] =
235 {
236         {"raw" , PLAN_FORMAT_RAW , false},
237         {"text", PLAN_FORMAT_TEXT, false},
238         {"json", PLAN_FORMAT_JSON, false},
239         {"yaml", PLAN_FORMAT_YAML, false},
240         {"xml" , PLAN_FORMAT_XML , false},
241         {NULL, 0, false}
242 };
243
244 /* options for plan storage */
245 typedef enum
246 {
247         PLAN_STORAGE_SHMEM,             /* plan is stored as a part of hash entry */
248         PLAN_STORAGE_FILE               /* plan is stored in a separate file */
249 }  pgspPlanStorage;
250
251 static const struct config_enum_entry plan_storage_options[] =
252 {
253         {"shmem", PLAN_STORAGE_SHMEM, false},
254         {"file", PLAN_STORAGE_FILE, false},
255         {NULL, 0, false}
256 };
257
258 static int      store_size;                     /* max # statements to track */
259 static int      track_level;            /* tracking level */
260 static int      min_duration;           /* min duration to record */
261 static bool dump_on_shutdown;   /* whether to save stats across shutdown */
262 static bool log_analyze;                /* Similar to EXPLAIN (ANALYZE *) */
263 static bool log_verbose;                /* Similar to EXPLAIN (VERBOSE *) */
264 static bool log_buffers;                /* Similar to EXPLAIN (BUFFERS *) */
265 static bool log_timing;                 /* Similar to EXPLAIN (TIMING *) */
266 static bool log_triggers;               /* whether to log trigger statistics  */
267 static int  plan_format;                /* Plan representation style in
268                                                                  * pg_store_plans.plan  */
269 static int  plan_storage;               /* Plan storage type */
270
271
272 /* disables tracking overriding track_level */
273 static bool force_disabled = false;
274
275 #if PG_VERSION_NUM >= 140000
276 /*
277  * For pg14 and later, we rely on core queryid calculation.  If
278  * it's not available it means that the admin explicitly refused to
279  * compute it, for performance reason or other.  In that case, we
280  * will also consider that this extension is disabled.
281  */
282 #define pgsp_enabled(q) \
283         (!force_disabled &&                                                                                       \
284          (track_level >= TRACK_LEVEL_ALL ||                                                       \
285           (track_level == TRACK_LEVEL_TOP && nested_level == 0)) &&       \
286          (q != PGSP_NO_QUERYID))
287 #else
288 #define pgsp_enabled(q) \
289         (!force_disabled &&                                                                                     \
290          (track_level >= TRACK_LEVEL_ALL ||                                                     \
291           (track_level == TRACK_LEVEL_TOP && nested_level == 0)))
292 #endif
293
294 #define SHMEM_PLAN_PTR(ent) (((char *) ent) + sizeof(pgspEntry))
295
296 /*---- Function declarations ----*/
297
298 void            _PG_init(void);
299 void            _PG_fini(void);
300
301 Datum           pg_store_plans_reset(PG_FUNCTION_ARGS);
302 Datum           pg_store_plans_hash_query(PG_FUNCTION_ARGS);
303 Datum           pg_store_plans(PG_FUNCTION_ARGS);
304 Datum           pg_store_plans_shorten(PG_FUNCTION_ARGS);
305 Datum           pg_store_plans_normalize(PG_FUNCTION_ARGS);
306 Datum           pg_store_plans_jsonplan(PG_FUNCTION_ARGS);
307 Datum           pg_store_plans_yamlplan(PG_FUNCTION_ARGS);
308 Datum           pg_store_plans_xmlplan(PG_FUNCTION_ARGS);
309 Datum           pg_store_plans_textplan(PG_FUNCTION_ARGS);
310 Datum           pg_store_plans_info(PG_FUNCTION_ARGS);
311
312 PG_FUNCTION_INFO_V1(pg_store_plans_reset);
313 PG_FUNCTION_INFO_V1(pg_store_plans_hash_query);
314 PG_FUNCTION_INFO_V1(pg_store_plans);
315 PG_FUNCTION_INFO_V1(pg_store_plans_1_6);
316 PG_FUNCTION_INFO_V1(pg_store_plans_shorten);
317 PG_FUNCTION_INFO_V1(pg_store_plans_normalize);
318 PG_FUNCTION_INFO_V1(pg_store_plans_jsonplan);
319 PG_FUNCTION_INFO_V1(pg_store_plans_yamlplan);
320 PG_FUNCTION_INFO_V1(pg_store_plans_xmlplan);
321 PG_FUNCTION_INFO_V1(pg_store_plans_textplan);
322 PG_FUNCTION_INFO_V1(pg_store_plans_info);
323
324 #if PG_VERSION_NUM < 130000
325 #define COMPTAG_TYPE char
326 #else
327 #define COMPTAG_TYPE QueryCompletion
328 #endif
329
330 #if PG_VERSION_NUM < 140000
331 #define ROLE_PG_READ_ALL_STATS          DEFAULT_ROLE_READ_ALL_STATS
332 #endif
333
334 static void pgsp_shmem_startup(void);
335 static void pgsp_shmem_shutdown(int code, Datum arg);
336 static void pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags);
337 static void pgsp_ExecutorRun(QueryDesc *queryDesc,
338                                  ScanDirection direction,
339                                                          uint64 count, bool execute_once);
340 static void pgsp_ExecutorFinish(QueryDesc *queryDesc);
341 static void pgsp_ExecutorEnd(QueryDesc *queryDesc);
342 static void pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
343 #if PG_VERSION_NUM >= 140000
344                                         bool readOnlyTree,
345 #endif
346                                         ProcessUtilityContext context, ParamListInfo params,
347                                         QueryEnvironment *queryEnv,
348                                         DestReceiver *dest, COMPTAG_TYPE *completionTag);
349 static uint32 hash_query(const char* query);
350 static void pgsp_store(char *plan, queryid_t queryId,
351                    double total_time, uint64 rows,
352                    const BufferUsage *bufusage);
353 static void pg_store_plans_internal(FunctionCallInfo fcinfo,
354                                                                         pgspVersion api_version);
355 static Size shared_mem_size(void);
356 static pgspEntry *entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len,
357                                                           bool sticky);
358 static bool ptext_store(const char *plan, int plan_len, Size *plan_offset,
359                                                 int *gc_count);
360 static char *ptext_load_file(Size *buffer_size);
361 static char *ptext_fetch(Size plan_offset, int plan_len, char *buffer,
362                                                  Size buffer_size);
363 static bool need_gc_ptexts(void);
364 static void gc_ptexts(void);
365 static void entry_dealloc(void);
366 static void entry_reset(void);
367
368 /*
369  * Module load callback
370  */
371 void
372 _PG_init(void)
373 {
374         /*
375          * In order to create our shared memory area, we have to be loaded via
376          * shared_preload_libraries.  If not, fall out without hooking into any of
377          * the main system.  (We don't throw error here because it seems useful to
378          * allow the pg_stat_statements functions to be created even when the
379          * module isn't active.  The functions must protect themselves against
380          * being called then, however.)
381          */
382         if (!process_shared_preload_libraries_in_progress)
383                 return;
384
385 #if PG_VERSION_NUM >= 140000
386         /*
387          * Inform the postmaster that we want to enable query_id calculation if
388          * compute_query_id is set to auto.
389          */
390         EnableQueryId();
391 #endif
392
393         /*
394          * Define (or redefine) custom GUC variables.
395          */
396         DefineCustomIntVariable("pg_store_plans.max",
397           "Sets the maximum number of plans tracked by pg_store_plans.",
398                                                         NULL,
399                                                         &store_size,
400                                                         1000,
401                                                         100,
402                                                         INT_MAX,
403                                                         PGC_POSTMASTER,
404                                                         0,
405                                                         NULL,
406                                                         NULL,
407                                                         NULL);
408
409         DefineCustomIntVariable("pg_store_plans.max_plan_length",
410           "Sets the maximum length of plans stored by pg_store_plans.",
411                                                         NULL,
412                                                         &max_plan_len,
413                                                         5000,
414                                                         100,
415                                                         INT32_MAX,
416                                                         PGC_POSTMASTER,
417                                                         0,
418                                                         NULL,
419                                                         NULL,
420                                                         NULL);
421
422         DefineCustomEnumVariable("pg_store_plans.plan_storage",
423                            "Selects where to store plan texts.",
424                                                          NULL,
425                                                          &plan_storage,
426                                                          PLAN_STORAGE_FILE,
427                                                          plan_storage_options,
428                                                          PGC_POSTMASTER,
429                                                          0,
430                                                          NULL,
431                                                          NULL,
432                                                          NULL);
433
434         DefineCustomEnumVariable("pg_store_plans.track",
435                            "Selects which plans are tracked by pg_store_plans.",
436                                                          NULL,
437                                                          &track_level,
438                                                          TRACK_LEVEL_TOP,
439                                                          track_options,
440                                                          PGC_SUSET,
441                                                          0,
442                                                          NULL,
443                                                          NULL,
444                                                          NULL);
445
446         DefineCustomEnumVariable("pg_store_plans.plan_format",
447                            "Selects which format to be appied for plan representation in pg_store_plans.",
448                                                          NULL,
449                                                          &plan_format,
450                                                          PLAN_FORMAT_TEXT,
451                                                          plan_formats,
452                                                          PGC_USERSET,
453                                                          0,
454                                                          NULL,
455                                                          NULL,
456                                                          NULL);
457
458         DefineCustomIntVariable("pg_store_plans.min_duration",
459                                         "Minimum duration to record plan in milliseconds.",
460                                                         NULL,
461                                                         &min_duration,
462                                                         0,
463                                                         0,
464                                                         INT_MAX,
465                                                         PGC_SUSET,
466                                                         0,
467                                                         NULL,
468                                                         NULL,
469                                                         NULL);
470
471         DefineCustomBoolVariable("pg_store_plans.save",
472                            "Save pg_store_plans statistics across server shutdowns.",
473                                                          NULL,
474                                                          &dump_on_shutdown,
475                                                          true,
476                                                          PGC_SIGHUP,
477                                                          0,
478                                                          NULL,
479                                                          NULL,
480                                                          NULL);
481
482         DefineCustomBoolVariable("pg_store_plans.log_analyze",
483                                                          "Use EXPLAIN ANALYZE for plan logging.",
484                                                          NULL,
485                                                          &log_analyze,
486                                                          false,
487                                                          PGC_SUSET,
488                                                          0,
489                                                          NULL,
490                                                          NULL,
491                                                          NULL);
492
493         DefineCustomBoolVariable("pg_store_plans.log_buffers",
494                                                          "Log buffer usage.",
495                                                          NULL,
496                                                          &log_buffers,
497                                                          false,
498                                                          PGC_SUSET,
499                                                          0,
500                                                          NULL,
501                                                          NULL,
502                                                          NULL);
503
504         DefineCustomBoolVariable("pg_store_plans.log_timing",
505                                                          "Log timings.",
506                                                          NULL,
507                                                          &log_timing,
508                                                          true,
509                                                          PGC_SUSET,
510                                                          0,
511                                                          NULL,
512                                                          NULL,
513                                                          NULL);
514
515         DefineCustomBoolVariable("pg_store_plans.log_triggers",
516                                                          "Log trigger trace.",
517                                                          NULL,
518                                                          &log_triggers,
519                                                          false,
520                                                          PGC_SUSET,
521                                                          0,
522                                                          NULL,
523                                                          NULL,
524                                                          NULL);
525
526         DefineCustomBoolVariable("pg_store_plans.log_verbose",
527                            "Set VERBOSE for EXPLAIN on logging.",
528                                                          NULL,
529                                                          &log_verbose,
530                                                          false,
531                                                          PGC_SUSET,
532                                                          0,
533                                                          NULL,
534                                                          NULL,
535                                                          NULL);
536
537         EmitWarningsOnPlaceholders("pg_store_plans");
538
539         /*
540          * Request additional shared resources.  (These are no-ops if we're not in
541          * the postmaster process.)  We'll allocate or attach to the shared
542          * resources in pgsp_shmem_startup().
543          */
544         RequestAddinShmemSpace(shared_mem_size());
545         RequestNamedLWLockTranche("pg_store_plans", 1);
546
547         /*
548          * Install hooks.
549          */
550         prev_shmem_startup_hook = shmem_startup_hook;
551         shmem_startup_hook = pgsp_shmem_startup;
552         prev_ExecutorStart = ExecutorStart_hook;
553         ExecutorStart_hook = pgsp_ExecutorStart;
554         prev_ExecutorRun = ExecutorRun_hook;
555         ExecutorRun_hook = pgsp_ExecutorRun;
556         prev_ExecutorFinish = ExecutorFinish_hook;
557         ExecutorFinish_hook = pgsp_ExecutorFinish;
558         prev_ExecutorEnd = ExecutorEnd_hook;
559         ExecutorEnd_hook = pgsp_ExecutorEnd;
560         prev_ProcessUtility = ProcessUtility_hook;
561         ProcessUtility_hook = pgsp_ProcessUtility;
562 }
563
564 /*
565  * Module unload callback
566  */
567 void
568 _PG_fini(void)
569 {
570         /* Uninstall hooks. */
571         shmem_startup_hook = prev_shmem_startup_hook;
572         ExecutorStart_hook = prev_ExecutorStart;
573         ExecutorRun_hook = prev_ExecutorRun;
574         ExecutorFinish_hook = prev_ExecutorFinish;
575         ExecutorEnd_hook = prev_ExecutorEnd;
576         ProcessUtility_hook = prev_ProcessUtility;
577 }
578
579 /*
580  * shmem_startup hook: allocate or attach to shared memory,
581  * then load any pre-existing statistics from file.
582  */
583 static void
584 pgsp_shmem_startup(void)
585 {
586         bool            found;
587         HASHCTL         info;
588         FILE       *file = NULL;
589         FILE       *pfile = NULL;
590         uint32          header;
591         int32           num;
592         int32           pgver;
593         int32           i;
594         int                     plan_size;
595         int                     buffer_size;
596         char       *buffer = NULL;
597
598         if (prev_shmem_startup_hook)
599                 prev_shmem_startup_hook();
600
601         /* reset in case this is a restart within the postmaster */
602         shared_state = NULL;
603         hash_table = NULL;
604
605         /*
606          * Create or attach to the shared memory state, including hash table
607          */
608         LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
609
610         shared_state = ShmemInitStruct("pg_store_plans",
611                                                    sizeof(pgspSharedState),
612                                                    &found);
613
614         if (!found)
615         {
616                 /* First time through ... */
617                 shared_state->lock = &(GetNamedLWLockTranche("pg_store_plans"))->lock;
618                 shared_state->plan_size = max_plan_len;
619                 shared_state->cur_median_usage = ASSUMED_MEDIAN_INIT;
620                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
621                 SpinLockInit(&shared_state->mutex);
622                 shared_state->extent = 0;
623                 shared_state->n_writers = 0;
624                 shared_state->gc_count = 0;
625                 shared_state->stats.dealloc = 0;
626                 shared_state->stats.stats_reset = GetCurrentTimestamp();
627         }
628
629         /* Be sure everyone agrees on the hash table entry size */
630         plan_size = shared_state->plan_size;
631
632         memset(&info, 0, sizeof(info));
633         info.keysize = sizeof(pgspHashKey);
634         info.entrysize = sizeof(pgspEntry);
635         if (plan_storage == PLAN_STORAGE_SHMEM)
636                 info.entrysize += max_plan_len;
637         hash_table = ShmemInitHash("pg_store_plans hash",
638                                                           store_size, store_size,
639                                                           &info, HASH_ELEM |
640                                                           HASH_BLOBS);
641
642         LWLockRelease(AddinShmemInitLock);
643
644         /*
645          * If we're in the postmaster (or a standalone backend...), set up a shmem
646          * exit hook to dump the statistics to disk.
647          */
648         if (!IsUnderPostmaster)
649                 on_shmem_exit(pgsp_shmem_shutdown, (Datum) 0);
650
651         /*
652          * Done if some other process already completed our initialization.
653          */
654         if (found)
655                 return;
656
657         /*
658          * Note: we don't bother with locks here, because there should be no other
659          * processes running when this code is reached.
660          */
661
662         /* Unlink query text file possibly left over from crash */
663         unlink(PGSP_TEXT_FILE);
664
665         if (plan_storage == PLAN_STORAGE_FILE)
666         {
667                 /* Allocate new query text temp file */
668                 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
669                 if (pfile == NULL)
670                         goto write_error;
671         }
672
673         /*
674          * If we were told not to load old statistics, we're done.  (Note we do
675          * not try to unlink any old dump file in this case.  This seems a bit
676          * questionable but it's the historical behavior.)
677          */
678         if (!dump_on_shutdown)
679         {
680                 if (pfile)
681                         FreeFile(pfile);
682                 return;
683         }
684
685         /*
686          * Attempt to load old statistics from the dump file.
687          */
688         file = AllocateFile(PGSP_DUMP_FILE, PG_BINARY_R);
689         if (file == NULL)
690         {
691                 if (errno == ENOENT)
692                         return;                         /* ignore not-found error */
693                 /* No existing persisted stats file, so we're done */
694                 goto read_error;
695         }
696
697         buffer_size = plan_size;
698         buffer = (char *) palloc(buffer_size);
699
700         if (fread(&header, sizeof(uint32), 1, file) != 1 ||
701                 fread(&pgver, sizeof(uint32), 1, file) != 1 ||
702                 fread(&num, sizeof(int32), 1, file) != 1)
703                 goto read_error;
704
705         if (header != PGSP_FILE_HEADER ||
706                 pgver != PGSP_PG_MAJOR_VERSION)
707                 goto data_error;
708
709         for (i = 0; i < num; i++)
710         {
711                 pgspEntry       temp;
712                 pgspEntry  *entry;
713                 Size            plan_offset = 0;
714
715                 if (fread(&temp, sizeof(pgspEntry), 1, file) != 1)
716                         goto read_error;
717
718                 /* Encoding is the only field we can easily sanity-check */
719                 if (!PG_VALID_BE_ENCODING(temp.encoding))
720                         goto data_error;
721
722                 /* Previous incarnation might have had a larger plan_size */
723                 if (temp.plan_len >= buffer_size)
724                 {
725                         buffer = (char *) repalloc(buffer, temp.plan_len + 1);
726                         buffer_size = temp.plan_len + 1;
727                 }
728
729                 if (fread(buffer, 1, temp.plan_len + 1, file) != temp.plan_len + 1)
730                         goto read_error;
731
732                 /* Skip loading "sticky" entries */
733                 if (temp.counters.calls == 0)
734                         continue;
735
736                 /* Clip to available length if needed */
737                 if (temp.plan_len >= plan_size)
738                         temp.plan_len = pg_encoding_mbcliplen(temp.encoding,
739                                                                                                    buffer,
740                                                                                                    temp.plan_len,
741                                                                                                    plan_size - 1);
742
743                 buffer[temp.plan_len] = '\0';
744
745                 if (plan_storage == PLAN_STORAGE_FILE)
746                 {
747                         /* Store the plan text */
748                         plan_offset = shared_state->extent;
749                         if (fwrite(buffer, 1, temp.plan_len + 1, pfile) !=
750                                 temp.plan_len + 1)
751                                 goto write_error;
752                         shared_state->extent += temp.plan_len + 1;
753                 }
754
755                 /* make the hashtable entry (discards old entries if too many) */
756                 entry = entry_alloc(&temp.key, plan_offset, temp.plan_len, false);
757
758                 if (plan_storage == PLAN_STORAGE_SHMEM)
759                         memcpy(SHMEM_PLAN_PTR(entry), buffer, temp.plan_len + 1);
760
761                 /* copy in the actual stats */
762                 entry->counters = temp.counters;
763         }
764
765         pfree(buffer);
766         FreeFile(file);
767
768         if (pfile)
769                 FreeFile(pfile);
770
771         /*
772          * Remove the file so it's not included in backups/replication slaves,
773          * etc. A new file will be written on next shutdown.
774          */
775         unlink(PGSP_DUMP_FILE);
776
777         return;
778
779 read_error:
780         ereport(LOG,
781                         (errcode_for_file_access(),
782                          errmsg("could not read file \"%s\": %m",
783                                         PGSP_DUMP_FILE)));
784         goto fail;
785 data_error:
786         ereport(LOG,
787                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
788                          errmsg("ignoring invalid data in file \"%s\"",
789                                         PGSP_DUMP_FILE)));
790         goto fail;
791 write_error:
792         ereport(LOG,
793                         (errcode_for_file_access(),
794                          errmsg("could not write file \"%s\": %m",
795                                         PGSP_TEXT_FILE)));
796 fail:
797         if (buffer)
798                 pfree(buffer);
799         if (file)
800                 FreeFile(file);
801         if (pfile)
802                 FreeFile(pfile);
803         /* If possible, throw away the bogus file; ignore any error */
804         unlink(PGSP_DUMP_FILE);
805
806         /*
807          * Don't unlink PGSP_TEXT_FILE here; it should always be around while the
808          * server is running with pg_stat_statements enabled
809          */
810 }
811
812 /*
813  * shmem_shutdown hook: Dump statistics into file.
814  *
815  * Note: we don't bother with acquiring lock, because there should be no
816  * other processes running when this is called.
817  */
818 static void
819 pgsp_shmem_shutdown(int code, Datum arg)
820 {
821         FILE       *file;
822         char       *pbuffer = NULL;
823         Size            pbuffer_size = 0;
824         HASH_SEQ_STATUS hash_seq;
825         int32           num_entries;
826         pgspEntry  *entry;
827
828         /* Don't try to dump during a crash. */
829         if (code)
830                 return;
831
832         /* Safety check ... shouldn't get here unless shmem is set up. */
833         if (!shared_state || !hash_table)
834                 return;
835
836         /* Don't dump if told not to. */
837         if (!dump_on_shutdown)
838                 return;
839
840         file = AllocateFile(PGSP_DUMP_FILE ".tmp", PG_BINARY_W);
841         if (file == NULL)
842                 goto error;
843
844         if (fwrite(&PGSP_FILE_HEADER, sizeof(uint32), 1, file) != 1)
845                 goto error;
846         if (fwrite(&PGSP_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
847                 goto error;
848         num_entries = hash_get_num_entries(hash_table);
849         if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
850                 goto error;
851
852         if (plan_storage == PLAN_STORAGE_FILE)
853         {
854                 pbuffer = ptext_load_file(&pbuffer_size);
855                 if (pbuffer == NULL)
856                         goto error;
857         }
858
859         hash_seq_init(&hash_seq, hash_table);
860         while ((entry = hash_seq_search(&hash_seq)) != NULL)
861         {
862                 int                     len = entry->plan_len;
863                 char       *pstr;
864
865                 if (plan_storage == PLAN_STORAGE_FILE)
866                         pstr = ptext_fetch(entry->plan_offset, len,
867                                                            pbuffer, pbuffer_size);
868                 else
869                         pstr = SHMEM_PLAN_PTR(entry);
870
871                 if (pstr == NULL)
872                         continue;                       /* Ignore any entries with bogus texts */
873
874                 if (fwrite(entry, sizeof(pgspEntry), 1, file) != 1 ||
875                         fwrite(pstr, 1, len + 1, file) != len + 1)
876                 {
877                         /* note: we assume hash_seq_term won't change errno */
878                         hash_seq_term(&hash_seq);
879                         goto error;
880                 }
881         }
882
883         if (FreeFile(file))
884         {
885                 file = NULL;
886                 goto error;
887         }
888
889         /*
890          * Rename file into place, so we atomically replace the old one.
891          */
892         if (rename(PGSP_DUMP_FILE ".tmp", PGSP_DUMP_FILE) != 0)
893                 ereport(LOG,
894                                 (errcode_for_file_access(),
895                                  errmsg("could not rename pg_store_plans file \"%s\": %m",
896                                                 PGSP_DUMP_FILE ".tmp")));
897
898         /* Unlink query-texts file; it's not needed while shutdown */
899         unlink(PGSP_TEXT_FILE);
900
901         return;
902
903 error:
904         ereport(LOG,
905                         (errcode_for_file_access(),
906                          errmsg("could not write pg_store_plans file \"%s\": %m",
907                                         PGSP_DUMP_FILE ".tmp")));
908         if (file)
909                 FreeFile(file);
910         unlink(PGSP_DUMP_FILE ".tmp");
911 }
912
913
914 /*
915  * ExecutorStart hook: start up tracking if needed
916  */
917 static void
918 pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags)
919 {
920         if (log_analyze &&
921                 (eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0)
922         {
923                 queryDesc->instrument_options |=
924                         (log_timing ? INSTRUMENT_TIMER : 0)|
925                         (log_timing ? 0: INSTRUMENT_ROWS)|
926                         (log_buffers ? INSTRUMENT_BUFFERS : 0);
927         }
928
929         if (prev_ExecutorStart)
930                 prev_ExecutorStart(queryDesc, eflags);
931         else
932                 standard_ExecutorStart(queryDesc, eflags);
933
934         /*
935          * Set up to track total elapsed time in ExecutorRun. Allocate in per-query
936          * context so as to be free at ExecutorEnd.
937          */
938         if (queryDesc->totaltime == NULL &&
939                         pgsp_enabled(queryDesc->plannedstmt->queryId))
940         {
941                 MemoryContext oldcxt;
942
943                 oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
944                 queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL
945 #if PG_VERSION_NUM >= 140000
946                                                                                   , false
947 #endif
948                                                                                  );
949                 MemoryContextSwitchTo(oldcxt);
950         }
951
952 }
953
954 /*
955  * ExecutorRun hook: all we need do is track nesting depth
956  */
957 static void
958 pgsp_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
959                                  bool execute_once)
960 {
961         nested_level++;
962         PG_TRY();
963         {
964                 if (prev_ExecutorRun)
965                         prev_ExecutorRun(queryDesc, direction, count, execute_once);
966                 else
967                         standard_ExecutorRun(queryDesc, direction, count, execute_once);
968                 nested_level--;
969         }
970         PG_CATCH();
971         {
972                 nested_level--;
973                 PG_RE_THROW();
974         }
975         PG_END_TRY();
976 }
977
978 /*
979  * ExecutorFinish hook: all we need do is track nesting depth
980  */
981 static void
982 pgsp_ExecutorFinish(QueryDesc *queryDesc)
983 {
984         nested_level++;
985         PG_TRY();
986         {
987                 if (prev_ExecutorFinish)
988                         prev_ExecutorFinish(queryDesc);
989                 else
990                         standard_ExecutorFinish(queryDesc);
991                 nested_level--;
992         }
993         PG_CATCH();
994         {
995                 nested_level--;
996                 PG_RE_THROW();
997         }
998         PG_END_TRY();
999 }
1000
1001 /*
1002  * ExecutorEnd hook: store results if needed
1003  */
1004 static void
1005 pgsp_ExecutorEnd(QueryDesc *queryDesc)
1006 {
1007         if (queryDesc->totaltime)
1008         {
1009                 /*
1010                  * Make sure stats accumulation is done.  (Note: it's okay if several
1011                  * levels of hook all do this.)
1012                  */
1013                 InstrEndLoop(queryDesc->totaltime);
1014
1015                 if (pgsp_enabled(queryDesc->plannedstmt->queryId) &&
1016                         queryDesc->totaltime->total &&
1017                         queryDesc->totaltime->total >=
1018                         (double)min_duration / 1000.0)
1019                 {
1020                         queryid_t         queryid;
1021                         ExplainState *es;
1022                         StringInfo        es_str;
1023
1024                         es = NewExplainState();
1025                         es_str = es->str;
1026
1027                         es->analyze = queryDesc->instrument_options;
1028                         es->verbose = log_verbose;
1029                         es->buffers = (es->analyze && log_buffers);
1030                         es->timing = (es->analyze && log_timing);
1031                         es->format = EXPLAIN_FORMAT_JSON;
1032
1033                         ExplainBeginOutput(es);
1034                         ExplainPrintPlan(es, queryDesc);
1035                         if (log_triggers)
1036                                 pgspExplainTriggers(es, queryDesc);
1037                         ExplainEndOutput(es);
1038
1039                         /* Remove last line break */
1040                         if (es_str->len > 0 && es_str->data[es_str->len - 1] == '\n')
1041                                 es_str->data[--es_str->len] = '\0';
1042
1043                         /* JSON outmost braces. */
1044                         es_str->data[0] = '{';
1045                         es_str->data[es_str->len - 1] = '}';
1046
1047                         queryid = queryDesc->plannedstmt->queryId;
1048 #if PG_VERSION_NUM < 140000
1049                         /*
1050                          * For versions before pg14, a queryid is only available if
1051                          * pg_stat_statements extension (or similar) if configured.  We
1052                          * don't want a hard requirement for such an extension so fallback
1053                          * to an internal queryid calculation in some case.
1054                          * For pg14 and above, core postgres can compute a queryid so we
1055                          * will rely on it.
1056                          */
1057                         if (queryid == PGSP_NO_QUERYID)
1058                                 queryid = (queryid_t) hash_query(queryDesc->sourceText);
1059 #else
1060                         Assert(queryid != PGSP_NO_QUERYID);
1061 #endif
1062
1063                         pgsp_store(es_str->data,
1064                                                 queryid,
1065                                                 queryDesc->totaltime->total * 1000.0,   /* convert to msec */
1066                                                 queryDesc->estate->es_processed,
1067                                                 &queryDesc->totaltime->bufusage);
1068                         pfree(es_str->data);
1069                 }
1070         }
1071
1072         if (prev_ExecutorEnd)
1073                 prev_ExecutorEnd(queryDesc);
1074         else
1075                 standard_ExecutorEnd(queryDesc);
1076 }
1077
1078 /*
1079  * ProcessUtility hook
1080  */
1081 static void
1082 pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
1083 #if PG_VERSION_NUM >= 140000
1084                                         bool readOnlyTree,
1085 #endif
1086                                         ProcessUtilityContext context, ParamListInfo params,
1087                                         QueryEnvironment *queryEnv,
1088                                         DestReceiver *dest, COMPTAG_TYPE *completionTag)
1089 {
1090         int                     tag = nodeTag(pstmt->utilityStmt);
1091         queryid_t       saved_queryId = pstmt->queryId;
1092         bool            reset_force_disabled = false;
1093
1094         if (pgsp_enabled(saved_queryId) &&
1095                 (tag == T_CreateExtensionStmt || tag == T_AlterExtensionStmt) &&
1096                 !force_disabled && track_level < TRACK_LEVEL_VERBOSE)
1097         {
1098                 force_disabled = true;
1099                 reset_force_disabled = true;
1100         }
1101
1102         PG_TRY();
1103         {
1104                 if (prev_ProcessUtility)
1105                 {
1106                         prev_ProcessUtility(pstmt, queryString,
1107 #if PG_VERSION_NUM >= 140000
1108                                                                 readOnlyTree,
1109 #endif
1110                                                                 context, params, queryEnv,
1111                                                                 dest, completionTag);
1112                 }
1113                 else
1114                         standard_ProcessUtility(pstmt, queryString,
1115 #if PG_VERSION_NUM >= 140000
1116                                                                         readOnlyTree,
1117 #endif
1118                                                                         context, params, queryEnv,
1119                                                                         dest, completionTag);
1120
1121                 if (reset_force_disabled)
1122                         force_disabled = false;
1123         }
1124         PG_CATCH();
1125         {
1126                 if (reset_force_disabled)
1127                         force_disabled = false;
1128                 PG_RE_THROW();
1129         }
1130         PG_END_TRY();
1131 }
1132
1133 /*
1134  * hash_query: calculate internal query ID for a query
1135  *
1136  *  As of PG11, Query.queryId has been widen to 64 bit to reduce collision of
1137  *  queries to practical level. On the other hand pg_store_plans uses the
1138  *  combination of query hash and plan hash values as the hash table key and
1139  *  the resolution of the hash value effectively has the same degree so we
1140  *  continue to use uint32 as internal queryid.
1141  *
1142  *  This may merge plans from different queries into single internal query id
1143  *  but it is not a problem when pg_stat_statements is used together since the
1144  *  extension gives enough resolution on queries.
1145  */
1146 static uint32
1147 hash_query(const char* query)
1148 {
1149         uint32 queryid;
1150
1151         char *normquery = pstrdup(query);
1152         normalize_expr(normquery, false);
1153         queryid = hash_any((const unsigned char*)normquery, strlen(normquery));
1154         pfree(normquery);
1155
1156         /* If we are unlucky enough to get a hash of zero, use 1 instead */
1157         if (queryid == 0)
1158                 queryid = 1;
1159
1160         return queryid;
1161 }
1162
1163
1164 /*
1165  * Store some statistics for a plan.
1166  *
1167  * Table entry is keyed with userid.dbid.queryId.planId. planId is the hash
1168  * value of the given plan, which is calculated in ths function.
1169  */
1170 static void
1171 pgsp_store(char *plan, queryid_t queryId,
1172                    double total_time, uint64 rows,
1173                    const BufferUsage *bufusage)
1174 {
1175         pgspHashKey key;
1176         pgspEntry  *entry;
1177         char       *norm_query = NULL;
1178         int             plan_len;
1179         char       *normalized_plan = NULL;
1180         char       *shorten_plan = NULL;
1181         volatile pgspEntry *e;
1182         Size            plan_offset = 0;
1183         bool            do_gc = false;
1184
1185         Assert(plan != NULL && queryId != PGSP_NO_QUERYID);
1186
1187         /* Safety check... */
1188         if (!shared_state || !hash_table)
1189                 return;
1190
1191         /* Set up key for hashtable search */
1192         key.userid = GetUserId();
1193         key.dbid = MyDatabaseId;
1194         key.queryid = queryId;
1195
1196         normalized_plan = pgsp_json_normalize(plan);
1197         shorten_plan = pgsp_json_shorten(plan);
1198         elog(DEBUG3, "pg_store_plans: Normalized plan: %s", normalized_plan);
1199         elog(DEBUG3, "pg_store_plans: Shorten plan: %s", shorten_plan);
1200         elog(DEBUG3, "pg_store_plans: Original plan: %s", plan);
1201         plan_len = strlen(shorten_plan);
1202
1203         key.planid = hash_any((const unsigned char *)normalized_plan,
1204                                                   strlen(normalized_plan));
1205         pfree(normalized_plan);
1206
1207         if (plan_len >= shared_state->plan_size)
1208                 plan_len = pg_encoding_mbcliplen(GetDatabaseEncoding(),
1209                                                                                  shorten_plan,
1210                                                                                  plan_len,
1211                                                                                  shared_state->plan_size - 1);
1212
1213
1214         /* Look up the hash table entry with shared lock. */
1215         LWLockAcquire(shared_state->lock, LW_SHARED);
1216
1217         entry = (pgspEntry *) hash_search(hash_table, &key, HASH_FIND, NULL);
1218
1219         /* Store the plan text, if the entry not present */
1220         if (!entry && plan_storage == PLAN_STORAGE_FILE)
1221         {
1222                 int             gc_count;
1223                 bool    stored;
1224
1225                 /* Append new plan text to file with only shared lock held */
1226                 stored = ptext_store(shorten_plan, plan_len, &plan_offset, &gc_count);
1227
1228                 /*
1229                  * Determine whether we need to garbage collect external query texts
1230                  * while the shared lock is still held.  This micro-optimization
1231                  * avoids taking the time to decide this while holding exclusive lock.
1232                  */
1233                 do_gc = need_gc_ptexts();
1234
1235                 /* Acquire exclusive lock as required by entry_alloc() */
1236                 LWLockRelease(shared_state->lock);
1237                 LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
1238
1239                 /*
1240                  * A garbage collection may have occurred while we weren't holding the
1241                  * lock.  In the unlikely event that this happens, the plan text we
1242                  * stored above will have been garbage collected, so write it again.
1243                  * This should be infrequent enough that doing it while holding
1244                  * exclusive lock isn't a performance problem.
1245                  */
1246                 if (!stored || shared_state->gc_count != gc_count)
1247                         stored = ptext_store(shorten_plan, plan_len, &plan_offset, NULL);
1248
1249                 /* If we failed to write to the text file, give up */
1250                 if (!stored)
1251                         goto done;
1252
1253         }
1254
1255         /* Create new entry, if not present */
1256         if (!entry)
1257         {
1258                 entry = entry_alloc(&key, plan_offset, plan_len, false);
1259
1260                 /* shorten_plan is terminated by NUL */
1261                 if (plan_storage == PLAN_STORAGE_SHMEM)
1262                         memcpy(SHMEM_PLAN_PTR(entry), shorten_plan, plan_len + 1);
1263                         
1264
1265                 /* If needed, perform garbage collection while exclusive lock held */
1266                 if (do_gc)
1267                         gc_ptexts();
1268         }
1269
1270         /* Increment the counts, except when jstate is not NULL */
1271
1272         /*
1273          * Grab the spinlock while updating the counters (see comment about
1274          * locking rules at the head of the file)
1275          */
1276
1277         e = (volatile pgspEntry *) entry;
1278         SpinLockAcquire(&e->mutex);
1279
1280         /* "Unstick" entry if it was previously sticky */
1281         if (e->counters.calls == 0)
1282         {
1283                 e->counters.usage = USAGE_INIT;
1284                 e->counters.first_call = GetCurrentTimestamp();
1285         }
1286
1287         e->counters.calls += 1;
1288         e->counters.total_time += total_time;
1289         if (e->counters.calls == 1)
1290         {
1291                 e->counters.min_time = total_time;
1292                 e->counters.max_time = total_time;
1293                 e->counters.mean_time = total_time;
1294         }
1295         else
1296         {
1297                 /*
1298                  * Welford's method for accurately computing variance. See
1299                  * <http://www.johndcook.com/blog/standard_deviation/>
1300                  */
1301                 double          old_mean = e->counters.mean_time;
1302
1303                 e->counters.mean_time +=
1304                         (total_time - old_mean) / e->counters.calls;
1305                 e->counters.sum_var_time +=
1306                         (total_time - old_mean) * (total_time - e->counters.mean_time);
1307
1308                 /* calculate min and max time */
1309                 if (e->counters.min_time > total_time)
1310                         e->counters.min_time = total_time;
1311                 if (e->counters.max_time < total_time)
1312                         e->counters.max_time = total_time;
1313         }
1314
1315         e->counters.rows += rows;
1316         e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1317         e->counters.shared_blks_read += bufusage->shared_blks_read;
1318         e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
1319         e->counters.shared_blks_written += bufusage->shared_blks_written;
1320         e->counters.local_blks_hit += bufusage->local_blks_hit;
1321         e->counters.local_blks_read += bufusage->local_blks_read;
1322         e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
1323         e->counters.local_blks_written += bufusage->local_blks_written;
1324         e->counters.temp_blks_read += bufusage->temp_blks_read;
1325         e->counters.temp_blks_written += bufusage->temp_blks_written;
1326         e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1327         e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
1328         e->counters.last_call = GetCurrentTimestamp();
1329         e->counters.usage += USAGE_EXEC(total_time);
1330
1331         SpinLockRelease(&e->mutex);
1332
1333 done:
1334         LWLockRelease(shared_state->lock);
1335
1336         /* We postpone this pfree until we're out of the lock */
1337         if (norm_query)
1338                 pfree(norm_query);
1339 }
1340
1341 /*
1342  * Reset all statement statistics.
1343  */
1344 Datum
1345 pg_store_plans_reset(PG_FUNCTION_ARGS)
1346 {
1347         if (!shared_state || !hash_table)
1348                 ereport(ERROR,
1349                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1350                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1351         entry_reset();
1352         PG_RETURN_VOID();
1353 }
1354
1355 /* Number of output arguments (columns) for various API versions */
1356 #define PG_STORE_PLANS_COLS_V1_5        27
1357 #define PG_STORE_PLANS_COLS_V1_6        26
1358 #define PG_STORE_PLANS_COLS                     27      /* maximum of above */
1359
1360 /*
1361  * Retrieve statement statistics.
1362  */
1363 Datum
1364 pg_store_plans_1_6(PG_FUNCTION_ARGS)
1365 {
1366         pg_store_plans_internal(fcinfo, PGSP_V1_6);
1367
1368         return (Datum) 0;
1369 }
1370
1371 Datum
1372 pg_store_plans(PG_FUNCTION_ARGS)
1373 {
1374         pg_store_plans_internal(fcinfo, PGSP_V1_5);
1375
1376         return (Datum) 0;
1377 }
1378
1379 static void
1380 pg_store_plans_internal(FunctionCallInfo fcinfo,
1381                                                 pgspVersion api_version)
1382 {
1383         ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1384         TupleDesc       tupdesc;
1385         Tuplestorestate *tupstore;
1386         MemoryContext per_query_ctx;
1387         MemoryContext oldcontext;
1388         Oid                     userid = GetUserId();
1389         bool            is_allowed_role = is_member_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS);
1390         int                     n_writers;
1391         char       *pbuffer = NULL;
1392         Size            pbuffer_size = 0;
1393         Size            extent = 0;
1394         int                     gc_count = 0;
1395         HASH_SEQ_STATUS hash_seq;
1396         pgspEntry  *entry;
1397
1398         if (!shared_state || !hash_table)
1399                 ereport(ERROR,
1400                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1401                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1402
1403         /* check to see if caller supports us returning a tuplestore */
1404         if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1405                 ereport(ERROR,
1406                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1407                                  errmsg("set-valued function called in context that cannot accept a set")));
1408         if (!(rsinfo->allowedModes & SFRM_Materialize))
1409                 ereport(ERROR,
1410                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1411                                  errmsg("materialize mode required, but it is not " \
1412                                                 "allowed in this context")));
1413
1414         /* Build a tuple descriptor for our result type */
1415         if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1416                 elog(ERROR, "return type must be a row type");
1417
1418         per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1419         oldcontext = MemoryContextSwitchTo(per_query_ctx);
1420
1421         tupstore = tuplestore_begin_heap(true, false, work_mem);
1422         rsinfo->returnMode = SFRM_Materialize;
1423         rsinfo->setResult = tupstore;
1424         rsinfo->setDesc = tupdesc;
1425
1426         MemoryContextSwitchTo(oldcontext);
1427
1428         /*
1429          * We'd like to load the plan text file (if needed) while not holding any
1430          * lock on shared_state->lock.  In the worst case we'll have to do this
1431          * again after we have the lock, but it's unlikely enough to make this a
1432          * win despite occasional duplicated work.  We need to reload if anybody
1433          * writes to the file (either a retail ptext_store(), or a garbage
1434          * collection) between this point and where we've gotten shared lock.  If a
1435          * ptext_store is actually in progress when we look, we might as well skip
1436          * the speculative load entirely.
1437          */
1438
1439         /* Take the mutex so we can examine variables */
1440         {
1441                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1442
1443                 SpinLockAcquire(&s->mutex);
1444                 extent = s->extent;
1445                 n_writers = s->n_writers;
1446                 gc_count = s->gc_count;
1447                 SpinLockRelease(&s->mutex);
1448         }
1449
1450         /* No point in loading file now if there are active writers */
1451         if (n_writers == 0 && plan_storage == PLAN_STORAGE_FILE)
1452                 pbuffer = ptext_load_file(&pbuffer_size);
1453
1454         /*
1455          * Get shared lock, load or reload the plan text file if we must, and
1456          * iterate over the hashtable entries.
1457          *
1458          * With a large hash table, we might be holding the lock rather longer
1459          * than one could wish.  However, this only blocks creation of new hash
1460          * table entries, and the larger the hash table the less likely that is to
1461          * be needed.  So we can hope this is okay.  Perhaps someday we'll decide
1462          * we need to partition the hash table to limit the time spent holding any
1463          * one lock.
1464          */
1465         LWLockAcquire(shared_state->lock, LW_SHARED);
1466
1467         /*
1468          * Here it is safe to examine extent and gc_count without taking the mutex.
1469          * Note that although other processes might change shared_state->extent
1470          * just after we look at it, the strings they then write into the file
1471          * cannot yet be referenced in the hashtable, so we don't care whether we
1472          * see them or not.
1473          *
1474          * If ptext_load_file fails, we just press on; we'll return NULL for every
1475          * plan text.
1476          */
1477         if (plan_storage == PLAN_STORAGE_FILE &&
1478                 (pbuffer == NULL ||
1479                  shared_state->extent != extent ||
1480                  shared_state->gc_count != gc_count))
1481         {
1482                 if (pbuffer)
1483                         free(pbuffer);
1484                 pbuffer = ptext_load_file(&pbuffer_size);
1485         }
1486
1487         hash_seq_init(&hash_seq, hash_table);
1488         while ((entry = hash_seq_search(&hash_seq)) != NULL)
1489         {
1490                 Datum           values[PG_STORE_PLANS_COLS];
1491                 bool            nulls[PG_STORE_PLANS_COLS];
1492                 int                     i = 0;
1493                 int64           queryid      = entry->key.queryid;
1494                 int64           planid       = entry->key.planid;
1495                 Counters        tmp;
1496                 double          stddev;
1497
1498                 memset(values, 0, sizeof(values));
1499                 memset(nulls, 0, sizeof(nulls));
1500
1501                 values[i++] = ObjectIdGetDatum(entry->key.userid);
1502                 values[i++] = ObjectIdGetDatum(entry->key.dbid);
1503                 if (is_allowed_role || entry->key.userid == userid)
1504                 {
1505                         values[i++] = Int64GetDatumFast(queryid);
1506                         values[i++] = Int64GetDatumFast(planid);
1507
1508                         /* fill queryid_stat_statements with the same value with queryid */
1509                         if (api_version == PGSP_V1_5)
1510                                 values[i++] = Int64GetDatumFast(queryid);
1511                 }
1512                 else
1513                 {
1514                         nulls[i++] = true;      /* queryid */
1515                         nulls[i++] = true;      /* planid */
1516
1517                         /* queryid_stat_statemetns*/
1518                         if (api_version == PGSP_V1_5)
1519                                 nulls[i++] = true;
1520                 }
1521
1522                 if (is_allowed_role || entry->key.userid == userid)
1523                 {
1524                         char       *pstr; /* Plan string */
1525                         char       *mstr; /* Modified plan string */
1526                         char       *estr; /* Encoded modified plan string */
1527
1528                         if (plan_storage == PLAN_STORAGE_FILE)
1529                                 pstr = ptext_fetch(entry->plan_offset, entry->plan_len,
1530                                                                    pbuffer, pbuffer_size);
1531                         else
1532                                 pstr = SHMEM_PLAN_PTR(entry);
1533
1534                         switch (plan_format)
1535                         {
1536                                 case PLAN_FORMAT_TEXT:
1537                                         mstr = pgsp_json_textize(pstr);
1538                                         break;
1539                                 case PLAN_FORMAT_JSON:
1540                                         mstr = pgsp_json_inflate(pstr);
1541                                         break;
1542                                 case PLAN_FORMAT_YAML:
1543                                         mstr = pgsp_json_yamlize(pstr);
1544                                         break;
1545                                 case PLAN_FORMAT_XML:
1546                                         mstr = pgsp_json_xmlize(pstr);
1547                                         break;
1548                                 default:
1549                                         mstr = pstr;
1550                                         break;
1551                         }
1552
1553                         estr = (char *)
1554                                 pg_do_encoding_conversion((unsigned char *) mstr,
1555                                                                                   strlen(mstr),
1556                                                                                   entry->encoding,
1557                                                                                   GetDatabaseEncoding());
1558                         values[i++] = CStringGetTextDatum(estr);
1559
1560                         if (estr != mstr)
1561                                 pfree(estr);
1562
1563                         if (mstr != pstr)
1564                                 pfree(mstr);
1565
1566                         /* pstr is a pointer onto pbuffer */
1567                 }
1568                 else
1569                         values[i++] = CStringGetTextDatum("<insufficient privilege>");
1570
1571                 /* copy counters to a local variable to keep locking time short */
1572                 {
1573                         volatile pgspEntry *e = (volatile pgspEntry *) entry;
1574
1575                         SpinLockAcquire(&e->mutex);
1576                         tmp = e->counters;
1577                         SpinLockRelease(&e->mutex);
1578                 }
1579
1580                 /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1581                 if (tmp.calls == 0)
1582                         continue;
1583
1584                 values[i++] = Int64GetDatumFast(tmp.calls);
1585                 values[i++] = Float8GetDatumFast(tmp.total_time);
1586                 values[i++] = Float8GetDatumFast(tmp.min_time);
1587                 values[i++] = Float8GetDatumFast(tmp.max_time);
1588                 values[i++] = Float8GetDatumFast(tmp.mean_time);
1589
1590                 /*
1591                  * Note we are calculating the population variance here, not the
1592                  * sample variance, as we have data for the whole population, so
1593                  * Bessel's correction is not used, and we don't divide by
1594                  * tmp.calls - 1.
1595                  */
1596                 if (tmp.calls > 1)
1597                         stddev = sqrt(tmp.sum_var_time / tmp.calls);
1598                 else
1599                         stddev = 0.0;
1600                 values[i++] = Float8GetDatumFast(stddev);
1601
1602                 values[i++] = Int64GetDatumFast(tmp.rows);
1603                 values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1604                 values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
1605                 values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
1606                 values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1607                 values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1608                 values[i++] = Int64GetDatumFast(tmp.local_blks_read);
1609                 values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
1610                 values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1611                 values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1612                 values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
1613                 values[i++] = Float8GetDatumFast(tmp.blk_read_time);
1614                 values[i++] = Float8GetDatumFast(tmp.blk_write_time);
1615                 values[i++] = TimestampTzGetDatum(tmp.first_call);
1616                 values[i++] = TimestampTzGetDatum(tmp.last_call);
1617
1618                 Assert(i == (api_version == PGSP_V1_5 ? PG_STORE_PLANS_COLS_V1_5 :
1619                                          api_version == PGSP_V1_6 ? PG_STORE_PLANS_COLS_V1_6 :
1620                                          -1 /* fail if you forget to update this assert */ ));
1621
1622                 tuplestore_putvalues(tupstore, tupdesc, values, nulls);
1623         }
1624
1625         LWLockRelease(shared_state->lock);
1626
1627         /* clean up and return the tuplestore */
1628         tuplestore_donestoring(tupstore);
1629 }
1630
1631 /* Number of output arguments (columns) for pg_stat_statements_info */
1632 #define PG_STORE_PLANS_INFO_COLS        2
1633
1634 /*
1635  * Return statistics of pg_stat_statements.
1636  */
1637 Datum
1638 pg_store_plans_info(PG_FUNCTION_ARGS)
1639 {
1640         pgspGlobalStats stats;
1641         TupleDesc       tupdesc;
1642         Datum           values[PG_STORE_PLANS_INFO_COLS];
1643         bool            nulls[PG_STORE_PLANS_INFO_COLS];
1644
1645         if (!shared_state || !hash_table)
1646                 ereport(ERROR,
1647                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1648                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1649
1650         /* Build a tuple descriptor for our result type */
1651         if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1652                 elog(ERROR, "return type must be a row type");
1653
1654         MemSet(values, 0, sizeof(values));
1655         MemSet(nulls, 0, sizeof(nulls));
1656
1657         /* Read global statistics for pg_stat_statements */
1658         {
1659                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1660
1661                 SpinLockAcquire(&s->mutex);
1662                 stats = s->stats;
1663                 SpinLockRelease(&s->mutex);
1664         }
1665
1666         values[0] = Int64GetDatum(stats.dealloc);
1667         values[1] = TimestampTzGetDatum(stats.stats_reset);
1668
1669         PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
1670 }
1671
1672 /*
1673  * Estimate shared memory space needed.
1674  */
1675 static Size
1676 shared_mem_size(void)
1677 {
1678         Size            size;
1679         int                     entry_size;
1680
1681         size = MAXALIGN(sizeof(pgspSharedState));
1682         entry_size = sizeof(pgspEntry);
1683
1684         /* plan text is apppended to the struct body */
1685         if (plan_storage == PLAN_STORAGE_SHMEM)
1686                 entry_size += max_plan_len;
1687
1688         size = add_size(size, hash_estimate_size(store_size, entry_size));
1689
1690         return size;
1691 }
1692
1693 /*
1694  * Allocate a new hashtable entry.
1695  * caller must hold an exclusive lock on shared_state->lock
1696  *
1697  * "plan" need not be null-terminated; we rely on plan_len instead
1698  *
1699  * If "sticky" is true, make the new entry artificially sticky so that it will
1700  * probably still be there when the query finishes execution.  We do this by
1701  * giving it a median usage value rather than the normal value.  (Strictly
1702  * speaking, query strings are normalized on a best effort basis, though it
1703  * would be difficult to demonstrate this even under artificial conditions.)
1704  *
1705  * Note: despite needing exclusive lock, it's not an error for the target
1706  * entry to already exist.      This is because pgsp_store releases and
1707  * reacquires lock after failing to find a match; so someone else could
1708  * have made the entry while we waited to get exclusive lock.
1709  */
1710 static pgspEntry *
1711 entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len, bool sticky)
1712 {
1713         pgspEntry  *entry;
1714         bool            found;
1715
1716         /* Make space if needed */
1717         while (hash_get_num_entries(hash_table) >= store_size)
1718                 entry_dealloc();
1719
1720         /* Find or create an entry with desired hash code */
1721         entry = (pgspEntry *) hash_search(hash_table, key, HASH_ENTER, &found);
1722
1723         if (!found)
1724         {
1725                 /* New entry, initialize it */
1726
1727                 /* reset the statistics */
1728                 memset(&entry->counters, 0, sizeof(Counters));
1729                 /* set the appropriate initial usage count */
1730                 entry->counters.usage = sticky ? shared_state->cur_median_usage : USAGE_INIT;
1731                 /* re-initialize the mutex each time ... we assume no one using it */
1732                 SpinLockInit(&entry->mutex);
1733                 /* ... and don't forget the query text */
1734                 Assert(plan_len >= 0 && plan_len < shared_state->plan_size);
1735                 entry->plan_offset = plan_offset;
1736                 entry->plan_len = plan_len;
1737                 entry->encoding = GetDatabaseEncoding();
1738         }
1739
1740         return entry;
1741 }
1742
1743 /*
1744  * qsort comparator for sorting into increasing usage order
1745  */
1746 static int
1747 entry_cmp(const void *lhs, const void *rhs)
1748 {
1749         double          l_usage = (*(pgspEntry *const *) lhs)->counters.usage;
1750         double          r_usage = (*(pgspEntry *const *) rhs)->counters.usage;
1751
1752         if (l_usage < r_usage)
1753                 return -1;
1754         else if (l_usage > r_usage)
1755                 return +1;
1756         else
1757                 return 0;
1758 }
1759
1760 /*
1761  * Deallocate least used entries.
1762  * Caller must hold an exclusive lock on shared_state->lock.
1763  */
1764 static void
1765 entry_dealloc(void)
1766 {
1767         HASH_SEQ_STATUS hash_seq;
1768         pgspEntry **entries;
1769         pgspEntry  *entry;
1770         int                     nvictims;
1771         int                     i;
1772         Size            tottextlen;
1773         int                     nvalidtexts;
1774
1775         /*
1776          * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1777          * While we're scanning the table, apply the decay factor to the usage
1778          * values.
1779          */
1780
1781         entries = palloc(hash_get_num_entries(hash_table) * sizeof(pgspEntry *));
1782
1783         i = 0;
1784         tottextlen = 0;
1785         nvalidtexts = 0;
1786
1787         hash_seq_init(&hash_seq, hash_table);
1788         while ((entry = hash_seq_search(&hash_seq)) != NULL)
1789         {
1790                 entries[i++] = entry;
1791                 /* "Sticky" entries get a different usage decay rate. */
1792                 if (entry->counters.calls == 0)
1793                         entry->counters.usage *= STICKY_DECREASE_FACTOR;
1794                 else
1795                         entry->counters.usage *= USAGE_DECREASE_FACTOR;
1796
1797                 /* In the mean length computation, ignore dropped texts. */
1798                 if (entry->plan_len >= 0)
1799                 {
1800                         tottextlen += entry->plan_len + 1;
1801                         nvalidtexts++;
1802                 }
1803         }
1804
1805         qsort(entries, i, sizeof(pgspEntry *), entry_cmp);
1806
1807         /* Also, record the (approximate) median usage */
1808         if (i > 0)
1809                 shared_state->cur_median_usage = entries[i / 2]->counters.usage;
1810         /* Record the mean plan length */
1811         if (nvalidtexts > 0)
1812                 shared_state->mean_plan_len = tottextlen / nvalidtexts;
1813         else
1814                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
1815
1816         nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
1817         nvictims = Min(nvictims, i);
1818
1819         for (i = 0; i < nvictims; i++)
1820         {
1821                 hash_search(hash_table, &entries[i]->key, HASH_REMOVE, NULL);
1822         }
1823
1824         pfree(entries);
1825
1826         /* Increment the number of times entries are deallocated */
1827         {
1828                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1829
1830                 SpinLockAcquire(&s->mutex);
1831                 s->stats.dealloc += 1;
1832                 SpinLockRelease(&s->mutex);
1833         }
1834 }
1835
1836 /*
1837  * Given a plan string (not necessarily null-terminated), allocate a new
1838  * entry in the external plan text file and store the string there.
1839  *
1840  * If successful, returns true, and stores the new entry's offset in the file
1841  * into *plan_offset.  Also, if gc_count isn't NULL, *gc_count is set to the
1842  * number of garbage collections that have occurred so far.
1843  *
1844  * On failure, returns false.
1845  *
1846  * At least a shared lock on shared_state->lock must be held by the caller, so
1847  * as to prevent a concurrent garbage collection.  Share-lock-holding callers
1848  * should pass a gc_count pointer to obtain the number of garbage collections,
1849  * so that they can recheck the count after obtaining exclusive lock to detect
1850  * whether a garbage collection occurred (and removed this entry).
1851  */
1852 static bool
1853 ptext_store(const char *plan, int plan_len, Size *plan_offset, int *gc_count)
1854 {
1855         Size            off;
1856         int                     fd;
1857
1858         Assert (plan_storage == PLAN_STORAGE_FILE);
1859
1860         /*
1861          * We use a spinlock to protect extent/n_writers/gc_count, so that
1862          * multiple processes may execute this function concurrently.
1863          */
1864         {
1865                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1866
1867                 SpinLockAcquire(&s->mutex);
1868                 off = s->extent;
1869                 s->extent += plan_len + 1;
1870                 s->n_writers++;
1871                 if (gc_count)
1872                         *gc_count = s->gc_count;
1873                 SpinLockRelease(&s->mutex);
1874         }
1875
1876         *plan_offset = off;
1877
1878         /* Now write the data into the successfully-reserved part of the file */
1879         fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
1880         if (fd < 0)
1881                 goto error;
1882
1883         if (pg_pwrite(fd, plan, plan_len, off) != plan_len)
1884                 goto error;
1885         if (pg_pwrite(fd, "\0", 1, off + plan_len) != 1)
1886                 goto error;
1887
1888         CloseTransientFile(fd);
1889
1890         /* Mark our write complete */
1891         {
1892                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1893
1894                 SpinLockAcquire(&s->mutex);
1895                 s->n_writers--;
1896                 SpinLockRelease(&s->mutex);
1897         }
1898
1899         return true;
1900
1901 error:
1902         ereport(LOG,
1903                         (errcode_for_file_access(),
1904                          errmsg("could not write file \"%s\": %m",
1905                                         PGSP_TEXT_FILE)));
1906
1907         if (fd >= 0)
1908                 CloseTransientFile(fd);
1909
1910         /* Mark our write complete */
1911         {
1912                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1913
1914                 SpinLockAcquire(&s->mutex);
1915                 s->n_writers--;
1916                 SpinLockRelease(&s->mutex);
1917         }
1918
1919         return false;
1920 }
1921
1922 /*
1923  * Read the external plan text file into a malloc'd buffer.
1924  *
1925  * Returns NULL (without throwing an error) if unable to read, eg
1926  * file not there or insufficient memory.
1927  *
1928  * On success, the buffer size is also returned into *buffer_size.
1929  *
1930  * This can be called without any lock on shared_state->lock, but in that case
1931  * the caller is responsible for verifying that the result is sane.
1932  */
1933 static char *
1934 ptext_load_file(Size *buffer_size)
1935 {
1936         char       *buf;
1937         int                     fd;
1938         struct stat stat;
1939         Size            nread;
1940
1941         Assert (plan_storage == PLAN_STORAGE_FILE);
1942
1943         fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDONLY | PG_BINARY);
1944         if (fd < 0)
1945         {
1946                 if (errno != ENOENT)
1947                         ereport(LOG,
1948                                         (errcode_for_file_access(),
1949                                          errmsg("could not read file \"%s\": %m",
1950                                                         PGSP_TEXT_FILE)));
1951                 return NULL;
1952         }
1953
1954         /* Get file length */
1955         if (fstat(fd, &stat))
1956         {
1957                 ereport(LOG,
1958                                 (errcode_for_file_access(),
1959                                  errmsg("could not stat file \"%s\": %m",
1960                                                 PGSP_TEXT_FILE)));
1961                 CloseTransientFile(fd);
1962                 return NULL;
1963         }
1964
1965         /* Allocate buffer; beware that off_t might be wider than size_t */
1966         if (stat.st_size <= MaxAllocHugeSize)
1967                 buf = (char *) malloc(stat.st_size);
1968         else
1969                 buf = NULL;
1970         if (buf == NULL)
1971         {
1972                 ereport(LOG,
1973                                 (errcode(ERRCODE_OUT_OF_MEMORY),
1974                                  errmsg("out of memory"),
1975                                  errdetail("Could not allocate enough memory to read file \"%s\".",
1976                                                    PGSP_TEXT_FILE)));
1977                 CloseTransientFile(fd);
1978                 return NULL;
1979         }
1980
1981         /*
1982          * OK, slurp in the file.  Windows fails if we try to read more than
1983          * INT_MAX bytes at once, and other platforms might not like that either,
1984          * so read a very large file in 1GB segments.
1985          */
1986         nread = 0;
1987         while (nread < stat.st_size)
1988         {
1989                 int                     toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
1990
1991                 /*
1992                  * If we get a short read and errno doesn't get set, the reason is
1993                  * probably that garbage collection truncated the file since we did
1994                  * the fstat(), so we don't log a complaint --- but we don't return
1995                  * the data, either, since it's most likely corrupt due to concurrent
1996                  * writes from garbage collection.
1997                  */
1998                 errno = 0;
1999                 if (read(fd, buf + nread, toread) != toread)
2000                 {
2001                         if (errno)
2002                                 ereport(LOG,
2003                                                 (errcode_for_file_access(),
2004                                                  errmsg("could not read file \"%s\": %m",
2005                                                                 PGSP_TEXT_FILE)));
2006                         free(buf);
2007                         CloseTransientFile(fd);
2008                         return NULL;
2009                 }
2010                 nread += toread;
2011         }
2012
2013         if (CloseTransientFile(fd) != 0)
2014                 ereport(LOG,
2015                                 (errcode_for_file_access(),
2016                                  errmsg("could not close file \"%s\": %m", PGSP_TEXT_FILE)));
2017
2018         *buffer_size = nread;
2019         return buf;
2020 }
2021
2022 /*
2023  * Locate a plan text in the file image previously read by ptext_load_file().
2024  *
2025  * We validate the given offset/length, and return NULL if bogus.  Otherwise,
2026  * the result points to a null-terminated string within the buffer.
2027  */
2028 static char *
2029 ptext_fetch(Size plan_offset, int plan_len,
2030                         char *buffer, Size buffer_size)
2031 {
2032         Assert (plan_storage == PLAN_STORAGE_FILE);
2033
2034         /* File read failed? */
2035         if (buffer == NULL)
2036                 return NULL;
2037         /* Bogus offset/length? */
2038         if (plan_len < 0 ||
2039                 plan_offset + plan_len >= buffer_size)
2040                 return NULL;
2041         /* As a further sanity check, make sure there's a trailing null */
2042         if (buffer[plan_offset + plan_len] != '\0')
2043                 return NULL;
2044         /* Looks OK */
2045         return buffer + plan_offset;
2046 }
2047
2048 /*
2049  * Do we need to garbage-collect the external plan text file?
2050  *
2051  * Caller should hold at least a shared lock on shared_state->lock.
2052  */
2053 static bool
2054 need_gc_ptexts(void)
2055 {
2056         Size            extent;
2057
2058         Assert (plan_storage == PLAN_STORAGE_FILE);
2059
2060         /* Read shared extent pointer */
2061         {
2062                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2063
2064                 SpinLockAcquire(&s->mutex);
2065                 extent = s->extent;
2066                 SpinLockRelease(&s->mutex);
2067         }
2068
2069         /* Don't proceed if file does not exceed 512 bytes per possible entry */
2070         if (extent < 512 * store_size)
2071                 return false;
2072
2073         /*
2074          * Don't proceed if file is less than about 50% bloat.  Nothing can or
2075          * should be done in the event of unusually large query texts accounting
2076          * for file's large size.  We go to the trouble of maintaining the mean
2077          * query length in order to prevent garbage collection from thrashing
2078          * uselessly.
2079          */
2080         if (extent < shared_state->mean_plan_len * store_size * 2)
2081                 return false;
2082
2083         return true;
2084 }
2085
2086 /*
2087  * Garbage-collect orphaned plan texts in external file.
2088  *
2089  * This won't be called often in the typical case, since it's likely that
2090  * there won't be too much churn, and besides, a similar compaction process
2091  * occurs when serializing to disk at shutdown or as part of resetting.
2092  * Despite this, it seems prudent to plan for the edge case where the file
2093  * becomes unreasonably large, with no other method of compaction likely to
2094  * occur in the foreseeable future.
2095  *
2096  * The caller must hold an exclusive lock on shared_state->lock.
2097  *
2098  * At the first sign of trouble we unlink the query text file to get a clean
2099  * slate (although existing statistics are retained), rather than risk
2100  * thrashing by allowing the same problem case to recur indefinitely.
2101  */
2102 static void
2103 gc_ptexts(void)
2104 {
2105         char       *pbuffer;
2106         Size            pbuffer_size;
2107         FILE       *pfile = NULL;
2108         HASH_SEQ_STATUS hash_seq;
2109         pgspEntry  *entry;
2110         Size            extent;
2111         int                     nentries;
2112
2113         Assert (plan_storage == PLAN_STORAGE_FILE);
2114
2115         /*
2116          * When called from store_entry, some other session might have proceeded
2117          * with garbage collection in the no-lock-held interim of lock strength
2118          * escalation.  Check once more that this is actually necessary.
2119          */
2120         if (!need_gc_ptexts())
2121                 return;
2122
2123         /*
2124          * Load the old texts file.  If we fail (out of memory, for instance),
2125          * invalidate query texts.  Hopefully this is rare.  It might seem better
2126          * to leave things alone on an OOM failure, but the problem is that the
2127          * file is only going to get bigger; hoping for a future non-OOM result is
2128          * risky and can easily lead to complete denial of service.
2129          */
2130         pbuffer = ptext_load_file(&pbuffer_size);
2131         if (pbuffer == NULL)
2132                 goto gc_fail;
2133
2134         /*
2135          * We overwrite the plan texts file in place, so as to reduce the risk of
2136          * an out-of-disk-space failure.  Since the file is guaranteed not to get
2137          * larger, this should always work on traditional filesystems; though we
2138          * could still lose on copy-on-write filesystems.
2139          */
2140         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2141         if (pfile == NULL)
2142         {
2143                 ereport(LOG,
2144                                 (errcode_for_file_access(),
2145                                  errmsg("could not write file \"%s\": %m",
2146                                                 PGSP_TEXT_FILE)));
2147                 goto gc_fail;
2148         }
2149
2150         extent = 0;
2151         nentries = 0;
2152
2153         hash_seq_init(&hash_seq, hash_table);
2154         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2155         {
2156                 int                     plan_len = entry->plan_len;
2157                 char       *plan = ptext_fetch(entry->plan_offset,
2158                                                                            plan_len,
2159                                                                            pbuffer,
2160                                                                            pbuffer_size);
2161
2162                 if (plan == NULL)
2163                 {
2164                         /* Trouble ... drop the text */
2165                         entry->plan_offset = 0;
2166                         entry->plan_len = -1;
2167                         /* entry will not be counted in mean plan length computation */
2168                         continue;
2169                 }
2170
2171                 if (fwrite(plan, 1, plan_len + 1, pfile) != plan_len + 1)
2172                 {
2173                         ereport(LOG,
2174                                         (errcode_for_file_access(),
2175                                          errmsg("could not write file \"%s\": %m",
2176                                                         PGSP_TEXT_FILE)));
2177                         hash_seq_term(&hash_seq);
2178                         goto gc_fail;
2179                 }
2180
2181                 entry->plan_offset = extent;
2182                 extent += plan_len + 1;
2183                 nentries++;
2184         }
2185
2186         /*
2187          * Truncate away any now-unused space.  If this fails for some odd reason,
2188          * we log it, but there's no need to fail.
2189          */
2190         if (ftruncate(fileno(pfile), extent) != 0)
2191                 ereport(LOG,
2192                                 (errcode_for_file_access(),
2193                                  errmsg("could not truncate file \"%s\": %m",
2194                                                 PGSP_TEXT_FILE)));
2195
2196         if (FreeFile(pfile))
2197         {
2198                 ereport(LOG,
2199                                 (errcode_for_file_access(),
2200                                  errmsg("could not write file \"%s\": %m",
2201                                                 PGSP_TEXT_FILE)));
2202                 pfile = NULL;
2203                 goto gc_fail;
2204         }
2205
2206         elog(DEBUG1, "pgsp gc of queries file shrunk size from %zu to %zu",
2207                  shared_state->extent, extent);
2208
2209         /* Reset the shared extent pointer */
2210         shared_state->extent = extent;
2211
2212         /*
2213          * Also update the mean plan length, to be sure that need_gc_ptexts()
2214          * won't still think we have a problem.
2215          */
2216         if (nentries > 0)
2217                 shared_state->mean_plan_len = extent / nentries;
2218         else
2219                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2220
2221         free(pbuffer);
2222
2223         return;
2224
2225 gc_fail:
2226         /* clean up resources */
2227         if (pfile)
2228                 FreeFile(pfile);
2229         if (pbuffer)
2230                 free(pbuffer);
2231
2232         /*
2233          * Since the contents of the external file are now uncertain, mark all
2234          * hashtable entries as having invalid texts.
2235          */
2236         hash_seq_init(&hash_seq, hash_table);
2237         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2238         {
2239                 entry->plan_offset = 0;
2240                 entry->plan_len = -1;
2241         }
2242
2243         /*
2244          * Destroy the query text file and create a new, empty one
2245          */
2246         (void) unlink(PGSP_TEXT_FILE);
2247         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2248         if (pfile == NULL)
2249                 ereport(LOG,
2250                                 (errcode_for_file_access(),
2251                                  errmsg("could not recreate file \"%s\": %m",
2252                                                 PGSP_TEXT_FILE)));
2253         else
2254                 FreeFile(pfile);
2255
2256         /* Reset the shared extent pointer */
2257         shared_state->extent = 0;
2258
2259         /* Reset mean_plan_len to match the new state */
2260         shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2261 }
2262
2263 /*
2264  * Release all entries.
2265  */
2266 static void
2267 entry_reset(void)
2268 {
2269         HASH_SEQ_STATUS hash_seq;
2270         pgspEntry  *entry;
2271         FILE       *pfile;
2272
2273         if (!shared_state || !hash_table)
2274                 ereport(ERROR,
2275                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2276                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
2277
2278         LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
2279
2280         hash_seq_init(&hash_seq, hash_table);
2281         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2282         {
2283                 hash_search(hash_table, &entry->key, HASH_REMOVE, NULL);
2284         }
2285
2286         /*
2287          * Reset global statistics for pg_store_plans.
2288          */
2289         {
2290                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2291                 TimestampTz stats_reset = GetCurrentTimestamp();
2292
2293                 SpinLockAcquire(&s->mutex);
2294                 s->stats.dealloc = 0;
2295                 s->stats.stats_reset = stats_reset;
2296                 SpinLockRelease(&s->mutex);
2297         }
2298
2299         /*
2300          * Write new empty plan file, perhaps even creating a new one to recover
2301          * if the file was missing.
2302          */
2303         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2304         if (pfile == NULL)
2305         {
2306                 ereport(LOG,
2307                                 (errcode_for_file_access(),
2308                                  errmsg("could not create file \"%s\": %m",
2309                                                 PGSP_TEXT_FILE)));
2310                 goto done;
2311         }
2312
2313         /* If ftruncate fails, log it, but it's not a fatal problem */
2314         if (ftruncate(fileno(pfile), 0) != 0)
2315                 ereport(LOG,
2316                                 (errcode_for_file_access(),
2317                                  errmsg("could not truncate file \"%s\": %m",
2318                                                 PGSP_TEXT_FILE)));
2319
2320         FreeFile(pfile);
2321
2322 done:
2323         shared_state->extent = 0;
2324         LWLockRelease(shared_state->lock);
2325 }
2326
2327 Datum
2328 pg_store_plans_hash_query(PG_FUNCTION_ARGS)
2329 {
2330         PG_RETURN_OID(hash_query(text_to_cstring(PG_GETARG_TEXT_P(0))));
2331 }
2332
2333 Datum
2334 pg_store_plans_shorten(PG_FUNCTION_ARGS)
2335 {
2336         text *short_plan = PG_GETARG_TEXT_P(0);
2337         char *cjson = text_to_cstring(short_plan);
2338         char *cshorten = pgsp_json_shorten(cjson);
2339         PG_RETURN_TEXT_P(cstring_to_text(cshorten));
2340 }
2341
2342 Datum
2343 pg_store_plans_normalize(PG_FUNCTION_ARGS)
2344 {
2345         text *short_plan = PG_GETARG_TEXT_P(0);
2346         char *cjson = text_to_cstring(short_plan);
2347         char *cnormalized = pgsp_json_normalize(cjson);
2348         PG_RETURN_TEXT_P(cstring_to_text(cnormalized));
2349 }
2350
2351 Datum
2352 pg_store_plans_jsonplan(PG_FUNCTION_ARGS)
2353 {
2354         text *short_plan = PG_GETARG_TEXT_P(0);
2355         char *cshort = text_to_cstring(short_plan);
2356         char *cinflated = pgsp_json_inflate(cshort);
2357         PG_RETURN_TEXT_P(cstring_to_text(cinflated));
2358 }
2359
2360 Datum
2361 pg_store_plans_textplan(PG_FUNCTION_ARGS)
2362 {
2363         text *short_plan = PG_GETARG_TEXT_P(0);
2364         char *cshort = text_to_cstring(short_plan);
2365         char *ctextized = pgsp_json_textize(cshort);
2366
2367         PG_RETURN_TEXT_P(cstring_to_text(ctextized));
2368 }
2369
2370 Datum
2371 pg_store_plans_yamlplan(PG_FUNCTION_ARGS)
2372 {
2373         text *short_plan = PG_GETARG_TEXT_P(0);
2374         char *cshort = text_to_cstring(short_plan);
2375         char *cyamlized = pgsp_json_yamlize(cshort);
2376
2377         PG_RETURN_TEXT_P(cstring_to_text(cyamlized));
2378 }
2379
2380 Datum
2381 pg_store_plans_xmlplan(PG_FUNCTION_ARGS)
2382 {
2383         text *short_plan = PG_GETARG_TEXT_P(0);
2384         char *cshort = text_to_cstring(short_plan);
2385         char *cxmlized = pgsp_json_xmlize(cshort);
2386
2387         PG_RETURN_TEXT_P(cstring_to_text(cxmlized));
2388 }