OSDN Git Service

acbc21b15fdca622f7f677de7c944f9c5383e020
[pgstoreplans/pg_store_plans.git] / pg_store_plans.c
1 /*-------------------------------------------------------------------------
2  *
3  * pg_store_plans.c
4  *              Take statistics of plan selection across a whole database cluster.
5  *
6  * Execution costs are totaled for each distinct plan for each query,
7  * and plan and queryid are kept in a shared hashtable, each record in
8  * which is associated with a record in pg_stat_statements, if any, by
9  * the queryid.
10  *
11  * For Postgres 9.3 or earlier does not expose query id so
12  * pg_store_plans needs to calculate it based on the given query
13  * string using different algorithm from pg_stat_statements, and later
14  * the id will be matched against the one made from query string
15  * stored in pg_stat_statements. For the reason, queryid matching in
16  * this way will fail if the query string kept in pg_stat_statements
17  * is truncated in the middle.
18  *
19  * Plans are identified by fingerprinting plan representations in
20  * "shortened" JSON format with constants and unstable values such as
21  * rows, width, loops ignored. Nevertheless, stored plan entries hold
22  * them of the latest execution. Entry eviction is done in the same
23  * way to pg_stat_statements.
24  *
25  * Copyright (c) 2008-2022, PostgreSQL Global Development Group
26  * Copyright (c) 2012-2022, NIPPON TELEGRAPH AND TELEPHONE CORPORATION
27  *
28  * IDENTIFICATION
29  *        pg_store_plans/pg_store_plans.c
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34
35 #include <sys/stat.h>
36 #include <unistd.h>
37 #include <dlfcn.h>
38 #include <math.h>
39
40 #include "catalog/pg_authid.h"
41 #include "commands/explain.h"
42 #include "access/hash.h"
43 #include "executor/instrument.h"
44 #include "funcapi.h"
45 #include "mb/pg_wchar.h"
46 #include "miscadmin.h"
47 #include "pgstat.h"
48 #include "storage/fd.h"
49 #include "storage/ipc.h"
50 #include "storage/lwlock.h"
51 #include "storage/spin.h"
52 #include "storage/shmem.h"
53 #include "tcop/utility.h"
54 #include "utils/acl.h"
55 #include "utils/builtins.h"
56 #if PG_VERSION_NUM >= 140000
57 #include "utils/queryjumble.h"
58 #endif
59 #include "utils/timestamp.h"
60
61 #include "pgsp_json.h"
62 #include "pgsp_explain.h"
63
64 PG_MODULE_MAGIC;
65
66 /* Location of stats file */
67 #define PGSP_DUMP_FILE  "global/pg_store_plans.stat"
68 #define PGSP_TEXT_FILE  PG_STAT_TMP_DIR "/pgsp_plan_texts.stat"
69
70 /* PostgreSQL major version number, changes in which invalidate all entries */
71 static const uint32 PGSP_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
72
73 /* This constant defines the magic number in the stats file header */
74 static const uint32 PGSP_FILE_HEADER = 0x20211125;
75 static int max_plan_len = 5000;
76
77 /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
78 #define USAGE_EXEC(duration)    (1.0)
79 #define USAGE_INIT                              (1.0)   /* including initial planning */
80 #define ASSUMED_MEDIAN_INIT             (10.0)  /* initial assumed median usage */
81 #define ASSUMED_LENGTH_INIT             1024    /* initial assumed mean query length */
82 #define USAGE_DECREASE_FACTOR   (0.99)  /* decreased every entry_dealloc */
83 #define STICKY_DECREASE_FACTOR  (0.50)  /* factor for sticky entries */
84 #define USAGE_DEALLOC_PERCENT   5               /* free this % of entries at once */
85
86 /* In PostgreSQL 11, queryid becomes a uint64 internally. */
87 #if PG_VERSION_NUM >= 110000
88 typedef uint64 queryid_t;
89 #define PGSP_NO_QUERYID         UINT64CONST(0)
90 #else
91 typedef uint32 queryid_t;
92 #define PGSP_NO_QUERYID         0
93 #endif
94
95 /*
96  * Extension version number, for supporting older extension versions' objects
97  */
98 typedef enum pgspVersion
99 {
100         PGSP_V1_5 = 0,
101         PGSP_V1_6
102 } pgspVersion;
103
104 /*
105  * Hashtable key that defines the identity of a hashtable entry.  We separate
106  * queries by user and by database even if they are otherwise identical.
107  *
108  * Presently, the query encoding is fully determined by the source database
109  * and so we don't really need it to be in the key.  But that might not always
110  * be true. Anyway it's notationally convenient to pass it as part of the key.
111  */
112 typedef struct pgspHashKey
113 {
114         Oid                     userid;                 /* user OID */
115         Oid                     dbid;                   /* database OID */
116         queryid_t       queryid;                /* query identifier */
117         uint32          planid;                 /* plan identifier */
118 } pgspHashKey;
119
120 /*
121  * The actual stats counters kept within pgspEntry.
122  */
123 typedef struct Counters
124 {
125         int64           calls;                          /* # of times executed */
126         double          total_time;                     /* total execution time, in msec */
127         double          min_time;                       /* minimum execution time in msec */
128         double          max_time;                       /* maximum execution time in msec */
129         double          mean_time;                      /* mean execution time in msec */
130         double          sum_var_time;   /* sum of variances in execution time in msec */
131         int64           rows;                           /* total # of retrieved or affected rows */
132         int64           shared_blks_hit;        /* # of shared buffer hits */
133         int64           shared_blks_read;       /* # of shared disk blocks read */
134         int64           shared_blks_dirtied;/* # of shared disk blocks dirtied */
135         int64           shared_blks_written;/* # of shared disk blocks written */
136         int64           local_blks_hit;         /* # of local buffer hits */
137         int64           local_blks_read;        /* # of local disk blocks read */
138         int64           local_blks_dirtied;     /* # of local disk blocks dirtied */
139         int64           local_blks_written;     /* # of local disk blocks written */
140         int64           temp_blks_read;         /* # of temp blocks read */
141         int64           temp_blks_written;      /* # of temp blocks written */
142         double          blk_read_time;          /* time spent reading, in msec */
143         double          blk_write_time;         /* time spent writing, in msec */
144         TimestampTz     first_call;                     /* timestamp of first call  */
145         TimestampTz     last_call;                      /* timestamp of last call  */
146         double          usage;                          /* usage factor */
147 } Counters;
148
149 /*
150  * Global statistics for pg_store_plans
151  */
152 typedef struct pgspGlobalStats
153 {
154         int64           dealloc;                /* # of times entries were deallocated */
155         TimestampTz stats_reset;        /* timestamp with all stats reset */
156 } pgspGlobalStats;
157
158 /*
159  * Statistics per plan
160  *
161  * NB: see the file read/write code before changing field order here.
162  */
163 typedef struct pgspEntry
164 {
165         pgspHashKey     key;                    /* hash key of entry - MUST BE FIRST */
166         Counters        counters;               /* the statistics for this query */
167         Size            plan_offset;    /* plan text offset in extern file */
168         int                     plan_len;               /* # of valid bytes in query string */
169         int                     encoding;               /* query encoding */
170         slock_t         mutex;                  /* protects the counters only */
171 } pgspEntry;
172
173 /*
174  * Global shared state
175  */
176 typedef struct pgspSharedState
177 {
178         LWLock     *lock;                       /* protects hashtable search/modification */
179         int                     plan_size;              /* max query length in bytes */
180         double          cur_median_usage;       /* current median usage in hashtable */
181         Size            mean_plan_len;  /* current mean entry text length */
182         slock_t         mutex;                  /* protects following fields only: */
183         Size            extent;                 /* current extent of plan file */
184         int                     n_writers;              /* number of active writers to query file */
185         int                     gc_count;               /* plan file garbage collection cycle count */
186         pgspGlobalStats stats;          /* global statistics for pgsp */
187 } pgspSharedState;
188
189 /*---- Local variables ----*/
190
191 /* Current nesting depth of ExecutorRun+ProcessUtility calls */
192 static int      nested_level = 0;
193
194 /* Saved hook values in case of unload */
195 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
196 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
197 static ExecutorRun_hook_type prev_ExecutorRun = NULL;
198 static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
199 static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
200 static ProcessUtility_hook_type prev_ProcessUtility = NULL;
201
202 /* Links to shared memory state */
203 static pgspSharedState *shared_state = NULL;
204 static HTAB *hash_table = NULL;
205
206 /*---- GUC variables ----*/
207
208 typedef enum
209 {
210         TRACK_LEVEL_NONE,                       /* track no statements */
211         TRACK_LEVEL_TOP,                                /* only top level statements */
212         TRACK_LEVEL_ALL,                                /* all statements, including nested ones */
213         TRACK_LEVEL_VERBOSE                     /* all statements, including internal ones */
214 }       PGSPTrackLevel;
215
216 static const struct config_enum_entry track_options[] =
217 {
218         {"none", TRACK_LEVEL_NONE, false},
219         {"top", TRACK_LEVEL_TOP, false},
220         {"all", TRACK_LEVEL_ALL, false},
221         {"verbose", TRACK_LEVEL_VERBOSE, false},
222         {NULL, 0, false}
223 };
224
225 typedef enum
226 {
227         PLAN_FORMAT_RAW,                /* No conversion. Shorten JSON */
228         PLAN_FORMAT_TEXT,               /* Traditional text representation */
229         PLAN_FORMAT_JSON,               /* JSON representation */
230         PLAN_FORMAT_YAML,               /* YAML */
231         PLAN_FORMAT_XML,                /* XML  */
232 }       PGSPPlanFormats;
233
234 static const struct config_enum_entry plan_formats[] =
235 {
236         {"raw" , PLAN_FORMAT_RAW , false},
237         {"text", PLAN_FORMAT_TEXT, false},
238         {"json", PLAN_FORMAT_JSON, false},
239         {"yaml", PLAN_FORMAT_YAML, false},
240         {"xml" , PLAN_FORMAT_XML , false},
241         {NULL, 0, false}
242 };
243
244 /* options for plan storage */
245 typedef enum
246 {
247         PLAN_STORAGE_SHMEM,             /* plan is stored as a part of hash entry */
248         PLAN_STORAGE_FILE               /* plan is stored in a separate file */
249 }  pgspPlanStorage;
250
251 static const struct config_enum_entry plan_storage_options[] =
252 {
253         {"shmem", PLAN_STORAGE_SHMEM, false},
254         {"file", PLAN_STORAGE_FILE, false},
255         {NULL, 0, false}
256 };
257
258 static int      store_size;                     /* max # statements to track */
259 static int      track_level;            /* tracking level */
260 static int      min_duration;           /* min duration to record */
261 static bool dump_on_shutdown;   /* whether to save stats across shutdown */
262 static bool log_analyze;                /* Similar to EXPLAIN (ANALYZE *) */
263 static bool log_verbose;                /* Similar to EXPLAIN (VERBOSE *) */
264 static bool log_buffers;                /* Similar to EXPLAIN (BUFFERS *) */
265 static bool log_timing;                 /* Similar to EXPLAIN (TIMING *) */
266 static bool log_triggers;               /* whether to log trigger statistics  */
267 static int  plan_format;                /* Plan representation style in
268                                                                  * pg_store_plans.plan  */
269 static int  plan_storage;               /* Plan storage type */
270
271
272 /* disables tracking overriding track_level */
273 static bool force_disabled = false;
274
275 #if PG_VERSION_NUM >= 140000
276 /*
277  * For pg14 and later, we rely on core queryid calculation.  If
278  * it's not available it means that the admin explicitly refused to
279  * compute it, for performance reason or other.  In that case, we
280  * will also consider that this extension is disabled.
281  */
282 #define pgsp_enabled(q) \
283         (!force_disabled &&                                                                                       \
284          (track_level >= TRACK_LEVEL_ALL ||                                                       \
285           (track_level == TRACK_LEVEL_TOP && nested_level == 0)) &&       \
286          (q != PGSP_NO_QUERYID))
287 #else
288 #define pgsp_enabled(q) \
289         (!force_disabled &&                                                                                     \
290          (track_level >= TRACK_LEVEL_ALL ||                                                     \
291           (track_level == TRACK_LEVEL_TOP && nested_level == 0)))
292 #endif
293
294 #define SHMEM_PLAN_PTR(ent) (((char *) ent) + sizeof(pgspEntry))
295
296 /*---- Function declarations ----*/
297
298 void            _PG_init(void);
299 void            _PG_fini(void);
300
301 Datum           pg_store_plans_reset(PG_FUNCTION_ARGS);
302 Datum           pg_store_plans_hash_query(PG_FUNCTION_ARGS);
303 Datum           pg_store_plans(PG_FUNCTION_ARGS);
304 Datum           pg_store_plans_shorten(PG_FUNCTION_ARGS);
305 Datum           pg_store_plans_normalize(PG_FUNCTION_ARGS);
306 Datum           pg_store_plans_jsonplan(PG_FUNCTION_ARGS);
307 Datum           pg_store_plans_yamlplan(PG_FUNCTION_ARGS);
308 Datum           pg_store_plans_xmlplan(PG_FUNCTION_ARGS);
309 Datum           pg_store_plans_textplan(PG_FUNCTION_ARGS);
310 Datum           pg_store_plans_info(PG_FUNCTION_ARGS);
311
312 PG_FUNCTION_INFO_V1(pg_store_plans_reset);
313 PG_FUNCTION_INFO_V1(pg_store_plans_hash_query);
314 PG_FUNCTION_INFO_V1(pg_store_plans);
315 PG_FUNCTION_INFO_V1(pg_store_plans_1_6);
316 PG_FUNCTION_INFO_V1(pg_store_plans_shorten);
317 PG_FUNCTION_INFO_V1(pg_store_plans_normalize);
318 PG_FUNCTION_INFO_V1(pg_store_plans_jsonplan);
319 PG_FUNCTION_INFO_V1(pg_store_plans_yamlplan);
320 PG_FUNCTION_INFO_V1(pg_store_plans_xmlplan);
321 PG_FUNCTION_INFO_V1(pg_store_plans_textplan);
322 PG_FUNCTION_INFO_V1(pg_store_plans_info);
323
324 #if PG_VERSION_NUM < 130000
325 #define COMPTAG_TYPE char
326 #else
327 #define COMPTAG_TYPE QueryCompletion
328 #endif
329
330 #if PG_VERSION_NUM < 140000
331 #define ROLE_PG_READ_ALL_STATS          DEFAULT_ROLE_READ_ALL_STATS
332 #endif
333
334 #if PG_VERSION_NUM >= 150000
335 static shmem_request_hook_type prev_shmem_request_hook = NULL;
336 #endif
337
338 static void pgsp_shmem_request(void);
339 static void pgsp_shmem_startup(void);
340 static void pgsp_shmem_shutdown(int code, Datum arg);
341 static void pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags);
342 static void pgsp_ExecutorRun(QueryDesc *queryDesc,
343                                  ScanDirection direction,
344                                                          uint64 count, bool execute_once);
345 static void pgsp_ExecutorFinish(QueryDesc *queryDesc);
346 static void pgsp_ExecutorEnd(QueryDesc *queryDesc);
347 static void pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
348 #if PG_VERSION_NUM >= 140000
349                                         bool readOnlyTree,
350 #endif
351                                         ProcessUtilityContext context, ParamListInfo params,
352                                         QueryEnvironment *queryEnv,
353                                         DestReceiver *dest, COMPTAG_TYPE *completionTag);
354 static uint32 hash_query(const char* query);
355 static void pgsp_store(char *plan, queryid_t queryId,
356                    double total_time, uint64 rows,
357                    const BufferUsage *bufusage);
358 static void pg_store_plans_internal(FunctionCallInfo fcinfo,
359                                                                         pgspVersion api_version);
360 static Size shared_mem_size(void);
361 static pgspEntry *entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len,
362                                                           bool sticky);
363 static bool ptext_store(const char *plan, int plan_len, Size *plan_offset,
364                                                 int *gc_count);
365 static char *ptext_load_file(Size *buffer_size);
366 static char *ptext_fetch(Size plan_offset, int plan_len, char *buffer,
367                                                  Size buffer_size);
368 static bool need_gc_ptexts(void);
369 static void gc_ptexts(void);
370 static void entry_dealloc(void);
371 static void entry_reset(void);
372
373 /*
374  * Module load callback
375  */
376 void
377 _PG_init(void)
378 {
379         /*
380          * In order to create our shared memory area, we have to be loaded via
381          * shared_preload_libraries.  If not, fall out without hooking into any of
382          * the main system.  (We don't throw error here because it seems useful to
383          * allow the pg_stat_statements functions to be created even when the
384          * module isn't active.  The functions must protect themselves against
385          * being called then, however.)
386          */
387         if (!process_shared_preload_libraries_in_progress)
388                 return;
389
390 #if PG_VERSION_NUM >= 140000
391         /*
392          * Inform the postmaster that we want to enable query_id calculation if
393          * compute_query_id is set to auto.
394          */
395         EnableQueryId();
396 #endif
397
398         /*
399          * Define (or redefine) custom GUC variables.
400          */
401         DefineCustomIntVariable("pg_store_plans.max",
402           "Sets the maximum number of plans tracked by pg_store_plans.",
403                                                         NULL,
404                                                         &store_size,
405                                                         1000,
406                                                         100,
407                                                         INT_MAX,
408                                                         PGC_POSTMASTER,
409                                                         0,
410                                                         NULL,
411                                                         NULL,
412                                                         NULL);
413
414         DefineCustomIntVariable("pg_store_plans.max_plan_length",
415           "Sets the maximum length of plans stored by pg_store_plans.",
416                                                         NULL,
417                                                         &max_plan_len,
418                                                         5000,
419                                                         100,
420                                                         INT32_MAX,
421                                                         PGC_POSTMASTER,
422                                                         0,
423                                                         NULL,
424                                                         NULL,
425                                                         NULL);
426
427         DefineCustomEnumVariable("pg_store_plans.plan_storage",
428                            "Selects where to store plan texts.",
429                                                          NULL,
430                                                          &plan_storage,
431                                                          PLAN_STORAGE_FILE,
432                                                          plan_storage_options,
433                                                          PGC_POSTMASTER,
434                                                          0,
435                                                          NULL,
436                                                          NULL,
437                                                          NULL);
438
439         DefineCustomEnumVariable("pg_store_plans.track",
440                            "Selects which plans are tracked by pg_store_plans.",
441                                                          NULL,
442                                                          &track_level,
443                                                          TRACK_LEVEL_TOP,
444                                                          track_options,
445                                                          PGC_SUSET,
446                                                          0,
447                                                          NULL,
448                                                          NULL,
449                                                          NULL);
450
451         DefineCustomEnumVariable("pg_store_plans.plan_format",
452                            "Selects which format to be appied for plan representation in pg_store_plans.",
453                                                          NULL,
454                                                          &plan_format,
455                                                          PLAN_FORMAT_TEXT,
456                                                          plan_formats,
457                                                          PGC_USERSET,
458                                                          0,
459                                                          NULL,
460                                                          NULL,
461                                                          NULL);
462
463         DefineCustomIntVariable("pg_store_plans.min_duration",
464                                         "Minimum duration to record plan in milliseconds.",
465                                                         NULL,
466                                                         &min_duration,
467                                                         0,
468                                                         0,
469                                                         INT_MAX,
470                                                         PGC_SUSET,
471                                                         0,
472                                                         NULL,
473                                                         NULL,
474                                                         NULL);
475
476         DefineCustomBoolVariable("pg_store_plans.save",
477                            "Save pg_store_plans statistics across server shutdowns.",
478                                                          NULL,
479                                                          &dump_on_shutdown,
480                                                          true,
481                                                          PGC_SIGHUP,
482                                                          0,
483                                                          NULL,
484                                                          NULL,
485                                                          NULL);
486
487         DefineCustomBoolVariable("pg_store_plans.log_analyze",
488                                                          "Use EXPLAIN ANALYZE for plan logging.",
489                                                          NULL,
490                                                          &log_analyze,
491                                                          false,
492                                                          PGC_SUSET,
493                                                          0,
494                                                          NULL,
495                                                          NULL,
496                                                          NULL);
497
498         DefineCustomBoolVariable("pg_store_plans.log_buffers",
499                                                          "Log buffer usage.",
500                                                          NULL,
501                                                          &log_buffers,
502                                                          false,
503                                                          PGC_SUSET,
504                                                          0,
505                                                          NULL,
506                                                          NULL,
507                                                          NULL);
508
509         DefineCustomBoolVariable("pg_store_plans.log_timing",
510                                                          "Log timings.",
511                                                          NULL,
512                                                          &log_timing,
513                                                          true,
514                                                          PGC_SUSET,
515                                                          0,
516                                                          NULL,
517                                                          NULL,
518                                                          NULL);
519
520         DefineCustomBoolVariable("pg_store_plans.log_triggers",
521                                                          "Log trigger trace.",
522                                                          NULL,
523                                                          &log_triggers,
524                                                          false,
525                                                          PGC_SUSET,
526                                                          0,
527                                                          NULL,
528                                                          NULL,
529                                                          NULL);
530
531         DefineCustomBoolVariable("pg_store_plans.log_verbose",
532                            "Set VERBOSE for EXPLAIN on logging.",
533                                                          NULL,
534                                                          &log_verbose,
535                                                          false,
536                                                          PGC_SUSET,
537                                                          0,
538                                                          NULL,
539                                                          NULL,
540                                                          NULL);
541
542         EmitWarningsOnPlaceholders("pg_store_plans");
543
544 #if PG_VERSION_NUM < 150000     
545         pgsp_shmem_request();
546 #endif
547
548         /*
549          * Install hooks.
550          */
551 #if PG_VERSION_NUM >= 150000
552         prev_shmem_request_hook = shmem_request_hook;
553         shmem_request_hook = pgsp_shmem_request;
554 #endif
555         prev_shmem_startup_hook = shmem_startup_hook;
556         shmem_startup_hook = pgsp_shmem_startup;
557         prev_ExecutorStart = ExecutorStart_hook;
558         ExecutorStart_hook = pgsp_ExecutorStart;
559         prev_ExecutorRun = ExecutorRun_hook;
560         ExecutorRun_hook = pgsp_ExecutorRun;
561         prev_ExecutorFinish = ExecutorFinish_hook;
562         ExecutorFinish_hook = pgsp_ExecutorFinish;
563         prev_ExecutorEnd = ExecutorEnd_hook;
564         ExecutorEnd_hook = pgsp_ExecutorEnd;
565         prev_ProcessUtility = ProcessUtility_hook;
566         ProcessUtility_hook = pgsp_ProcessUtility;
567 }
568
569 /*
570  * Module unload callback
571  */
572 void
573 _PG_fini(void)
574 {
575         /* Uninstall hooks. */
576         shmem_startup_hook = prev_shmem_startup_hook;
577         ExecutorStart_hook = prev_ExecutorStart;
578         ExecutorRun_hook = prev_ExecutorRun;
579         ExecutorFinish_hook = prev_ExecutorFinish;
580         ExecutorEnd_hook = prev_ExecutorEnd;
581         ProcessUtility_hook = prev_ProcessUtility;
582 }
583
584 /*
585  * pgsp_shmem_request: request shared memory to the core.
586  * Called as a hook in PG15 or later, otherwise called from _PG_init().
587  */
588 static void
589 pgsp_shmem_request(void)
590 {
591 #if PG_VERSION_NUM >= 150000
592         if (prev_shmem_request_hook)
593                 prev_shmem_request_hook();
594 #endif
595
596         RequestAddinShmemSpace(shared_mem_size());
597         RequestNamedLWLockTranche("pg_store_plans", 1);
598 }
599
600 /*
601  * shmem_startup hook: allocate or attach to shared memory,
602  * then load any pre-existing statistics from file.
603  */
604 static void
605 pgsp_shmem_startup(void)
606 {
607         bool            found;
608         HASHCTL         info;
609         FILE       *file = NULL;
610         FILE       *pfile = NULL;
611         uint32          header;
612         int32           num;
613         int32           pgver;
614         int32           i;
615         int                     plan_size;
616         int                     buffer_size;
617         char       *buffer = NULL;
618
619         if (prev_shmem_startup_hook)
620                 prev_shmem_startup_hook();
621
622         /* reset in case this is a restart within the postmaster */
623         shared_state = NULL;
624         hash_table = NULL;
625
626         /*
627          * Create or attach to the shared memory state, including hash table
628          */
629         LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
630
631         shared_state = ShmemInitStruct("pg_store_plans",
632                                                    sizeof(pgspSharedState),
633                                                    &found);
634
635         if (!found)
636         {
637                 /* First time through ... */
638                 shared_state->lock = &(GetNamedLWLockTranche("pg_store_plans"))->lock;
639                 shared_state->plan_size = max_plan_len;
640                 shared_state->cur_median_usage = ASSUMED_MEDIAN_INIT;
641                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
642                 SpinLockInit(&shared_state->mutex);
643                 shared_state->extent = 0;
644                 shared_state->n_writers = 0;
645                 shared_state->gc_count = 0;
646                 shared_state->stats.dealloc = 0;
647                 shared_state->stats.stats_reset = GetCurrentTimestamp();
648         }
649
650         /* Be sure everyone agrees on the hash table entry size */
651         plan_size = shared_state->plan_size;
652
653         memset(&info, 0, sizeof(info));
654         info.keysize = sizeof(pgspHashKey);
655         info.entrysize = sizeof(pgspEntry);
656         if (plan_storage == PLAN_STORAGE_SHMEM)
657                 info.entrysize += max_plan_len;
658         hash_table = ShmemInitHash("pg_store_plans hash",
659                                                           store_size, store_size,
660                                                           &info, HASH_ELEM |
661                                                           HASH_BLOBS);
662
663         LWLockRelease(AddinShmemInitLock);
664
665         /*
666          * If we're in the postmaster (or a standalone backend...), set up a shmem
667          * exit hook to dump the statistics to disk.
668          */
669         if (!IsUnderPostmaster)
670                 on_shmem_exit(pgsp_shmem_shutdown, (Datum) 0);
671
672         /*
673          * Done if some other process already completed our initialization.
674          */
675         if (found)
676                 return;
677
678         /*
679          * Note: we don't bother with locks here, because there should be no other
680          * processes running when this code is reached.
681          */
682
683         /* Unlink query text file possibly left over from crash */
684         unlink(PGSP_TEXT_FILE);
685
686         if (plan_storage == PLAN_STORAGE_FILE)
687         {
688                 /* Allocate new query text temp file */
689                 pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
690                 if (pfile == NULL)
691                         goto write_error;
692         }
693
694         /*
695          * If we were told not to load old statistics, we're done.  (Note we do
696          * not try to unlink any old dump file in this case.  This seems a bit
697          * questionable but it's the historical behavior.)
698          */
699         if (!dump_on_shutdown)
700         {
701                 if (pfile)
702                         FreeFile(pfile);
703                 return;
704         }
705
706         /*
707          * Attempt to load old statistics from the dump file.
708          */
709         file = AllocateFile(PGSP_DUMP_FILE, PG_BINARY_R);
710         if (file == NULL)
711         {
712                 /* ignore not-found error */
713                 if (errno != ENOENT)
714                         goto read_error;
715
716                 /* No existing persisted stats file, so we're done */
717                 if (pfile)
718                         FreeFile(pfile);
719                 return;
720         }
721
722         buffer_size = plan_size;
723         buffer = (char *) palloc(buffer_size);
724
725         if (fread(&header, sizeof(uint32), 1, file) != 1 ||
726                 fread(&pgver, sizeof(uint32), 1, file) != 1 ||
727                 fread(&num, sizeof(int32), 1, file) != 1)
728                 goto read_error;
729
730         if (header != PGSP_FILE_HEADER ||
731                 pgver != PGSP_PG_MAJOR_VERSION)
732                 goto data_error;
733
734         for (i = 0; i < num; i++)
735         {
736                 pgspEntry       temp;
737                 pgspEntry  *entry;
738                 Size            plan_offset = 0;
739
740                 if (fread(&temp, sizeof(pgspEntry), 1, file) != 1)
741                         goto read_error;
742
743                 /* Encoding is the only field we can easily sanity-check */
744                 if (!PG_VALID_BE_ENCODING(temp.encoding))
745                         goto data_error;
746
747                 /* Previous incarnation might have had a larger plan_size */
748                 if (temp.plan_len >= buffer_size)
749                 {
750                         buffer = (char *) repalloc(buffer, temp.plan_len + 1);
751                         buffer_size = temp.plan_len + 1;
752                 }
753
754                 if (fread(buffer, 1, temp.plan_len + 1, file) != temp.plan_len + 1)
755                         goto read_error;
756
757                 /* Skip loading "sticky" entries */
758                 if (temp.counters.calls == 0)
759                         continue;
760
761                 /* Clip to available length if needed */
762                 if (temp.plan_len >= plan_size)
763                         temp.plan_len = pg_encoding_mbcliplen(temp.encoding,
764                                                                                                    buffer,
765                                                                                                    temp.plan_len,
766                                                                                                    plan_size - 1);
767
768                 buffer[temp.plan_len] = '\0';
769
770                 if (plan_storage == PLAN_STORAGE_FILE)
771                 {
772                         /* Store the plan text */
773                         plan_offset = shared_state->extent;
774                         if (fwrite(buffer, 1, temp.plan_len + 1, pfile) !=
775                                 temp.plan_len + 1)
776                                 goto write_error;
777                         shared_state->extent += temp.plan_len + 1;
778                 }
779
780                 /* make the hashtable entry (discards old entries if too many) */
781                 entry = entry_alloc(&temp.key, plan_offset, temp.plan_len, false);
782
783                 if (plan_storage == PLAN_STORAGE_SHMEM)
784                         memcpy(SHMEM_PLAN_PTR(entry), buffer, temp.plan_len + 1);
785
786                 /* copy in the actual stats */
787                 entry->counters = temp.counters;
788         }
789
790         pfree(buffer);
791         FreeFile(file);
792
793         if (pfile)
794                 FreeFile(pfile);
795
796         /*
797          * Remove the file so it's not included in backups/replication slaves,
798          * etc. A new file will be written on next shutdown.
799          */
800         unlink(PGSP_DUMP_FILE);
801
802         return;
803
804 read_error:
805         ereport(LOG,
806                         (errcode_for_file_access(),
807                          errmsg("could not read file \"%s\": %m",
808                                         PGSP_DUMP_FILE)));
809         goto fail;
810 data_error:
811         ereport(LOG,
812                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
813                          errmsg("ignoring invalid data in file \"%s\"",
814                                         PGSP_DUMP_FILE)));
815         goto fail;
816 write_error:
817         ereport(LOG,
818                         (errcode_for_file_access(),
819                          errmsg("could not write file \"%s\": %m",
820                                         PGSP_TEXT_FILE)));
821 fail:
822         if (buffer)
823                 pfree(buffer);
824         if (file)
825                 FreeFile(file);
826         if (pfile)
827                 FreeFile(pfile);
828         /* If possible, throw away the bogus file; ignore any error */
829         unlink(PGSP_DUMP_FILE);
830
831         /*
832          * Don't unlink PGSP_TEXT_FILE here; it should always be around while the
833          * server is running with pg_stat_statements enabled
834          */
835 }
836
837 /*
838  * shmem_shutdown hook: Dump statistics into file.
839  *
840  * Note: we don't bother with acquiring lock, because there should be no
841  * other processes running when this is called.
842  */
843 static void
844 pgsp_shmem_shutdown(int code, Datum arg)
845 {
846         FILE       *file;
847         char       *pbuffer = NULL;
848         Size            pbuffer_size = 0;
849         HASH_SEQ_STATUS hash_seq;
850         int32           num_entries;
851         pgspEntry  *entry;
852
853         /* Don't try to dump during a crash. */
854         if (code)
855                 return;
856
857         /* Safety check ... shouldn't get here unless shmem is set up. */
858         if (!shared_state || !hash_table)
859                 return;
860
861         /* Don't dump if told not to. */
862         if (!dump_on_shutdown)
863                 return;
864
865         file = AllocateFile(PGSP_DUMP_FILE ".tmp", PG_BINARY_W);
866         if (file == NULL)
867                 goto error;
868
869         if (fwrite(&PGSP_FILE_HEADER, sizeof(uint32), 1, file) != 1)
870                 goto error;
871         if (fwrite(&PGSP_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
872                 goto error;
873         num_entries = hash_get_num_entries(hash_table);
874         if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
875                 goto error;
876
877         if (plan_storage == PLAN_STORAGE_FILE)
878         {
879                 pbuffer = ptext_load_file(&pbuffer_size);
880                 if (pbuffer == NULL)
881                         goto error;
882         }
883
884         hash_seq_init(&hash_seq, hash_table);
885         while ((entry = hash_seq_search(&hash_seq)) != NULL)
886         {
887                 int                     len = entry->plan_len;
888                 char       *pstr;
889
890                 if (plan_storage == PLAN_STORAGE_FILE)
891                         pstr = ptext_fetch(entry->plan_offset, len,
892                                                            pbuffer, pbuffer_size);
893                 else
894                         pstr = SHMEM_PLAN_PTR(entry);
895
896                 if (pstr == NULL)
897                         continue;                       /* Ignore any entries with bogus texts */
898
899                 if (fwrite(entry, sizeof(pgspEntry), 1, file) != 1 ||
900                         fwrite(pstr, 1, len + 1, file) != len + 1)
901                 {
902                         /* note: we assume hash_seq_term won't change errno */
903                         hash_seq_term(&hash_seq);
904                         goto error;
905                 }
906         }
907
908         if (FreeFile(file))
909         {
910                 file = NULL;
911                 goto error;
912         }
913
914         /*
915          * Rename file into place, so we atomically replace the old one.
916          */
917         if (rename(PGSP_DUMP_FILE ".tmp", PGSP_DUMP_FILE) != 0)
918                 ereport(LOG,
919                                 (errcode_for_file_access(),
920                                  errmsg("could not rename pg_store_plans file \"%s\": %m",
921                                                 PGSP_DUMP_FILE ".tmp")));
922
923         /* Unlink query-texts file; it's not needed while shutdown */
924         unlink(PGSP_TEXT_FILE);
925
926         return;
927
928 error:
929         ereport(LOG,
930                         (errcode_for_file_access(),
931                          errmsg("could not write pg_store_plans file \"%s\": %m",
932                                         PGSP_DUMP_FILE ".tmp")));
933         if (file)
934                 FreeFile(file);
935         unlink(PGSP_DUMP_FILE ".tmp");
936 }
937
938
939 /*
940  * ExecutorStart hook: start up tracking if needed
941  */
942 static void
943 pgsp_ExecutorStart(QueryDesc *queryDesc, int eflags)
944 {
945         if (log_analyze &&
946                 (eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0)
947         {
948                 queryDesc->instrument_options |=
949                         (log_timing ? INSTRUMENT_TIMER : 0)|
950                         (log_timing ? 0: INSTRUMENT_ROWS)|
951                         (log_buffers ? INSTRUMENT_BUFFERS : 0);
952         }
953
954         if (prev_ExecutorStart)
955                 prev_ExecutorStart(queryDesc, eflags);
956         else
957                 standard_ExecutorStart(queryDesc, eflags);
958
959         /*
960          * Set up to track total elapsed time in ExecutorRun. Allocate in per-query
961          * context so as to be free at ExecutorEnd.
962          */
963         if (queryDesc->totaltime == NULL &&
964                         pgsp_enabled(queryDesc->plannedstmt->queryId))
965         {
966                 MemoryContext oldcxt;
967
968                 oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
969                 queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL
970 #if PG_VERSION_NUM >= 140000
971                                                                                   , false
972 #endif
973                                                                                  );
974                 MemoryContextSwitchTo(oldcxt);
975         }
976
977 }
978
979 /*
980  * ExecutorRun hook: all we need do is track nesting depth
981  */
982 static void
983 pgsp_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
984                                  bool execute_once)
985 {
986         nested_level++;
987         PG_TRY();
988         {
989                 if (prev_ExecutorRun)
990                         prev_ExecutorRun(queryDesc, direction, count, execute_once);
991                 else
992                         standard_ExecutorRun(queryDesc, direction, count, execute_once);
993                 nested_level--;
994         }
995         PG_CATCH();
996         {
997                 nested_level--;
998                 PG_RE_THROW();
999         }
1000         PG_END_TRY();
1001 }
1002
1003 /*
1004  * ExecutorFinish hook: all we need do is track nesting depth
1005  */
1006 static void
1007 pgsp_ExecutorFinish(QueryDesc *queryDesc)
1008 {
1009         nested_level++;
1010         PG_TRY();
1011         {
1012                 if (prev_ExecutorFinish)
1013                         prev_ExecutorFinish(queryDesc);
1014                 else
1015                         standard_ExecutorFinish(queryDesc);
1016                 nested_level--;
1017         }
1018         PG_CATCH();
1019         {
1020                 nested_level--;
1021                 PG_RE_THROW();
1022         }
1023         PG_END_TRY();
1024 }
1025
1026 /*
1027  * ExecutorEnd hook: store results if needed
1028  */
1029 static void
1030 pgsp_ExecutorEnd(QueryDesc *queryDesc)
1031 {
1032         if (queryDesc->totaltime)
1033         {
1034                 /*
1035                  * Make sure stats accumulation is done.  (Note: it's okay if several
1036                  * levels of hook all do this.)
1037                  */
1038                 InstrEndLoop(queryDesc->totaltime);
1039
1040                 if (pgsp_enabled(queryDesc->plannedstmt->queryId) &&
1041                         queryDesc->totaltime->total &&
1042                         queryDesc->totaltime->total >=
1043                         (double)min_duration / 1000.0)
1044                 {
1045                         queryid_t         queryid;
1046                         ExplainState *es;
1047                         StringInfo        es_str;
1048
1049                         es = NewExplainState();
1050                         es_str = es->str;
1051
1052                         es->analyze = queryDesc->instrument_options;
1053                         es->verbose = log_verbose;
1054                         es->buffers = (es->analyze && log_buffers);
1055                         es->timing = (es->analyze && log_timing);
1056                         es->format = EXPLAIN_FORMAT_JSON;
1057
1058                         ExplainBeginOutput(es);
1059                         ExplainPrintPlan(es, queryDesc);
1060                         if (log_triggers)
1061                                 pgspExplainTriggers(es, queryDesc);
1062                         ExplainEndOutput(es);
1063
1064                         /* Remove last line break */
1065                         if (es_str->len > 0 && es_str->data[es_str->len - 1] == '\n')
1066                                 es_str->data[--es_str->len] = '\0';
1067
1068                         /* JSON outmost braces. */
1069                         es_str->data[0] = '{';
1070                         es_str->data[es_str->len - 1] = '}';
1071
1072                         queryid = queryDesc->plannedstmt->queryId;
1073 #if PG_VERSION_NUM < 140000
1074                         /*
1075                          * For versions before pg14, a queryid is only available if
1076                          * pg_stat_statements extension (or similar) if configured.  We
1077                          * don't want a hard requirement for such an extension so fallback
1078                          * to an internal queryid calculation in some case.
1079                          * For pg14 and above, core postgres can compute a queryid so we
1080                          * will rely on it.
1081                          */
1082                         if (queryid == PGSP_NO_QUERYID)
1083                                 queryid = (queryid_t) hash_query(queryDesc->sourceText);
1084 #else
1085                         Assert(queryid != PGSP_NO_QUERYID);
1086 #endif
1087
1088                         pgsp_store(es_str->data,
1089                                                 queryid,
1090                                                 queryDesc->totaltime->total * 1000.0,   /* convert to msec */
1091                                                 queryDesc->estate->es_processed,
1092                                                 &queryDesc->totaltime->bufusage);
1093                         pfree(es_str->data);
1094                 }
1095         }
1096
1097         if (prev_ExecutorEnd)
1098                 prev_ExecutorEnd(queryDesc);
1099         else
1100                 standard_ExecutorEnd(queryDesc);
1101 }
1102
1103 /*
1104  * ProcessUtility hook
1105  */
1106 static void
1107 pgsp_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
1108 #if PG_VERSION_NUM >= 140000
1109                                         bool readOnlyTree,
1110 #endif
1111                                         ProcessUtilityContext context, ParamListInfo params,
1112                                         QueryEnvironment *queryEnv,
1113                                         DestReceiver *dest, COMPTAG_TYPE *completionTag)
1114 {
1115         int                     tag = nodeTag(pstmt->utilityStmt);
1116         queryid_t       saved_queryId = pstmt->queryId;
1117         bool            reset_force_disabled = false;
1118
1119         if (pgsp_enabled(saved_queryId) &&
1120                 (tag == T_CreateExtensionStmt || tag == T_AlterExtensionStmt) &&
1121                 !force_disabled && track_level < TRACK_LEVEL_VERBOSE)
1122         {
1123                 force_disabled = true;
1124                 reset_force_disabled = true;
1125         }
1126
1127         PG_TRY();
1128         {
1129                 if (prev_ProcessUtility)
1130                 {
1131                         prev_ProcessUtility(pstmt, queryString,
1132 #if PG_VERSION_NUM >= 140000
1133                                                                 readOnlyTree,
1134 #endif
1135                                                                 context, params, queryEnv,
1136                                                                 dest, completionTag);
1137                 }
1138                 else
1139                         standard_ProcessUtility(pstmt, queryString,
1140 #if PG_VERSION_NUM >= 140000
1141                                                                         readOnlyTree,
1142 #endif
1143                                                                         context, params, queryEnv,
1144                                                                         dest, completionTag);
1145
1146                 if (reset_force_disabled)
1147                         force_disabled = false;
1148         }
1149         PG_CATCH();
1150         {
1151                 if (reset_force_disabled)
1152                         force_disabled = false;
1153                 PG_RE_THROW();
1154         }
1155         PG_END_TRY();
1156 }
1157
1158 /*
1159  * hash_query: calculate internal query ID for a query
1160  *
1161  *  As of PG11, Query.queryId has been widen to 64 bit to reduce collision of
1162  *  queries to practical level. On the other hand pg_store_plans uses the
1163  *  combination of query hash and plan hash values as the hash table key and
1164  *  the resolution of the hash value effectively has the same degree so we
1165  *  continue to use uint32 as internal queryid.
1166  *
1167  *  This may merge plans from different queries into single internal query id
1168  *  but it is not a problem when pg_stat_statements is used together since the
1169  *  extension gives enough resolution on queries.
1170  */
1171 static uint32
1172 hash_query(const char* query)
1173 {
1174         uint32 queryid;
1175
1176         char *normquery = pstrdup(query);
1177         normalize_expr(normquery, false);
1178         queryid = hash_any((const unsigned char*)normquery, strlen(normquery));
1179         pfree(normquery);
1180
1181         /* If we are unlucky enough to get a hash of zero, use 1 instead */
1182         if (queryid == 0)
1183                 queryid = 1;
1184
1185         return queryid;
1186 }
1187
1188
1189 /*
1190  * Store some statistics for a plan.
1191  *
1192  * Table entry is keyed with userid.dbid.queryId.planId. planId is the hash
1193  * value of the given plan, which is calculated in ths function.
1194  */
1195 static void
1196 pgsp_store(char *plan, queryid_t queryId,
1197                    double total_time, uint64 rows,
1198                    const BufferUsage *bufusage)
1199 {
1200         pgspHashKey key;
1201         pgspEntry  *entry;
1202         char       *norm_query = NULL;
1203         int             plan_len;
1204         char       *normalized_plan = NULL;
1205         char       *shorten_plan = NULL;
1206         volatile pgspEntry *e;
1207         Size            plan_offset = 0;
1208         bool            do_gc = false;
1209
1210         Assert(plan != NULL && queryId != PGSP_NO_QUERYID);
1211
1212         /* Safety check... */
1213         if (!shared_state || !hash_table)
1214                 return;
1215
1216         /* Set up key for hashtable search */
1217         key.userid = GetUserId();
1218         key.dbid = MyDatabaseId;
1219         key.queryid = queryId;
1220
1221         normalized_plan = pgsp_json_normalize(plan);
1222         shorten_plan = pgsp_json_shorten(plan);
1223         elog(DEBUG3, "pg_store_plans: Normalized plan: %s", normalized_plan);
1224         elog(DEBUG3, "pg_store_plans: Shorten plan: %s", shorten_plan);
1225         elog(DEBUG3, "pg_store_plans: Original plan: %s", plan);
1226         plan_len = strlen(shorten_plan);
1227
1228         key.planid = hash_any((const unsigned char *)normalized_plan,
1229                                                   strlen(normalized_plan));
1230         pfree(normalized_plan);
1231
1232         if (plan_len >= shared_state->plan_size)
1233                 plan_len = pg_encoding_mbcliplen(GetDatabaseEncoding(),
1234                                                                                  shorten_plan,
1235                                                                                  plan_len,
1236                                                                                  shared_state->plan_size - 1);
1237
1238
1239         /* Look up the hash table entry with shared lock. */
1240         LWLockAcquire(shared_state->lock, LW_SHARED);
1241
1242         entry = (pgspEntry *) hash_search(hash_table, &key, HASH_FIND, NULL);
1243
1244         /* Store the plan text, if the entry not present */
1245         if (!entry && plan_storage == PLAN_STORAGE_FILE)
1246         {
1247                 int             gc_count;
1248                 bool    stored;
1249
1250                 /* Append new plan text to file with only shared lock held */
1251                 stored = ptext_store(shorten_plan, plan_len, &plan_offset, &gc_count);
1252
1253                 /*
1254                  * Determine whether we need to garbage collect external query texts
1255                  * while the shared lock is still held.  This micro-optimization
1256                  * avoids taking the time to decide this while holding exclusive lock.
1257                  */
1258                 do_gc = need_gc_ptexts();
1259
1260                 /* Acquire exclusive lock as required by entry_alloc() */
1261                 LWLockRelease(shared_state->lock);
1262                 LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
1263
1264                 /*
1265                  * A garbage collection may have occurred while we weren't holding the
1266                  * lock.  In the unlikely event that this happens, the plan text we
1267                  * stored above will have been garbage collected, so write it again.
1268                  * This should be infrequent enough that doing it while holding
1269                  * exclusive lock isn't a performance problem.
1270                  */
1271                 if (!stored || shared_state->gc_count != gc_count)
1272                         stored = ptext_store(shorten_plan, plan_len, &plan_offset, NULL);
1273
1274                 /* If we failed to write to the text file, give up */
1275                 if (!stored)
1276                         goto done;
1277
1278         }
1279
1280         /* Create new entry, if not present */
1281         if (!entry)
1282         {
1283                 entry = entry_alloc(&key, plan_offset, plan_len, false);
1284
1285                 /* shorten_plan is terminated by NUL */
1286                 if (plan_storage == PLAN_STORAGE_SHMEM)
1287                         memcpy(SHMEM_PLAN_PTR(entry), shorten_plan, plan_len + 1);
1288                         
1289
1290                 /* If needed, perform garbage collection while exclusive lock held */
1291                 if (do_gc)
1292                         gc_ptexts();
1293         }
1294
1295         /* Increment the counts, except when jstate is not NULL */
1296
1297         /*
1298          * Grab the spinlock while updating the counters (see comment about
1299          * locking rules at the head of the file)
1300          */
1301
1302         e = (volatile pgspEntry *) entry;
1303         SpinLockAcquire(&e->mutex);
1304
1305         /* "Unstick" entry if it was previously sticky */
1306         if (e->counters.calls == 0)
1307         {
1308                 e->counters.usage = USAGE_INIT;
1309                 e->counters.first_call = GetCurrentTimestamp();
1310         }
1311
1312         e->counters.calls += 1;
1313         e->counters.total_time += total_time;
1314         if (e->counters.calls == 1)
1315         {
1316                 e->counters.min_time = total_time;
1317                 e->counters.max_time = total_time;
1318                 e->counters.mean_time = total_time;
1319         }
1320         else
1321         {
1322                 /*
1323                  * Welford's method for accurately computing variance. See
1324                  * <http://www.johndcook.com/blog/standard_deviation/>
1325                  */
1326                 double          old_mean = e->counters.mean_time;
1327
1328                 e->counters.mean_time +=
1329                         (total_time - old_mean) / e->counters.calls;
1330                 e->counters.sum_var_time +=
1331                         (total_time - old_mean) * (total_time - e->counters.mean_time);
1332
1333                 /* calculate min and max time */
1334                 if (e->counters.min_time > total_time)
1335                         e->counters.min_time = total_time;
1336                 if (e->counters.max_time < total_time)
1337                         e->counters.max_time = total_time;
1338         }
1339
1340         e->counters.rows += rows;
1341         e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1342         e->counters.shared_blks_read += bufusage->shared_blks_read;
1343         e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
1344         e->counters.shared_blks_written += bufusage->shared_blks_written;
1345         e->counters.local_blks_hit += bufusage->local_blks_hit;
1346         e->counters.local_blks_read += bufusage->local_blks_read;
1347         e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
1348         e->counters.local_blks_written += bufusage->local_blks_written;
1349         e->counters.temp_blks_read += bufusage->temp_blks_read;
1350         e->counters.temp_blks_written += bufusage->temp_blks_written;
1351         e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1352         e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
1353         e->counters.last_call = GetCurrentTimestamp();
1354         e->counters.usage += USAGE_EXEC(total_time);
1355
1356         SpinLockRelease(&e->mutex);
1357
1358 done:
1359         LWLockRelease(shared_state->lock);
1360
1361         /* We postpone this pfree until we're out of the lock */
1362         if (norm_query)
1363                 pfree(norm_query);
1364 }
1365
1366 /*
1367  * Reset all statement statistics.
1368  */
1369 Datum
1370 pg_store_plans_reset(PG_FUNCTION_ARGS)
1371 {
1372         if (!shared_state || !hash_table)
1373                 ereport(ERROR,
1374                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1375                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1376         entry_reset();
1377         PG_RETURN_VOID();
1378 }
1379
1380 /* Number of output arguments (columns) for various API versions */
1381 #define PG_STORE_PLANS_COLS_V1_5        27
1382 #define PG_STORE_PLANS_COLS_V1_6        26
1383 #define PG_STORE_PLANS_COLS                     27      /* maximum of above */
1384
1385 /*
1386  * Retrieve statement statistics.
1387  */
1388 Datum
1389 pg_store_plans_1_6(PG_FUNCTION_ARGS)
1390 {
1391         pg_store_plans_internal(fcinfo, PGSP_V1_6);
1392
1393         return (Datum) 0;
1394 }
1395
1396 Datum
1397 pg_store_plans(PG_FUNCTION_ARGS)
1398 {
1399         pg_store_plans_internal(fcinfo, PGSP_V1_5);
1400
1401         return (Datum) 0;
1402 }
1403
1404 static void
1405 pg_store_plans_internal(FunctionCallInfo fcinfo,
1406                                                 pgspVersion api_version)
1407 {
1408         ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1409         TupleDesc       tupdesc;
1410         Tuplestorestate *tupstore;
1411         MemoryContext per_query_ctx;
1412         MemoryContext oldcontext;
1413         Oid                     userid = GetUserId();
1414         bool            is_allowed_role = is_member_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS);
1415         int                     n_writers;
1416         char       *pbuffer = NULL;
1417         Size            pbuffer_size = 0;
1418         Size            extent = 0;
1419         int                     gc_count = 0;
1420         HASH_SEQ_STATUS hash_seq;
1421         pgspEntry  *entry;
1422
1423         if (!shared_state || !hash_table)
1424                 ereport(ERROR,
1425                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1426                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1427
1428         /* check to see if caller supports us returning a tuplestore */
1429         if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1430                 ereport(ERROR,
1431                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1432                                  errmsg("set-valued function called in context that cannot accept a set")));
1433         if (!(rsinfo->allowedModes & SFRM_Materialize))
1434                 ereport(ERROR,
1435                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1436                                  errmsg("materialize mode required, but it is not " \
1437                                                 "allowed in this context")));
1438
1439         /* Build a tuple descriptor for our result type */
1440         if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1441                 elog(ERROR, "return type must be a row type");
1442
1443         per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1444         oldcontext = MemoryContextSwitchTo(per_query_ctx);
1445
1446         tupstore = tuplestore_begin_heap(true, false, work_mem);
1447         rsinfo->returnMode = SFRM_Materialize;
1448         rsinfo->setResult = tupstore;
1449         rsinfo->setDesc = tupdesc;
1450
1451         MemoryContextSwitchTo(oldcontext);
1452
1453         /*
1454          * We'd like to load the plan text file (if needed) while not holding any
1455          * lock on shared_state->lock.  In the worst case we'll have to do this
1456          * again after we have the lock, but it's unlikely enough to make this a
1457          * win despite occasional duplicated work.  We need to reload if anybody
1458          * writes to the file (either a retail ptext_store(), or a garbage
1459          * collection) between this point and where we've gotten shared lock.  If a
1460          * ptext_store is actually in progress when we look, we might as well skip
1461          * the speculative load entirely.
1462          */
1463
1464         /* Take the mutex so we can examine variables */
1465         {
1466                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1467
1468                 SpinLockAcquire(&s->mutex);
1469                 extent = s->extent;
1470                 n_writers = s->n_writers;
1471                 gc_count = s->gc_count;
1472                 SpinLockRelease(&s->mutex);
1473         }
1474
1475         /* No point in loading file now if there are active writers */
1476         if (n_writers == 0 && plan_storage == PLAN_STORAGE_FILE)
1477                 pbuffer = ptext_load_file(&pbuffer_size);
1478
1479         /*
1480          * Get shared lock, load or reload the plan text file if we must, and
1481          * iterate over the hashtable entries.
1482          *
1483          * With a large hash table, we might be holding the lock rather longer
1484          * than one could wish.  However, this only blocks creation of new hash
1485          * table entries, and the larger the hash table the less likely that is to
1486          * be needed.  So we can hope this is okay.  Perhaps someday we'll decide
1487          * we need to partition the hash table to limit the time spent holding any
1488          * one lock.
1489          */
1490         LWLockAcquire(shared_state->lock, LW_SHARED);
1491
1492         /*
1493          * Here it is safe to examine extent and gc_count without taking the mutex.
1494          * Note that although other processes might change shared_state->extent
1495          * just after we look at it, the strings they then write into the file
1496          * cannot yet be referenced in the hashtable, so we don't care whether we
1497          * see them or not.
1498          *
1499          * If ptext_load_file fails, we just press on; we'll return NULL for every
1500          * plan text.
1501          */
1502         if (plan_storage == PLAN_STORAGE_FILE &&
1503                 (pbuffer == NULL ||
1504                  shared_state->extent != extent ||
1505                  shared_state->gc_count != gc_count))
1506         {
1507                 if (pbuffer)
1508                         free(pbuffer);
1509                 pbuffer = ptext_load_file(&pbuffer_size);
1510         }
1511
1512         hash_seq_init(&hash_seq, hash_table);
1513         while ((entry = hash_seq_search(&hash_seq)) != NULL)
1514         {
1515                 Datum           values[PG_STORE_PLANS_COLS];
1516                 bool            nulls[PG_STORE_PLANS_COLS];
1517                 int                     i = 0;
1518                 int64           queryid      = entry->key.queryid;
1519                 int64           planid       = entry->key.planid;
1520                 Counters        tmp;
1521                 double          stddev;
1522
1523                 memset(values, 0, sizeof(values));
1524                 memset(nulls, 0, sizeof(nulls));
1525
1526                 values[i++] = ObjectIdGetDatum(entry->key.userid);
1527                 values[i++] = ObjectIdGetDatum(entry->key.dbid);
1528                 if (is_allowed_role || entry->key.userid == userid)
1529                 {
1530                         values[i++] = Int64GetDatumFast(queryid);
1531                         values[i++] = Int64GetDatumFast(planid);
1532
1533                         /* fill queryid_stat_statements with the same value with queryid */
1534                         if (api_version == PGSP_V1_5)
1535                                 values[i++] = Int64GetDatumFast(queryid);
1536                 }
1537                 else
1538                 {
1539                         nulls[i++] = true;      /* queryid */
1540                         nulls[i++] = true;      /* planid */
1541
1542                         /* queryid_stat_statemetns*/
1543                         if (api_version == PGSP_V1_5)
1544                                 nulls[i++] = true;
1545                 }
1546
1547                 if (is_allowed_role || entry->key.userid == userid)
1548                 {
1549                         char       *pstr; /* Plan string */
1550                         char       *mstr; /* Modified plan string */
1551                         char       *estr; /* Encoded modified plan string */
1552
1553                         if (plan_storage == PLAN_STORAGE_FILE)
1554                                 pstr = ptext_fetch(entry->plan_offset, entry->plan_len,
1555                                                                    pbuffer, pbuffer_size);
1556                         else
1557                                 pstr = SHMEM_PLAN_PTR(entry);
1558
1559                         switch (plan_format)
1560                         {
1561                                 case PLAN_FORMAT_TEXT:
1562                                         mstr = pgsp_json_textize(pstr);
1563                                         break;
1564                                 case PLAN_FORMAT_JSON:
1565                                         mstr = pgsp_json_inflate(pstr);
1566                                         break;
1567                                 case PLAN_FORMAT_YAML:
1568                                         mstr = pgsp_json_yamlize(pstr);
1569                                         break;
1570                                 case PLAN_FORMAT_XML:
1571                                         mstr = pgsp_json_xmlize(pstr);
1572                                         break;
1573                                 default:
1574                                         mstr = pstr;
1575                                         break;
1576                         }
1577
1578                         estr = (char *)
1579                                 pg_do_encoding_conversion((unsigned char *) mstr,
1580                                                                                   strlen(mstr),
1581                                                                                   entry->encoding,
1582                                                                                   GetDatabaseEncoding());
1583                         values[i++] = CStringGetTextDatum(estr);
1584
1585                         if (estr != mstr)
1586                                 pfree(estr);
1587
1588                         if (mstr != pstr)
1589                                 pfree(mstr);
1590
1591                         /* pstr is a pointer onto pbuffer */
1592                 }
1593                 else
1594                         values[i++] = CStringGetTextDatum("<insufficient privilege>");
1595
1596                 /* copy counters to a local variable to keep locking time short */
1597                 {
1598                         volatile pgspEntry *e = (volatile pgspEntry *) entry;
1599
1600                         SpinLockAcquire(&e->mutex);
1601                         tmp = e->counters;
1602                         SpinLockRelease(&e->mutex);
1603                 }
1604
1605                 /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1606                 if (tmp.calls == 0)
1607                         continue;
1608
1609                 values[i++] = Int64GetDatumFast(tmp.calls);
1610                 values[i++] = Float8GetDatumFast(tmp.total_time);
1611                 values[i++] = Float8GetDatumFast(tmp.min_time);
1612                 values[i++] = Float8GetDatumFast(tmp.max_time);
1613                 values[i++] = Float8GetDatumFast(tmp.mean_time);
1614
1615                 /*
1616                  * Note we are calculating the population variance here, not the
1617                  * sample variance, as we have data for the whole population, so
1618                  * Bessel's correction is not used, and we don't divide by
1619                  * tmp.calls - 1.
1620                  */
1621                 if (tmp.calls > 1)
1622                         stddev = sqrt(tmp.sum_var_time / tmp.calls);
1623                 else
1624                         stddev = 0.0;
1625                 values[i++] = Float8GetDatumFast(stddev);
1626
1627                 values[i++] = Int64GetDatumFast(tmp.rows);
1628                 values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1629                 values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
1630                 values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
1631                 values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1632                 values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1633                 values[i++] = Int64GetDatumFast(tmp.local_blks_read);
1634                 values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
1635                 values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1636                 values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1637                 values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
1638                 values[i++] = Float8GetDatumFast(tmp.blk_read_time);
1639                 values[i++] = Float8GetDatumFast(tmp.blk_write_time);
1640                 values[i++] = TimestampTzGetDatum(tmp.first_call);
1641                 values[i++] = TimestampTzGetDatum(tmp.last_call);
1642
1643                 Assert(i == (api_version == PGSP_V1_5 ? PG_STORE_PLANS_COLS_V1_5 :
1644                                          api_version == PGSP_V1_6 ? PG_STORE_PLANS_COLS_V1_6 :
1645                                          -1 /* fail if you forget to update this assert */ ));
1646
1647                 tuplestore_putvalues(tupstore, tupdesc, values, nulls);
1648         }
1649
1650         LWLockRelease(shared_state->lock);
1651
1652         /* clean up and return the tuplestore */
1653         tuplestore_donestoring(tupstore);
1654 }
1655
1656 /* Number of output arguments (columns) for pg_stat_statements_info */
1657 #define PG_STORE_PLANS_INFO_COLS        2
1658
1659 /*
1660  * Return statistics of pg_stat_statements.
1661  */
1662 Datum
1663 pg_store_plans_info(PG_FUNCTION_ARGS)
1664 {
1665         pgspGlobalStats stats;
1666         TupleDesc       tupdesc;
1667         Datum           values[PG_STORE_PLANS_INFO_COLS];
1668         bool            nulls[PG_STORE_PLANS_INFO_COLS];
1669
1670         if (!shared_state || !hash_table)
1671                 ereport(ERROR,
1672                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1673                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
1674
1675         /* Build a tuple descriptor for our result type */
1676         if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1677                 elog(ERROR, "return type must be a row type");
1678
1679         MemSet(values, 0, sizeof(values));
1680         MemSet(nulls, 0, sizeof(nulls));
1681
1682         /* Read global statistics for pg_stat_statements */
1683         {
1684                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1685
1686                 SpinLockAcquire(&s->mutex);
1687                 stats = s->stats;
1688                 SpinLockRelease(&s->mutex);
1689         }
1690
1691         values[0] = Int64GetDatum(stats.dealloc);
1692         values[1] = TimestampTzGetDatum(stats.stats_reset);
1693
1694         PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
1695 }
1696
1697 /*
1698  * Estimate shared memory space needed.
1699  */
1700 static Size
1701 shared_mem_size(void)
1702 {
1703         Size            size;
1704         int                     entry_size;
1705
1706         size = MAXALIGN(sizeof(pgspSharedState));
1707         entry_size = sizeof(pgspEntry);
1708
1709         /* plan text is apppended to the struct body */
1710         if (plan_storage == PLAN_STORAGE_SHMEM)
1711                 entry_size += max_plan_len;
1712
1713         size = add_size(size, hash_estimate_size(store_size, entry_size));
1714
1715         return size;
1716 }
1717
1718 /*
1719  * Allocate a new hashtable entry.
1720  * caller must hold an exclusive lock on shared_state->lock
1721  *
1722  * "plan" need not be null-terminated; we rely on plan_len instead
1723  *
1724  * If "sticky" is true, make the new entry artificially sticky so that it will
1725  * probably still be there when the query finishes execution.  We do this by
1726  * giving it a median usage value rather than the normal value.  (Strictly
1727  * speaking, query strings are normalized on a best effort basis, though it
1728  * would be difficult to demonstrate this even under artificial conditions.)
1729  *
1730  * Note: despite needing exclusive lock, it's not an error for the target
1731  * entry to already exist.      This is because pgsp_store releases and
1732  * reacquires lock after failing to find a match; so someone else could
1733  * have made the entry while we waited to get exclusive lock.
1734  */
1735 static pgspEntry *
1736 entry_alloc(pgspHashKey *key, Size plan_offset, int plan_len, bool sticky)
1737 {
1738         pgspEntry  *entry;
1739         bool            found;
1740
1741         /* Make space if needed */
1742         while (hash_get_num_entries(hash_table) >= store_size)
1743                 entry_dealloc();
1744
1745         /* Find or create an entry with desired hash code */
1746         entry = (pgspEntry *) hash_search(hash_table, key, HASH_ENTER, &found);
1747
1748         if (!found)
1749         {
1750                 /* New entry, initialize it */
1751
1752                 /* reset the statistics */
1753                 memset(&entry->counters, 0, sizeof(Counters));
1754                 /* set the appropriate initial usage count */
1755                 entry->counters.usage = sticky ? shared_state->cur_median_usage : USAGE_INIT;
1756                 /* re-initialize the mutex each time ... we assume no one using it */
1757                 SpinLockInit(&entry->mutex);
1758                 /* ... and don't forget the query text */
1759                 Assert(plan_len >= 0 && plan_len < shared_state->plan_size);
1760                 entry->plan_offset = plan_offset;
1761                 entry->plan_len = plan_len;
1762                 entry->encoding = GetDatabaseEncoding();
1763         }
1764
1765         return entry;
1766 }
1767
1768 /*
1769  * qsort comparator for sorting into increasing usage order
1770  */
1771 static int
1772 entry_cmp(const void *lhs, const void *rhs)
1773 {
1774         double          l_usage = (*(pgspEntry *const *) lhs)->counters.usage;
1775         double          r_usage = (*(pgspEntry *const *) rhs)->counters.usage;
1776
1777         if (l_usage < r_usage)
1778                 return -1;
1779         else if (l_usage > r_usage)
1780                 return +1;
1781         else
1782                 return 0;
1783 }
1784
1785 /*
1786  * Deallocate least used entries.
1787  * Caller must hold an exclusive lock on shared_state->lock.
1788  */
1789 static void
1790 entry_dealloc(void)
1791 {
1792         HASH_SEQ_STATUS hash_seq;
1793         pgspEntry **entries;
1794         pgspEntry  *entry;
1795         int                     nvictims;
1796         int                     i;
1797         Size            tottextlen;
1798         int                     nvalidtexts;
1799
1800         /*
1801          * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1802          * While we're scanning the table, apply the decay factor to the usage
1803          * values.
1804          */
1805
1806         entries = palloc(hash_get_num_entries(hash_table) * sizeof(pgspEntry *));
1807
1808         i = 0;
1809         tottextlen = 0;
1810         nvalidtexts = 0;
1811
1812         hash_seq_init(&hash_seq, hash_table);
1813         while ((entry = hash_seq_search(&hash_seq)) != NULL)
1814         {
1815                 entries[i++] = entry;
1816                 /* "Sticky" entries get a different usage decay rate. */
1817                 if (entry->counters.calls == 0)
1818                         entry->counters.usage *= STICKY_DECREASE_FACTOR;
1819                 else
1820                         entry->counters.usage *= USAGE_DECREASE_FACTOR;
1821
1822                 /* In the mean length computation, ignore dropped texts. */
1823                 if (entry->plan_len >= 0)
1824                 {
1825                         tottextlen += entry->plan_len + 1;
1826                         nvalidtexts++;
1827                 }
1828         }
1829
1830         qsort(entries, i, sizeof(pgspEntry *), entry_cmp);
1831
1832         /* Also, record the (approximate) median usage */
1833         if (i > 0)
1834                 shared_state->cur_median_usage = entries[i / 2]->counters.usage;
1835         /* Record the mean plan length */
1836         if (nvalidtexts > 0)
1837                 shared_state->mean_plan_len = tottextlen / nvalidtexts;
1838         else
1839                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
1840
1841         nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
1842         nvictims = Min(nvictims, i);
1843
1844         for (i = 0; i < nvictims; i++)
1845         {
1846                 hash_search(hash_table, &entries[i]->key, HASH_REMOVE, NULL);
1847         }
1848
1849         pfree(entries);
1850
1851         /* Increment the number of times entries are deallocated */
1852         {
1853                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1854
1855                 SpinLockAcquire(&s->mutex);
1856                 s->stats.dealloc += 1;
1857                 SpinLockRelease(&s->mutex);
1858         }
1859 }
1860
1861 /*
1862  * Given a plan string (not necessarily null-terminated), allocate a new
1863  * entry in the external plan text file and store the string there.
1864  *
1865  * If successful, returns true, and stores the new entry's offset in the file
1866  * into *plan_offset.  Also, if gc_count isn't NULL, *gc_count is set to the
1867  * number of garbage collections that have occurred so far.
1868  *
1869  * On failure, returns false.
1870  *
1871  * At least a shared lock on shared_state->lock must be held by the caller, so
1872  * as to prevent a concurrent garbage collection.  Share-lock-holding callers
1873  * should pass a gc_count pointer to obtain the number of garbage collections,
1874  * so that they can recheck the count after obtaining exclusive lock to detect
1875  * whether a garbage collection occurred (and removed this entry).
1876  */
1877 static bool
1878 ptext_store(const char *plan, int plan_len, Size *plan_offset, int *gc_count)
1879 {
1880         Size            off;
1881         int                     fd;
1882
1883         Assert (plan_storage == PLAN_STORAGE_FILE);
1884
1885         /*
1886          * We use a spinlock to protect extent/n_writers/gc_count, so that
1887          * multiple processes may execute this function concurrently.
1888          */
1889         {
1890                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1891
1892                 SpinLockAcquire(&s->mutex);
1893                 off = s->extent;
1894                 s->extent += plan_len + 1;
1895                 s->n_writers++;
1896                 if (gc_count)
1897                         *gc_count = s->gc_count;
1898                 SpinLockRelease(&s->mutex);
1899         }
1900
1901         *plan_offset = off;
1902
1903         /* Now write the data into the successfully-reserved part of the file */
1904         fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
1905         if (fd < 0)
1906                 goto error;
1907
1908         if (pg_pwrite(fd, plan, plan_len, off) != plan_len)
1909                 goto error;
1910         if (pg_pwrite(fd, "\0", 1, off + plan_len) != 1)
1911                 goto error;
1912
1913         CloseTransientFile(fd);
1914
1915         /* Mark our write complete */
1916         {
1917                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1918
1919                 SpinLockAcquire(&s->mutex);
1920                 s->n_writers--;
1921                 SpinLockRelease(&s->mutex);
1922         }
1923
1924         return true;
1925
1926 error:
1927         ereport(LOG,
1928                         (errcode_for_file_access(),
1929                          errmsg("could not write file \"%s\": %m",
1930                                         PGSP_TEXT_FILE)));
1931
1932         if (fd >= 0)
1933                 CloseTransientFile(fd);
1934
1935         /* Mark our write complete */
1936         {
1937                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
1938
1939                 SpinLockAcquire(&s->mutex);
1940                 s->n_writers--;
1941                 SpinLockRelease(&s->mutex);
1942         }
1943
1944         return false;
1945 }
1946
1947 /*
1948  * Read the external plan text file into a malloc'd buffer.
1949  *
1950  * Returns NULL (without throwing an error) if unable to read, eg
1951  * file not there or insufficient memory.
1952  *
1953  * On success, the buffer size is also returned into *buffer_size.
1954  *
1955  * This can be called without any lock on shared_state->lock, but in that case
1956  * the caller is responsible for verifying that the result is sane.
1957  */
1958 static char *
1959 ptext_load_file(Size *buffer_size)
1960 {
1961         char       *buf;
1962         int                     fd;
1963         struct stat stat;
1964         Size            nread;
1965
1966         Assert (plan_storage == PLAN_STORAGE_FILE);
1967
1968         fd = OpenTransientFile(PGSP_TEXT_FILE, O_RDONLY | PG_BINARY);
1969         if (fd < 0)
1970         {
1971                 if (errno != ENOENT)
1972                         ereport(LOG,
1973                                         (errcode_for_file_access(),
1974                                          errmsg("could not read file \"%s\": %m",
1975                                                         PGSP_TEXT_FILE)));
1976                 return NULL;
1977         }
1978
1979         /* Get file length */
1980         if (fstat(fd, &stat))
1981         {
1982                 ereport(LOG,
1983                                 (errcode_for_file_access(),
1984                                  errmsg("could not stat file \"%s\": %m",
1985                                                 PGSP_TEXT_FILE)));
1986                 CloseTransientFile(fd);
1987                 return NULL;
1988         }
1989
1990         /* Allocate buffer; beware that off_t might be wider than size_t */
1991         if (stat.st_size <= MaxAllocHugeSize)
1992                 buf = (char *) malloc(stat.st_size);
1993         else
1994                 buf = NULL;
1995         if (buf == NULL)
1996         {
1997                 ereport(LOG,
1998                                 (errcode(ERRCODE_OUT_OF_MEMORY),
1999                                  errmsg("out of memory"),
2000                                  errdetail("Could not allocate enough memory to read file \"%s\".",
2001                                                    PGSP_TEXT_FILE)));
2002                 CloseTransientFile(fd);
2003                 return NULL;
2004         }
2005
2006         /*
2007          * OK, slurp in the file.  Windows fails if we try to read more than
2008          * INT_MAX bytes at once, and other platforms might not like that either,
2009          * so read a very large file in 1GB segments.
2010          */
2011         nread = 0;
2012         while (nread < stat.st_size)
2013         {
2014                 int                     toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
2015
2016                 /*
2017                  * If we get a short read and errno doesn't get set, the reason is
2018                  * probably that garbage collection truncated the file since we did
2019                  * the fstat(), so we don't log a complaint --- but we don't return
2020                  * the data, either, since it's most likely corrupt due to concurrent
2021                  * writes from garbage collection.
2022                  */
2023                 errno = 0;
2024                 if (read(fd, buf + nread, toread) != toread)
2025                 {
2026                         if (errno)
2027                                 ereport(LOG,
2028                                                 (errcode_for_file_access(),
2029                                                  errmsg("could not read file \"%s\": %m",
2030                                                                 PGSP_TEXT_FILE)));
2031                         free(buf);
2032                         CloseTransientFile(fd);
2033                         return NULL;
2034                 }
2035                 nread += toread;
2036         }
2037
2038         if (CloseTransientFile(fd) != 0)
2039                 ereport(LOG,
2040                                 (errcode_for_file_access(),
2041                                  errmsg("could not close file \"%s\": %m", PGSP_TEXT_FILE)));
2042
2043         *buffer_size = nread;
2044         return buf;
2045 }
2046
2047 /*
2048  * Locate a plan text in the file image previously read by ptext_load_file().
2049  *
2050  * We validate the given offset/length, and return NULL if bogus.  Otherwise,
2051  * the result points to a null-terminated string within the buffer.
2052  */
2053 static char *
2054 ptext_fetch(Size plan_offset, int plan_len,
2055                         char *buffer, Size buffer_size)
2056 {
2057         Assert (plan_storage == PLAN_STORAGE_FILE);
2058
2059         /* File read failed? */
2060         if (buffer == NULL)
2061                 return NULL;
2062         /* Bogus offset/length? */
2063         if (plan_len < 0 ||
2064                 plan_offset + plan_len >= buffer_size)
2065                 return NULL;
2066         /* As a further sanity check, make sure there's a trailing null */
2067         if (buffer[plan_offset + plan_len] != '\0')
2068                 return NULL;
2069         /* Looks OK */
2070         return buffer + plan_offset;
2071 }
2072
2073 /*
2074  * Do we need to garbage-collect the external plan text file?
2075  *
2076  * Caller should hold at least a shared lock on shared_state->lock.
2077  */
2078 static bool
2079 need_gc_ptexts(void)
2080 {
2081         Size            extent;
2082
2083         Assert (plan_storage == PLAN_STORAGE_FILE);
2084
2085         /* Read shared extent pointer */
2086         {
2087                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2088
2089                 SpinLockAcquire(&s->mutex);
2090                 extent = s->extent;
2091                 SpinLockRelease(&s->mutex);
2092         }
2093
2094         /* Don't proceed if file does not exceed 512 bytes per possible entry */
2095         if (extent < 512 * store_size)
2096                 return false;
2097
2098         /*
2099          * Don't proceed if file is less than about 50% bloat.  Nothing can or
2100          * should be done in the event of unusually large query texts accounting
2101          * for file's large size.  We go to the trouble of maintaining the mean
2102          * query length in order to prevent garbage collection from thrashing
2103          * uselessly.
2104          */
2105         if (extent < shared_state->mean_plan_len * store_size * 2)
2106                 return false;
2107
2108         return true;
2109 }
2110
2111 /*
2112  * Garbage-collect orphaned plan texts in external file.
2113  *
2114  * This won't be called often in the typical case, since it's likely that
2115  * there won't be too much churn, and besides, a similar compaction process
2116  * occurs when serializing to disk at shutdown or as part of resetting.
2117  * Despite this, it seems prudent to plan for the edge case where the file
2118  * becomes unreasonably large, with no other method of compaction likely to
2119  * occur in the foreseeable future.
2120  *
2121  * The caller must hold an exclusive lock on shared_state->lock.
2122  *
2123  * At the first sign of trouble we unlink the query text file to get a clean
2124  * slate (although existing statistics are retained), rather than risk
2125  * thrashing by allowing the same problem case to recur indefinitely.
2126  */
2127 static void
2128 gc_ptexts(void)
2129 {
2130         char       *pbuffer;
2131         Size            pbuffer_size;
2132         FILE       *pfile = NULL;
2133         HASH_SEQ_STATUS hash_seq;
2134         pgspEntry  *entry;
2135         Size            extent;
2136         int                     nentries;
2137
2138         Assert (plan_storage == PLAN_STORAGE_FILE);
2139
2140         /*
2141          * When called from store_entry, some other session might have proceeded
2142          * with garbage collection in the no-lock-held interim of lock strength
2143          * escalation.  Check once more that this is actually necessary.
2144          */
2145         if (!need_gc_ptexts())
2146                 return;
2147
2148         /*
2149          * Load the old texts file.  If we fail (out of memory, for instance),
2150          * invalidate query texts.  Hopefully this is rare.  It might seem better
2151          * to leave things alone on an OOM failure, but the problem is that the
2152          * file is only going to get bigger; hoping for a future non-OOM result is
2153          * risky and can easily lead to complete denial of service.
2154          */
2155         pbuffer = ptext_load_file(&pbuffer_size);
2156         if (pbuffer == NULL)
2157                 goto gc_fail;
2158
2159         /*
2160          * We overwrite the plan texts file in place, so as to reduce the risk of
2161          * an out-of-disk-space failure.  Since the file is guaranteed not to get
2162          * larger, this should always work on traditional filesystems; though we
2163          * could still lose on copy-on-write filesystems.
2164          */
2165         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2166         if (pfile == NULL)
2167         {
2168                 ereport(LOG,
2169                                 (errcode_for_file_access(),
2170                                  errmsg("could not write file \"%s\": %m",
2171                                                 PGSP_TEXT_FILE)));
2172                 goto gc_fail;
2173         }
2174
2175         extent = 0;
2176         nentries = 0;
2177
2178         hash_seq_init(&hash_seq, hash_table);
2179         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2180         {
2181                 int                     plan_len = entry->plan_len;
2182                 char       *plan = ptext_fetch(entry->plan_offset,
2183                                                                            plan_len,
2184                                                                            pbuffer,
2185                                                                            pbuffer_size);
2186
2187                 if (plan == NULL)
2188                 {
2189                         /* Trouble ... drop the text */
2190                         entry->plan_offset = 0;
2191                         entry->plan_len = -1;
2192                         /* entry will not be counted in mean plan length computation */
2193                         continue;
2194                 }
2195
2196                 if (fwrite(plan, 1, plan_len + 1, pfile) != plan_len + 1)
2197                 {
2198                         ereport(LOG,
2199                                         (errcode_for_file_access(),
2200                                          errmsg("could not write file \"%s\": %m",
2201                                                         PGSP_TEXT_FILE)));
2202                         hash_seq_term(&hash_seq);
2203                         goto gc_fail;
2204                 }
2205
2206                 entry->plan_offset = extent;
2207                 extent += plan_len + 1;
2208                 nentries++;
2209         }
2210
2211         /*
2212          * Truncate away any now-unused space.  If this fails for some odd reason,
2213          * we log it, but there's no need to fail.
2214          */
2215         if (ftruncate(fileno(pfile), extent) != 0)
2216                 ereport(LOG,
2217                                 (errcode_for_file_access(),
2218                                  errmsg("could not truncate file \"%s\": %m",
2219                                                 PGSP_TEXT_FILE)));
2220
2221         if (FreeFile(pfile))
2222         {
2223                 ereport(LOG,
2224                                 (errcode_for_file_access(),
2225                                  errmsg("could not write file \"%s\": %m",
2226                                                 PGSP_TEXT_FILE)));
2227                 pfile = NULL;
2228                 goto gc_fail;
2229         }
2230
2231         elog(DEBUG1, "pgsp gc of queries file shrunk size from %zu to %zu",
2232                  shared_state->extent, extent);
2233
2234         /* Reset the shared extent pointer */
2235         shared_state->extent = extent;
2236
2237         /*
2238          * Also update the mean plan length, to be sure that need_gc_ptexts()
2239          * won't still think we have a problem.
2240          */
2241         if (nentries > 0)
2242                 shared_state->mean_plan_len = extent / nentries;
2243         else
2244                 shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2245
2246         free(pbuffer);
2247
2248         return;
2249
2250 gc_fail:
2251         /* clean up resources */
2252         if (pfile)
2253                 FreeFile(pfile);
2254         if (pbuffer)
2255                 free(pbuffer);
2256
2257         /*
2258          * Since the contents of the external file are now uncertain, mark all
2259          * hashtable entries as having invalid texts.
2260          */
2261         hash_seq_init(&hash_seq, hash_table);
2262         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2263         {
2264                 entry->plan_offset = 0;
2265                 entry->plan_len = -1;
2266         }
2267
2268         /*
2269          * Destroy the query text file and create a new, empty one
2270          */
2271         (void) unlink(PGSP_TEXT_FILE);
2272         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2273         if (pfile == NULL)
2274                 ereport(LOG,
2275                                 (errcode_for_file_access(),
2276                                  errmsg("could not recreate file \"%s\": %m",
2277                                                 PGSP_TEXT_FILE)));
2278         else
2279                 FreeFile(pfile);
2280
2281         /* Reset the shared extent pointer */
2282         shared_state->extent = 0;
2283
2284         /* Reset mean_plan_len to match the new state */
2285         shared_state->mean_plan_len = ASSUMED_LENGTH_INIT;
2286 }
2287
2288 /*
2289  * Release all entries.
2290  */
2291 static void
2292 entry_reset(void)
2293 {
2294         HASH_SEQ_STATUS hash_seq;
2295         pgspEntry  *entry;
2296         FILE       *pfile;
2297
2298         if (!shared_state || !hash_table)
2299                 ereport(ERROR,
2300                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2301                                  errmsg("pg_store_plans must be loaded via shared_preload_libraries")));
2302
2303         LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
2304
2305         hash_seq_init(&hash_seq, hash_table);
2306         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2307         {
2308                 hash_search(hash_table, &entry->key, HASH_REMOVE, NULL);
2309         }
2310
2311         /*
2312          * Reset global statistics for pg_store_plans.
2313          */
2314         {
2315                 volatile pgspSharedState *s = (volatile pgspSharedState *) shared_state;
2316                 TimestampTz stats_reset = GetCurrentTimestamp();
2317
2318                 SpinLockAcquire(&s->mutex);
2319                 s->stats.dealloc = 0;
2320                 s->stats.stats_reset = stats_reset;
2321                 SpinLockRelease(&s->mutex);
2322         }
2323
2324         /*
2325          * Write new empty plan file, perhaps even creating a new one to recover
2326          * if the file was missing.
2327          */
2328         pfile = AllocateFile(PGSP_TEXT_FILE, PG_BINARY_W);
2329         if (pfile == NULL)
2330         {
2331                 ereport(LOG,
2332                                 (errcode_for_file_access(),
2333                                  errmsg("could not create file \"%s\": %m",
2334                                                 PGSP_TEXT_FILE)));
2335                 goto done;
2336         }
2337
2338         /* If ftruncate fails, log it, but it's not a fatal problem */
2339         if (ftruncate(fileno(pfile), 0) != 0)
2340                 ereport(LOG,
2341                                 (errcode_for_file_access(),
2342                                  errmsg("could not truncate file \"%s\": %m",
2343                                                 PGSP_TEXT_FILE)));
2344
2345         FreeFile(pfile);
2346
2347 done:
2348         shared_state->extent = 0;
2349         LWLockRelease(shared_state->lock);
2350 }
2351
2352 Datum
2353 pg_store_plans_hash_query(PG_FUNCTION_ARGS)
2354 {
2355         PG_RETURN_OID(hash_query(text_to_cstring(PG_GETARG_TEXT_P(0))));
2356 }
2357
2358 Datum
2359 pg_store_plans_shorten(PG_FUNCTION_ARGS)
2360 {
2361         text *short_plan = PG_GETARG_TEXT_P(0);
2362         char *cjson = text_to_cstring(short_plan);
2363         char *cshorten = pgsp_json_shorten(cjson);
2364         PG_RETURN_TEXT_P(cstring_to_text(cshorten));
2365 }
2366
2367 Datum
2368 pg_store_plans_normalize(PG_FUNCTION_ARGS)
2369 {
2370         text *short_plan = PG_GETARG_TEXT_P(0);
2371         char *cjson = text_to_cstring(short_plan);
2372         char *cnormalized = pgsp_json_normalize(cjson);
2373         PG_RETURN_TEXT_P(cstring_to_text(cnormalized));
2374 }
2375
2376 Datum
2377 pg_store_plans_jsonplan(PG_FUNCTION_ARGS)
2378 {
2379         text *short_plan = PG_GETARG_TEXT_P(0);
2380         char *cshort = text_to_cstring(short_plan);
2381         char *cinflated = pgsp_json_inflate(cshort);
2382         PG_RETURN_TEXT_P(cstring_to_text(cinflated));
2383 }
2384
2385 Datum
2386 pg_store_plans_textplan(PG_FUNCTION_ARGS)
2387 {
2388         text *short_plan = PG_GETARG_TEXT_P(0);
2389         char *cshort = text_to_cstring(short_plan);
2390         char *ctextized = pgsp_json_textize(cshort);
2391
2392         PG_RETURN_TEXT_P(cstring_to_text(ctextized));
2393 }
2394
2395 Datum
2396 pg_store_plans_yamlplan(PG_FUNCTION_ARGS)
2397 {
2398         text *short_plan = PG_GETARG_TEXT_P(0);
2399         char *cshort = text_to_cstring(short_plan);
2400         char *cyamlized = pgsp_json_yamlize(cshort);
2401
2402         PG_RETURN_TEXT_P(cstring_to_text(cyamlized));
2403 }
2404
2405 Datum
2406 pg_store_plans_xmlplan(PG_FUNCTION_ARGS)
2407 {
2408         text *short_plan = PG_GETARG_TEXT_P(0);
2409         char *cshort = text_to_cstring(short_plan);
2410         char *cxmlized = pgsp_json_xmlize(cshort);
2411
2412         PG_RETURN_TEXT_P(cstring_to_text(cxmlized));
2413 }