OSDN Git Service

Interim support for 9.4beta2. pg_stat_statements.c is replaced with the latest versio...
[pghintplan/pg_hint_plan.git] / pg_stat_statements.c
1 /*-------------------------------------------------------------------------
2  *
3  * pg_stat_statements.c
4  *              Track statement execution times across a whole database cluster.
5  *
6  * Execution costs are totalled for each distinct source query, and kept in
7  * a shared hashtable.  (We track only as many distinct queries as will fit
8  * in the designated amount of shared memory.)
9  *
10  * As of Postgres 9.2, this module normalizes query entries.  Normalization
11  * is a process whereby similar queries, typically differing only in their
12  * constants (though the exact rules are somewhat more subtle than that) are
13  * recognized as equivalent, and are tracked as a single entry.  This is
14  * particularly useful for non-prepared queries.
15  *
16  * Normalization is implemented by fingerprinting queries, selectively
17  * serializing those fields of each query tree's nodes that are judged to be
18  * essential to the query.  This is referred to as a query jumble.  This is
19  * distinct from a regular serialization in that various extraneous
20  * information is ignored as irrelevant or not essential to the query, such
21  * as the collations of Vars and, most notably, the values of constants.
22  *
23  * This jumble is acquired at the end of parse analysis of each query, and
24  * a 32-bit hash of it is stored into the query's Query.queryId field.
25  * The server then copies this value around, making it available in plan
26  * tree(s) generated from the query.  The executor can then use this value
27  * to blame query costs on the proper queryId.
28  *
29  * To facilitate presenting entries to users, we create "representative" query
30  * strings in which constants are replaced with '?' characters, to make it
31  * clearer what a normalized entry can represent.  To save on shared memory,
32  * and to avoid having to truncate oversized query strings, we store these
33  * strings in a temporary external query-texts file.  Offsets into this
34  * file are kept in shared memory.
35  *
36  * Note about locking issues: to create or delete an entry in the shared
37  * hashtable, one must hold pgss->lock exclusively.  Modifying any field
38  * in an entry except the counters requires the same.  To look up an entry,
39  * one must hold the lock shared.  To read or update the counters within
40  * an entry, one must hold the lock shared or exclusive (so the entry doesn't
41  * disappear!) and also take the entry's mutex spinlock.
42  * The shared state variable pgss->extent (the next free spot in the external
43  * query-text file) should be accessed only while holding either the
44  * pgss->mutex spinlock, or exclusive lock on pgss->lock.  We use the mutex to
45  * allow reserving file space while holding only shared lock on pgss->lock.
46  * Rewriting the entire external query-text file, eg for garbage collection,
47  * requires holding pgss->lock exclusively; this allows individual entries
48  * in the file to be read or written while holding only shared lock.
49  *
50  *
51  * Copyright (c) 2008-2014, PostgreSQL Global Development Group
52  *
53  * IDENTIFICATION
54  *        contrib/pg_stat_statements/pg_stat_statements.c
55  *
56  *-------------------------------------------------------------------------
57  */
58 #include "postgres.h"
59
60 #include <sys/stat.h>
61
62 #ifdef NOT_USED
63 #include <unistd.h>
64 #endif
65
66 #include "access/hash.h"
67 #ifdef NOT_USED
68 #include "executor/instrument.h"
69 #include "funcapi.h"
70 #include "mb/pg_wchar.h"
71 #include "miscadmin.h"
72 #include "parser/analyze.h"
73 #include "parser/parsetree.h"
74 #endif
75 #include "parser/scanner.h"
76 #ifdef NOT_USED
77 #include "pgstat.h"
78 #include "storage/fd.h"
79 #include "storage/ipc.h"
80 #include "storage/spin.h"
81 #include "tcop/utility.h"
82 #include "utils/builtins.h"
83 #include "utils/memutils.h"
84
85
86 PG_MODULE_MAGIC;
87
88 /* Location of permanent stats file (valid when database is shut down) */
89 #define PGSS_DUMP_FILE  PGSTAT_STAT_PERMANENT_DIRECTORY "/pg_stat_statements.stat"
90
91 /*
92  * Location of external query text file.  We don't keep it in the core
93  * system's stats_temp_directory.  The core system can safely use that GUC
94  * setting, because the statistics collector temp file paths are set only once
95  * as part of changing the GUC, but pg_stat_statements has no way of avoiding
96  * race conditions.  Besides, we only expect modest, infrequent I/O for query
97  * strings, so placing the file on a faster filesystem is not compelling.
98  */
99 #define PGSS_TEXT_FILE  PG_STAT_TMP_DIR "/pgss_query_texts.stat"
100
101 /* Magic number identifying the stats file format */
102 static const uint32 PGSS_FILE_HEADER = 0x20140125;
103
104 /* PostgreSQL major version number, changes in which invalidate all entries */
105 static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
106
107 /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
108 #define USAGE_EXEC(duration)    (1.0)
109 #define USAGE_INIT                              (1.0)   /* including initial planning */
110 #define ASSUMED_MEDIAN_INIT             (10.0)  /* initial assumed median usage */
111 #define ASSUMED_LENGTH_INIT             1024    /* initial assumed mean query length */
112 #define USAGE_DECREASE_FACTOR   (0.99)  /* decreased every entry_dealloc */
113 #define STICKY_DECREASE_FACTOR  (0.50)  /* factor for sticky entries */
114 #define USAGE_DEALLOC_PERCENT   5               /* free this % of entries at once */
115
116 #define JUMBLE_SIZE                             1024    /* query serialization buffer size */
117
118 /*
119  * Extension version number, for supporting older extension versions' objects
120  */
121 typedef enum pgssVersion
122 {
123         PGSS_V1_0 = 0,
124         PGSS_V1_1,
125         PGSS_V1_2
126 } pgssVersion;
127
128 /*
129  * Hashtable key that defines the identity of a hashtable entry.  We separate
130  * queries by user and by database even if they are otherwise identical.
131  */
132 typedef struct pgssHashKey
133 {
134         Oid                     userid;                 /* user OID */
135         Oid                     dbid;                   /* database OID */
136         uint32          queryid;                /* query identifier */
137 } pgssHashKey;
138
139 /*
140  * The actual stats counters kept within pgssEntry.
141  */
142 typedef struct Counters
143 {
144         int64           calls;                  /* # of times executed */
145         double          total_time;             /* total execution time, in msec */
146         int64           rows;                   /* total # of retrieved or affected rows */
147         int64           shared_blks_hit;        /* # of shared buffer hits */
148         int64           shared_blks_read;               /* # of shared disk blocks read */
149         int64           shared_blks_dirtied;    /* # of shared disk blocks dirtied */
150         int64           shared_blks_written;    /* # of shared disk blocks written */
151         int64           local_blks_hit; /* # of local buffer hits */
152         int64           local_blks_read;        /* # of local disk blocks read */
153         int64           local_blks_dirtied;             /* # of local disk blocks dirtied */
154         int64           local_blks_written;             /* # of local disk blocks written */
155         int64           temp_blks_read; /* # of temp blocks read */
156         int64           temp_blks_written;              /* # of temp blocks written */
157         double          blk_read_time;  /* time spent reading, in msec */
158         double          blk_write_time; /* time spent writing, in msec */
159         double          usage;                  /* usage factor */
160 } Counters;
161
162 /*
163  * Statistics per statement
164  *
165  * Note: in event of a failure in garbage collection of the query text file,
166  * we reset query_offset to zero and query_len to -1.  This will be seen as
167  * an invalid state by qtext_fetch().
168  */
169 typedef struct pgssEntry
170 {
171         pgssHashKey key;                        /* hash key of entry - MUST BE FIRST */
172         Counters        counters;               /* the statistics for this query */
173         Size            query_offset;   /* query text offset in external file */
174         int                     query_len;              /* # of valid bytes in query string */
175         int                     encoding;               /* query text encoding */
176         slock_t         mutex;                  /* protects the counters only */
177 } pgssEntry;
178
179 /*
180  * Global shared state
181  */
182 typedef struct pgssSharedState
183 {
184         LWLock     *lock;                       /* protects hashtable search/modification */
185         double          cur_median_usage;               /* current median usage in hashtable */
186         Size            mean_query_len; /* current mean entry text length */
187         slock_t         mutex;                  /* protects following fields only: */
188         Size            extent;                 /* current extent of query file */
189         int                     n_writers;              /* number of active writers to query file */
190         int                     gc_count;               /* query file garbage collection cycle count */
191 } pgssSharedState;
192
193 /*
194  * Struct for tracking locations/lengths of constants during normalization
195  */
196 typedef struct pgssLocationLen
197 {
198         int                     location;               /* start offset in query text */
199         int                     length;                 /* length in bytes, or -1 to ignore */
200 } pgssLocationLen;
201
202 /*
203  * Working state for computing a query jumble and producing a normalized
204  * query string
205  */
206 typedef struct pgssJumbleState
207 {
208         /* Jumble of current query tree */
209         unsigned char *jumble;
210
211         /* Number of bytes used in jumble[] */
212         Size            jumble_len;
213
214         /* Array of locations of constants that should be removed */
215         pgssLocationLen *clocations;
216
217         /* Allocated length of clocations array */
218         int                     clocations_buf_size;
219
220         /* Current number of valid entries in clocations array */
221         int                     clocations_count;
222 } pgssJumbleState;
223
224 /*---- Local variables ----*/
225
226 /* Current nesting depth of ExecutorRun+ProcessUtility calls */
227 static int      nested_level = 0;
228
229 /* Saved hook values in case of unload */
230 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
231 static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL;
232 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
233 static ExecutorRun_hook_type prev_ExecutorRun = NULL;
234 static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
235 static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
236 static ProcessUtility_hook_type prev_ProcessUtility = NULL;
237
238 /* Links to shared memory state */
239 static pgssSharedState *pgss = NULL;
240 static HTAB *pgss_hash = NULL;
241
242 /*---- GUC variables ----*/
243
244 typedef enum
245 {
246         PGSS_TRACK_NONE,                        /* track no statements */
247         PGSS_TRACK_TOP,                         /* only top level statements */
248         PGSS_TRACK_ALL                          /* all statements, including nested ones */
249 }       PGSSTrackLevel;
250
251 static const struct config_enum_entry track_options[] =
252 {
253         {"none", PGSS_TRACK_NONE, false},
254         {"top", PGSS_TRACK_TOP, false},
255         {"all", PGSS_TRACK_ALL, false},
256         {NULL, 0, false}
257 };
258
259 static int      pgss_max;                       /* max # statements to track */
260 static int      pgss_track;                     /* tracking level */
261 static bool pgss_track_utility; /* whether to track utility commands */
262 static bool pgss_save;                  /* whether to save stats across shutdown */
263
264
265 #define pgss_enabled() \
266         (pgss_track == PGSS_TRACK_ALL || \
267         (pgss_track == PGSS_TRACK_TOP && nested_level == 0))
268
269 #define record_gc_qtexts() \
270         do { \
271                 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; \
272                 SpinLockAcquire(&s->mutex); \
273                 s->gc_count++; \
274                 SpinLockRelease(&s->mutex); \
275         } while(0)
276
277 /*---- Function declarations ----*/
278
279 void            _PG_init(void);
280 void            _PG_fini(void);
281
282 PG_FUNCTION_INFO_V1(pg_stat_statements_reset);
283 PG_FUNCTION_INFO_V1(pg_stat_statements_1_2);
284 PG_FUNCTION_INFO_V1(pg_stat_statements);
285
286 static void pgss_shmem_startup(void);
287 static void pgss_shmem_shutdown(int code, Datum arg);
288 static void pgss_post_parse_analyze(ParseState *pstate, Query *query);
289 static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags);
290 static void pgss_ExecutorRun(QueryDesc *queryDesc,
291                                  ScanDirection direction,
292                                  long count);
293 static void pgss_ExecutorFinish(QueryDesc *queryDesc);
294 static void pgss_ExecutorEnd(QueryDesc *queryDesc);
295 static void pgss_ProcessUtility(Node *parsetree, const char *queryString,
296                                         ProcessUtilityContext context, ParamListInfo params,
297                                         DestReceiver *dest, char *completionTag);
298 static uint32 pgss_hash_fn(const void *key, Size keysize);
299 static int      pgss_match_fn(const void *key1, const void *key2, Size keysize);
300 static uint32 pgss_hash_string(const char *str);
301 static void pgss_store(const char *query, uint32 queryId,
302                    double total_time, uint64 rows,
303                    const BufferUsage *bufusage,
304                    pgssJumbleState *jstate);
305 static void pg_stat_statements_internal(FunctionCallInfo fcinfo,
306                                                         pgssVersion api_version,
307                                                         bool showtext);
308 static Size pgss_memsize(void);
309 static pgssEntry *entry_alloc(pgssHashKey *key, Size query_offset, int query_len,
310                         int encoding, bool sticky);
311 static void entry_dealloc(void);
312 static bool qtext_store(const char *query, int query_len,
313                         Size *query_offset, int *gc_count);
314 static char *qtext_load_file(Size *buffer_size);
315 static char *qtext_fetch(Size query_offset, int query_len,
316                         char *buffer, Size buffer_size);
317 static bool need_gc_qtexts(void);
318 static void gc_qtexts(void);
319 static void entry_reset(void);
320 #endif
321 static void AppendJumble(pgssJumbleState *jstate,
322                          const unsigned char *item, Size size);
323 static void JumbleQuery(pgssJumbleState *jstate, Query *query);
324 static void JumbleRangeTable(pgssJumbleState *jstate, List *rtable);
325 static void JumbleExpr(pgssJumbleState *jstate, Node *node);
326 static void RecordConstLocation(pgssJumbleState *jstate, int location);
327 #ifdef NOT_USED
328 static char *generate_normalized_query(pgssJumbleState *jstate, const char *query,
329                                                   int *query_len_p, int encoding);
330 #endif
331 static void fill_in_constant_lengths(pgssJumbleState *jstate, const char *query);
332 static int      comp_location(const void *a, const void *b);
333
334
335 #ifdef NOT_USED
336 /*
337  * Module load callback
338  */
339 void
340 _PG_init(void)
341 {
342         /*
343          * In order to create our shared memory area, we have to be loaded via
344          * shared_preload_libraries.  If not, fall out without hooking into any of
345          * the main system.  (We don't throw error here because it seems useful to
346          * allow the pg_stat_statements functions to be created even when the
347          * module isn't active.  The functions must protect themselves against
348          * being called then, however.)
349          */
350         if (!process_shared_preload_libraries_in_progress)
351                 return;
352
353         /*
354          * Define (or redefine) custom GUC variables.
355          */
356         DefineCustomIntVariable("pg_stat_statements.max",
357           "Sets the maximum number of statements tracked by pg_stat_statements.",
358                                                         NULL,
359                                                         &pgss_max,
360                                                         5000,
361                                                         100,
362                                                         INT_MAX,
363                                                         PGC_POSTMASTER,
364                                                         0,
365                                                         NULL,
366                                                         NULL,
367                                                         NULL);
368
369         DefineCustomEnumVariable("pg_stat_statements.track",
370                            "Selects which statements are tracked by pg_stat_statements.",
371                                                          NULL,
372                                                          &pgss_track,
373                                                          PGSS_TRACK_TOP,
374                                                          track_options,
375                                                          PGC_SUSET,
376                                                          0,
377                                                          NULL,
378                                                          NULL,
379                                                          NULL);
380
381         DefineCustomBoolVariable("pg_stat_statements.track_utility",
382            "Selects whether utility commands are tracked by pg_stat_statements.",
383                                                          NULL,
384                                                          &pgss_track_utility,
385                                                          true,
386                                                          PGC_SUSET,
387                                                          0,
388                                                          NULL,
389                                                          NULL,
390                                                          NULL);
391
392         DefineCustomBoolVariable("pg_stat_statements.save",
393                            "Save pg_stat_statements statistics across server shutdowns.",
394                                                          NULL,
395                                                          &pgss_save,
396                                                          true,
397                                                          PGC_SIGHUP,
398                                                          0,
399                                                          NULL,
400                                                          NULL,
401                                                          NULL);
402
403         EmitWarningsOnPlaceholders("pg_stat_statements");
404
405         /*
406          * Request additional shared resources.  (These are no-ops if we're not in
407          * the postmaster process.)  We'll allocate or attach to the shared
408          * resources in pgss_shmem_startup().
409          */
410         RequestAddinShmemSpace(pgss_memsize());
411         RequestAddinLWLocks(1);
412
413         /*
414          * Install hooks.
415          */
416         prev_shmem_startup_hook = shmem_startup_hook;
417         shmem_startup_hook = pgss_shmem_startup;
418         prev_post_parse_analyze_hook = post_parse_analyze_hook;
419         post_parse_analyze_hook = pgss_post_parse_analyze;
420         prev_ExecutorStart = ExecutorStart_hook;
421         ExecutorStart_hook = pgss_ExecutorStart;
422         prev_ExecutorRun = ExecutorRun_hook;
423         ExecutorRun_hook = pgss_ExecutorRun;
424         prev_ExecutorFinish = ExecutorFinish_hook;
425         ExecutorFinish_hook = pgss_ExecutorFinish;
426         prev_ExecutorEnd = ExecutorEnd_hook;
427         ExecutorEnd_hook = pgss_ExecutorEnd;
428         prev_ProcessUtility = ProcessUtility_hook;
429         ProcessUtility_hook = pgss_ProcessUtility;
430 }
431
432 /*
433  * Module unload callback
434  */
435 void
436 _PG_fini(void)
437 {
438         /* Uninstall hooks. */
439         shmem_startup_hook = prev_shmem_startup_hook;
440         post_parse_analyze_hook = prev_post_parse_analyze_hook;
441         ExecutorStart_hook = prev_ExecutorStart;
442         ExecutorRun_hook = prev_ExecutorRun;
443         ExecutorFinish_hook = prev_ExecutorFinish;
444         ExecutorEnd_hook = prev_ExecutorEnd;
445         ProcessUtility_hook = prev_ProcessUtility;
446 }
447
448 /*
449  * shmem_startup hook: allocate or attach to shared memory,
450  * then load any pre-existing statistics from file.
451  * Also create and load the query-texts file, which is expected to exist
452  * (even if empty) while the module is enabled.
453  */
454 static void
455 pgss_shmem_startup(void)
456 {
457         bool            found;
458         HASHCTL         info;
459         FILE       *file = NULL;
460         FILE       *qfile = NULL;
461         uint32          header;
462         int32           num;
463         int32           pgver;
464         int32           i;
465         int                     buffer_size;
466         char       *buffer = NULL;
467
468         if (prev_shmem_startup_hook)
469                 prev_shmem_startup_hook();
470
471         /* reset in case this is a restart within the postmaster */
472         pgss = NULL;
473         pgss_hash = NULL;
474
475         /*
476          * Create or attach to the shared memory state, including hash table
477          */
478         LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
479
480         pgss = ShmemInitStruct("pg_stat_statements",
481                                                    sizeof(pgssSharedState),
482                                                    &found);
483
484         if (!found)
485         {
486                 /* First time through ... */
487                 pgss->lock = LWLockAssign();
488                 pgss->cur_median_usage = ASSUMED_MEDIAN_INIT;
489                 pgss->mean_query_len = ASSUMED_LENGTH_INIT;
490                 SpinLockInit(&pgss->mutex);
491                 pgss->extent = 0;
492                 pgss->n_writers = 0;
493                 pgss->gc_count = 0;
494         }
495
496         memset(&info, 0, sizeof(info));
497         info.keysize = sizeof(pgssHashKey);
498         info.entrysize = sizeof(pgssEntry);
499         info.hash = pgss_hash_fn;
500         info.match = pgss_match_fn;
501         pgss_hash = ShmemInitHash("pg_stat_statements hash",
502                                                           pgss_max, pgss_max,
503                                                           &info,
504                                                           HASH_ELEM | HASH_FUNCTION | HASH_COMPARE);
505
506         LWLockRelease(AddinShmemInitLock);
507
508         /*
509          * If we're in the postmaster (or a standalone backend...), set up a shmem
510          * exit hook to dump the statistics to disk.
511          */
512         if (!IsUnderPostmaster)
513                 on_shmem_exit(pgss_shmem_shutdown, (Datum) 0);
514
515         /*
516          * Done if some other process already completed our initialization.
517          */
518         if (found)
519                 return;
520
521         /*
522          * Note: we don't bother with locks here, because there should be no other
523          * processes running when this code is reached.
524          */
525
526         /* Unlink query text file possibly left over from crash */
527         unlink(PGSS_TEXT_FILE);
528
529         /* Allocate new query text temp file */
530         qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
531         if (qfile == NULL)
532                 goto write_error;
533
534         /*
535          * If we were told not to load old statistics, we're done.  (Note we do
536          * not try to unlink any old dump file in this case.  This seems a bit
537          * questionable but it's the historical behavior.)
538          */
539         if (!pgss_save)
540         {
541                 FreeFile(qfile);
542                 return;
543         }
544
545         /*
546          * Attempt to load old statistics from the dump file.
547          */
548         file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R);
549         if (file == NULL)
550         {
551                 if (errno != ENOENT)
552                         goto read_error;
553                 /* No existing persisted stats file, so we're done */
554                 FreeFile(qfile);
555                 return;
556         }
557
558         buffer_size = 2048;
559         buffer = (char *) palloc(buffer_size);
560
561         if (fread(&header, sizeof(uint32), 1, file) != 1 ||
562                 fread(&pgver, sizeof(uint32), 1, file) != 1 ||
563                 fread(&num, sizeof(int32), 1, file) != 1)
564                 goto read_error;
565
566         if (header != PGSS_FILE_HEADER ||
567                 pgver != PGSS_PG_MAJOR_VERSION)
568                 goto data_error;
569
570         for (i = 0; i < num; i++)
571         {
572                 pgssEntry       temp;
573                 pgssEntry  *entry;
574                 Size            query_offset;
575
576                 if (fread(&temp, sizeof(pgssEntry), 1, file) != 1)
577                         goto read_error;
578
579                 /* Encoding is the only field we can easily sanity-check */
580                 if (!PG_VALID_BE_ENCODING(temp.encoding))
581                         goto data_error;
582
583                 /* Resize buffer as needed */
584                 if (temp.query_len >= buffer_size)
585                 {
586                         buffer_size = Max(buffer_size * 2, temp.query_len + 1);
587                         buffer = repalloc(buffer, buffer_size);
588                 }
589
590                 if (fread(buffer, 1, temp.query_len + 1, file) != temp.query_len + 1)
591                         goto read_error;
592
593                 /* Should have a trailing null, but let's make sure */
594                 buffer[temp.query_len] = '\0';
595
596                 /* Skip loading "sticky" entries */
597                 if (temp.counters.calls == 0)
598                         continue;
599
600                 /* Store the query text */
601                 query_offset = pgss->extent;
602                 if (fwrite(buffer, 1, temp.query_len + 1, qfile) != temp.query_len + 1)
603                         goto write_error;
604                 pgss->extent += temp.query_len + 1;
605
606                 /* make the hashtable entry (discards old entries if too many) */
607                 entry = entry_alloc(&temp.key, query_offset, temp.query_len,
608                                                         temp.encoding,
609                                                         false);
610
611                 /* copy in the actual stats */
612                 entry->counters = temp.counters;
613         }
614
615         pfree(buffer);
616         FreeFile(file);
617         FreeFile(qfile);
618
619         /*
620          * Remove the persisted stats file so it's not included in
621          * backups/replication slaves, etc.  A new file will be written on next
622          * shutdown.
623          *
624          * Note: it's okay if the PGSS_TEXT_FILE is included in a basebackup,
625          * because we remove that file on startup; it acts inversely to
626          * PGSS_DUMP_FILE, in that it is only supposed to be around when the
627          * server is running, whereas PGSS_DUMP_FILE is only supposed to be around
628          * when the server is not running.  Leaving the file creates no danger of
629          * a newly restored database having a spurious record of execution costs,
630          * which is what we're really concerned about here.
631          */
632         unlink(PGSS_DUMP_FILE);
633
634         return;
635
636 read_error:
637         ereport(LOG,
638                         (errcode_for_file_access(),
639                          errmsg("could not read pg_stat_statement file \"%s\": %m",
640                                         PGSS_DUMP_FILE)));
641         goto fail;
642 data_error:
643         ereport(LOG,
644                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
645                          errmsg("ignoring invalid data in pg_stat_statement file \"%s\"",
646                                         PGSS_DUMP_FILE)));
647         goto fail;
648 write_error:
649         ereport(LOG,
650                         (errcode_for_file_access(),
651                          errmsg("could not write pg_stat_statement file \"%s\": %m",
652                                         PGSS_TEXT_FILE)));
653 fail:
654         if (buffer)
655                 pfree(buffer);
656         if (file)
657                 FreeFile(file);
658         if (qfile)
659                 FreeFile(qfile);
660         /* If possible, throw away the bogus file; ignore any error */
661         unlink(PGSS_DUMP_FILE);
662
663         /*
664          * Don't unlink PGSS_TEXT_FILE here; it should always be around while the
665          * server is running with pg_stat_statements enabled
666          */
667 }
668
669 /*
670  * shmem_shutdown hook: Dump statistics into file.
671  *
672  * Note: we don't bother with acquiring lock, because there should be no
673  * other processes running when this is called.
674  */
675 static void
676 pgss_shmem_shutdown(int code, Datum arg)
677 {
678         FILE       *file;
679         char       *qbuffer = NULL;
680         Size            qbuffer_size = 0;
681         HASH_SEQ_STATUS hash_seq;
682         int32           num_entries;
683         pgssEntry  *entry;
684
685         /* Don't try to dump during a crash. */
686         if (code)
687                 return;
688
689         /* Safety check ... shouldn't get here unless shmem is set up. */
690         if (!pgss || !pgss_hash)
691                 return;
692
693         /* Don't dump if told not to. */
694         if (!pgss_save)
695                 return;
696
697         file = AllocateFile(PGSS_DUMP_FILE ".tmp", PG_BINARY_W);
698         if (file == NULL)
699                 goto error;
700
701         if (fwrite(&PGSS_FILE_HEADER, sizeof(uint32), 1, file) != 1)
702                 goto error;
703         if (fwrite(&PGSS_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
704                 goto error;
705         num_entries = hash_get_num_entries(pgss_hash);
706         if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
707                 goto error;
708
709         qbuffer = qtext_load_file(&qbuffer_size);
710         if (qbuffer == NULL)
711                 goto error;
712
713         /*
714          * When serializing to disk, we store query texts immediately after their
715          * entry data.  Any orphaned query texts are thereby excluded.
716          */
717         hash_seq_init(&hash_seq, pgss_hash);
718         while ((entry = hash_seq_search(&hash_seq)) != NULL)
719         {
720                 int                     len = entry->query_len;
721                 char       *qstr = qtext_fetch(entry->query_offset, len,
722                                                                            qbuffer, qbuffer_size);
723
724                 if (qstr == NULL)
725                         continue;                       /* Ignore any entries with bogus texts */
726
727                 if (fwrite(entry, sizeof(pgssEntry), 1, file) != 1 ||
728                         fwrite(qstr, 1, len + 1, file) != len + 1)
729                 {
730                         /* note: we assume hash_seq_term won't change errno */
731                         hash_seq_term(&hash_seq);
732                         goto error;
733                 }
734         }
735
736         free(qbuffer);
737         qbuffer = NULL;
738
739         if (FreeFile(file))
740         {
741                 file = NULL;
742                 goto error;
743         }
744
745         /*
746          * Rename file into place, so we atomically replace any old one.
747          */
748         if (rename(PGSS_DUMP_FILE ".tmp", PGSS_DUMP_FILE) != 0)
749                 ereport(LOG,
750                                 (errcode_for_file_access(),
751                                  errmsg("could not rename pg_stat_statement file \"%s\": %m",
752                                                 PGSS_DUMP_FILE ".tmp")));
753
754         /* Unlink query-texts file; it's not needed while shutdown */
755         unlink(PGSS_TEXT_FILE);
756
757         return;
758
759 error:
760         ereport(LOG,
761                         (errcode_for_file_access(),
762                          errmsg("could not write pg_stat_statement file \"%s\": %m",
763                                         PGSS_DUMP_FILE ".tmp")));
764         if (qbuffer)
765                 free(qbuffer);
766         if (file)
767                 FreeFile(file);
768         unlink(PGSS_DUMP_FILE ".tmp");
769         unlink(PGSS_TEXT_FILE);
770 }
771
772 /*
773  * Post-parse-analysis hook: mark query with a queryId
774  */
775 static void
776 pgss_post_parse_analyze(ParseState *pstate, Query *query)
777 {
778         pgssJumbleState jstate;
779
780         if (prev_post_parse_analyze_hook)
781                 prev_post_parse_analyze_hook(pstate, query);
782
783         /* Assert we didn't do this already */
784         Assert(query->queryId == 0);
785
786         /* Safety check... */
787         if (!pgss || !pgss_hash)
788                 return;
789
790         /*
791          * Utility statements get queryId zero.  We do this even in cases where
792          * the statement contains an optimizable statement for which a queryId
793          * could be derived (such as EXPLAIN or DECLARE CURSOR).  For such cases,
794          * runtime control will first go through ProcessUtility and then the
795          * executor, and we don't want the executor hooks to do anything, since we
796          * are already measuring the statement's costs at the utility level.
797          */
798         if (query->utilityStmt)
799         {
800                 query->queryId = 0;
801                 return;
802         }
803
804         /* Set up workspace for query jumbling */
805         jstate.jumble = (unsigned char *) palloc(JUMBLE_SIZE);
806         jstate.jumble_len = 0;
807         jstate.clocations_buf_size = 32;
808         jstate.clocations = (pgssLocationLen *)
809                 palloc(jstate.clocations_buf_size * sizeof(pgssLocationLen));
810         jstate.clocations_count = 0;
811
812         /* Compute query ID and mark the Query node with it */
813         JumbleQuery(&jstate, query);
814         query->queryId = hash_any(jstate.jumble, jstate.jumble_len);
815
816         /*
817          * If we are unlucky enough to get a hash of zero, use 1 instead, to
818          * prevent confusion with the utility-statement case.
819          */
820         if (query->queryId == 0)
821                 query->queryId = 1;
822
823         /*
824          * If we were able to identify any ignorable constants, we immediately
825          * create a hash table entry for the query, so that we can record the
826          * normalized form of the query string.  If there were no such constants,
827          * the normalized string would be the same as the query text anyway, so
828          * there's no need for an early entry.
829          */
830         if (jstate.clocations_count > 0)
831                 pgss_store(pstate->p_sourcetext,
832                                    query->queryId,
833                                    0,
834                                    0,
835                                    NULL,
836                                    &jstate);
837 }
838
839 /*
840  * ExecutorStart hook: start up tracking if needed
841  */
842 static void
843 pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
844 {
845         if (prev_ExecutorStart)
846                 prev_ExecutorStart(queryDesc, eflags);
847         else
848                 standard_ExecutorStart(queryDesc, eflags);
849
850         /*
851          * If query has queryId zero, don't track it.  This prevents double
852          * counting of optimizable statements that are directly contained in
853          * utility statements.
854          */
855         if (pgss_enabled() && queryDesc->plannedstmt->queryId != 0)
856         {
857                 /*
858                  * Set up to track total elapsed time in ExecutorRun.  Make sure the
859                  * space is allocated in the per-query context so it will go away at
860                  * ExecutorEnd.
861                  */
862                 if (queryDesc->totaltime == NULL)
863                 {
864                         MemoryContext oldcxt;
865
866                         oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
867                         queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL);
868                         MemoryContextSwitchTo(oldcxt);
869                 }
870         }
871 }
872
873 /*
874  * ExecutorRun hook: all we need do is track nesting depth
875  */
876 static void
877 pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long count)
878 {
879         nested_level++;
880         PG_TRY();
881         {
882                 if (prev_ExecutorRun)
883                         prev_ExecutorRun(queryDesc, direction, count);
884                 else
885                         standard_ExecutorRun(queryDesc, direction, count);
886                 nested_level--;
887         }
888         PG_CATCH();
889         {
890                 nested_level--;
891                 PG_RE_THROW();
892         }
893         PG_END_TRY();
894 }
895
896 /*
897  * ExecutorFinish hook: all we need do is track nesting depth
898  */
899 static void
900 pgss_ExecutorFinish(QueryDesc *queryDesc)
901 {
902         nested_level++;
903         PG_TRY();
904         {
905                 if (prev_ExecutorFinish)
906                         prev_ExecutorFinish(queryDesc);
907                 else
908                         standard_ExecutorFinish(queryDesc);
909                 nested_level--;
910         }
911         PG_CATCH();
912         {
913                 nested_level--;
914                 PG_RE_THROW();
915         }
916         PG_END_TRY();
917 }
918
919 /*
920  * ExecutorEnd hook: store results if needed
921  */
922 static void
923 pgss_ExecutorEnd(QueryDesc *queryDesc)
924 {
925         uint32          queryId = queryDesc->plannedstmt->queryId;
926
927         if (queryId != 0 && queryDesc->totaltime && pgss_enabled())
928         {
929                 /*
930                  * Make sure stats accumulation is done.  (Note: it's okay if several
931                  * levels of hook all do this.)
932                  */
933                 InstrEndLoop(queryDesc->totaltime);
934
935                 pgss_store(queryDesc->sourceText,
936                                    queryId,
937                                    queryDesc->totaltime->total * 1000.0,                /* convert to msec */
938                                    queryDesc->estate->es_processed,
939                                    &queryDesc->totaltime->bufusage,
940                                    NULL);
941         }
942
943         if (prev_ExecutorEnd)
944                 prev_ExecutorEnd(queryDesc);
945         else
946                 standard_ExecutorEnd(queryDesc);
947 }
948
949 /*
950  * ProcessUtility hook
951  */
952 static void
953 pgss_ProcessUtility(Node *parsetree, const char *queryString,
954                                         ProcessUtilityContext context, ParamListInfo params,
955                                         DestReceiver *dest, char *completionTag)
956 {
957         /*
958          * If it's an EXECUTE statement, we don't track it and don't increment the
959          * nesting level.  This allows the cycles to be charged to the underlying
960          * PREPARE instead (by the Executor hooks), which is much more useful.
961          *
962          * We also don't track execution of PREPARE.  If we did, we would get one
963          * hash table entry for the PREPARE (with hash calculated from the query
964          * string), and then a different one with the same query string (but hash
965          * calculated from the query tree) would be used to accumulate costs of
966          * ensuing EXECUTEs.  This would be confusing, and inconsistent with other
967          * cases where planning time is not included at all.
968          */
969         if (pgss_track_utility && pgss_enabled() &&
970                 !IsA(parsetree, ExecuteStmt) &&
971                 !IsA(parsetree, PrepareStmt))
972         {
973                 instr_time      start;
974                 instr_time      duration;
975                 uint64          rows;
976                 BufferUsage bufusage_start,
977                                         bufusage;
978                 uint32          queryId;
979
980                 bufusage_start = pgBufferUsage;
981                 INSTR_TIME_SET_CURRENT(start);
982
983                 nested_level++;
984                 PG_TRY();
985                 {
986                         if (prev_ProcessUtility)
987                                 prev_ProcessUtility(parsetree, queryString,
988                                                                         context, params,
989                                                                         dest, completionTag);
990                         else
991                                 standard_ProcessUtility(parsetree, queryString,
992                                                                                 context, params,
993                                                                                 dest, completionTag);
994                         nested_level--;
995                 }
996                 PG_CATCH();
997                 {
998                         nested_level--;
999                         PG_RE_THROW();
1000                 }
1001                 PG_END_TRY();
1002
1003                 INSTR_TIME_SET_CURRENT(duration);
1004                 INSTR_TIME_SUBTRACT(duration, start);
1005
1006                 /* parse command tag to retrieve the number of affected rows. */
1007                 if (completionTag &&
1008                         strncmp(completionTag, "COPY ", 5) == 0)
1009                 {
1010 #ifdef HAVE_STRTOULL
1011                         rows = strtoull(completionTag + 5, NULL, 10);
1012 #else
1013                         rows = strtoul(completionTag + 5, NULL, 10);
1014 #endif
1015                 }
1016                 else
1017                         rows = 0;
1018
1019                 /* calc differences of buffer counters. */
1020                 bufusage.shared_blks_hit =
1021                         pgBufferUsage.shared_blks_hit - bufusage_start.shared_blks_hit;
1022                 bufusage.shared_blks_read =
1023                         pgBufferUsage.shared_blks_read - bufusage_start.shared_blks_read;
1024                 bufusage.shared_blks_dirtied =
1025                         pgBufferUsage.shared_blks_dirtied - bufusage_start.shared_blks_dirtied;
1026                 bufusage.shared_blks_written =
1027                         pgBufferUsage.shared_blks_written - bufusage_start.shared_blks_written;
1028                 bufusage.local_blks_hit =
1029                         pgBufferUsage.local_blks_hit - bufusage_start.local_blks_hit;
1030                 bufusage.local_blks_read =
1031                         pgBufferUsage.local_blks_read - bufusage_start.local_blks_read;
1032                 bufusage.local_blks_dirtied =
1033                         pgBufferUsage.local_blks_dirtied - bufusage_start.local_blks_dirtied;
1034                 bufusage.local_blks_written =
1035                         pgBufferUsage.local_blks_written - bufusage_start.local_blks_written;
1036                 bufusage.temp_blks_read =
1037                         pgBufferUsage.temp_blks_read - bufusage_start.temp_blks_read;
1038                 bufusage.temp_blks_written =
1039                         pgBufferUsage.temp_blks_written - bufusage_start.temp_blks_written;
1040                 bufusage.blk_read_time = pgBufferUsage.blk_read_time;
1041                 INSTR_TIME_SUBTRACT(bufusage.blk_read_time, bufusage_start.blk_read_time);
1042                 bufusage.blk_write_time = pgBufferUsage.blk_write_time;
1043                 INSTR_TIME_SUBTRACT(bufusage.blk_write_time, bufusage_start.blk_write_time);
1044
1045                 /* For utility statements, we just hash the query string directly */
1046                 queryId = pgss_hash_string(queryString);
1047
1048                 pgss_store(queryString,
1049                                    queryId,
1050                                    INSTR_TIME_GET_MILLISEC(duration),
1051                                    rows,
1052                                    &bufusage,
1053                                    NULL);
1054         }
1055         else
1056         {
1057                 if (prev_ProcessUtility)
1058                         prev_ProcessUtility(parsetree, queryString,
1059                                                                 context, params,
1060                                                                 dest, completionTag);
1061                 else
1062                         standard_ProcessUtility(parsetree, queryString,
1063                                                                         context, params,
1064                                                                         dest, completionTag);
1065         }
1066 }
1067
1068 /*
1069  * Calculate hash value for a key
1070  */
1071 static uint32
1072 pgss_hash_fn(const void *key, Size keysize)
1073 {
1074         const pgssHashKey *k = (const pgssHashKey *) key;
1075
1076         return hash_uint32((uint32) k->userid) ^
1077                 hash_uint32((uint32) k->dbid) ^
1078                 hash_uint32((uint32) k->queryid);
1079 }
1080
1081 /*
1082  * Compare two keys - zero means match
1083  */
1084 static int
1085 pgss_match_fn(const void *key1, const void *key2, Size keysize)
1086 {
1087         const pgssHashKey *k1 = (const pgssHashKey *) key1;
1088         const pgssHashKey *k2 = (const pgssHashKey *) key2;
1089
1090         if (k1->userid == k2->userid &&
1091                 k1->dbid == k2->dbid &&
1092                 k1->queryid == k2->queryid)
1093                 return 0;
1094         else
1095                 return 1;
1096 }
1097
1098 /*
1099  * Given an arbitrarily long query string, produce a hash for the purposes of
1100  * identifying the query, without normalizing constants.  Used when hashing
1101  * utility statements.
1102  */
1103 static uint32
1104 pgss_hash_string(const char *str)
1105 {
1106         return hash_any((const unsigned char *) str, strlen(str));
1107 }
1108
1109 /*
1110  * Store some statistics for a statement.
1111  *
1112  * If jstate is not NULL then we're trying to create an entry for which
1113  * we have no statistics as yet; we just want to record the normalized
1114  * query string.  total_time, rows, bufusage are ignored in this case.
1115  */
1116 static void
1117 pgss_store(const char *query, uint32 queryId,
1118                    double total_time, uint64 rows,
1119                    const BufferUsage *bufusage,
1120                    pgssJumbleState *jstate)
1121 {
1122         pgssHashKey key;
1123         pgssEntry  *entry;
1124         char       *norm_query = NULL;
1125         int                     encoding = GetDatabaseEncoding();
1126         int                     query_len;
1127
1128         Assert(query != NULL);
1129
1130         /* Safety check... */
1131         if (!pgss || !pgss_hash)
1132                 return;
1133
1134         query_len = strlen(query);
1135
1136         /* Set up key for hashtable search */
1137         key.userid = GetUserId();
1138         key.dbid = MyDatabaseId;
1139         key.queryid = queryId;
1140
1141         /* Lookup the hash table entry with shared lock. */
1142         LWLockAcquire(pgss->lock, LW_SHARED);
1143
1144         entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL);
1145
1146         /* Create new entry, if not present */
1147         if (!entry)
1148         {
1149                 Size            query_offset;
1150                 int                     gc_count;
1151                 bool            stored;
1152                 bool            do_gc;
1153
1154                 /*
1155                  * Create a new, normalized query string if caller asked.  We don't
1156                  * need to hold the lock while doing this work.  (Note: in any case,
1157                  * it's possible that someone else creates a duplicate hashtable entry
1158                  * in the interval where we don't hold the lock below.  That case is
1159                  * handled by entry_alloc.)
1160                  */
1161                 if (jstate)
1162                 {
1163                         LWLockRelease(pgss->lock);
1164                         norm_query = generate_normalized_query(jstate, query,
1165                                                                                                    &query_len,
1166                                                                                                    encoding);
1167                         LWLockAcquire(pgss->lock, LW_SHARED);
1168                 }
1169
1170                 /* Append new query text to file with only shared lock held */
1171                 stored = qtext_store(norm_query ? norm_query : query, query_len,
1172                                                          &query_offset, &gc_count);
1173
1174                 /*
1175                  * Determine whether we need to garbage collect external query texts
1176                  * while the shared lock is still held.  This micro-optimization
1177                  * avoids taking the time to decide this while holding exclusive lock.
1178                  */
1179                 do_gc = need_gc_qtexts();
1180
1181                 /* Need exclusive lock to make a new hashtable entry - promote */
1182                 LWLockRelease(pgss->lock);
1183                 LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
1184
1185                 /*
1186                  * A garbage collection may have occurred while we weren't holding the
1187                  * lock.  In the unlikely event that this happens, the query text we
1188                  * stored above will have been garbage collected, so write it again.
1189                  * This should be infrequent enough that doing it while holding
1190                  * exclusive lock isn't a performance problem.
1191                  */
1192                 if (!stored || pgss->gc_count != gc_count)
1193                         stored = qtext_store(norm_query ? norm_query : query, query_len,
1194                                                                  &query_offset, NULL);
1195
1196                 /* If we failed to write to the text file, give up */
1197                 if (!stored)
1198                         goto done;
1199
1200                 /* OK to create a new hashtable entry */
1201                 entry = entry_alloc(&key, query_offset, query_len, encoding,
1202                                                         jstate != NULL);
1203
1204                 /* If needed, perform garbage collection while exclusive lock held */
1205                 if (do_gc)
1206                         gc_qtexts();
1207         }
1208
1209         /* Increment the counts, except when jstate is not NULL */
1210         if (!jstate)
1211         {
1212                 /*
1213                  * Grab the spinlock while updating the counters (see comment about
1214                  * locking rules at the head of the file)
1215                  */
1216                 volatile pgssEntry *e = (volatile pgssEntry *) entry;
1217
1218                 SpinLockAcquire(&e->mutex);
1219
1220                 /* "Unstick" entry if it was previously sticky */
1221                 if (e->counters.calls == 0)
1222                         e->counters.usage = USAGE_INIT;
1223
1224                 e->counters.calls += 1;
1225                 e->counters.total_time += total_time;
1226                 e->counters.rows += rows;
1227                 e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1228                 e->counters.shared_blks_read += bufusage->shared_blks_read;
1229                 e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
1230                 e->counters.shared_blks_written += bufusage->shared_blks_written;
1231                 e->counters.local_blks_hit += bufusage->local_blks_hit;
1232                 e->counters.local_blks_read += bufusage->local_blks_read;
1233                 e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
1234                 e->counters.local_blks_written += bufusage->local_blks_written;
1235                 e->counters.temp_blks_read += bufusage->temp_blks_read;
1236                 e->counters.temp_blks_written += bufusage->temp_blks_written;
1237                 e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1238                 e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
1239                 e->counters.usage += USAGE_EXEC(total_time);
1240
1241                 SpinLockRelease(&e->mutex);
1242         }
1243
1244 done:
1245         LWLockRelease(pgss->lock);
1246
1247         /* We postpone this clean-up until we're out of the lock */
1248         if (norm_query)
1249                 pfree(norm_query);
1250 }
1251
1252 /*
1253  * Reset all statement statistics.
1254  */
1255 Datum
1256 pg_stat_statements_reset(PG_FUNCTION_ARGS)
1257 {
1258         if (!pgss || !pgss_hash)
1259                 ereport(ERROR,
1260                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1261                                  errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
1262         entry_reset();
1263         PG_RETURN_VOID();
1264 }
1265
1266 /* Number of output arguments (columns) for various API versions */
1267 #define PG_STAT_STATEMENTS_COLS_V1_0    14
1268 #define PG_STAT_STATEMENTS_COLS_V1_1    18
1269 #define PG_STAT_STATEMENTS_COLS_V1_2    19
1270 #define PG_STAT_STATEMENTS_COLS                 19              /* maximum of above */
1271
1272 /*
1273  * Retrieve statement statistics.
1274  *
1275  * The SQL API of this function has changed multiple times, and will likely
1276  * do so again in future.  To support the case where a newer version of this
1277  * loadable module is being used with an old SQL declaration of the function,
1278  * we continue to support the older API versions.  For 1.2 and later, the
1279  * expected API version is identified by embedding it in the C name of the
1280  * function.  Unfortunately we weren't bright enough to do that for 1.1.
1281  */
1282 Datum
1283 pg_stat_statements_1_2(PG_FUNCTION_ARGS)
1284 {
1285         bool            showtext = PG_GETARG_BOOL(0);
1286
1287         pg_stat_statements_internal(fcinfo, PGSS_V1_2, showtext);
1288
1289         return (Datum) 0;
1290 }
1291
1292 /*
1293  * Legacy entry point for pg_stat_statements() API versions 1.0 and 1.1.
1294  * This can be removed someday, perhaps.
1295  */
1296 Datum
1297 pg_stat_statements(PG_FUNCTION_ARGS)
1298 {
1299         /* If it's really API 1.1, we'll figure that out below */
1300         pg_stat_statements_internal(fcinfo, PGSS_V1_0, true);
1301
1302         return (Datum) 0;
1303 }
1304
1305 /* Common code for all versions of pg_stat_statements() */
1306 static void
1307 pg_stat_statements_internal(FunctionCallInfo fcinfo,
1308                                                         pgssVersion api_version,
1309                                                         bool showtext)
1310 {
1311         ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1312         TupleDesc       tupdesc;
1313         Tuplestorestate *tupstore;
1314         MemoryContext per_query_ctx;
1315         MemoryContext oldcontext;
1316         Oid                     userid = GetUserId();
1317         bool            is_superuser = superuser();
1318         char       *qbuffer = NULL;
1319         Size            qbuffer_size = 0;
1320         Size            extent = 0;
1321         int                     gc_count = 0;
1322         HASH_SEQ_STATUS hash_seq;
1323         pgssEntry  *entry;
1324
1325         /* hash table must exist already */
1326         if (!pgss || !pgss_hash)
1327                 ereport(ERROR,
1328                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1329                                  errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
1330
1331         /* check to see if caller supports us returning a tuplestore */
1332         if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1333                 ereport(ERROR,
1334                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1335                                  errmsg("set-valued function called in context that cannot accept a set")));
1336         if (!(rsinfo->allowedModes & SFRM_Materialize))
1337                 ereport(ERROR,
1338                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1339                                  errmsg("materialize mode required, but it is not " \
1340                                                 "allowed in this context")));
1341
1342         /* Switch into long-lived context to construct returned data structures */
1343         per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1344         oldcontext = MemoryContextSwitchTo(per_query_ctx);
1345
1346         /* Build a tuple descriptor for our result type */
1347         if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1348                 elog(ERROR, "return type must be a row type");
1349
1350         /*
1351          * Check we have the expected number of output arguments.  Aside from
1352          * being a good safety check, we need a kluge here to detect API version
1353          * 1.1, which was wedged into the code in an ill-considered way.
1354          */
1355         switch (tupdesc->natts)
1356         {
1357                 case PG_STAT_STATEMENTS_COLS_V1_0:
1358                         if (api_version != PGSS_V1_0)
1359                                 elog(ERROR, "incorrect number of output arguments");
1360                         break;
1361                 case PG_STAT_STATEMENTS_COLS_V1_1:
1362                         /* pg_stat_statements() should have told us 1.0 */
1363                         if (api_version != PGSS_V1_0)
1364                                 elog(ERROR, "incorrect number of output arguments");
1365                         api_version = PGSS_V1_1;
1366                         break;
1367                 case PG_STAT_STATEMENTS_COLS_V1_2:
1368                         if (api_version != PGSS_V1_2)
1369                                 elog(ERROR, "incorrect number of output arguments");
1370                         break;
1371                 default:
1372                         elog(ERROR, "incorrect number of output arguments");
1373         }
1374
1375         tupstore = tuplestore_begin_heap(true, false, work_mem);
1376         rsinfo->returnMode = SFRM_Materialize;
1377         rsinfo->setResult = tupstore;
1378         rsinfo->setDesc = tupdesc;
1379
1380         MemoryContextSwitchTo(oldcontext);
1381
1382         /*
1383          * We'd like to load the query text file (if needed) while not holding any
1384          * lock on pgss->lock.  In the worst case we'll have to do this again
1385          * after we have the lock, but it's unlikely enough to make this a win
1386          * despite occasional duplicated work.  We need to reload if anybody
1387          * writes to the file (either a retail qtext_store(), or a garbage
1388          * collection) between this point and where we've gotten shared lock.  If
1389          * a qtext_store is actually in progress when we look, we might as well
1390          * skip the speculative load entirely.
1391          */
1392         if (showtext)
1393         {
1394                 int                     n_writers;
1395
1396                 /* Take the mutex so we can examine variables */
1397                 {
1398                         volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1399
1400                         SpinLockAcquire(&s->mutex);
1401                         extent = s->extent;
1402                         n_writers = s->n_writers;
1403                         gc_count = s->gc_count;
1404                         SpinLockRelease(&s->mutex);
1405                 }
1406
1407                 /* No point in loading file now if there are active writers */
1408                 if (n_writers == 0)
1409                         qbuffer = qtext_load_file(&qbuffer_size);
1410         }
1411
1412         /*
1413          * Get shared lock, load or reload the query text file if we must, and
1414          * iterate over the hashtable entries.
1415          *
1416          * With a large hash table, we might be holding the lock rather longer
1417          * than one could wish.  However, this only blocks creation of new hash
1418          * table entries, and the larger the hash table the less likely that is to
1419          * be needed.  So we can hope this is okay.  Perhaps someday we'll decide
1420          * we need to partition the hash table to limit the time spent holding any
1421          * one lock.
1422          */
1423         LWLockAcquire(pgss->lock, LW_SHARED);
1424
1425         if (showtext)
1426         {
1427                 /*
1428                  * Here it is safe to examine extent and gc_count without taking the
1429                  * mutex.  Note that although other processes might change
1430                  * pgss->extent just after we look at it, the strings they then write
1431                  * into the file cannot yet be referenced in the hashtable, so we
1432                  * don't care whether we see them or not.
1433                  *
1434                  * If qtext_load_file fails, we just press on; we'll return NULL for
1435                  * every query text.
1436                  */
1437                 if (qbuffer == NULL ||
1438                         pgss->extent != extent ||
1439                         pgss->gc_count != gc_count)
1440                 {
1441                         if (qbuffer)
1442                                 free(qbuffer);
1443                         qbuffer = qtext_load_file(&qbuffer_size);
1444                 }
1445         }
1446
1447         hash_seq_init(&hash_seq, pgss_hash);
1448         while ((entry = hash_seq_search(&hash_seq)) != NULL)
1449         {
1450                 Datum           values[PG_STAT_STATEMENTS_COLS];
1451                 bool            nulls[PG_STAT_STATEMENTS_COLS];
1452                 int                     i = 0;
1453                 Counters        tmp;
1454                 int64           queryid = entry->key.queryid;
1455
1456                 memset(values, 0, sizeof(values));
1457                 memset(nulls, 0, sizeof(nulls));
1458
1459                 values[i++] = ObjectIdGetDatum(entry->key.userid);
1460                 values[i++] = ObjectIdGetDatum(entry->key.dbid);
1461
1462                 if (is_superuser || entry->key.userid == userid)
1463                 {
1464                         if (api_version >= PGSS_V1_2)
1465                                 values[i++] = Int64GetDatumFast(queryid);
1466
1467                         if (showtext)
1468                         {
1469                                 char       *qstr = qtext_fetch(entry->query_offset,
1470                                                                                            entry->query_len,
1471                                                                                            qbuffer,
1472                                                                                            qbuffer_size);
1473
1474                                 if (qstr)
1475                                 {
1476                                         char       *enc;
1477
1478                                         enc = pg_any_to_server(qstr,
1479                                                                                    entry->query_len,
1480                                                                                    entry->encoding);
1481
1482                                         values[i++] = CStringGetTextDatum(enc);
1483
1484                                         if (enc != qstr)
1485                                                 pfree(enc);
1486                                 }
1487                                 else
1488                                 {
1489                                         /* Just return a null if we fail to find the text */
1490                                         nulls[i++] = true;
1491                                 }
1492                         }
1493                         else
1494                         {
1495                                 /* Query text not requested */
1496                                 nulls[i++] = true;
1497                         }
1498                 }
1499                 else
1500                 {
1501                         /* Don't show queryid */
1502                         if (api_version >= PGSS_V1_2)
1503                                 nulls[i++] = true;
1504
1505                         /*
1506                          * Don't show query text, but hint as to the reason for not doing
1507                          * so if it was requested
1508                          */
1509                         if (showtext)
1510                                 values[i++] = CStringGetTextDatum("<insufficient privilege>");
1511                         else
1512                                 nulls[i++] = true;
1513                 }
1514
1515                 /* copy counters to a local variable to keep locking time short */
1516                 {
1517                         volatile pgssEntry *e = (volatile pgssEntry *) entry;
1518
1519                         SpinLockAcquire(&e->mutex);
1520                         tmp = e->counters;
1521                         SpinLockRelease(&e->mutex);
1522                 }
1523
1524                 /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1525                 if (tmp.calls == 0)
1526                         continue;
1527
1528                 values[i++] = Int64GetDatumFast(tmp.calls);
1529                 values[i++] = Float8GetDatumFast(tmp.total_time);
1530                 values[i++] = Int64GetDatumFast(tmp.rows);
1531                 values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1532                 values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
1533                 if (api_version >= PGSS_V1_1)
1534                         values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
1535                 values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1536                 values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1537                 values[i++] = Int64GetDatumFast(tmp.local_blks_read);
1538                 if (api_version >= PGSS_V1_1)
1539                         values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
1540                 values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1541                 values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1542                 values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
1543                 if (api_version >= PGSS_V1_1)
1544                 {
1545                         values[i++] = Float8GetDatumFast(tmp.blk_read_time);
1546                         values[i++] = Float8GetDatumFast(tmp.blk_write_time);
1547                 }
1548
1549                 Assert(i == (api_version == PGSS_V1_0 ? PG_STAT_STATEMENTS_COLS_V1_0 :
1550                                          api_version == PGSS_V1_1 ? PG_STAT_STATEMENTS_COLS_V1_1 :
1551                                          api_version == PGSS_V1_2 ? PG_STAT_STATEMENTS_COLS_V1_2 :
1552                                          -1 /* fail if you forget to update this assert */ ));
1553
1554                 tuplestore_putvalues(tupstore, tupdesc, values, nulls);
1555         }
1556
1557         /* clean up and return the tuplestore */
1558         LWLockRelease(pgss->lock);
1559
1560         if (qbuffer)
1561                 free(qbuffer);
1562
1563         tuplestore_donestoring(tupstore);
1564 }
1565
1566 /*
1567  * Estimate shared memory space needed.
1568  */
1569 static Size
1570 pgss_memsize(void)
1571 {
1572         Size            size;
1573
1574         size = MAXALIGN(sizeof(pgssSharedState));
1575         size = add_size(size, hash_estimate_size(pgss_max, sizeof(pgssEntry)));
1576
1577         return size;
1578 }
1579
1580 /*
1581  * Allocate a new hashtable entry.
1582  * caller must hold an exclusive lock on pgss->lock
1583  *
1584  * "query" need not be null-terminated; we rely on query_len instead
1585  *
1586  * If "sticky" is true, make the new entry artificially sticky so that it will
1587  * probably still be there when the query finishes execution.  We do this by
1588  * giving it a median usage value rather than the normal value.  (Strictly
1589  * speaking, query strings are normalized on a best effort basis, though it
1590  * would be difficult to demonstrate this even under artificial conditions.)
1591  *
1592  * Note: despite needing exclusive lock, it's not an error for the target
1593  * entry to already exist.  This is because pgss_store releases and
1594  * reacquires lock after failing to find a match; so someone else could
1595  * have made the entry while we waited to get exclusive lock.
1596  */
1597 static pgssEntry *
1598 entry_alloc(pgssHashKey *key, Size query_offset, int query_len, int encoding,
1599                         bool sticky)
1600 {
1601         pgssEntry  *entry;
1602         bool            found;
1603
1604         /* Make space if needed */
1605         while (hash_get_num_entries(pgss_hash) >= pgss_max)
1606                 entry_dealloc();
1607
1608         /* Find or create an entry with desired hash code */
1609         entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER, &found);
1610
1611         if (!found)
1612         {
1613                 /* New entry, initialize it */
1614
1615                 /* reset the statistics */
1616                 memset(&entry->counters, 0, sizeof(Counters));
1617                 /* set the appropriate initial usage count */
1618                 entry->counters.usage = sticky ? pgss->cur_median_usage : USAGE_INIT;
1619                 /* re-initialize the mutex each time ... we assume no one using it */
1620                 SpinLockInit(&entry->mutex);
1621                 /* ... and don't forget the query text metadata */
1622                 Assert(query_len >= 0);
1623                 entry->query_offset = query_offset;
1624                 entry->query_len = query_len;
1625                 entry->encoding = encoding;
1626         }
1627
1628         return entry;
1629 }
1630
1631 /*
1632  * qsort comparator for sorting into increasing usage order
1633  */
1634 static int
1635 entry_cmp(const void *lhs, const void *rhs)
1636 {
1637         double          l_usage = (*(pgssEntry *const *) lhs)->counters.usage;
1638         double          r_usage = (*(pgssEntry *const *) rhs)->counters.usage;
1639
1640         if (l_usage < r_usage)
1641                 return -1;
1642         else if (l_usage > r_usage)
1643                 return +1;
1644         else
1645                 return 0;
1646 }
1647
1648 /*
1649  * Deallocate least used entries.
1650  * Caller must hold an exclusive lock on pgss->lock.
1651  */
1652 static void
1653 entry_dealloc(void)
1654 {
1655         HASH_SEQ_STATUS hash_seq;
1656         pgssEntry **entries;
1657         pgssEntry  *entry;
1658         int                     nvictims;
1659         int                     i;
1660         Size            totlen = 0;
1661
1662         /*
1663          * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1664          * While we're scanning the table, apply the decay factor to the usage
1665          * values.
1666          */
1667
1668         entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *));
1669
1670         i = 0;
1671         hash_seq_init(&hash_seq, pgss_hash);
1672         while ((entry = hash_seq_search(&hash_seq)) != NULL)
1673         {
1674                 entries[i++] = entry;
1675                 /* "Sticky" entries get a different usage decay rate. */
1676                 if (entry->counters.calls == 0)
1677                         entry->counters.usage *= STICKY_DECREASE_FACTOR;
1678                 else
1679                         entry->counters.usage *= USAGE_DECREASE_FACTOR;
1680                 /* Accumulate total size, too. */
1681                 totlen += entry->query_len + 1;
1682         }
1683
1684         qsort(entries, i, sizeof(pgssEntry *), entry_cmp);
1685
1686         if (i > 0)
1687         {
1688                 /* Record the (approximate) median usage */
1689                 pgss->cur_median_usage = entries[i / 2]->counters.usage;
1690                 /* Record the mean query length */
1691                 pgss->mean_query_len = totlen / i;
1692         }
1693
1694         nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
1695         nvictims = Min(nvictims, i);
1696
1697         for (i = 0; i < nvictims; i++)
1698         {
1699                 hash_search(pgss_hash, &entries[i]->key, HASH_REMOVE, NULL);
1700         }
1701
1702         pfree(entries);
1703 }
1704
1705 /*
1706  * Given a null-terminated string, allocate a new entry in the external query
1707  * text file and store the string there.
1708  *
1709  * Although we could compute the string length via strlen(), callers already
1710  * have it handy, so we require them to pass it too.
1711  *
1712  * If successful, returns true, and stores the new entry's offset in the file
1713  * into *query_offset.  Also, if gc_count isn't NULL, *gc_count is set to the
1714  * number of garbage collections that have occurred so far.
1715  *
1716  * On failure, returns false.
1717  *
1718  * At least a shared lock on pgss->lock must be held by the caller, so as
1719  * to prevent a concurrent garbage collection.  Share-lock-holding callers
1720  * should pass a gc_count pointer to obtain the number of garbage collections,
1721  * so that they can recheck the count after obtaining exclusive lock to
1722  * detect whether a garbage collection occurred (and removed this entry).
1723  */
1724 static bool
1725 qtext_store(const char *query, int query_len,
1726                         Size *query_offset, int *gc_count)
1727 {
1728         Size            off;
1729         int                     fd;
1730
1731         /*
1732          * We use a spinlock to protect extent/n_writers/gc_count, so that
1733          * multiple processes may execute this function concurrently.
1734          */
1735         {
1736                 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1737
1738                 SpinLockAcquire(&s->mutex);
1739                 off = s->extent;
1740                 s->extent += query_len + 1;
1741                 s->n_writers++;
1742                 if (gc_count)
1743                         *gc_count = s->gc_count;
1744                 SpinLockRelease(&s->mutex);
1745         }
1746
1747         *query_offset = off;
1748
1749         /* Now write the data into the successfully-reserved part of the file */
1750         fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY,
1751                                                    S_IRUSR | S_IWUSR);
1752         if (fd < 0)
1753                 goto error;
1754
1755         if (lseek(fd, off, SEEK_SET) != off)
1756                 goto error;
1757
1758         if (write(fd, query, query_len + 1) != query_len + 1)
1759                 goto error;
1760
1761         CloseTransientFile(fd);
1762
1763         /* Mark our write complete */
1764         {
1765                 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1766
1767                 SpinLockAcquire(&s->mutex);
1768                 s->n_writers--;
1769                 SpinLockRelease(&s->mutex);
1770         }
1771
1772         return true;
1773
1774 error:
1775         ereport(LOG,
1776                         (errcode_for_file_access(),
1777                          errmsg("could not write pg_stat_statement file \"%s\": %m",
1778                                         PGSS_TEXT_FILE)));
1779
1780         if (fd >= 0)
1781                 CloseTransientFile(fd);
1782
1783         /* Mark our write complete */
1784         {
1785                 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1786
1787                 SpinLockAcquire(&s->mutex);
1788                 s->n_writers--;
1789                 SpinLockRelease(&s->mutex);
1790         }
1791
1792         return false;
1793 }
1794
1795 /*
1796  * Read the external query text file into a malloc'd buffer.
1797  *
1798  * Returns NULL (without throwing an error) if unable to read, eg
1799  * file not there or insufficient memory.
1800  *
1801  * On success, the buffer size is also returned into *buffer_size.
1802  *
1803  * This can be called without any lock on pgss->lock, but in that case
1804  * the caller is responsible for verifying that the result is sane.
1805  */
1806 static char *
1807 qtext_load_file(Size *buffer_size)
1808 {
1809         char       *buf;
1810         int                     fd;
1811         struct stat stat;
1812
1813         fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDONLY | PG_BINARY, 0);
1814         if (fd < 0)
1815         {
1816                 if (errno != ENOENT)
1817                         ereport(LOG,
1818                                         (errcode_for_file_access(),
1819                                    errmsg("could not read pg_stat_statement file \"%s\": %m",
1820                                                   PGSS_TEXT_FILE)));
1821                 return NULL;
1822         }
1823
1824         /* Get file length */
1825         if (fstat(fd, &stat))
1826         {
1827                 ereport(LOG,
1828                                 (errcode_for_file_access(),
1829                                  errmsg("could not stat pg_stat_statement file \"%s\": %m",
1830                                                 PGSS_TEXT_FILE)));
1831                 CloseTransientFile(fd);
1832                 return NULL;
1833         }
1834
1835         /* Allocate buffer; beware that off_t might be wider than size_t */
1836         if (stat.st_size <= MaxAllocSize)
1837                 buf = (char *) malloc(stat.st_size);
1838         else
1839                 buf = NULL;
1840         if (buf == NULL)
1841         {
1842                 ereport(LOG,
1843                                 (errcode(ERRCODE_OUT_OF_MEMORY),
1844                                  errmsg("out of memory")));
1845                 CloseTransientFile(fd);
1846                 return NULL;
1847         }
1848
1849         /*
1850          * OK, slurp in the file.  If we get a short read and errno doesn't get
1851          * set, the reason is probably that garbage collection truncated the file
1852          * since we did the fstat(), so we don't log a complaint --- but we don't
1853          * return the data, either, since it's most likely corrupt due to
1854          * concurrent writes from garbage collection.
1855          */
1856         errno = 0;
1857         if (read(fd, buf, stat.st_size) != stat.st_size)
1858         {
1859                 if (errno)
1860                         ereport(LOG,
1861                                         (errcode_for_file_access(),
1862                                    errmsg("could not read pg_stat_statement file \"%s\": %m",
1863                                                   PGSS_TEXT_FILE)));
1864                 free(buf);
1865                 CloseTransientFile(fd);
1866                 return NULL;
1867         }
1868
1869         CloseTransientFile(fd);
1870
1871         *buffer_size = stat.st_size;
1872         return buf;
1873 }
1874
1875 /*
1876  * Locate a query text in the file image previously read by qtext_load_file().
1877  *
1878  * We validate the given offset/length, and return NULL if bogus.  Otherwise,
1879  * the result points to a null-terminated string within the buffer.
1880  */
1881 static char *
1882 qtext_fetch(Size query_offset, int query_len,
1883                         char *buffer, Size buffer_size)
1884 {
1885         /* File read failed? */
1886         if (buffer == NULL)
1887                 return NULL;
1888         /* Bogus offset/length? */
1889         if (query_len < 0 ||
1890                 query_offset + query_len >= buffer_size)
1891                 return NULL;
1892         /* As a further sanity check, make sure there's a trailing null */
1893         if (buffer[query_offset + query_len] != '\0')
1894                 return NULL;
1895         /* Looks OK */
1896         return buffer + query_offset;
1897 }
1898
1899 /*
1900  * Do we need to garbage-collect the external query text file?
1901  *
1902  * Caller should hold at least a shared lock on pgss->lock.
1903  */
1904 static bool
1905 need_gc_qtexts(void)
1906 {
1907         Size            extent;
1908
1909         /* Read shared extent pointer */
1910         {
1911                 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1912
1913                 SpinLockAcquire(&s->mutex);
1914                 extent = s->extent;
1915                 SpinLockRelease(&s->mutex);
1916         }
1917
1918         /* Don't proceed if file does not exceed 512 bytes per possible entry */
1919         if (extent < 512 * pgss_max)
1920                 return false;
1921
1922         /*
1923          * Don't proceed if file is less than about 50% bloat.  Nothing can or
1924          * should be done in the event of unusually large query texts accounting
1925          * for file's large size.  We go to the trouble of maintaining the mean
1926          * query length in order to prevent garbage collection from thrashing
1927          * uselessly.
1928          */
1929         if (extent < pgss->mean_query_len * pgss_max * 2)
1930                 return false;
1931
1932         return true;
1933 }
1934
1935 /*
1936  * Garbage-collect orphaned query texts in external file.
1937  *
1938  * This won't be called often in the typical case, since it's likely that
1939  * there won't be too much churn, and besides, a similar compaction process
1940  * occurs when serializing to disk at shutdown or as part of resetting.
1941  * Despite this, it seems prudent to plan for the edge case where the file
1942  * becomes unreasonably large, with no other method of compaction likely to
1943  * occur in the foreseeable future.
1944  *
1945  * The caller must hold an exclusive lock on pgss->lock.
1946  */
1947 static void
1948 gc_qtexts(void)
1949 {
1950         char       *qbuffer;
1951         Size            qbuffer_size;
1952         FILE       *qfile;
1953         HASH_SEQ_STATUS hash_seq;
1954         pgssEntry  *entry;
1955         Size            extent;
1956         int                     nentries;
1957
1958         /*
1959          * When called from pgss_store, some other session might have proceeded
1960          * with garbage collection in the no-lock-held interim of lock strength
1961          * escalation.  Check once more that this is actually necessary.
1962          */
1963         if (!need_gc_qtexts())
1964                 return;
1965
1966         /*
1967          * Load the old texts file.  If we fail (out of memory, for instance) just
1968          * skip the garbage collection.
1969          */
1970         qbuffer = qtext_load_file(&qbuffer_size);
1971         if (qbuffer == NULL)
1972                 return;
1973
1974         /*
1975          * We overwrite the query texts file in place, so as to reduce the risk of
1976          * an out-of-disk-space failure.  Since the file is guaranteed not to get
1977          * larger, this should always work on traditional filesystems; though we
1978          * could still lose on copy-on-write filesystems.
1979          */
1980         qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
1981         if (qfile == NULL)
1982         {
1983                 ereport(LOG,
1984                                 (errcode_for_file_access(),
1985                                  errmsg("could not write pg_stat_statement file \"%s\": %m",
1986                                                 PGSS_TEXT_FILE)));
1987                 goto gc_fail;
1988         }
1989
1990         extent = 0;
1991         nentries = 0;
1992
1993         hash_seq_init(&hash_seq, pgss_hash);
1994         while ((entry = hash_seq_search(&hash_seq)) != NULL)
1995         {
1996                 int                     query_len = entry->query_len;
1997                 char       *qry = qtext_fetch(entry->query_offset,
1998                                                                           query_len,
1999                                                                           qbuffer,
2000                                                                           qbuffer_size);
2001
2002                 if (qry == NULL)
2003                 {
2004                         /* Trouble ... drop the text */
2005                         entry->query_offset = 0;
2006                         entry->query_len = -1;
2007                         continue;
2008                 }
2009
2010                 if (fwrite(qry, 1, query_len + 1, qfile) != query_len + 1)
2011                 {
2012                         ereport(LOG,
2013                                         (errcode_for_file_access(),
2014                                   errmsg("could not write pg_stat_statement file \"%s\": %m",
2015                                                  PGSS_TEXT_FILE)));
2016                         hash_seq_term(&hash_seq);
2017                         goto gc_fail;
2018                 }
2019
2020                 entry->query_offset = extent;
2021                 extent += query_len + 1;
2022                 nentries++;
2023         }
2024
2025         /*
2026          * Truncate away any now-unused space.  If this fails for some odd reason,
2027          * we log it, but there's no need to fail.
2028          */
2029         if (ftruncate(fileno(qfile), extent) != 0)
2030                 ereport(LOG,
2031                                 (errcode_for_file_access(),
2032                            errmsg("could not truncate pg_stat_statement file \"%s\": %m",
2033                                           PGSS_TEXT_FILE)));
2034
2035         if (FreeFile(qfile))
2036         {
2037                 ereport(LOG,
2038                                 (errcode_for_file_access(),
2039                                  errmsg("could not write pg_stat_statement file \"%s\": %m",
2040                                                 PGSS_TEXT_FILE)));
2041                 qfile = NULL;
2042                 goto gc_fail;
2043         }
2044
2045         elog(DEBUG1, "pgss gc of queries file shrunk size from %zu to %zu",
2046                  pgss->extent, extent);
2047
2048         /* Reset the shared extent pointer */
2049         pgss->extent = extent;
2050
2051         /*
2052          * Also update the mean query length, to be sure that need_gc_qtexts()
2053          * won't still think we have a problem.
2054          */
2055         if (nentries > 0)
2056                 pgss->mean_query_len = extent / nentries;
2057         else
2058                 pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2059
2060         free(qbuffer);
2061
2062         /*
2063          * OK, count a garbage collection cycle.  (Note: even though we have
2064          * exclusive lock on pgss->lock, we must take pgss->mutex for this, since
2065          * other processes may examine gc_count while holding only the mutex.
2066          * Also, we have to advance the count *after* we've rewritten the file,
2067          * else other processes might not realize they read a stale file.)
2068          */
2069         record_gc_qtexts();
2070
2071         return;
2072
2073 gc_fail:
2074         /* clean up resources */
2075         if (qfile)
2076                 FreeFile(qfile);
2077         if (qbuffer)
2078                 free(qbuffer);
2079
2080         /*
2081          * Since the contents of the external file are now uncertain, mark all
2082          * hashtable entries as having invalid texts.
2083          */
2084         hash_seq_init(&hash_seq, pgss_hash);
2085         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2086         {
2087                 entry->query_offset = 0;
2088                 entry->query_len = -1;
2089         }
2090
2091         /* Seems like a good idea to bump the GC count even though we failed */
2092         record_gc_qtexts();
2093 }
2094
2095 /*
2096  * Release all entries.
2097  */
2098 static void
2099 entry_reset(void)
2100 {
2101         HASH_SEQ_STATUS hash_seq;
2102         pgssEntry  *entry;
2103         FILE       *qfile;
2104
2105         LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
2106
2107         hash_seq_init(&hash_seq, pgss_hash);
2108         while ((entry = hash_seq_search(&hash_seq)) != NULL)
2109         {
2110                 hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
2111         }
2112
2113         /*
2114          * Write new empty query file, perhaps even creating a new one to recover
2115          * if the file was missing.
2116          */
2117         qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2118         if (qfile == NULL)
2119         {
2120                 ereport(LOG,
2121                                 (errcode_for_file_access(),
2122                                  errmsg("could not create pg_stat_statement file \"%s\": %m",
2123                                                 PGSS_TEXT_FILE)));
2124                 goto done;
2125         }
2126
2127         /* If ftruncate fails, log it, but it's not a fatal problem */
2128         if (ftruncate(fileno(qfile), 0) != 0)
2129                 ereport(LOG,
2130                                 (errcode_for_file_access(),
2131                            errmsg("could not truncate pg_stat_statement file \"%s\": %m",
2132                                           PGSS_TEXT_FILE)));
2133
2134         FreeFile(qfile);
2135
2136 done:
2137         pgss->extent = 0;
2138         /* This counts as a query text garbage collection for our purposes */
2139         record_gc_qtexts();
2140
2141         LWLockRelease(pgss->lock);
2142 }
2143 #endif
2144
2145 /*
2146  * AppendJumble: Append a value that is substantive in a given query to
2147  * the current jumble.
2148  */
2149 static void
2150 AppendJumble(pgssJumbleState *jstate, const unsigned char *item, Size size)
2151 {
2152         unsigned char *jumble = jstate->jumble;
2153         Size            jumble_len = jstate->jumble_len;
2154
2155         /*
2156          * Whenever the jumble buffer is full, we hash the current contents and
2157          * reset the buffer to contain just that hash value, thus relying on the
2158          * hash to summarize everything so far.
2159          */
2160         while (size > 0)
2161         {
2162                 Size            part_size;
2163
2164                 if (jumble_len >= JUMBLE_SIZE)
2165                 {
2166                         uint32          start_hash = hash_any(jumble, JUMBLE_SIZE);
2167
2168                         memcpy(jumble, &start_hash, sizeof(start_hash));
2169                         jumble_len = sizeof(start_hash);
2170                 }
2171                 part_size = Min(size, JUMBLE_SIZE - jumble_len);
2172                 memcpy(jumble + jumble_len, item, part_size);
2173                 jumble_len += part_size;
2174                 item += part_size;
2175                 size -= part_size;
2176         }
2177         jstate->jumble_len = jumble_len;
2178 }
2179
2180 /*
2181  * Wrappers around AppendJumble to encapsulate details of serialization
2182  * of individual local variable elements.
2183  */
2184 #define APP_JUMB(item) \
2185         AppendJumble(jstate, (const unsigned char *) &(item), sizeof(item))
2186 #define APP_JUMB_STRING(str) \
2187         AppendJumble(jstate, (const unsigned char *) (str), strlen(str) + 1)
2188
2189 /*
2190  * JumbleQuery: Selectively serialize the query tree, appending significant
2191  * data to the "query jumble" while ignoring nonsignificant data.
2192  *
2193  * Rule of thumb for what to include is that we should ignore anything not
2194  * semantically significant (such as alias names) as well as anything that can
2195  * be deduced from child nodes (else we'd just be double-hashing that piece
2196  * of information).
2197  */
2198 static void
2199 JumbleQuery(pgssJumbleState *jstate, Query *query)
2200 {
2201         Assert(IsA(query, Query));
2202         Assert(query->utilityStmt == NULL);
2203
2204         APP_JUMB(query->commandType);
2205         /* resultRelation is usually predictable from commandType */
2206         JumbleExpr(jstate, (Node *) query->cteList);
2207         JumbleRangeTable(jstate, query->rtable);
2208         JumbleExpr(jstate, (Node *) query->jointree);
2209         JumbleExpr(jstate, (Node *) query->targetList);
2210         JumbleExpr(jstate, (Node *) query->returningList);
2211         JumbleExpr(jstate, (Node *) query->groupClause);
2212         JumbleExpr(jstate, query->havingQual);
2213         JumbleExpr(jstate, (Node *) query->windowClause);
2214         JumbleExpr(jstate, (Node *) query->distinctClause);
2215         JumbleExpr(jstate, (Node *) query->sortClause);
2216         JumbleExpr(jstate, query->limitOffset);
2217         JumbleExpr(jstate, query->limitCount);
2218         /* we ignore rowMarks */
2219         JumbleExpr(jstate, query->setOperations);
2220 }
2221
2222 /*
2223  * Jumble a range table
2224  */
2225 static void
2226 JumbleRangeTable(pgssJumbleState *jstate, List *rtable)
2227 {
2228         ListCell   *lc;
2229
2230         foreach(lc, rtable)
2231         {
2232                 RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
2233
2234                 Assert(IsA(rte, RangeTblEntry));
2235                 APP_JUMB(rte->rtekind);
2236                 switch (rte->rtekind)
2237                 {
2238                         case RTE_RELATION:
2239                                 APP_JUMB(rte->relid);
2240                                 break;
2241                         case RTE_SUBQUERY:
2242                                 JumbleQuery(jstate, rte->subquery);
2243                                 break;
2244                         case RTE_JOIN:
2245                                 APP_JUMB(rte->jointype);
2246                                 break;
2247                         case RTE_FUNCTION:
2248                                 JumbleExpr(jstate, (Node *) rte->functions);
2249                                 break;
2250                         case RTE_VALUES:
2251                                 JumbleExpr(jstate, (Node *) rte->values_lists);
2252                                 break;
2253                         case RTE_CTE:
2254
2255                                 /*
2256                                  * Depending on the CTE name here isn't ideal, but it's the
2257                                  * only info we have to identify the referenced WITH item.
2258                                  */
2259                                 APP_JUMB_STRING(rte->ctename);
2260                                 APP_JUMB(rte->ctelevelsup);
2261                                 break;
2262                         default:
2263                                 elog(ERROR, "unrecognized RTE kind: %d", (int) rte->rtekind);
2264                                 break;
2265                 }
2266         }
2267 }
2268
2269 /*
2270  * Jumble an expression tree
2271  *
2272  * In general this function should handle all the same node types that
2273  * expression_tree_walker() does, and therefore it's coded to be as parallel
2274  * to that function as possible.  However, since we are only invoked on
2275  * queries immediately post-parse-analysis, we need not handle node types
2276  * that only appear in planning.
2277  *
2278  * Note: the reason we don't simply use expression_tree_walker() is that the
2279  * point of that function is to support tree walkers that don't care about
2280  * most tree node types, but here we care about all types.  We should complain
2281  * about any unrecognized node type.
2282  */
2283 static void
2284 JumbleExpr(pgssJumbleState *jstate, Node *node)
2285 {
2286         ListCell   *temp;
2287
2288         if (node == NULL)
2289                 return;
2290
2291         /* Guard against stack overflow due to overly complex expressions */
2292         check_stack_depth();
2293
2294         /*
2295          * We always emit the node's NodeTag, then any additional fields that are
2296          * considered significant, and then we recurse to any child nodes.
2297          */
2298         APP_JUMB(node->type);
2299
2300         switch (nodeTag(node))
2301         {
2302                 case T_Var:
2303                         {
2304                                 Var                *var = (Var *) node;
2305
2306                                 APP_JUMB(var->varno);
2307                                 APP_JUMB(var->varattno);
2308                                 APP_JUMB(var->varlevelsup);
2309                         }
2310                         break;
2311                 case T_Const:
2312                         {
2313                                 Const      *c = (Const *) node;
2314
2315                                 /* We jumble only the constant's type, not its value */
2316                                 APP_JUMB(c->consttype);
2317                                 /* Also, record its parse location for query normalization */
2318                                 RecordConstLocation(jstate, c->location);
2319                         }
2320                         break;
2321                 case T_Param:
2322                         {
2323                                 Param      *p = (Param *) node;
2324
2325                                 APP_JUMB(p->paramkind);
2326                                 APP_JUMB(p->paramid);
2327                                 APP_JUMB(p->paramtype);
2328                         }
2329                         break;
2330                 case T_Aggref:
2331                         {
2332                                 Aggref     *expr = (Aggref *) node;
2333
2334                                 APP_JUMB(expr->aggfnoid);
2335                                 JumbleExpr(jstate, (Node *) expr->aggdirectargs);
2336                                 JumbleExpr(jstate, (Node *) expr->args);
2337                                 JumbleExpr(jstate, (Node *) expr->aggorder);
2338                                 JumbleExpr(jstate, (Node *) expr->aggdistinct);
2339                                 JumbleExpr(jstate, (Node *) expr->aggfilter);
2340                         }
2341                         break;
2342                 case T_WindowFunc:
2343                         {
2344                                 WindowFunc *expr = (WindowFunc *) node;
2345
2346                                 APP_JUMB(expr->winfnoid);
2347                                 APP_JUMB(expr->winref);
2348                                 JumbleExpr(jstate, (Node *) expr->args);
2349                                 JumbleExpr(jstate, (Node *) expr->aggfilter);
2350                         }
2351                         break;
2352                 case T_ArrayRef:
2353                         {
2354                                 ArrayRef   *aref = (ArrayRef *) node;
2355
2356                                 JumbleExpr(jstate, (Node *) aref->refupperindexpr);
2357                                 JumbleExpr(jstate, (Node *) aref->reflowerindexpr);
2358                                 JumbleExpr(jstate, (Node *) aref->refexpr);
2359                                 JumbleExpr(jstate, (Node *) aref->refassgnexpr);
2360                         }
2361                         break;
2362                 case T_FuncExpr:
2363                         {
2364                                 FuncExpr   *expr = (FuncExpr *) node;
2365
2366                                 APP_JUMB(expr->funcid);
2367                                 JumbleExpr(jstate, (Node *) expr->args);
2368                         }
2369                         break;
2370                 case T_NamedArgExpr:
2371                         {
2372                                 NamedArgExpr *nae = (NamedArgExpr *) node;
2373
2374                                 APP_JUMB(nae->argnumber);
2375                                 JumbleExpr(jstate, (Node *) nae->arg);
2376                         }
2377                         break;
2378                 case T_OpExpr:
2379                 case T_DistinctExpr:    /* struct-equivalent to OpExpr */
2380                 case T_NullIfExpr:              /* struct-equivalent to OpExpr */
2381                         {
2382                                 OpExpr     *expr = (OpExpr *) node;
2383
2384                                 APP_JUMB(expr->opno);
2385                                 JumbleExpr(jstate, (Node *) expr->args);
2386                         }
2387                         break;
2388                 case T_ScalarArrayOpExpr:
2389                         {
2390                                 ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) node;
2391
2392                                 APP_JUMB(expr->opno);
2393                                 APP_JUMB(expr->useOr);
2394                                 JumbleExpr(jstate, (Node *) expr->args);
2395                         }
2396                         break;
2397                 case T_BoolExpr:
2398                         {
2399                                 BoolExpr   *expr = (BoolExpr *) node;
2400
2401                                 APP_JUMB(expr->boolop);
2402                                 JumbleExpr(jstate, (Node *) expr->args);
2403                         }
2404                         break;
2405                 case T_SubLink:
2406                         {
2407                                 SubLink    *sublink = (SubLink *) node;
2408
2409                                 APP_JUMB(sublink->subLinkType);
2410                                 JumbleExpr(jstate, (Node *) sublink->testexpr);
2411                                 JumbleQuery(jstate, (Query *) sublink->subselect);
2412                         }
2413                         break;
2414                 case T_FieldSelect:
2415                         {
2416                                 FieldSelect *fs = (FieldSelect *) node;
2417
2418                                 APP_JUMB(fs->fieldnum);
2419                                 JumbleExpr(jstate, (Node *) fs->arg);
2420                         }
2421                         break;
2422                 case T_FieldStore:
2423                         {
2424                                 FieldStore *fstore = (FieldStore *) node;
2425
2426                                 JumbleExpr(jstate, (Node *) fstore->arg);
2427                                 JumbleExpr(jstate, (Node *) fstore->newvals);
2428                         }
2429                         break;
2430                 case T_RelabelType:
2431                         {
2432                                 RelabelType *rt = (RelabelType *) node;
2433
2434                                 APP_JUMB(rt->resulttype);
2435                                 JumbleExpr(jstate, (Node *) rt->arg);
2436                         }
2437                         break;
2438                 case T_CoerceViaIO:
2439                         {
2440                                 CoerceViaIO *cio = (CoerceViaIO *) node;
2441
2442                                 APP_JUMB(cio->resulttype);
2443                                 JumbleExpr(jstate, (Node *) cio->arg);
2444                         }
2445                         break;
2446                 case T_ArrayCoerceExpr:
2447                         {
2448                                 ArrayCoerceExpr *acexpr = (ArrayCoerceExpr *) node;
2449
2450                                 APP_JUMB(acexpr->resulttype);
2451                                 JumbleExpr(jstate, (Node *) acexpr->arg);
2452                         }
2453                         break;
2454                 case T_ConvertRowtypeExpr:
2455                         {
2456                                 ConvertRowtypeExpr *crexpr = (ConvertRowtypeExpr *) node;
2457
2458                                 APP_JUMB(crexpr->resulttype);
2459                                 JumbleExpr(jstate, (Node *) crexpr->arg);
2460                         }
2461                         break;
2462                 case T_CollateExpr:
2463                         {
2464                                 CollateExpr *ce = (CollateExpr *) node;
2465
2466                                 APP_JUMB(ce->collOid);
2467                                 JumbleExpr(jstate, (Node *) ce->arg);
2468                         }
2469                         break;
2470                 case T_CaseExpr:
2471                         {
2472                                 CaseExpr   *caseexpr = (CaseExpr *) node;
2473
2474                                 JumbleExpr(jstate, (Node *) caseexpr->arg);
2475                                 foreach(temp, caseexpr->args)
2476                                 {
2477                                         CaseWhen   *when = (CaseWhen *) lfirst(temp);
2478
2479                                         Assert(IsA(when, CaseWhen));
2480                                         JumbleExpr(jstate, (Node *) when->expr);
2481                                         JumbleExpr(jstate, (Node *) when->result);
2482                                 }
2483                                 JumbleExpr(jstate, (Node *) caseexpr->defresult);
2484                         }
2485                         break;
2486                 case T_CaseTestExpr:
2487                         {
2488                                 CaseTestExpr *ct = (CaseTestExpr *) node;
2489
2490                                 APP_JUMB(ct->typeId);
2491                         }
2492                         break;
2493                 case T_ArrayExpr:
2494                         JumbleExpr(jstate, (Node *) ((ArrayExpr *) node)->elements);
2495                         break;
2496                 case T_RowExpr:
2497                         JumbleExpr(jstate, (Node *) ((RowExpr *) node)->args);
2498                         break;
2499                 case T_RowCompareExpr:
2500                         {
2501                                 RowCompareExpr *rcexpr = (RowCompareExpr *) node;
2502
2503                                 APP_JUMB(rcexpr->rctype);
2504                                 JumbleExpr(jstate, (Node *) rcexpr->largs);
2505                                 JumbleExpr(jstate, (Node *) rcexpr->rargs);
2506                         }
2507                         break;
2508                 case T_CoalesceExpr:
2509                         JumbleExpr(jstate, (Node *) ((CoalesceExpr *) node)->args);
2510                         break;
2511                 case T_MinMaxExpr:
2512                         {
2513                                 MinMaxExpr *mmexpr = (MinMaxExpr *) node;
2514
2515                                 APP_JUMB(mmexpr->op);
2516                                 JumbleExpr(jstate, (Node *) mmexpr->args);
2517                         }
2518                         break;
2519                 case T_XmlExpr:
2520                         {
2521                                 XmlExpr    *xexpr = (XmlExpr *) node;
2522
2523                                 APP_JUMB(xexpr->op);
2524                                 JumbleExpr(jstate, (Node *) xexpr->named_args);
2525                                 JumbleExpr(jstate, (Node *) xexpr->args);
2526                         }
2527                         break;
2528                 case T_NullTest:
2529                         {
2530                                 NullTest   *nt = (NullTest *) node;
2531
2532                                 APP_JUMB(nt->nulltesttype);
2533                                 JumbleExpr(jstate, (Node *) nt->arg);
2534                         }
2535                         break;
2536                 case T_BooleanTest:
2537                         {
2538                                 BooleanTest *bt = (BooleanTest *) node;
2539
2540                                 APP_JUMB(bt->booltesttype);
2541                                 JumbleExpr(jstate, (Node *) bt->arg);
2542                         }
2543                         break;
2544                 case T_CoerceToDomain:
2545                         {
2546                                 CoerceToDomain *cd = (CoerceToDomain *) node;
2547
2548                                 APP_JUMB(cd->resulttype);
2549                                 JumbleExpr(jstate, (Node *) cd->arg);
2550                         }
2551                         break;
2552                 case T_CoerceToDomainValue:
2553                         {
2554                                 CoerceToDomainValue *cdv = (CoerceToDomainValue *) node;
2555
2556                                 APP_JUMB(cdv->typeId);
2557                         }
2558                         break;
2559                 case T_SetToDefault:
2560                         {
2561                                 SetToDefault *sd = (SetToDefault *) node;
2562
2563                                 APP_JUMB(sd->typeId);
2564                         }
2565                         break;
2566                 case T_CurrentOfExpr:
2567                         {
2568                                 CurrentOfExpr *ce = (CurrentOfExpr *) node;
2569
2570                                 APP_JUMB(ce->cvarno);
2571                                 if (ce->cursor_name)
2572                                         APP_JUMB_STRING(ce->cursor_name);
2573                                 APP_JUMB(ce->cursor_param);
2574                         }
2575                         break;
2576                 case T_TargetEntry:
2577                         {
2578                                 TargetEntry *tle = (TargetEntry *) node;
2579
2580                                 APP_JUMB(tle->resno);
2581                                 APP_JUMB(tle->ressortgroupref);
2582                                 JumbleExpr(jstate, (Node *) tle->expr);
2583                         }
2584                         break;
2585                 case T_RangeTblRef:
2586                         {
2587                                 RangeTblRef *rtr = (RangeTblRef *) node;
2588
2589                                 APP_JUMB(rtr->rtindex);
2590                         }
2591                         break;
2592                 case T_JoinExpr:
2593                         {
2594                                 JoinExpr   *join = (JoinExpr *) node;
2595
2596                                 APP_JUMB(join->jointype);
2597                                 APP_JUMB(join->isNatural);
2598                                 APP_JUMB(join->rtindex);
2599                                 JumbleExpr(jstate, join->larg);
2600                                 JumbleExpr(jstate, join->rarg);
2601                                 JumbleExpr(jstate, join->quals);
2602                         }
2603                         break;
2604                 case T_FromExpr:
2605                         {
2606                                 FromExpr   *from = (FromExpr *) node;
2607
2608                                 JumbleExpr(jstate, (Node *) from->fromlist);
2609                                 JumbleExpr(jstate, from->quals);
2610                         }
2611                         break;
2612                 case T_List:
2613                         foreach(temp, (List *) node)
2614                         {
2615                                 JumbleExpr(jstate, (Node *) lfirst(temp));
2616                         }
2617                         break;
2618                 case T_SortGroupClause:
2619                         {
2620                                 SortGroupClause *sgc = (SortGroupClause *) node;
2621
2622                                 APP_JUMB(sgc->tleSortGroupRef);
2623                                 APP_JUMB(sgc->eqop);
2624                                 APP_JUMB(sgc->sortop);
2625                                 APP_JUMB(sgc->nulls_first);
2626                         }
2627                         break;
2628                 case T_WindowClause:
2629                         {
2630                                 WindowClause *wc = (WindowClause *) node;
2631
2632                                 APP_JUMB(wc->winref);
2633                                 APP_JUMB(wc->frameOptions);
2634                                 JumbleExpr(jstate, (Node *) wc->partitionClause);
2635                                 JumbleExpr(jstate, (Node *) wc->orderClause);
2636                                 JumbleExpr(jstate, wc->startOffset);
2637                                 JumbleExpr(jstate, wc->endOffset);
2638                         }
2639                         break;
2640                 case T_CommonTableExpr:
2641                         {
2642                                 CommonTableExpr *cte = (CommonTableExpr *) node;
2643
2644                                 /* we store the string name because RTE_CTE RTEs need it */
2645                                 APP_JUMB_STRING(cte->ctename);
2646                                 JumbleQuery(jstate, (Query *) cte->ctequery);
2647                         }
2648                         break;
2649                 case T_SetOperationStmt:
2650                         {
2651                                 SetOperationStmt *setop = (SetOperationStmt *) node;
2652
2653                                 APP_JUMB(setop->op);
2654                                 APP_JUMB(setop->all);
2655                                 JumbleExpr(jstate, setop->larg);
2656                                 JumbleExpr(jstate, setop->rarg);
2657                         }
2658                         break;
2659                 case T_RangeTblFunction:
2660                         {
2661                                 RangeTblFunction *rtfunc = (RangeTblFunction *) node;
2662
2663                                 JumbleExpr(jstate, rtfunc->funcexpr);
2664                         }
2665                         break;
2666                 default:
2667                         /* Only a warning, since we can stumble along anyway */
2668                         elog(WARNING, "unrecognized node type: %d",
2669                                  (int) nodeTag(node));
2670                         break;
2671         }
2672 }
2673
2674 /*
2675  * Record location of constant within query string of query tree
2676  * that is currently being walked.
2677  */
2678 static void
2679 RecordConstLocation(pgssJumbleState *jstate, int location)
2680 {
2681         /* -1 indicates unknown or undefined location */
2682         if (location >= 0)
2683         {
2684                 /* enlarge array if needed */
2685                 if (jstate->clocations_count >= jstate->clocations_buf_size)
2686                 {
2687                         jstate->clocations_buf_size *= 2;
2688                         jstate->clocations = (pgssLocationLen *)
2689                                 repalloc(jstate->clocations,
2690                                                  jstate->clocations_buf_size *
2691                                                  sizeof(pgssLocationLen));
2692                 }
2693                 jstate->clocations[jstate->clocations_count].location = location;
2694                 /* initialize lengths to -1 to simplify fill_in_constant_lengths */
2695                 jstate->clocations[jstate->clocations_count].length = -1;
2696                 jstate->clocations_count++;
2697         }
2698 }
2699
2700 /*
2701  * Generate a normalized version of the query string that will be used to
2702  * represent all similar queries.
2703  *
2704  * Note that the normalized representation may well vary depending on
2705  * just which "equivalent" query is used to create the hashtable entry.
2706  * We assume this is OK.
2707  *
2708  * *query_len_p contains the input string length, and is updated with
2709  * the result string length (which cannot be longer) on exit.
2710  *
2711  * Returns a palloc'd string.
2712  */
2713 static char *
2714 generate_normalized_query(pgssJumbleState *jstate, const char *query,
2715                                                   int *query_len_p, int encoding)
2716 {
2717         char       *norm_query;
2718         int                     query_len = *query_len_p;
2719         int                     i,
2720                                 len_to_wrt,             /* Length (in bytes) to write */
2721                                 quer_loc = 0,   /* Source query byte location */
2722                                 n_quer_loc = 0, /* Normalized query byte location */
2723                                 last_off = 0,   /* Offset from start for previous tok */
2724                                 last_tok_len = 0;               /* Length (in bytes) of that tok */
2725
2726         /*
2727          * Get constants' lengths (core system only gives us locations).  Note
2728          * this also ensures the items are sorted by location.
2729          */
2730         fill_in_constant_lengths(jstate, query);
2731
2732         /* Allocate result buffer */
2733         norm_query = palloc(query_len + 1);
2734
2735         for (i = 0; i < jstate->clocations_count; i++)
2736         {
2737                 int                     off,            /* Offset from start for cur tok */
2738                                         tok_len;        /* Length (in bytes) of that tok */
2739
2740                 off = jstate->clocations[i].location;
2741                 tok_len = jstate->clocations[i].length;
2742
2743                 if (tok_len < 0)
2744                         continue;                       /* ignore any duplicates */
2745
2746                 /* Copy next chunk (what precedes the next constant) */
2747                 len_to_wrt = off - last_off;
2748                 len_to_wrt -= last_tok_len;
2749
2750                 Assert(len_to_wrt >= 0);
2751                 memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
2752                 n_quer_loc += len_to_wrt;
2753
2754                 /* And insert a '?' in place of the constant token */
2755                 norm_query[n_quer_loc++] = '?';
2756
2757                 quer_loc = off + tok_len;
2758                 last_off = off;
2759                 last_tok_len = tok_len;
2760         }
2761
2762         /*
2763          * We've copied up until the last ignorable constant.  Copy over the
2764          * remaining bytes of the original query string.
2765          */
2766         len_to_wrt = query_len - quer_loc;
2767
2768         Assert(len_to_wrt >= 0);
2769         memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
2770         n_quer_loc += len_to_wrt;
2771
2772         Assert(n_quer_loc <= query_len);
2773         norm_query[n_quer_loc] = '\0';
2774
2775         *query_len_p = n_quer_loc;
2776         return norm_query;
2777 }
2778
2779 /*
2780  * Given a valid SQL string and an array of constant-location records,
2781  * fill in the textual lengths of those constants.
2782  *
2783  * The constants may use any allowed constant syntax, such as float literals,
2784  * bit-strings, single-quoted strings and dollar-quoted strings.  This is
2785  * accomplished by using the public API for the core scanner.
2786  *
2787  * It is the caller's job to ensure that the string is a valid SQL statement
2788  * with constants at the indicated locations.  Since in practice the string
2789  * has already been parsed, and the locations that the caller provides will
2790  * have originated from within the authoritative parser, this should not be
2791  * a problem.
2792  *
2793  * Duplicate constant pointers are possible, and will have their lengths
2794  * marked as '-1', so that they are later ignored.  (Actually, we assume the
2795  * lengths were initialized as -1 to start with, and don't change them here.)
2796  *
2797  * N.B. There is an assumption that a '-' character at a Const location begins
2798  * a negative numeric constant.  This precludes there ever being another
2799  * reason for a constant to start with a '-'.
2800  */
2801 static void
2802 fill_in_constant_lengths(pgssJumbleState *jstate, const char *query)
2803 {
2804         pgssLocationLen *locs;
2805         core_yyscan_t yyscanner;
2806         core_yy_extra_type yyextra;
2807         core_YYSTYPE yylval;
2808         YYLTYPE         yylloc;
2809         int                     last_loc = -1;
2810         int                     i;
2811
2812         /*
2813          * Sort the records by location so that we can process them in order while
2814          * scanning the query text.
2815          */
2816         if (jstate->clocations_count > 1)
2817                 qsort(jstate->clocations, jstate->clocations_count,
2818                           sizeof(pgssLocationLen), comp_location);
2819         locs = jstate->clocations;
2820
2821         /* initialize the flex scanner --- should match raw_parser() */
2822         yyscanner = scanner_init(query,
2823                                                          &yyextra,
2824                                                          ScanKeywords,
2825                                                          NumScanKeywords);
2826
2827         /* Search for each constant, in sequence */
2828         for (i = 0; i < jstate->clocations_count; i++)
2829         {
2830                 int                     loc = locs[i].location;
2831                 int                     tok;
2832
2833                 Assert(loc >= 0);
2834
2835                 if (loc <= last_loc)
2836                         continue;                       /* Duplicate constant, ignore */
2837
2838                 /* Lex tokens until we find the desired constant */
2839                 for (;;)
2840                 {
2841                         tok = core_yylex(&yylval, &yylloc, yyscanner);
2842
2843                         /* We should not hit end-of-string, but if we do, behave sanely */
2844                         if (tok == 0)
2845                                 break;                  /* out of inner for-loop */
2846
2847                         /*
2848                          * We should find the token position exactly, but if we somehow
2849                          * run past it, work with that.
2850                          */
2851                         if (yylloc >= loc)
2852                         {
2853                                 if (query[loc] == '-')
2854                                 {
2855                                         /*
2856                                          * It's a negative value - this is the one and only case
2857                                          * where we replace more than a single token.
2858                                          *
2859                                          * Do not compensate for the core system's special-case
2860                                          * adjustment of location to that of the leading '-'
2861                                          * operator in the event of a negative constant.  It is
2862                                          * also useful for our purposes to start from the minus
2863                                          * symbol.  In this way, queries like "select * from foo
2864                                          * where bar = 1" and "select * from foo where bar = -2"
2865                                          * will have identical normalized query strings.
2866                                          */
2867                                         tok = core_yylex(&yylval, &yylloc, yyscanner);
2868                                         if (tok == 0)
2869                                                 break;  /* out of inner for-loop */
2870                                 }
2871
2872                                 /*
2873                                  * We now rely on the assumption that flex has placed a zero
2874                                  * byte after the text of the current token in scanbuf.
2875                                  */
2876                                 locs[i].length = strlen(yyextra.scanbuf + loc);
2877                                 break;                  /* out of inner for-loop */
2878                         }
2879                 }
2880
2881                 /* If we hit end-of-string, give up, leaving remaining lengths -1 */
2882                 if (tok == 0)
2883                         break;
2884
2885                 last_loc = loc;
2886         }
2887
2888         scanner_finish(yyscanner);
2889 }
2890
2891 /*
2892  * comp_location: comparator for qsorting pgssLocationLen structs by location
2893  */
2894 static int
2895 comp_location(const void *a, const void *b)
2896 {
2897         int                     l = ((const pgssLocationLen *) a)->location;
2898         int                     r = ((const pgssLocationLen *) b)->location;
2899
2900         if (l < r)
2901                 return -1;
2902         else if (l > r)
2903                 return +1;
2904         else
2905                 return 0;
2906 }