1 /*-------------------------------------------------------------------------
5 * Part of pg_stat_statements.c in PostgreSQL 10.
7 * Copyright (c) 2008-2020, PostgreSQL Global Development Group
9 *-------------------------------------------------------------------------
13 #include "parser/scanner.h"
15 static char *generate_normalized_query(JumbleState *jstate, const char *query,
16 int query_loc, int *query_len_p);
17 static void fill_in_constant_lengths(JumbleState *jstate, const char *query,
19 static int comp_location(const void *a, const void *b);
24 * Generate a normalized version of the query string that will be used to
25 * represent all similar queries.
27 * Note that the normalized representation may well vary depending on
28 * just which "equivalent" query is used to create the hashtable entry.
29 * We assume this is OK.
31 * If query_loc > 0, then "query" has been advanced by that much compared to
32 * the original string start, so we need to translate the provided locations
33 * to compensate. (This lets us avoid re-scanning statements before the one
34 * of interest, so it's worth doing.)
36 * *query_len_p contains the input string length, and is updated with
37 * the result string length on exit. The resulting string might be longer
38 * or shorter depending on what happens with replacement of constants.
40 * Returns a palloc'd string.
43 generate_normalized_query(JumbleState *jstate, const char *query,
44 int query_loc, int *query_len_p)
47 int query_len = *query_len_p;
49 norm_query_buflen, /* Space allowed for norm_query */
50 len_to_wrt, /* Length (in bytes) to write */
51 quer_loc = 0, /* Source query byte location */
52 n_quer_loc = 0, /* Normalized query byte location */
53 last_off = 0, /* Offset from start for previous tok */
54 last_tok_len = 0; /* Length (in bytes) of that tok */
57 * Get constants' lengths (core system only gives us locations). Note
58 * this also ensures the items are sorted by location.
60 fill_in_constant_lengths(jstate, query, query_loc);
63 * Allow for $n symbols to be longer than the constants they replace.
64 * Constants must take at least one byte in text form, while a $n symbol
65 * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We
66 * could refine that limit based on the max value of n for the current
67 * query, but it hardly seems worth any extra effort to do so.
69 norm_query_buflen = query_len + jstate->clocations_count * 10;
71 /* Allocate result buffer */
72 norm_query = palloc(norm_query_buflen + 1);
74 for (i = 0; i < jstate->clocations_count; i++)
76 int off, /* Offset from start for cur tok */
77 tok_len; /* Length (in bytes) of that tok */
79 off = jstate->clocations[i].location;
80 /* Adjust recorded location if we're dealing with partial string */
83 tok_len = jstate->clocations[i].length;
86 continue; /* ignore any duplicates */
88 /* Copy next chunk (what precedes the next constant) */
89 len_to_wrt = off - last_off;
90 len_to_wrt -= last_tok_len;
92 Assert(len_to_wrt >= 0);
93 memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
94 n_quer_loc += len_to_wrt;
96 /* And insert a param symbol in place of the constant token */
98 /* !!! START: HERE IS THE PART WHICH IS MODIFIED FOR PG_HINT_PLAN !!! */
99 n_quer_loc += sprintf(norm_query + n_quer_loc, "?");
100 /* !!! END: HERE IS THE PART WHICH IS MODIFIED FOR PG_HINT_PLAN !!! */
102 quer_loc = off + tok_len;
104 last_tok_len = tok_len;
108 * We've copied up until the last ignorable constant. Copy over the
109 * remaining bytes of the original query string.
111 len_to_wrt = query_len - quer_loc;
113 Assert(len_to_wrt >= 0);
114 memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
115 n_quer_loc += len_to_wrt;
117 Assert(n_quer_loc <= norm_query_buflen);
118 norm_query[n_quer_loc] = '\0';
120 *query_len_p = n_quer_loc;
125 * Given a valid SQL string and an array of constant-location records,
126 * fill in the textual lengths of those constants.
128 * The constants may use any allowed constant syntax, such as float literals,
129 * bit-strings, single-quoted strings and dollar-quoted strings. This is
130 * accomplished by using the public API for the core scanner.
132 * It is the caller's job to ensure that the string is a valid SQL statement
133 * with constants at the indicated locations. Since in practice the string
134 * has already been parsed, and the locations that the caller provides will
135 * have originated from within the authoritative parser, this should not be
138 * Duplicate constant pointers are possible, and will have their lengths
139 * marked as '-1', so that they are later ignored. (Actually, we assume the
140 * lengths were initialized as -1 to start with, and don't change them here.)
142 * If query_loc > 0, then "query" has been advanced by that much compared to
143 * the original string start, so we need to translate the provided locations
144 * to compensate. (This lets us avoid re-scanning statements before the one
145 * of interest, so it's worth doing.)
147 * N.B. There is an assumption that a '-' character at a Const location begins
148 * a negative numeric constant. This precludes there ever being another
149 * reason for a constant to start with a '-'.
152 fill_in_constant_lengths(JumbleState *jstate, const char *query,
156 core_yyscan_t yyscanner;
157 core_yy_extra_type yyextra;
164 * Sort the records by location so that we can process them in order while
165 * scanning the query text.
167 if (jstate->clocations_count > 1)
168 qsort(jstate->clocations, jstate->clocations_count,
169 sizeof(LocationLen), comp_location);
170 locs = jstate->clocations;
172 /* initialize the flex scanner --- should match raw_parser() */
173 yyscanner = scanner_init(query,
178 /* we don't want to re-emit any escape string warnings */
179 yyextra.escape_string_warning = false;
181 /* Search for each constant, in sequence */
182 for (i = 0; i < jstate->clocations_count; i++)
184 int loc = locs[i].location;
187 /* Adjust recorded location if we're dealing with partial string */
193 continue; /* Duplicate constant, ignore */
195 /* Lex tokens until we find the desired constant */
198 tok = core_yylex(&yylval, &yylloc, yyscanner);
200 /* We should not hit end-of-string, but if we do, behave sanely */
202 break; /* out of inner for-loop */
205 * We should find the token position exactly, but if we somehow
206 * run past it, work with that.
210 if (query[loc] == '-')
213 * It's a negative value - this is the one and only case
214 * where we replace more than a single token.
216 * Do not compensate for the core system's special-case
217 * adjustment of location to that of the leading '-'
218 * operator in the event of a negative constant. It is
219 * also useful for our purposes to start from the minus
220 * symbol. In this way, queries like "select * from foo
221 * where bar = 1" and "select * from foo where bar = -2"
222 * will have identical normalized query strings.
224 tok = core_yylex(&yylval, &yylloc, yyscanner);
226 break; /* out of inner for-loop */
230 * We now rely on the assumption that flex has placed a zero
231 * byte after the text of the current token in scanbuf.
233 locs[i].length = strlen(yyextra.scanbuf + loc);
234 break; /* out of inner for-loop */
238 /* If we hit end-of-string, give up, leaving remaining lengths -1 */
245 scanner_finish(yyscanner);
249 * comp_location: comparator for qsorting LocationLen structs by location
252 comp_location(const void *a, const void *b)
254 int l = ((const LocationLen *) a)->location;
255 int r = ((const LocationLen *) b)->location;