1 /*-------------------------------------------------------------------------
5 * Part of pg_stat_statements.c in PostgreSQL 10.
7 * Copyright (c) 2008-2019, PostgreSQL Global Development Group
9 *-------------------------------------------------------------------------
15 #include "access/hash.h"
16 #include "parser/scanner.h"
18 static void AppendJumble(pgssJumbleState *jstate,
19 const unsigned char *item, Size size);
20 static void JumbleQuery(pgssJumbleState *jstate, Query *query);
21 static void JumbleRangeTable(pgssJumbleState *jstate, List *rtable);
22 static void JumbleExpr(pgssJumbleState *jstate, Node *node);
23 static void RecordConstLocation(pgssJumbleState *jstate, int location);
24 static char *generate_normalized_query(pgssJumbleState *jstate, const char *query,
25 int query_loc, int *query_len_p, int encoding);
26 static void fill_in_constant_lengths(pgssJumbleState *jstate, const char *query,
28 static int comp_location(const void *a, const void *b);
31 * AppendJumble: Append a value that is substantive in a given query to
35 AppendJumble(pgssJumbleState *jstate, const unsigned char *item, Size size)
37 unsigned char *jumble = jstate->jumble;
38 Size jumble_len = jstate->jumble_len;
41 * Whenever the jumble buffer is full, we hash the current contents and
42 * reset the buffer to contain just that hash value, thus relying on the
43 * hash to summarize everything so far.
49 if (jumble_len >= JUMBLE_SIZE)
51 uint32 start_hash = hash_any(jumble, JUMBLE_SIZE);
53 memcpy(jumble, &start_hash, sizeof(start_hash));
54 jumble_len = sizeof(start_hash);
56 part_size = Min(size, JUMBLE_SIZE - jumble_len);
57 memcpy(jumble + jumble_len, item, part_size);
58 jumble_len += part_size;
62 jstate->jumble_len = jumble_len;
66 * Wrappers around AppendJumble to encapsulate details of serialization
67 * of individual local variable elements.
69 #define APP_JUMB(item) \
70 AppendJumble(jstate, (const unsigned char *) &(item), sizeof(item))
71 #define APP_JUMB_STRING(str) \
72 AppendJumble(jstate, (const unsigned char *) (str), strlen(str) + 1)
75 * JumbleQuery: Selectively serialize the query tree, appending significant
76 * data to the "query jumble" while ignoring nonsignificant data.
78 * Rule of thumb for what to include is that we should ignore anything not
79 * semantically significant (such as alias names) as well as anything that can
80 * be deduced from child nodes (else we'd just be double-hashing that piece
84 JumbleQuery(pgssJumbleState *jstate, Query *query)
86 Assert(IsA(query, Query));
87 Assert(query->utilityStmt == NULL);
89 APP_JUMB(query->commandType);
90 /* resultRelation is usually predictable from commandType */
91 JumbleExpr(jstate, (Node *) query->cteList);
92 JumbleRangeTable(jstate, query->rtable);
93 JumbleExpr(jstate, (Node *) query->jointree);
94 JumbleExpr(jstate, (Node *) query->targetList);
95 JumbleExpr(jstate, (Node *) query->onConflict);
96 JumbleExpr(jstate, (Node *) query->returningList);
97 JumbleExpr(jstate, (Node *) query->groupClause);
98 JumbleExpr(jstate, (Node *) query->groupingSets);
99 JumbleExpr(jstate, query->havingQual);
100 JumbleExpr(jstate, (Node *) query->windowClause);
101 JumbleExpr(jstate, (Node *) query->distinctClause);
102 JumbleExpr(jstate, (Node *) query->sortClause);
103 JumbleExpr(jstate, query->limitOffset);
104 JumbleExpr(jstate, query->limitCount);
105 /* we ignore rowMarks */
106 JumbleExpr(jstate, query->setOperations);
110 * Jumble a range table
113 JumbleRangeTable(pgssJumbleState *jstate, List *rtable)
119 RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc);
121 APP_JUMB(rte->rtekind);
122 switch (rte->rtekind)
125 APP_JUMB(rte->relid);
126 JumbleExpr(jstate, (Node *) rte->tablesample);
129 JumbleQuery(jstate, rte->subquery);
132 APP_JUMB(rte->jointype);
135 JumbleExpr(jstate, (Node *) rte->functions);
138 JumbleExpr(jstate, (Node *) rte->tablefunc);
141 JumbleExpr(jstate, (Node *) rte->values_lists);
146 * Depending on the CTE name here isn't ideal, but it's the
147 * only info we have to identify the referenced WITH item.
149 APP_JUMB_STRING(rte->ctename);
150 APP_JUMB(rte->ctelevelsup);
152 case RTE_NAMEDTUPLESTORE:
153 APP_JUMB_STRING(rte->enrname);
156 elog(ERROR, "unrecognized RTE kind: %d", (int) rte->rtekind);
163 * Jumble an expression tree
165 * In general this function should handle all the same node types that
166 * expression_tree_walker() does, and therefore it's coded to be as parallel
167 * to that function as possible. However, since we are only invoked on
168 * queries immediately post-parse-analysis, we need not handle node types
169 * that only appear in planning.
171 * Note: the reason we don't simply use expression_tree_walker() is that the
172 * point of that function is to support tree walkers that don't care about
173 * most tree node types, but here we care about all types. We should complain
174 * about any unrecognized node type.
177 JumbleExpr(pgssJumbleState *jstate, Node *node)
184 /* Guard against stack overflow due to overly complex expressions */
188 * We always emit the node's NodeTag, then any additional fields that are
189 * considered significant, and then we recurse to any child nodes.
191 APP_JUMB(node->type);
193 switch (nodeTag(node))
197 Var *var = (Var *) node;
199 APP_JUMB(var->varno);
200 APP_JUMB(var->varattno);
201 APP_JUMB(var->varlevelsup);
206 Const *c = (Const *) node;
208 /* We jumble only the constant's type, not its value */
209 APP_JUMB(c->consttype);
210 /* Also, record its parse location for query normalization */
211 RecordConstLocation(jstate, c->location);
216 Param *p = (Param *) node;
218 APP_JUMB(p->paramkind);
219 APP_JUMB(p->paramid);
220 APP_JUMB(p->paramtype);
221 /* Also, track the highest external Param id */
222 if (p->paramkind == PARAM_EXTERN &&
223 p->paramid > jstate->highest_extern_param_id)
224 jstate->highest_extern_param_id = p->paramid;
229 Aggref *expr = (Aggref *) node;
231 APP_JUMB(expr->aggfnoid);
232 JumbleExpr(jstate, (Node *) expr->aggdirectargs);
233 JumbleExpr(jstate, (Node *) expr->args);
234 JumbleExpr(jstate, (Node *) expr->aggorder);
235 JumbleExpr(jstate, (Node *) expr->aggdistinct);
236 JumbleExpr(jstate, (Node *) expr->aggfilter);
241 GroupingFunc *grpnode = (GroupingFunc *) node;
243 JumbleExpr(jstate, (Node *) grpnode->refs);
248 WindowFunc *expr = (WindowFunc *) node;
250 APP_JUMB(expr->winfnoid);
251 APP_JUMB(expr->winref);
252 JumbleExpr(jstate, (Node *) expr->args);
253 JumbleExpr(jstate, (Node *) expr->aggfilter);
258 ArrayRef *aref = (ArrayRef *) node;
260 JumbleExpr(jstate, (Node *) aref->refupperindexpr);
261 JumbleExpr(jstate, (Node *) aref->reflowerindexpr);
262 JumbleExpr(jstate, (Node *) aref->refexpr);
263 JumbleExpr(jstate, (Node *) aref->refassgnexpr);
268 FuncExpr *expr = (FuncExpr *) node;
270 APP_JUMB(expr->funcid);
271 JumbleExpr(jstate, (Node *) expr->args);
276 NamedArgExpr *nae = (NamedArgExpr *) node;
278 APP_JUMB(nae->argnumber);
279 JumbleExpr(jstate, (Node *) nae->arg);
283 case T_DistinctExpr: /* struct-equivalent to OpExpr */
284 case T_NullIfExpr: /* struct-equivalent to OpExpr */
286 OpExpr *expr = (OpExpr *) node;
288 APP_JUMB(expr->opno);
289 JumbleExpr(jstate, (Node *) expr->args);
292 case T_ScalarArrayOpExpr:
294 ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) node;
296 APP_JUMB(expr->opno);
297 APP_JUMB(expr->useOr);
298 JumbleExpr(jstate, (Node *) expr->args);
303 BoolExpr *expr = (BoolExpr *) node;
305 APP_JUMB(expr->boolop);
306 JumbleExpr(jstate, (Node *) expr->args);
311 SubLink *sublink = (SubLink *) node;
313 APP_JUMB(sublink->subLinkType);
314 APP_JUMB(sublink->subLinkId);
315 JumbleExpr(jstate, (Node *) sublink->testexpr);
316 JumbleQuery(jstate, castNode(Query, sublink->subselect));
321 FieldSelect *fs = (FieldSelect *) node;
323 APP_JUMB(fs->fieldnum);
324 JumbleExpr(jstate, (Node *) fs->arg);
329 FieldStore *fstore = (FieldStore *) node;
331 JumbleExpr(jstate, (Node *) fstore->arg);
332 JumbleExpr(jstate, (Node *) fstore->newvals);
337 RelabelType *rt = (RelabelType *) node;
339 APP_JUMB(rt->resulttype);
340 JumbleExpr(jstate, (Node *) rt->arg);
345 CoerceViaIO *cio = (CoerceViaIO *) node;
347 APP_JUMB(cio->resulttype);
348 JumbleExpr(jstate, (Node *) cio->arg);
351 case T_ArrayCoerceExpr:
353 ArrayCoerceExpr *acexpr = (ArrayCoerceExpr *) node;
355 APP_JUMB(acexpr->resulttype);
356 JumbleExpr(jstate, (Node *) acexpr->arg);
359 case T_ConvertRowtypeExpr:
361 ConvertRowtypeExpr *crexpr = (ConvertRowtypeExpr *) node;
363 APP_JUMB(crexpr->resulttype);
364 JumbleExpr(jstate, (Node *) crexpr->arg);
369 CollateExpr *ce = (CollateExpr *) node;
371 APP_JUMB(ce->collOid);
372 JumbleExpr(jstate, (Node *) ce->arg);
377 CaseExpr *caseexpr = (CaseExpr *) node;
379 JumbleExpr(jstate, (Node *) caseexpr->arg);
380 foreach(temp, caseexpr->args)
382 CaseWhen *when = lfirst_node(CaseWhen, temp);
384 JumbleExpr(jstate, (Node *) when->expr);
385 JumbleExpr(jstate, (Node *) when->result);
387 JumbleExpr(jstate, (Node *) caseexpr->defresult);
392 CaseTestExpr *ct = (CaseTestExpr *) node;
394 APP_JUMB(ct->typeId);
398 JumbleExpr(jstate, (Node *) ((ArrayExpr *) node)->elements);
401 JumbleExpr(jstate, (Node *) ((RowExpr *) node)->args);
403 case T_RowCompareExpr:
405 RowCompareExpr *rcexpr = (RowCompareExpr *) node;
407 APP_JUMB(rcexpr->rctype);
408 JumbleExpr(jstate, (Node *) rcexpr->largs);
409 JumbleExpr(jstate, (Node *) rcexpr->rargs);
413 JumbleExpr(jstate, (Node *) ((CoalesceExpr *) node)->args);
417 MinMaxExpr *mmexpr = (MinMaxExpr *) node;
419 APP_JUMB(mmexpr->op);
420 JumbleExpr(jstate, (Node *) mmexpr->args);
423 case T_SQLValueFunction:
425 SQLValueFunction *svf = (SQLValueFunction *) node;
428 /* type is fully determined by op */
429 APP_JUMB(svf->typmod);
434 XmlExpr *xexpr = (XmlExpr *) node;
437 JumbleExpr(jstate, (Node *) xexpr->named_args);
438 JumbleExpr(jstate, (Node *) xexpr->args);
443 NullTest *nt = (NullTest *) node;
445 APP_JUMB(nt->nulltesttype);
446 JumbleExpr(jstate, (Node *) nt->arg);
451 BooleanTest *bt = (BooleanTest *) node;
453 APP_JUMB(bt->booltesttype);
454 JumbleExpr(jstate, (Node *) bt->arg);
457 case T_CoerceToDomain:
459 CoerceToDomain *cd = (CoerceToDomain *) node;
461 APP_JUMB(cd->resulttype);
462 JumbleExpr(jstate, (Node *) cd->arg);
465 case T_CoerceToDomainValue:
467 CoerceToDomainValue *cdv = (CoerceToDomainValue *) node;
469 APP_JUMB(cdv->typeId);
474 SetToDefault *sd = (SetToDefault *) node;
476 APP_JUMB(sd->typeId);
479 case T_CurrentOfExpr:
481 CurrentOfExpr *ce = (CurrentOfExpr *) node;
483 APP_JUMB(ce->cvarno);
485 APP_JUMB_STRING(ce->cursor_name);
486 APP_JUMB(ce->cursor_param);
489 case T_NextValueExpr:
491 NextValueExpr *nve = (NextValueExpr *) node;
493 APP_JUMB(nve->seqid);
494 APP_JUMB(nve->typeId);
497 case T_InferenceElem:
499 InferenceElem *ie = (InferenceElem *) node;
501 APP_JUMB(ie->infercollid);
502 APP_JUMB(ie->inferopclass);
503 JumbleExpr(jstate, ie->expr);
508 TargetEntry *tle = (TargetEntry *) node;
510 APP_JUMB(tle->resno);
511 APP_JUMB(tle->ressortgroupref);
512 JumbleExpr(jstate, (Node *) tle->expr);
517 RangeTblRef *rtr = (RangeTblRef *) node;
519 APP_JUMB(rtr->rtindex);
524 JoinExpr *join = (JoinExpr *) node;
526 APP_JUMB(join->jointype);
527 APP_JUMB(join->isNatural);
528 APP_JUMB(join->rtindex);
529 JumbleExpr(jstate, join->larg);
530 JumbleExpr(jstate, join->rarg);
531 JumbleExpr(jstate, join->quals);
536 FromExpr *from = (FromExpr *) node;
538 JumbleExpr(jstate, (Node *) from->fromlist);
539 JumbleExpr(jstate, from->quals);
542 case T_OnConflictExpr:
544 OnConflictExpr *conf = (OnConflictExpr *) node;
546 APP_JUMB(conf->action);
547 JumbleExpr(jstate, (Node *) conf->arbiterElems);
548 JumbleExpr(jstate, conf->arbiterWhere);
549 JumbleExpr(jstate, (Node *) conf->onConflictSet);
550 JumbleExpr(jstate, conf->onConflictWhere);
551 APP_JUMB(conf->constraint);
552 APP_JUMB(conf->exclRelIndex);
553 JumbleExpr(jstate, (Node *) conf->exclRelTlist);
557 foreach(temp, (List *) node)
559 JumbleExpr(jstate, (Node *) lfirst(temp));
563 foreach(temp, (List *) node)
565 APP_JUMB(lfirst_int(temp));
568 case T_SortGroupClause:
570 SortGroupClause *sgc = (SortGroupClause *) node;
572 APP_JUMB(sgc->tleSortGroupRef);
574 APP_JUMB(sgc->sortop);
575 APP_JUMB(sgc->nulls_first);
580 GroupingSet *gsnode = (GroupingSet *) node;
582 JumbleExpr(jstate, (Node *) gsnode->content);
587 WindowClause *wc = (WindowClause *) node;
589 APP_JUMB(wc->winref);
590 APP_JUMB(wc->frameOptions);
591 JumbleExpr(jstate, (Node *) wc->partitionClause);
592 JumbleExpr(jstate, (Node *) wc->orderClause);
593 JumbleExpr(jstate, wc->startOffset);
594 JumbleExpr(jstate, wc->endOffset);
597 case T_CommonTableExpr:
599 CommonTableExpr *cte = (CommonTableExpr *) node;
601 /* we store the string name because RTE_CTE RTEs need it */
602 APP_JUMB_STRING(cte->ctename);
603 JumbleQuery(jstate, castNode(Query, cte->ctequery));
606 case T_SetOperationStmt:
608 SetOperationStmt *setop = (SetOperationStmt *) node;
611 APP_JUMB(setop->all);
612 JumbleExpr(jstate, setop->larg);
613 JumbleExpr(jstate, setop->rarg);
616 case T_RangeTblFunction:
618 RangeTblFunction *rtfunc = (RangeTblFunction *) node;
620 JumbleExpr(jstate, rtfunc->funcexpr);
625 TableFunc *tablefunc = (TableFunc *) node;
627 JumbleExpr(jstate, tablefunc->docexpr);
628 JumbleExpr(jstate, tablefunc->rowexpr);
629 JumbleExpr(jstate, (Node *) tablefunc->colexprs);
632 case T_TableSampleClause:
634 TableSampleClause *tsc = (TableSampleClause *) node;
636 APP_JUMB(tsc->tsmhandler);
637 JumbleExpr(jstate, (Node *) tsc->args);
638 JumbleExpr(jstate, (Node *) tsc->repeatable);
642 /* Only a warning, since we can stumble along anyway */
643 elog(WARNING, "unrecognized node type: %d",
644 (int) nodeTag(node));
650 * Record location of constant within query string of query tree
651 * that is currently being walked.
654 RecordConstLocation(pgssJumbleState *jstate, int location)
656 /* -1 indicates unknown or undefined location */
659 /* enlarge array if needed */
660 if (jstate->clocations_count >= jstate->clocations_buf_size)
662 jstate->clocations_buf_size *= 2;
663 jstate->clocations = (pgssLocationLen *)
664 repalloc(jstate->clocations,
665 jstate->clocations_buf_size *
666 sizeof(pgssLocationLen));
668 jstate->clocations[jstate->clocations_count].location = location;
669 /* initialize lengths to -1 to simplify fill_in_constant_lengths */
670 jstate->clocations[jstate->clocations_count].length = -1;
671 jstate->clocations_count++;
676 * Generate a normalized version of the query string that will be used to
677 * represent all similar queries.
679 * Note that the normalized representation may well vary depending on
680 * just which "equivalent" query is used to create the hashtable entry.
681 * We assume this is OK.
683 * If query_loc > 0, then "query" has been advanced by that much compared to
684 * the original string start, so we need to translate the provided locations
685 * to compensate. (This lets us avoid re-scanning statements before the one
686 * of interest, so it's worth doing.)
688 * *query_len_p contains the input string length, and is updated with
689 * the result string length on exit. The resulting string might be longer
690 * or shorter depending on what happens with replacement of constants.
692 * Returns a palloc'd string.
695 generate_normalized_query(pgssJumbleState *jstate, const char *query,
696 int query_loc, int *query_len_p, int encoding)
699 int query_len = *query_len_p;
701 norm_query_buflen, /* Space allowed for norm_query */
702 len_to_wrt, /* Length (in bytes) to write */
703 quer_loc = 0, /* Source query byte location */
704 n_quer_loc = 0, /* Normalized query byte location */
705 last_off = 0, /* Offset from start for previous tok */
706 last_tok_len = 0; /* Length (in bytes) of that tok */
709 * Get constants' lengths (core system only gives us locations). Note
710 * this also ensures the items are sorted by location.
712 fill_in_constant_lengths(jstate, query, query_loc);
715 * Allow for $n symbols to be longer than the constants they replace.
716 * Constants must take at least one byte in text form, while a $n symbol
717 * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We
718 * could refine that limit based on the max value of n for the current
719 * query, but it hardly seems worth any extra effort to do so.
721 norm_query_buflen = query_len + jstate->clocations_count * 10;
723 /* Allocate result buffer */
724 norm_query = palloc(norm_query_buflen + 1);
726 for (i = 0; i < jstate->clocations_count; i++)
728 int off, /* Offset from start for cur tok */
729 tok_len; /* Length (in bytes) of that tok */
731 off = jstate->clocations[i].location;
732 /* Adjust recorded location if we're dealing with partial string */
735 tok_len = jstate->clocations[i].length;
738 continue; /* ignore any duplicates */
740 /* Copy next chunk (what precedes the next constant) */
741 len_to_wrt = off - last_off;
742 len_to_wrt -= last_tok_len;
744 Assert(len_to_wrt >= 0);
745 memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
746 n_quer_loc += len_to_wrt;
749 * PG_HINT_PLAN: DON'T TAKE IN a6f22e8356 so that the designed behavior
752 /* And insert a '?' in place of the constant token */
753 norm_query[n_quer_loc++] = '?';
755 quer_loc = off + tok_len;
757 last_tok_len = tok_len;
761 * We've copied up until the last ignorable constant. Copy over the
762 * remaining bytes of the original query string.
764 len_to_wrt = query_len - quer_loc;
766 Assert(len_to_wrt >= 0);
767 memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
768 n_quer_loc += len_to_wrt;
770 Assert(n_quer_loc <= norm_query_buflen);
771 norm_query[n_quer_loc] = '\0';
773 *query_len_p = n_quer_loc;
778 * Given a valid SQL string and an array of constant-location records,
779 * fill in the textual lengths of those constants.
781 * The constants may use any allowed constant syntax, such as float literals,
782 * bit-strings, single-quoted strings and dollar-quoted strings. This is
783 * accomplished by using the public API for the core scanner.
785 * It is the caller's job to ensure that the string is a valid SQL statement
786 * with constants at the indicated locations. Since in practice the string
787 * has already been parsed, and the locations that the caller provides will
788 * have originated from within the authoritative parser, this should not be
791 * Duplicate constant pointers are possible, and will have their lengths
792 * marked as '-1', so that they are later ignored. (Actually, we assume the
793 * lengths were initialized as -1 to start with, and don't change them here.)
795 * If query_loc > 0, then "query" has been advanced by that much compared to
796 * the original string start, so we need to translate the provided locations
797 * to compensate. (This lets us avoid re-scanning statements before the one
798 * of interest, so it's worth doing.)
800 * N.B. There is an assumption that a '-' character at a Const location begins
801 * a negative numeric constant. This precludes there ever being another
802 * reason for a constant to start with a '-'.
805 fill_in_constant_lengths(pgssJumbleState *jstate, const char *query,
808 pgssLocationLen *locs;
809 core_yyscan_t yyscanner;
810 core_yy_extra_type yyextra;
817 * Sort the records by location so that we can process them in order while
818 * scanning the query text.
820 if (jstate->clocations_count > 1)
821 qsort(jstate->clocations, jstate->clocations_count,
822 sizeof(pgssLocationLen), comp_location);
823 locs = jstate->clocations;
825 /* initialize the flex scanner --- should match raw_parser() */
826 yyscanner = scanner_init(query,
831 /* we don't want to re-emit any escape string warnings */
832 yyextra.escape_string_warning = false;
834 /* Search for each constant, in sequence */
835 for (i = 0; i < jstate->clocations_count; i++)
837 int loc = locs[i].location;
840 /* Adjust recorded location if we're dealing with partial string */
846 continue; /* Duplicate constant, ignore */
848 /* Lex tokens until we find the desired constant */
851 tok = core_yylex(&yylval, &yylloc, yyscanner);
853 /* We should not hit end-of-string, but if we do, behave sanely */
855 break; /* out of inner for-loop */
858 * We should find the token position exactly, but if we somehow
859 * run past it, work with that.
863 if (query[loc] == '-')
866 * It's a negative value - this is the one and only case
867 * where we replace more than a single token.
869 * Do not compensate for the core system's special-case
870 * adjustment of location to that of the leading '-'
871 * operator in the event of a negative constant. It is
872 * also useful for our purposes to start from the minus
873 * symbol. In this way, queries like "select * from foo
874 * where bar = 1" and "select * from foo where bar = -2"
875 * will have identical normalized query strings.
877 tok = core_yylex(&yylval, &yylloc, yyscanner);
879 break; /* out of inner for-loop */
883 * We now rely on the assumption that flex has placed a zero
884 * byte after the text of the current token in scanbuf.
886 locs[i].length = strlen(yyextra.scanbuf + loc);
887 break; /* out of inner for-loop */
891 /* If we hit end-of-string, give up, leaving remaining lengths -1 */
898 scanner_finish(yyscanner);
902 * comp_location: comparator for qsorting pgssLocationLen structs by location
905 comp_location(const void *a, const void *b)
907 int l = ((const pgssLocationLen *) a)->location;
908 int r = ((const pgssLocationLen *) b)->location;