From 978de9d06d54de07547049e49ad5ed500e64edf0 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Fri, 7 Sep 2007 16:03:40 +0000 Subject: [PATCH] Improvements from Heikki Linnakangas - change the alignment requirement of lexemes in TSVector slightly. Lexeme strings were always padded to 2-byte aligned length to make sure that if there's position array (uint16[]) it has the right alignment. The patch changes that so that the padding is not done when there's no positions. That makes the storage of tsvectors without positions slightly more compact. - added some #include "miscadmin.h" lines I missed in the earlier when I added calls to check_stack_depth(). - Reimplement the send/recv functions, and added a comment above them describing the on-wire format. The CRC is now recalculated in tsquery as well per previous discussion. --- src/backend/utils/adt/tsginidx.c | 6 +- src/backend/utils/adt/tsquery.c | 249 +++++++++++++++----------------- src/backend/utils/adt/tsquery_cleanup.c | 3 +- src/backend/utils/adt/tsquery_rewrite.c | 3 +- src/backend/utils/adt/tsquery_util.c | 3 +- src/backend/utils/adt/tsrank.c | 9 +- src/backend/utils/adt/tsvector.c | 162 ++++++++++++++------- src/backend/utils/adt/tsvector_op.c | 37 +++-- src/include/tsearch/ts_type.h | 33 +++-- 9 files changed, 286 insertions(+), 219 deletions(-) diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c index 10b80dc956..974a1b7ae4 100644 --- a/src/backend/utils/adt/tsginidx.c +++ b/src/backend/utils/adt/tsginidx.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsginidx.c,v 1.2 2007/09/07 15:09:56 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsginidx.c,v 1.3 2007/09/07 16:03:40 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -22,7 +22,7 @@ Datum gin_extract_tsvector(PG_FUNCTION_ARGS) { TSVector vector = PG_GETARG_TSVECTOR(0); - uint32 *nentries = (uint32 *) PG_GETARG_POINTER(1); + int32 *nentries = (int32 *) PG_GETARG_POINTER(1); Datum *entries = NULL; *nentries = 0; @@ -55,7 +55,7 @@ Datum gin_extract_query(PG_FUNCTION_ARGS) { TSQuery query = PG_GETARG_TSQUERY(0); - uint32 *nentries = (uint32 *) PG_GETARG_POINTER(1); + int32 *nentries = (int32 *) PG_GETARG_POINTER(1); StrategyNumber strategy = PG_GETARG_UINT16(2); Datum *entries = NULL; diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index 83f2c602a8..c53d526bdd 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.4 2007/09/07 15:35:10 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.5 2007/09/07 16:03:40 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -21,7 +21,6 @@ #include "tsearch/ts_utils.h" #include "utils/memutils.h" #include "utils/pg_crc.h" -#include "nodes/bitmapset.h" struct TSQueryParserStateData @@ -384,16 +383,15 @@ makepol(TSQueryParserState state, } } -/* - * Fills in the left-fields previously left unfilled. The input - * QueryItems must be in polish (prefix) notation. - */ static void -findoprnd(QueryItem *ptr, uint32 *pos) +findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes) { /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); + if (*pos >= nnodes) + elog(ERROR, "malformed tsquery; operand not found"); + if (ptr[*pos].type == QI_VAL || ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here, * they haven't been cleaned @@ -410,7 +408,7 @@ findoprnd(QueryItem *ptr, uint32 *pos) { ptr[*pos].operator.left = 1; (*pos)++; - findoprnd(ptr, pos); + findoprnd_recurse(ptr, pos, nnodes); } else { @@ -420,13 +418,31 @@ findoprnd(QueryItem *ptr, uint32 *pos) Assert(curitem->oper == OP_AND || curitem->oper == OP_OR); (*pos)++; - findoprnd(ptr, pos); + findoprnd_recurse(ptr, pos, nnodes); curitem->left = *pos - tmp; - findoprnd(ptr, pos); + findoprnd_recurse(ptr, pos, nnodes); } } } + +/* + * Fills in the left-fields previously left unfilled. The input + * QueryItems must be in polish (prefix) notation. + */ +static void +findoprnd(QueryItem *ptr, int size) +{ + uint32 pos; + + pos = 0; + findoprnd_recurse(ptr, &pos, size); + + if (pos != size) + elog(ERROR, "malformed tsquery; extra nodes"); +} + + /* * Each value (operand) in the query is be passed to pushval. pushval can * transform the simple value to an arbitrarily complex expression using @@ -452,7 +468,6 @@ parse_tsquery(char *buf, TSQuery query; int commonlen; QueryItem *ptr; - uint32 pos = 0; ListCell *cell; /* init state */ @@ -522,8 +537,7 @@ parse_tsquery(char *buf, pfree(state.op); /* Set left operand pointers for every operator. */ - pos = 0; - findoprnd(ptr, &pos); + findoprnd(ptr, query->size); return query; } @@ -734,6 +748,22 @@ tsqueryout(PG_FUNCTION_ARGS) PG_RETURN_CSTRING(nrm.buf); } +/* + * Binary Input / Output functions. The binary format is as follows: + * + * uint32 number of operators/operands in the query + * + * Followed by the operators and operands, in prefix notation. For each + * operand: + * + * uint8 type, QI_VAL + * uint8 weight + * operand text in client encoding, null-terminated + * + * For each operator: + * uint8 type, QI_OPR + * uint8 operator, one of OP_AND, OP_OR, OP_NOT. + */ Datum tsquerysend(PG_FUNCTION_ARGS) { @@ -744,7 +774,7 @@ tsquerysend(PG_FUNCTION_ARGS) pq_begintypsend(&buf); - pq_sendint(&buf, query->size, sizeof(int32)); + pq_sendint(&buf, query->size, sizeof(uint32)); for (i = 0; i < query->size; i++) { pq_sendint(&buf, item->type, sizeof(item->type)); @@ -752,16 +782,13 @@ tsquerysend(PG_FUNCTION_ARGS) switch(item->type) { case QI_VAL: - pq_sendint(&buf, item->operand.weight, sizeof(item->operand.weight)); - pq_sendint(&buf, item->operand.valcrc, sizeof(item->operand.valcrc)); - pq_sendint(&buf, item->operand.length, sizeof(int16)); + pq_sendint(&buf, item->operand.weight, sizeof(uint8)); + pq_sendstring(&buf, GETOPERAND(query) + item->operand.distance); /* istrue flag is just for temporary use in tsrank.c/Cover, * so we don't need to transfer that */ break; case QI_OPR: pq_sendint(&buf, item->operator.oper, sizeof(item->operator.oper)); - if (item->operator.oper != OP_NOT) - pq_sendint(&buf, item->operator.left, sizeof(item->operator.left)); break; default: elog(ERROR, "unknown tsquery node type %d", item->type); @@ -769,14 +796,6 @@ tsquerysend(PG_FUNCTION_ARGS) item++; } - item = GETQUERY(query); - for (i = 0; i < query->size; i++) - { - if (item->type == QI_VAL) - pq_sendbytes(&buf, GETOPERAND(query) + item->operand.distance, item->operand.length); - item++; - } - PG_FREE_IF_COPY(query, 0); PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); @@ -788,141 +807,113 @@ tsqueryrecv(PG_FUNCTION_ARGS) StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); TSQuery query; int i, - size, len; QueryItem *item; - int datalen = 0; + int datalen; char *ptr; - Bitmapset *parentset = NULL; + uint32 size; + const char **operands; size = pq_getmsgint(buf, sizeof(uint32)); - if (size < 0 || size > (MaxAllocSize / sizeof(QueryItem))) + if (size > (MaxAllocSize / sizeof(QueryItem))) elog(ERROR, "invalid size of tsquery"); - len = HDRSIZETQ + sizeof(QueryItem) * size; + /* Allocate space to temporarily hold operand strings */ + operands = palloc(size * sizeof(char *)); + /* Allocate space for all the QueryItems. */ + len = HDRSIZETQ + sizeof(QueryItem) * size; query = (TSQuery) palloc0(len); query->size = size; item = GETQUERY(query); + datalen = 0; for (i = 0; i < size; i++) { item->type = (int8) pq_getmsgint(buf, sizeof(int8)); - switch(item->type) + if (item->type == QI_VAL) { - case QI_VAL: - item->operand.weight = (int8) pq_getmsgint(buf, sizeof(int8)); - item->operand.valcrc = (int32) pq_getmsgint(buf, sizeof(int32)); - item->operand.length = pq_getmsgint(buf, sizeof(int16)); - - /* Check that the weight bitmap is valid */ - if (item->operand.weight < 0 || item->operand.weight > 0xF) - elog(ERROR, "invalid weight bitmap"); - - /* XXX: We don't check that the CRC is valid. Actually, if we - * bothered to calculate it to verify, there would be no need - * to transfer it. - */ - - /* - * Check that datalen doesn't grow too large. Without the - * check, a malicious client could induce a buffer overflow - * by sending a tsquery whose size exceeds 2GB. datalen - * would overflow, we would allocate a too small buffer below, - * and overflow the buffer. Because operand.length is a 20-bit - * field, adding one such value to datalen must exceed - * MaxAllocSize before wrapping over the 32-bit datalen field, - * so this check will protect from it. - */ - if (datalen > MAXSTRLEN) - elog(ERROR, "invalid tsquery; total operand length exceeded"); - - /* We can calculate distance from datalen, no need to send it - * across the wire. If we did, we would have to check that - * it's valid anyway. - */ - item->operand.distance = datalen; - - datalen += item->operand.length + 1; /* \0 */ - - break; - case QI_OPR: - item->operator.oper = (int8) pq_getmsgint(buf, sizeof(int8)); - if (item->operator.oper != OP_NOT && - item->operator.oper != OP_OR && - item->operator.oper != OP_AND) - elog(ERROR, "unknown operator type %d", (int) item->operator.oper); - - /* - * Check that no previous operator node points to the right - * operand. That would mean that the operand node - * has two parents. - */ - if (bms_is_member(i + 1, parentset)) - elog(ERROR, "malformed query tree"); - - parentset = bms_add_member(parentset, i + 1); - - if(item->operator.oper != OP_NOT) - { - uint32 left = (uint32) pq_getmsgint(buf, sizeof(uint32)); - - /* - * Right operand is implicitly at "this+1". Don't allow - * left to point to the right operand, or to self. - */ - if (left <= 1 || i + left >= size) - elog(ERROR, "invalid pointer to left operand"); - - /* - * Check that no previous operator node points to the left - * operand. - */ - if (bms_is_member(i + left, parentset)) - elog(ERROR, "malformed query tree"); - - parentset = bms_add_member(parentset, i + left); - - item->operator.left = left; - } - else - item->operator.left = 1; /* do not leave uninitialized fields */ - - if (i == size - 1) - elog(ERROR, "invalid pointer to right operand"); - break; - default: - elog(ERROR, "unknown tsquery node type %d", item->type); + size_t val_len; /* length after recoding to server encoding */ + uint8 weight; + const char *val; + pg_crc32 valcrc; + + weight = (uint8) pq_getmsgint(buf, sizeof(uint8)); + val = pq_getmsgstring(buf); + val_len = strlen(val); + + /* Sanity checks */ + + if (weight > 0xF) + elog(ERROR, "invalid tsquery; invalid weight bitmap"); + + if (val_len > MAXSTRLEN) + elog(ERROR, "invalid tsquery; operand too long"); + + if (datalen > MAXSTRPOS) + elog(ERROR, "invalid tsquery; total operand length exceeded"); + + /* Looks valid. */ + + INIT_CRC32(valcrc); + COMP_CRC32(valcrc, val, val_len); + FIN_CRC32(valcrc); + + item->operand.weight = weight; + item->operand.valcrc = (int32) valcrc; + item->operand.length = val_len; + item->operand.distance = datalen; + + /* + * Operand strings are copied to the final struct after this loop; + * here we just collect them to an array + */ + operands[i] = val; + + datalen += val_len + 1; /* + 1 for the '\0' terminator */ + } + else if (item->type == QI_OPR) + { + int8 oper; + oper = (int8) pq_getmsgint(buf, sizeof(int8)); + if (oper != OP_NOT && oper != OP_OR && oper != OP_AND) + elog(ERROR, "invalid tsquery; unknown operator type %d", (int) oper); + if (i == size - 1) + elog(ERROR, "invalid pointer to right operand"); + + item->operator.oper = oper; } + else + elog(ERROR, "unknown tsquery node type %d", item->type); item++; } - /* Now check that each node, except the root, has a parent. We - * already checked above that no node has more than one parent. */ - if (bms_num_members(parentset) != size - 1 && size != 0) - elog(ERROR, "malformed query tree"); - - bms_free( parentset ); - + /* Enlarge buffer to make room for the operand values. */ query = (TSQuery) repalloc(query, len + datalen); - item = GETQUERY(query); ptr = GETOPERAND(query); + + /* + * Fill in the left-pointers. Checks that the tree is well-formed + * as a side-effect. + */ + findoprnd(item, size); + + /* Copy operands to output struct */ for (i = 0; i < size; i++) { if (item->type == QI_VAL) { - memcpy(ptr, - pq_getmsgbytes(buf, item->operand.length), - item->operand.length); - ptr += item->operand.length; - *ptr++ = '\0'; + memcpy(ptr, operands[i], item->operand.length + 1); + ptr += item->operand.length + 1; } item++; } + pfree(operands); + Assert(ptr - GETOPERAND(query) == datalen); SET_VARSIZE(query, len + datalen); diff --git a/src/backend/utils/adt/tsquery_cleanup.c b/src/backend/utils/adt/tsquery_cleanup.c index 6ab58228b5..bc123378e0 100644 --- a/src/backend/utils/adt/tsquery_cleanup.c +++ b/src/backend/utils/adt/tsquery_cleanup.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_cleanup.c,v 1.3 2007/09/07 15:35:10 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_cleanup.c,v 1.4 2007/09/07 16:03:40 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -17,6 +17,7 @@ #include "tsearch/ts_type.h" #include "tsearch/ts_utils.h" +#include "miscadmin.h" typedef struct NODE { diff --git a/src/backend/utils/adt/tsquery_rewrite.c b/src/backend/utils/adt/tsquery_rewrite.c index d733a51d8e..77e3a8b173 100644 --- a/src/backend/utils/adt/tsquery_rewrite.c +++ b/src/backend/utils/adt/tsquery_rewrite.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_rewrite.c,v 1.3 2007/09/07 15:35:10 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_rewrite.c,v 1.4 2007/09/07 16:03:40 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -17,6 +17,7 @@ #include "executor/spi.h" #include "tsearch/ts_type.h" #include "tsearch/ts_utils.h" +#include "miscadmin.h" static int diff --git a/src/backend/utils/adt/tsquery_util.c b/src/backend/utils/adt/tsquery_util.c index 60de44cc6f..a4fd0937c4 100644 --- a/src/backend/utils/adt/tsquery_util.c +++ b/src/backend/utils/adt/tsquery_util.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_util.c,v 1.3 2007/09/07 15:35:10 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_util.c,v 1.4 2007/09/07 16:03:40 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -16,6 +16,7 @@ #include "tsearch/ts_type.h" #include "tsearch/ts_utils.h" +#include "miscadmin.h" QTNode * QT2QTN(QueryItem * in, char *operand) diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c index 30ccfa3f61..535a3541bf 100644 --- a/src/backend/utils/adt/tsrank.c +++ b/src/backend/utils/adt/tsrank.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.3 2007/09/07 15:35:10 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.4 2007/09/07 16:03:40 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -18,6 +18,7 @@ #include "tsearch/ts_type.h" #include "tsearch/ts_utils.h" #include "utils/array.h" +#include "miscadmin.h" static float weights[] = {0.1, 0.2, 0.4, 1.0}; @@ -176,8 +177,9 @@ SortAndUniqItems(TSQuery q, int *size) return res; } +/* A dummy WordEntryPos array to use when haspos is false */ static WordEntryPos POSNULL[] = { - 0, + 1, /* Number of elements that follow */ 0 }; @@ -207,7 +209,6 @@ calc_rank_and(float *w, TSVector t, TSQuery q) } pos = (uint16 **) palloc(sizeof(uint16 *) * q->size); memset(pos, 0, sizeof(uint16 *) * q->size); - *(uint16 *) POSNULL = lengthof(POSNULL) - 1; WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1); for (i = 0; i < size; i++) @@ -265,7 +266,6 @@ calc_rank_or(float *w, TSVector t, TSQuery q) QueryOperand **item; int size = q->size; - *(uint16 *) POSNULL = lengthof(POSNULL) - 1; item = SortAndUniqItems(q, &size); for (i = 0; i < size; i++) @@ -593,7 +593,6 @@ get_docrep(TSVector txt, TSQuery query, int *doclen) DocRepresentation *doc; char *operand; - *(uint16 *) POSNULL = lengthof(POSNULL) - 1; doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); operand = GETOPERAND(query); reset_istrue_flag(query); diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 2866e028da..992da4a9b4 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.3 2007/09/07 15:09:56 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.4 2007/09/07 16:03:40 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -75,18 +75,20 @@ uniquePos(WordEntryPos * a, int l) } static int -compareentry(const void *a, const void *b, void *arg) +compareentry(const void *va, const void *vb, void *arg) { char *BufferStr = (char *) arg; + WordEntryIN *a = (WordEntryIN *) va; + WordEntryIN *b = (WordEntryIN *) vb; - if (((WordEntryIN *) a)->entry.len == ((WordEntryIN *) b)->entry.len) + if (a->entry.len == b->entry.len) { - return strncmp(&BufferStr[((WordEntryIN *) a)->entry.pos], - &BufferStr[((WordEntryIN *) b)->entry.pos], - ((WordEntryIN *) a)->entry.len); + return strncmp(&BufferStr[a->entry.pos], + &BufferStr[b->entry.pos], + a->entry.len); } - return (((WordEntryIN *) a)->entry.len > ((WordEntryIN *) b)->entry.len) ? 1 : -1; + return (a->entry.len > b->entry.len) ? 1 : -1; } static int @@ -104,6 +106,9 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen) a->poslen = uniquePos(a->pos, a->poslen); *outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos); } + else + *outbuflen = a->entry.len; + return l; } res = a; @@ -118,10 +123,12 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen) { if (res->entry.haspos) { + *outbuflen += SHORTALIGN(res->entry.len); res->poslen = uniquePos(res->pos, res->poslen); *outbuflen += res->poslen * sizeof(WordEntryPos); } - *outbuflen += SHORTALIGN(res->entry.len); + else + *outbuflen += res->entry.len; res++; memcpy(res, ptr, sizeof(WordEntryIN)); } @@ -147,12 +154,18 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen) } ptr++; } + + /* add last item */ + if (res->entry.haspos) { + *outbuflen += SHORTALIGN(res->entry.len); + res->poslen = uniquePos(res->pos, res->poslen); *outbuflen += res->poslen * sizeof(WordEntryPos); } - *outbuflen += SHORTALIGN(res->entry.len); + else + *outbuflen += res->entry.len; return res + 1 - a; } @@ -367,6 +380,18 @@ tsvectorout(PG_FUNCTION_ARGS) PG_RETURN_CSTRING(outbuf); } +/* + * Binary Input / Output functions. The binary format is as follows: + * + * uint32 number of lexemes + * + * for each lexeme: + * lexeme text in client encoding, null-terminated + * uint16 number of positions + * for each position: + * uint16 WordEntryPos + */ + Datum tsvectorsend(PG_FUNCTION_ARGS) { @@ -381,18 +406,22 @@ tsvectorsend(PG_FUNCTION_ARGS) pq_sendint(&buf, vec->size, sizeof(int32)); for (i = 0; i < vec->size; i++) { - /* - * We are sure that sizeof(WordEntry) == sizeof(int32) + uint16 npos; + + /* the strings in the TSVector array are not null-terminated, so + * we have to send the null-terminator separately */ - pq_sendint(&buf, *(int32 *) weptr, sizeof(int32)); + pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len); + pq_sendbyte(&buf, '\0'); - pq_sendbytes(&buf, STRPTR(vec) + weptr->pos, weptr->len); - if (weptr->haspos) + npos = POSDATALEN(vec, weptr); + pq_sendint(&buf, npos, sizeof(uint16)); + + if(npos > 0) { WordEntryPos *wepptr = POSDATAPTR(vec, weptr); - pq_sendint(&buf, POSDATALEN(vec, weptr), sizeof(WordEntryPos)); - for (j = 0; j < POSDATALEN(vec, weptr); j++) + for (j = 0; j < npos; j++) pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos)); } weptr++; @@ -407,71 +436,92 @@ tsvectorrecv(PG_FUNCTION_ARGS) StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); TSVector vec; int i; - uint32 size; - WordEntry *weptr; - int datalen = 0; - Size len; - - size = pq_getmsgint(buf, sizeof(uint32)); - if (size < 0 || size > (MaxAllocSize / sizeof(WordEntry))) + int32 nentries; + int datalen; /* number of bytes used in the variable size area + * after fixed size TSVector header and WordEntries + */ + Size hdrlen; + Size len; /* allocated size of vec */ + + nentries = pq_getmsgint(buf, sizeof(int32)); + if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry))) elog(ERROR, "invalid size of tsvector"); - len = DATAHDRSIZE + sizeof(WordEntry) * size; + hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries; - len = len * 2; /* times two to make room for lexemes */ + len = hdrlen * 2; /* times two to make room for lexemes */ vec = (TSVector) palloc0(len); - vec->size = size; + vec->size = nentries; - weptr = ARRPTR(vec); - for (i = 0; i < size; i++) + datalen = 0; + for (i = 0; i < nentries; i++) { - int32 tmp; + const char *lexeme; + uint16 npos; + size_t lex_len; + + lexeme = pq_getmsgstring(buf); + npos = (uint16) pq_getmsgint(buf, sizeof(uint16)); + + /* sanity checks */ + + lex_len = strlen(lexeme); + if (lex_len < 0 || lex_len > MAXSTRLEN) + elog(ERROR, "invalid tsvector; lexeme too long"); + + if (datalen > MAXSTRPOS) + elog(ERROR, "invalid tsvector; maximum total lexeme length exceeded"); - weptr = ARRPTR(vec) + i; + if (npos > MAXNUMPOS) + elog(ERROR, "unexpected number of positions"); /* - * We are sure that sizeof(WordEntry) == sizeof(int32) + * Looks valid. Fill the WordEntry struct, and copy lexeme. + * + * But make sure the buffer is large enough first. */ - tmp = pq_getmsgint(buf, sizeof(int32)); - *weptr = *(WordEntry *) & tmp; - - while (CALCDATASIZE(size, datalen + SHORTALIGN(weptr->len)) >= len) + while (hdrlen + SHORTALIGN(datalen + lex_len) + + (npos + 1) * sizeof(WordEntryPos) >= len) { len *= 2; vec = (TSVector) repalloc(vec, len); - weptr = ARRPTR(vec) + i; } - memcpy(STRPTR(vec) + weptr->pos, - pq_getmsgbytes(buf, weptr->len), - weptr->len); - datalen += SHORTALIGN(weptr->len); + vec->entries[i].haspos = (npos > 0) ? 1 : 0; + vec->entries[i].len = lex_len; + vec->entries[i].pos = datalen; + + memcpy(STRPTR(vec) + datalen, lexeme, lex_len); + + datalen += lex_len; - if (i > 0 && WordEntryCMP(weptr, weptr - 1, STRPTR(vec)) <= 0) + if (i > 0 && WordEntryCMP(&vec->entries[i], &vec->entries[i - 1], STRPTR(vec)) <= 0) elog(ERROR, "lexemes are unordered"); - if (weptr->haspos) + /* Receive positions */ + + if (npos > 0) { - uint16 j, - npos; + uint16 j; WordEntryPos *wepptr; - npos = (uint16) pq_getmsgint(buf, sizeof(uint16)); - if (npos > MAXNUMPOS) - elog(ERROR, "unexpected number of positions"); - - while (CALCDATASIZE(size, datalen + (npos + 1) * sizeof(WordEntryPos)) >= len) + /* + * Pad to 2-byte alignment if necessary. Though we used palloc0 + * for the initial allocation, subsequent repalloc'd memory + * areas are not initialized to zero. + */ + if (datalen != SHORTALIGN(datalen)) { - len *= 2; - vec = (TSVector) repalloc(vec, len); - weptr = ARRPTR(vec) + i; + *(STRPTR(vec) + datalen) = '\0'; + datalen = SHORTALIGN(datalen); } - memcpy(_POSDATAPTR(vec, weptr), &npos, sizeof(int16)); - wepptr = POSDATAPTR(vec, weptr); + memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16)); + + wepptr = POSDATAPTR(vec, &vec->entries[i]); for (j = 0; j < npos; j++) { - wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(int16)); + wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos)); if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) elog(ERROR, "position information is unordered"); } @@ -480,7 +530,7 @@ tsvectorrecv(PG_FUNCTION_ARGS) } } - SET_VARSIZE(vec, CALCDATASIZE(vec->size, datalen)); + SET_VARSIZE(vec, hdrlen + datalen); PG_RETURN_TSVECTOR(vec); } diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index d34ab1fcf0..8e7593513f 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.3 2007/09/07 15:09:56 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.4 2007/09/07 16:03:40 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -165,7 +165,7 @@ tsvector_strip(PG_FUNCTION_ARGS) char *cur; for (i = 0; i < in->size; i++) - len += SHORTALIGN(arrin[i].len); + len += arrin[i].len; len = CALCDATASIZE(in->size, len); out = (TSVector) palloc0(len); @@ -179,7 +179,7 @@ tsvector_strip(PG_FUNCTION_ARGS) arrout[i].haspos = 0; arrout[i].len = arrin[i].len; arrout[i].pos = cur - STRPTR(out); - cur += SHORTALIGN(arrout[i].len); + cur += arrout[i].len; } PG_FREE_IF_COPY(in, 0); @@ -351,12 +351,15 @@ tsvector_concat(PG_FUNCTION_ARGS) ptr->len = ptr1->len; memcpy(cur, data1 + ptr1->pos, ptr1->len); ptr->pos = cur - data; - cur += SHORTALIGN(ptr1->len); if (ptr->haspos) { + cur += SHORTALIGN(ptr1->len); memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } + else + cur += ptr1->len; + ptr++; ptr1++; i1--; @@ -367,16 +370,20 @@ tsvector_concat(PG_FUNCTION_ARGS) ptr->len = ptr2->len; memcpy(cur, data2 + ptr2->pos, ptr2->len); ptr->pos = cur - data; - cur += SHORTALIGN(ptr2->len); if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + cur += SHORTALIGN(ptr2->len); + if (addlen == 0) ptr->haspos = 0; else cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); } + else + cur += ptr2->len; + ptr++; ptr2++; i2--; @@ -387,9 +394,9 @@ tsvector_concat(PG_FUNCTION_ARGS) ptr->len = ptr1->len; memcpy(cur, data1 + ptr1->pos, ptr1->len); ptr->pos = cur - data; - cur += SHORTALIGN(ptr1->len); if (ptr->haspos) { + cur += SHORTALIGN(ptr1->len); if (ptr1->haspos) { memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); @@ -407,6 +414,9 @@ tsvector_concat(PG_FUNCTION_ARGS) cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); } } + else + cur += ptr1->len; + ptr++; ptr1++; ptr2++; @@ -421,12 +431,15 @@ tsvector_concat(PG_FUNCTION_ARGS) ptr->len = ptr1->len; memcpy(cur, data1 + ptr1->pos, ptr1->len); ptr->pos = cur - data; - cur += SHORTALIGN(ptr1->len); if (ptr->haspos) { + cur += SHORTALIGN(ptr1->len); memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } + else + cur += ptr1->len; + ptr++; ptr1++; i1--; @@ -438,16 +451,20 @@ tsvector_concat(PG_FUNCTION_ARGS) ptr->len = ptr2->len; memcpy(cur, data2 + ptr2->pos, ptr2->len); ptr->pos = cur - data; - cur += SHORTALIGN(ptr2->len); if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + cur += SHORTALIGN(ptr2->len); + if (addlen == 0) ptr->haspos = 0; else cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); } + else + cur += ptr2->len; + ptr++; ptr2++; i2--; @@ -484,8 +501,8 @@ ValCompare(CHKVAL * chkval, WordEntry * ptr, QueryOperand * item) static bool checkclass_str(CHKVAL * chkval, WordEntry * val, QueryOperand * item) { - WordEntryPos *ptr = (WordEntryPos *) (chkval->values + val->pos + SHORTALIGN(val->len) + sizeof(uint16)); - uint16 len = *((uint16 *) (chkval->values + val->pos + SHORTALIGN(val->len))); + WordEntryPos *ptr = (WordEntryPos *) (chkval->values + SHORTALIGN(val->pos + val->len) + sizeof(uint16)); + uint16 len = *((uint16 *) (chkval->values + SHORTALIGN(val->pos + val->len))); while (len--) { diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h index 15aa9f5c6e..0aa95e892c 100644 --- a/src/include/tsearch/ts_type.h +++ b/src/include/tsearch/ts_type.h @@ -5,7 +5,7 @@ * * Copyright (c) 1998-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/tsearch/ts_type.h,v 1.3 2007/09/07 15:35:11 teodor Exp $ + * $PostgreSQL: pgsql/src/include/tsearch/ts_type.h,v 1.4 2007/09/07 16:03:40 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -62,26 +62,33 @@ typedef uint16 WordEntryPos; * bytes from end of WordEntry array to start of * corresponding lexeme. * 4) Lexeme's storage: - * SHORTALIGNED(lexeme) and position information if it exists - * Position information: first int2 - is a number of positions and it - * follows array of WordEntryPos + * lexeme (without null-terminator) + * if haspos is true: + * padding byte if necessary to make the number of positions 2-byte aligned + * uint16 number of positions that follow. + * uint16[] positions + * + * The positions must be sorted. */ typedef struct { int32 vl_len_; /* varlena header (do not touch directly!) */ - uint32 size; - char data[1]; + int32 size; + WordEntry entries[1]; /* var size */ + /* lexemes follow */ } TSVectorData; typedef TSVectorData *TSVector; -#define DATAHDRSIZE (VARHDRSZ + sizeof(int4)) -#define CALCDATASIZE(x, lenstr) ( (x) * sizeof(WordEntry) + DATAHDRSIZE + (lenstr) ) -#define ARRPTR(x) ( (WordEntry*) ( (char*)(x) + DATAHDRSIZE ) ) -#define STRPTR(x) ( (char*)(x) + DATAHDRSIZE + ( sizeof(WordEntry) * ((TSVector)(x))->size ) ) -#define STRSIZE(x) ( ((TSVector)(x))->len - DATAHDRSIZE - ( sizeof(WordEntry) * ((TSVector)(x))->size ) ) -#define _POSDATAPTR(x,e) (STRPTR(x)+((WordEntry*)(e))->pos+SHORTALIGN(((WordEntry*)(e))->len)) +#define DATAHDRSIZE (offsetof(TSVectorData, entries)) +#define CALCDATASIZE(x, lenstr) (DATAHDRSIZE + (x) * sizeof(WordEntry) + (lenstr) ) +#define ARRPTR(x) ( (x)->entries ) + +/* returns a pointer to the beginning of lexemes */ +#define STRPTR(x) ( (char *) &(x)->entries[x->size] ) + +#define _POSDATAPTR(x,e) (STRPTR(x) + SHORTALIGN((e)->pos + (e)->len)) #define POSDATALEN(x,e) ( ( ((WordEntry*)(e))->haspos ) ? (*(uint16*)_POSDATAPTR(x,e)) : 0 ) #define POSDATAPTR(x,e) ( (WordEntryPos*)( _POSDATAPTR(x,e)+sizeof(uint16) ) ) @@ -159,7 +166,7 @@ typedef int8 QueryItemType; typedef struct { QueryItemType type; /* operand or kind of operator (ts_tokentype) */ - int8 weight; /* weights of operand to search. It's a bitmask of allowed weights. + uint8 weight; /* weights of operand to search. It's a bitmask of allowed weights. * if it =0 then any weight are allowed. * Weights and bit map: * A: 1<<3 -- 2.11.0