From 234163649e4eb7eb348e2a00192fec4a30cf0e33 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Tue, 11 Jul 2006 16:55:34 +0000 Subject: [PATCH] GIN improvements - Replace sorted array of entries in maintenance_work_mem to binary tree, this should improve create performance. - More precisely calculate allocated memory, eliminate leaks with user-defined extractValue() - Improve wordings in tsearch2 --- contrib/tsearch2/ts_cfg.c | 8 +- src/backend/access/gin/ginbulk.c | 250 ++++++++++++++++++++++++++++--------- src/backend/access/gin/gininsert.c | 22 +++- src/include/access/gin.h | 15 ++- 4 files changed, 221 insertions(+), 74 deletions(-) diff --git a/contrib/tsearch2/ts_cfg.c b/contrib/tsearch2/ts_cfg.c index 5a662b7dbb..d0e1891668 100644 --- a/contrib/tsearch2/ts_cfg.c +++ b/contrib/tsearch2/ts_cfg.c @@ -313,12 +313,12 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen) #ifdef IGNORE_LONGLEXEME ereport(NOTICE, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("word is too long"))); + errmsg("A word you are indexing is too long. It will be ignored."))); continue; #else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("word is too long"))); + errmsg("A word you are indexing is too long"))); #endif } @@ -470,12 +470,12 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4 #ifdef IGNORE_LONGLEXEME ereport(NOTICE, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("word is too long"))); + errmsg("A word you are indexing is too long. It will be ignored."))); continue; #else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("word is too long"))); + errmsg("A word you are indexing is too long"))); #endif } diff --git a/src/backend/access/gin/ginbulk.c b/src/backend/access/gin/ginbulk.c index 6d73f070ba..58fa1b34d7 100644 --- a/src/backend/access/gin/ginbulk.c +++ b/src/backend/access/gin/ginbulk.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginbulk.c,v 1.1 2006/05/02 11:28:54 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/ginbulk.c,v 1.2 2006/07/11 16:55:34 teodor Exp $ *------------------------------------------------------------------------- */ @@ -22,17 +22,29 @@ #include "utils/memutils.h" #include "access/tuptoaster.h" -#define DEF_NENTRY 128 +#define DEF_NENTRY 2048 #define DEF_NPTR 4 void ginInitBA(BuildAccumulator *accum) { - - accum->number = 0; - accum->curget = 0; - accum->length = DEF_NENTRY; - accum->entries = (EntryAccumulator*)palloc0( sizeof(EntryAccumulator) * DEF_NENTRY ); - accum->allocatedMemory = sizeof(EntryAccumulator) * DEF_NENTRY; + accum->maxdepth = 1; + accum->stackpos = 0; + accum->entries = NULL; + accum->stack = NULL; + accum->allocatedMemory = 0; + accum->entryallocator = NULL; +} + +static EntryAccumulator* +EAAllocate( BuildAccumulator *accum ) { + if ( accum->entryallocator == NULL || accum->length>=DEF_NENTRY ) { + accum->entryallocator = palloc(sizeof(EntryAccumulator)*DEF_NENTRY); + accum->allocatedMemory += sizeof(EntryAccumulator)*DEF_NENTRY; + accum->length = 0; + } + + accum->length++; + return accum->entryallocator + accum->length - 1; } /* @@ -61,64 +73,133 @@ ginInsertData(BuildAccumulator *accum, EntryAccumulator *entry, ItemPointer heap entry->number++; } +static Datum +getDatumCopy(BuildAccumulator *accum, Datum value) { + Form_pg_attribute *att = accum->ginstate->tupdesc->attrs; + Datum newvalue; + int data_length = 0; + void *ptr; + + if ( att[0]->attbyval ) { + store_att_byval(&newvalue, value, att[0]->attlen); + } else { + /* pass-by-reference */ + if (att[0]->attlen == -1) { + /* varlena */ + data_length = VARATT_SIZE(DatumGetPointer(value)); + } else if (att[0]->attlen == -2) { + /* c-string */ + data_length = strlen(DatumGetCString(value)) + 1; + } else { + /* fixed-length pass-by-reference */ + Assert(att[0]->attlen > 0); + data_length = att[0]->attlen; + } + + ptr = palloc( data_length ); + memcpy(ptr, DatumGetPointer(value), data_length); + newvalue = PointerGetDatum(ptr); + } + + accum->allocatedMemory+=data_length; + + return newvalue; +} + /* * Find/store one entry from indexed value. - * It supposes, that entry should be located between low and end of array of - * entries. Returns position of found/inserted entry */ -static uint32 -ginInsertEntry(BuildAccumulator *accum, ItemPointer heapptr, Datum entry, uint32 low) { - uint32 high = accum->number, mid; - int res; - - while(high>low) { - mid = low + ((high - low) / 2); - - res = compareEntries(accum->ginstate, entry, accum->entries[mid].value); - - if ( res == 0 ) { - ginInsertData( accum, accum->entries+mid, heapptr ); - return mid; - } else if ( res > 0 ) - low = mid + 1; - else - high = mid; +static void +ginInsertEntry(BuildAccumulator *accum, ItemPointer heapptr, Datum entry) { + EntryAccumulator *ea = accum->entries, *pea = NULL; + int res = 0; + uint32 depth = 1; + + while( ea ) { + res = compareEntries(accum->ginstate, entry, ea->value); + if ( res == 0 ) + break; /* found */ + else { + pea = ea; + if ( res < 0 ) + ea = ea->left; + else + ea = ea->right; + } + depth++; } - /* did not find an entry, insert */ - if ( accum->number >= accum->length ) { - accum->allocatedMemory += sizeof(EntryAccumulator) * accum->length; - accum->length *= 2; - accum->entries = (EntryAccumulator*)repalloc( accum->entries, - sizeof(EntryAccumulator) * accum->length ); - } - - if ( high != accum->number ) - memmove( accum->entries+high+1, accum->entries+high, sizeof(EntryAccumulator) * (accum->number-high) ); + if ( depth > accum->maxdepth ) + accum->maxdepth = depth; - accum->entries[high].value = entry; - accum->entries[high].length = DEF_NPTR; - accum->entries[high].number = 1; - accum->entries[high].shouldSort = FALSE; - accum->entries[high].list = (ItemPointerData*)palloc(sizeof(ItemPointerData)*DEF_NPTR); - accum->entries[high].list[0] = *heapptr; + if ( ea == NULL ) { + ea = EAAllocate(accum); - accum->allocatedMemory += sizeof(ItemPointerData)*DEF_NPTR; - accum->number++; + ea->left = ea->right = NULL; + ea->value = getDatumCopy(accum, entry); + ea->length = DEF_NPTR; + ea->number = 1; + ea->shouldSort = FALSE; + ea->list = (ItemPointerData*)palloc(sizeof(ItemPointerData)*DEF_NPTR); + ea->list[0] = *heapptr; + accum->allocatedMemory += sizeof(ItemPointerData)*DEF_NPTR; - return high; + if ( pea == NULL ) + accum->entries = ea; + else { + Assert( res != 0 ); + if ( res < 0 ) + pea->left = ea; + else + pea->right = ea; + } + } else + ginInsertData( accum, ea, heapptr ); } +/* + * insert middle of left part the middle of right one, + * then calls itself for each parts + */ +static void +ginChooseElem(BuildAccumulator *accum, ItemPointer heapptr, Datum *entries, uint32 nentry, + uint32 low, uint32 high, uint32 offset) { + uint32 pos; + uint32 middle = (low+high)>>1; + + pos = (low+middle)>>1; + if ( low!=middle && pos>=offset && pos-offset < nentry ) + ginInsertEntry( accum, heapptr, entries[ pos-offset ]); + pos = (high+middle+1)>>1; + if ( middle+1 != high && pos>=offset && pos-offset < nentry ) + ginInsertEntry( accum, heapptr, entries[ pos-offset ]); + + if ( low!=middle ) + ginChooseElem(accum, heapptr, entries, nentry, low, middle, offset ); + if ( high!=middle+1 ) + ginChooseElem(accum, heapptr, entries, nentry, middle+1, high, offset ); +} /* - * Insert one heap pointer. Requires entries to be sorted! + * Insert one heap pointer. Suppose entries is sorted. + * Insertion order trys to get binary tree balanced: first insert middle value, + * next middle on left part and middle of right part. */ void ginInsertRecordBA( BuildAccumulator *accum, ItemPointer heapptr, Datum *entries, uint32 nentry ) { - uint32 start=0,i; + uint32 i, nbit=0, offset; + + if (nentry==0) + return; + + i=nentry-1; + for(;i>0;i>>=1) nbit++; - for(i=0;i>1)-offset ]); + ginChooseElem(accum, heapptr, entries, nentry, 0, nbit, offset); } static int @@ -128,28 +209,79 @@ qsortCompareItemPointers( const void *a, const void *b ) { return res; } +/* + * walk on binary tree and returns ordered nodes + */ +static EntryAccumulator* +walkTree( BuildAccumulator *accum ) { + EntryAccumulator *entry = accum->stack[ accum->stackpos ]; + + if ( entry->list != NULL ) { + /* return entry itself: we already was at left sublink */ + return entry; + } else if ( entry->right && entry->right != accum->stack[ accum->stackpos+1 ] ) { + /* go on right sublink */ + accum->stackpos++; + entry = entry->right; + + /* find most-left value */ + for(;;) { + accum->stack[ accum->stackpos ] = entry; + if ( entry->left ) { + accum->stackpos++; + entry = entry->left; + } else + break; + } + } else { + /* we already return all left subtree, itself and right subtree */ + if ( accum->stackpos == 0 ) + return 0; + accum->stackpos--; + return walkTree(accum); + } + + return entry; +} + ItemPointerData* ginGetEntry(BuildAccumulator *accum, Datum *value, uint32 *n) { EntryAccumulator *entry; - ItemPointerData *list; - if ( accum->curget >= accum->number ) + + + if ( accum->stack == NULL ) { + /* first call */ + accum->stack = palloc0(sizeof(EntryAccumulator*)*(accum->maxdepth+1)); + entry = accum->entries; + + /* find most-left value */ + for(;;) { + accum->stack[ accum->stackpos ] = entry; + if ( entry->left ) { + accum->stackpos++; + entry = entry->left; + } else + break; + } + } else { + pfree( accum->stack[ accum->stackpos ]->list ); + accum->stack[ accum->stackpos ]->list = NULL; + entry = walkTree( accum ); + } + + if ( entry == NULL ) return NULL; - else if ( accum->curget > 0 ) - pfree( accum->entries[ accum->curget-1 ].list ); - entry = accum->entries + accum->curget; *n = entry->number; *value = entry->value; list = entry->list; - accum->curget++; + + Assert(list != NULL); if ( entry->shouldSort && entry->number > 1 ) qsort(list, *n, sizeof(ItemPointerData), qsortCompareItemPointers); - return list; } - - diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index a4416a94cb..b67bf6e821 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/gininsert.c,v 1.2 2006/05/10 23:18:38 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/gininsert.c,v 1.3 2006/07/11 16:55:34 teodor Exp $ *------------------------------------------------------------------------- */ @@ -26,6 +26,7 @@ typedef struct { GinState ginstate; double indtuples; MemoryContext tmpCtx; + MemoryContext funcCtx; BuildAccumulator accum; } GinBuildState; @@ -189,19 +190,22 @@ ginEntryInsert( Relation index, GinState *ginstate, Datum value, ItemPointerData * Function isnt use during normal insert */ static uint32 -ginHeapTupleBulkInsert(BuildAccumulator *accum, Datum value, ItemPointer heapptr) { +ginHeapTupleBulkInsert(GinBuildState *buildstate, Datum value, ItemPointer heapptr) { Datum *entries; uint32 nentries; + MemoryContext oldCtx; - entries = extractEntriesSU( accum->ginstate, value, &nentries); + oldCtx = MemoryContextSwitchTo(buildstate->funcCtx); + entries = extractEntriesSU( buildstate->accum.ginstate, value, &nentries); + MemoryContextSwitchTo(oldCtx); if ( nentries==0 ) /* nothing to insert */ return 0; - ginInsertRecordBA( accum, heapptr, entries, nentries); + ginInsertRecordBA( &buildstate->accum, heapptr, entries, nentries); - pfree( entries ); + MemoryContextReset(buildstate->funcCtx); return nentries; } @@ -218,7 +222,7 @@ ginBuildCallback(Relation index, HeapTuple htup, Datum *values, oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); - buildstate->indtuples += ginHeapTupleBulkInsert(&buildstate->accum, *values, &htup->t_self); + buildstate->indtuples += ginHeapTupleBulkInsert(buildstate, *values, &htup->t_self); /* we use only half maintenance_work_mem, because there is some leaks during insertion and extract values */ @@ -297,6 +301,12 @@ ginbuild(PG_FUNCTION_ARGS) { ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); + buildstate.funcCtx = AllocSetContextCreate(buildstate.tmpCtx, + "Gin build temporary context for user-defined function", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA( &buildstate.accum ); diff --git a/src/include/access/gin.h b/src/include/access/gin.h index c13d726aa9..ba7d15cf1f 100644 --- a/src/include/access/gin.h +++ b/src/include/access/gin.h @@ -3,7 +3,7 @@ * header file for postgres inverted index access method implementation. * * Copyright (c) 2006, PostgreSQL Global Development Group - * $PostgreSQL: pgsql/src/include/access/gin.h,v 1.4 2006/07/11 13:54:24 momjian Exp $ + * $PostgreSQL: pgsql/src/include/access/gin.h,v 1.5 2006/07/11 16:55:34 teodor Exp $ *-------------------------------------------------------------------------- */ @@ -414,21 +414,26 @@ extern Datum arraycontains(PG_FUNCTION_ARGS); extern Datum arraycontained(PG_FUNCTION_ARGS); /* ginbulk.c */ -typedef struct { +typedef struct EntryAccumulator { Datum value; uint32 length; uint32 number; ItemPointerData *list; bool shouldSort; + struct EntryAccumulator *left; + struct EntryAccumulator *right; } EntryAccumulator; typedef struct { GinState *ginstate; EntryAccumulator *entries; - uint32 length; - uint32 number; - uint32 curget; + uint32 maxdepth; + EntryAccumulator **stack; + uint32 stackpos; uint32 allocatedMemory; + + uint32 length; + EntryAccumulator *entryallocator; } BuildAccumulator; extern void ginInitBA(BuildAccumulator *accum); -- 2.11.0