From 391eb5e5b6a78fce5179808379cdae20baedd9c3 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 4 Mar 2003 21:51:22 +0000 Subject: [PATCH] Reimplement free-space-map management as per recent discussions. Adjustable threshold is gone in favor of keeping track of total requested page storage and doling out proportional fractions to each relation (with a minimum amount per relation, and some quantization of the results to avoid thrashing with small changes in page counts). Provide special- case code for indexes so as not to waste space storing useless page free space counts. Restructure internal data storage to be a flat array instead of list-of-chunks; this may cost a little more work in data copying when reorganizing, but allows binary search to be used during lookup_fsm_page_entry(). --- doc/src/sgml/runtime.sgml | 14 +- src/backend/access/nbtree/nbtpage.c | 9 +- src/backend/access/nbtree/nbtree.c | 29 +- src/backend/commands/vacuum.c | 42 +- src/backend/commands/vacuumlazy.c | 47 +- src/backend/storage/freespace/freespace.c | 1602 +++++++++++++++---------- src/backend/storage/smgr/smgr.c | 4 +- src/backend/utils/misc/guc.c | 6 +- src/backend/utils/misc/postgresql.conf.sample | 5 +- src/include/storage/freespace.h | 22 +- 10 files changed, 1102 insertions(+), 678 deletions(-) diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml index b023940dc3..86d5ddc285 100644 --- a/doc/src/sgml/runtime.sgml +++ b/doc/src/sgml/runtime.sgml @@ -1,5 +1,5 @@ @@ -1725,7 +1725,9 @@ dynamic_library_path = '/usr/local/lib/postgresql:/home/my_project/lib:$libdir' Sets the maximum number of disk pages for which free space will - be tracked in the shared free-space map. The default is 10000. + be tracked in the shared free-space map. Six bytes of shared memory + are consumed for each page slot. This setting must be more than + 16 * max_fsm_relations. The default is 20000. This option can only be set at server start. @@ -1735,9 +1737,11 @@ dynamic_library_path = '/usr/local/lib/postgresql:/home/my_project/lib:$libdir' MAX_FSM_RELATIONS (integer) - Sets the maximum number of relations (tables) for which free - space will be tracked in the shared free-space map. The default - is 1000. This option can only be set at server start. + Sets the maximum number of relations (tables and indexes) for which + free space will be tracked in the shared free-space map. Roughly + fifty bytes of shared memory are consumed for each slot. + The default is 1000. + This option can only be set at server start. diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index c8fb7628ea..f9d108e769 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.63 2003/02/23 23:20:52 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.64 2003/03/04 21:51:20 tgl Exp $ * * NOTES * Postgres btree pages look like ordinary relation pages. The opaque @@ -401,15 +401,10 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) * that the page is still free. (For example, an already-free page * could have been re-used between the time the last VACUUM scanned * it and the time the VACUUM made its FSM updates.) - * - * The request size should be more than half of what btvacuumcleanup - * logs as the per-page free space. We use BLCKSZ/2 and BLCKSZ-1 - * to try to get some use out of FSM's space management algorithm. - * XXX this needs some more thought... */ for (;;) { - blkno = GetPageWithFreeSpace(&rel->rd_node, BLCKSZ/2); + blkno = GetFreeIndexPage(&rel->rd_node); if (blkno == InvalidBlockNumber) break; buf = ReadBuffer(rel, blkno); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index c393e907b5..b1722244e6 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -12,7 +12,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.100 2003/02/24 00:57:17 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.101 2003/03/04 21:51:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -697,7 +697,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS) IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(2); BlockNumber num_pages; BlockNumber blkno; - PageFreeSpaceInfo *pageSpaces; + BlockNumber *freePages; int nFreePages, maxFreePages; BlockNumber pages_deleted = 0; @@ -712,7 +712,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS) maxFreePages = MaxFSMPages; if ((BlockNumber) maxFreePages > num_pages) maxFreePages = (int) num_pages + 1; /* +1 to avoid palloc(0) */ - pageSpaces = (PageFreeSpaceInfo *) palloc(maxFreePages * sizeof(PageFreeSpaceInfo)); + freePages = (BlockNumber *) palloc(maxFreePages * sizeof(BlockNumber)); nFreePages = 0; /* Create a temporary memory context to run _bt_pagedel in */ @@ -740,12 +740,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS) { /* Okay to recycle this page */ if (nFreePages < maxFreePages) - { - pageSpaces[nFreePages].blkno = blkno; - /* claimed avail-space must be < BLCKSZ */ - pageSpaces[nFreePages].avail = BLCKSZ-1; - nFreePages++; - } + freePages[nFreePages++] = blkno; pages_deleted++; } else if (P_ISDELETED(opaque)) @@ -781,12 +776,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS) if (ndel && info->vacuum_full) { if (nFreePages < maxFreePages) - { - pageSpaces[nFreePages].blkno = blkno; - /* claimed avail-space must be < BLCKSZ */ - pageSpaces[nFreePages].avail = BLCKSZ-1; - nFreePages++; - } + freePages[nFreePages++] = blkno; } MemoryContextSwitchTo(oldcontext); @@ -805,8 +795,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS) { BlockNumber new_pages = num_pages; - while (nFreePages > 0 && - pageSpaces[nFreePages-1].blkno == new_pages-1) + while (nFreePages > 0 && freePages[nFreePages-1] == new_pages-1) { new_pages--; pages_deleted--; @@ -841,12 +830,12 @@ btvacuumcleanup(PG_FUNCTION_ARGS) /* * Update the shared Free Space Map with the info we now have about - * free space in the index, discarding any old info the map may have. + * free pages in the index, discarding any old info the map may have. * We do not need to sort the page numbers; they're in order already. */ - MultiRecordFreeSpace(&rel->rd_node, 0, nFreePages, pageSpaces); + RecordIndexFreeSpace(&rel->rd_node, nFreePages, freePages); - pfree(pageSpaces); + pfree(freePages); MemoryContextDelete(mycontext); diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 8b8cff6d30..03bd3de328 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.250 2003/02/24 00:57:17 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.251 2003/03/04 21:51:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -337,6 +337,13 @@ vacuum(VacuumStmt *vacstmt) StartTransactionCommand(true); /* + * If it was a database-wide VACUUM, print FSM usage statistics + * (we don't make you be superuser to see these). + */ + if (vacstmt->relation == NULL) + PrintFreeSpaceMapStatistics(elevel); + + /* * If we completed a database-wide VACUUM without skipping any * relations, update the database's pg_database row with info * about the transaction IDs used, and try to truncate pg_clog. @@ -2781,31 +2788,48 @@ vac_update_fsm(Relation onerel, VacPageList fraged_pages, BlockNumber rel_pages) { int nPages = fraged_pages->num_pages; - int i; + VacPage *pagedesc = fraged_pages->pagedesc; + Size threshold; PageFreeSpaceInfo *pageSpaces; + int outPages; + int i; + + /* + * We only report pages with free space at least equal to the average + * request size --- this avoids cluttering FSM with uselessly-small bits + * of space. Although FSM would discard pages with little free space + * anyway, it's important to do this prefiltering because (a) it reduces + * the time spent holding the FSM lock in RecordRelationFreeSpace, and + * (b) FSM uses the number of pages reported as a statistic for guiding + * space management. If we didn't threshold our reports the same way + * vacuumlazy.c does, we'd be skewing that statistic. + */ + threshold = GetAvgFSMRequestSize(&onerel->rd_node); /* +1 to avoid palloc(0) */ pageSpaces = (PageFreeSpaceInfo *) palloc((nPages + 1) * sizeof(PageFreeSpaceInfo)); + outPages = 0; for (i = 0; i < nPages; i++) { - pageSpaces[i].blkno = fraged_pages->pagedesc[i]->blkno; - pageSpaces[i].avail = fraged_pages->pagedesc[i]->free; - /* * fraged_pages may contain entries for pages that we later * decided to truncate from the relation; don't enter them into * the free space map! */ - if (pageSpaces[i].blkno >= rel_pages) - { - nPages = i; + if (pagedesc[i]->blkno >= rel_pages) break; + + if (pagedesc[i]->free >= threshold) + { + pageSpaces[outPages].blkno = pagedesc[i]->blkno; + pageSpaces[outPages].avail = pagedesc[i]->free; + outPages++; } } - MultiRecordFreeSpace(&onerel->rd_node, 0, nPages, pageSpaces); + RecordRelationFreeSpace(&onerel->rd_node, outPages, pageSpaces); pfree(pageSpaces); } diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 6eee5765e7..95f13a514e 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -31,7 +31,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/commands/vacuumlazy.c,v 1.26 2003/02/24 00:57:17 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/commands/vacuumlazy.c,v 1.27 2003/03/04 21:51:21 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -51,21 +51,11 @@ /* * Space/time tradeoff parameters: do these need to be user-tunable? * - * A page with less than PAGE_SPACE_THRESHOLD free space will be forgotten - * immediately, and not even passed to the free space map. Removing the - * uselessly small entries early saves cycles, and in particular reduces - * the amount of time we spend holding the FSM lock when we finally call - * MultiRecordFreeSpace. Since the FSM will ignore pages below its own - * runtime threshold anyway, there's no point in making this really small. - * XXX Is it worth trying to measure average tuple size, and using that to - * set the threshold? Problem is we don't know average tuple size very - * accurately for the first few pages... - * * To consider truncating the relation, we want there to be at least - * relsize / REL_TRUNCATE_FRACTION potentially-freeable pages. + * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever + * is less) potentially-freeable pages. */ -#define PAGE_SPACE_THRESHOLD ((Size) (BLCKSZ / 32)) - +#define REL_TRUNCATE_MINIMUM 1000 #define REL_TRUNCATE_FRACTION 16 /* MAX_TUPLES_PER_PAGE can be a conservative upper limit */ @@ -78,6 +68,7 @@ typedef struct LVRelStats BlockNumber rel_pages; double rel_tuples; BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ + Size threshold; /* minimum interesting free space */ /* List of TIDs of tuples we intend to delete */ /* NB: this list is ordered by TID address */ int num_dead_tuples; /* current # of entries */ @@ -149,6 +140,10 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt) vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); + /* Set threshold for interesting free space = average request size */ + /* XXX should we scale it up or down? Adjust vacuum.c too, if so */ + vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node); + /* Open all indexes of the relation */ vac_open_indexes(onerel, &nindexes, &Irel); hasindex = (nindexes > 0); @@ -166,7 +161,8 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt) * number of pages. Otherwise, the time taken isn't worth it. */ possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages; - if (possibly_freeable > vacrelstats->rel_pages / REL_TRUNCATE_FRACTION) + if (possibly_freeable >= REL_TRUNCATE_MINIMUM || + possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION) lazy_truncate_heap(onerel, vacrelstats); /* Update shared free space map with final free space info */ @@ -943,8 +939,21 @@ lazy_record_free_space(LVRelStats *vacrelstats, PageFreeSpaceInfo *pageSpaces; int n; - /* Ignore pages with little free space */ - if (avail < PAGE_SPACE_THRESHOLD) + /* + * A page with less than stats->threshold free space will be forgotten + * immediately, and never passed to the free space map. Removing the + * uselessly small entries early saves cycles, and in particular reduces + * the amount of time we spend holding the FSM lock when we finally call + * RecordRelationFreeSpace. Since the FSM will probably drop pages with + * little free space anyway, there's no point in making this really small. + * + * XXX Is it worth trying to measure average tuple size, and using that to + * adjust the threshold? Would be worthwhile if FSM has no stats yet + * for this relation. But changing the threshold as we scan the rel + * might lead to bizarre behavior, too. Also, it's probably better if + * vacuum.c has the same thresholding behavior as we do here. + */ + if (avail < vacrelstats->threshold) return; /* Copy pointers to local variables for notational simplicity */ @@ -1079,13 +1088,13 @@ lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats) int nPages = vacrelstats->num_free_pages; /* - * Sort data into order, as required by MultiRecordFreeSpace. + * Sort data into order, as required by RecordRelationFreeSpace. */ if (nPages > 1) qsort(pageSpaces, nPages, sizeof(PageFreeSpaceInfo), vac_cmp_page_spaces); - MultiRecordFreeSpace(&onerel->rd_node, 0, nPages, pageSpaces); + RecordRelationFreeSpace(&onerel->rd_node, nPages, pageSpaces); } /* diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 4b6d92eb35..83b60bad35 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/freespace/freespace.c,v 1.15 2003/02/23 06:17:13 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/freespace/freespace.c,v 1.16 2003/03/04 21:51:21 tgl Exp $ * * * NOTES: @@ -20,66 +20,126 @@ * * The number of distinct relations tracked is limited by a configuration * variable (MaxFSMRelations). When this would be exceeded, we discard the - * least frequently used relation. A previously-unknown relation is always - * entered into the map with useCount 1 on first reference, even if this - * causes an existing entry with higher useCount to be discarded. This may - * cause a little bit of thrashing among the bottom entries in the list, - * but if we didn't do it then there'd be no way for a relation not in the - * map to get in once the map is full. Note we allow a relation to be in the - * map even if no pages are currently stored for it: this allows us to track - * its useCount & threshold, which may eventually go high enough to give it - * priority for page storage. + * least recently used relation. A doubly-linked list with move-to-front + * behavior keeps track of which relation is least recently used. * - * The total number of pages tracked is also set by a configuration variable - * (MaxFSMPages). We allocate these dynamically among the known relations, - * giving preference to the more-frequently-referenced relations and those - * with smaller tuples. This is done by a thresholding mechanism: for each - * relation we keep track of a current target threshold, and allow only pages - * with free space >= threshold to be stored in the map. The threshold starts - * at BLCKSZ/2 (a somewhat arbitrary choice) for a newly entered relation. - * On each GetFreeSpace reference, the threshold is moved towards the - * request size using a slow exponential moving average; this means that - * for heavily used relations, the threshold will approach the average - * freespace request size (average tuple size). Whenever we run out of - * storage space in the map, we double the threshold of all existing relations - * (but not to more than BLCKSZ, so as to prevent runaway thresholds). - * Infrequently used relations will thus tend to have large thresholds. + * For each known relation, we track the average request size given to + * GetPageWithFreeSpace() as well as the most recent number of pages given + * to RecordRelationFreeSpace(). The average request size is not directly + * used in this module, but we expect VACUUM to use it to filter out + * uninteresting amounts of space before calling RecordRelationFreeSpace(). + * The sum of the RRFS page counts is thus the total number of "interesting" + * pages that we would like to track; this is called DesiredFSMPages. * - * XXX this thresholding mechanism is experimental; need to see how well - * it works in practice. + * The number of pages actually tracked is limited by a configuration variable + * (MaxFSMPages). When this is less than DesiredFSMPages, each relation + * gets to keep a fraction MaxFSMPages/DesiredFSMPages of its free pages. + * We discard pages with less free space to reach this target. + * + * Actually, our space allocation is done in "chunks" of CHUNKPAGES pages, + * with each relation guaranteed at least one chunk. This reduces thrashing + * of the storage allocations when there are small changes in the RRFS page + * counts from one VACUUM to the next. (XXX it might also be worthwhile to + * impose some kind of moving-average smoothing on the RRFS page counts?) + * + * So the actual arithmetic is: for each relation compute myRequest as the + * number of chunks needed to hold its RRFS page count (not counting the + * first, guaranteed chunk); compute sumRequests as the sum of these values + * over all relations; then for each relation figure its actual allocation + * as + * 1 + round(spareChunks * myRequest / sumRequests) + * where spareChunks = totalChunks - numRels is the number of chunks we have + * a choice what to do with. We round off these numbers because truncating + * all of them would waste significant space. But because of roundoff, it's + * possible for the last few relations to get less space than they should; + * the computed allocation must be checked against remaining available space. * *------------------------------------------------------------------------- */ #include "postgres.h" #include +#include #include "storage/freespace.h" -#include "storage/itemid.h" +#include "storage/itemptr.h" #include "storage/lwlock.h" #include "storage/shmem.h" +/* Initial value for average-request moving average */ +#define INITIAL_AVERAGE ((Size) (BLCKSZ / 32)) + +/* + * Number of pages and bytes per allocation chunk. Indexes can squeeze 50% + * more pages into the same space because they don't need to remember how much + * free space on each page. The nominal number of pages, CHUNKPAGES, is for + * regular rels, and INDEXCHUNKPAGES is for indexes. CHUNKPAGES should be + * even so that no space is wasted in the index case. + */ +#define CHUNKPAGES 16 +#define CHUNKBYTES (CHUNKPAGES * sizeof(FSMPageData)) +#define INDEXCHUNKPAGES ((int) (CHUNKBYTES / sizeof(IndexFSMPageData))) + + +/* + * Typedefs and macros for items in the page-storage arena. We use the + * existing ItemPointer and BlockId data structures, which are designed + * to pack well (they should be 6 and 4 bytes apiece regardless of machine + * alignment issues). Unfortunately we can't use the ItemPointer access + * macros, because they include Asserts insisting that ip_posid != 0. + */ +typedef ItemPointerData FSMPageData; +typedef BlockIdData IndexFSMPageData; + +#define FSMPageGetPageNum(ptr) \ + BlockIdGetBlockNumber(&(ptr)->ip_blkid) +#define FSMPageGetSpace(ptr) \ + ((Size) (ptr)->ip_posid) +#define FSMPageSetPageNum(ptr, pg) \ + BlockIdSet(&(ptr)->ip_blkid, pg) +#define FSMPageSetSpace(ptr, sz) \ + ((ptr)->ip_posid = (OffsetNumber) (sz)) +#define IndexFSMPageGetPageNum(ptr) \ + BlockIdGetBlockNumber(ptr) +#define IndexFSMPageSetPageNum(ptr, pg) \ + BlockIdSet(ptr, pg) + + /* * Shared free-space-map objects * + * The per-relation objects are indexed by a hash table, and are also members + * of two linked lists: one ordered by recency of usage (most recent first), + * and the other ordered by physical location of the associated storage in + * the page-info arena. + * + * Each relation owns one or more chunks of per-page storage in the "arena". + * The chunks for each relation are always consecutive, so that it can treat + * its page storage as a simple array. We further insist that its page data + * be ordered by block number, so that binary search is possible. + * * Note: we handle pointers to these items as pointers, not as SHMEM_OFFSETs. * This assumes that all processes accessing the map will have the shared * memory segment mapped at the same place in their address space. */ typedef struct FSMHeader FSMHeader; typedef struct FSMRelation FSMRelation; -typedef struct FSMChunk FSMChunk; /* Header for whole map */ struct FSMHeader { HTAB *relHash; /* hashtable of FSMRelation entries */ - FSMRelation *relList; /* FSMRelations in useCount order */ - FSMRelation *relListTail; /* tail of FSMRelation list */ + FSMRelation *usageList; /* FSMRelations in usage-recency order */ + FSMRelation *usageListTail; /* tail of usage-recency list */ + FSMRelation *firstRel; /* FSMRelations in arena storage order */ + FSMRelation *lastRel; /* tail of storage-order list */ int numRels; /* number of FSMRelations now in use */ - FSMChunk *freeChunks; /* linked list of currently-free chunks */ - int numFreeChunks; /* number of free chunks */ + double sumRequests; /* sum of requested chunks over all rels */ + char *arena; /* arena for page-info storage */ + int totalChunks; /* total size of arena, in chunks */ + int usedChunks; /* # of chunks assigned */ + /* NB: there are totalChunks - usedChunks free chunks at end of arena */ }; /* @@ -90,36 +150,16 @@ struct FSMHeader struct FSMRelation { RelFileNode key; /* hash key (must be first) */ - FSMRelation *nextRel; /* next rel in useCount order */ - FSMRelation *priorRel; /* prior rel in useCount order */ - int useCount; /* use count for prioritizing rels */ - Size threshold; /* minimum amount of free space to keep */ + FSMRelation *nextUsage; /* next rel in usage-recency order */ + FSMRelation *priorUsage; /* prior rel in usage-recency order */ + FSMRelation *nextPhysical; /* next rel in arena-storage order */ + FSMRelation *priorPhysical; /* prior rel in arena-storage order */ + bool isIndex; /* if true, we store only page numbers */ + Size avgRequest; /* moving average of space requests */ + int lastPageCount; /* pages passed to RecordRelationFreeSpace */ + int firstChunk; /* chunk # of my first chunk in arena */ + int storedPages; /* # of pages stored in arena */ int nextPage; /* index (from 0) to start next search at */ - int numPages; /* total number of pages we have info - * about */ - int numChunks; /* number of FSMChunks allocated to rel */ - FSMChunk *relChunks; /* linked list of page info chunks */ - FSMChunk *lastChunk; /* last chunk in linked list */ -}; - -/* - * Info about individual pages in a relation is stored in chunks to reduce - * allocation overhead. Note that we allow any chunk of a relation's list - * to be partly full, not only the last chunk; this speeds deletion of - * individual page entries. When the total number of unused slots reaches - * CHUNKPAGES, we compact out the unused slots so that we can return a chunk - * to the freelist; but there's no point in doing the compaction before that. - */ - -#define CHUNKPAGES 32 /* each chunk can store this many pages */ - -struct FSMChunk -{ - FSMChunk *next; /* linked-list link */ - int numPages; /* number of pages described here */ - BlockNumber pages[CHUNKPAGES]; /* page numbers within relation */ - ItemLength bytes[CHUNKPAGES]; /* free space available on each - * page */ }; @@ -132,24 +172,26 @@ static FSMHeader *FreeSpaceMap; /* points to FSMHeader in shared memory */ static FSMRelation *lookup_fsm_rel(RelFileNode *rel); static FSMRelation *create_fsm_rel(RelFileNode *rel); static void delete_fsm_rel(FSMRelation *fsmrel); -static void link_fsm_rel_after(FSMRelation *fsmrel, FSMRelation *oldrel); -static void unlink_fsm_rel(FSMRelation *fsmrel); -static void free_chunk_chain(FSMChunk *fchunk); +static void link_fsm_rel_usage(FSMRelation *fsmrel); +static void unlink_fsm_rel_usage(FSMRelation *fsmrel); +static void link_fsm_rel_storage(FSMRelation *fsmrel); +static void unlink_fsm_rel_storage(FSMRelation *fsmrel); static BlockNumber find_free_space(FSMRelation *fsmrel, Size spaceNeeded); +static BlockNumber find_index_free_space(FSMRelation *fsmrel); static void fsm_record_free_space(FSMRelation *fsmrel, BlockNumber page, Size spaceAvail); static bool lookup_fsm_page_entry(FSMRelation *fsmrel, BlockNumber page, - FSMChunk **outChunk, int *outChunkRelIndex); -static bool insert_fsm_page_entry(FSMRelation *fsmrel, - BlockNumber page, Size spaceAvail, - FSMChunk *chunk, int chunkRelIndex); -static bool append_fsm_chunk(FSMRelation *fsmrel); -static bool push_fsm_page_entry(BlockNumber page, Size spaceAvail, - FSMChunk *chunk, int chunkRelIndex); -static void delete_fsm_page_entry(FSMRelation *fsmrel, FSMChunk *chunk, - int chunkRelIndex); -static void compact_fsm_page_list(FSMRelation *fsmrel); -static void acquire_fsm_free_space(void); + int *outPageIndex); +static void compact_fsm_storage(void); +static void push_fsm_rels_after(FSMRelation *afterRel); +static void pack_incoming_pages(FSMPageData *newLocation, int newPages, + PageFreeSpaceInfo *pageSpaces, int nPages); +static void pack_existing_pages(FSMPageData *newLocation, int newPages, + FSMPageData *oldLocation, int oldPages); +static int fsm_calc_request(FSMRelation *fsmrel); +static int fsm_calc_target_allocation(int myRequest); +static int fsm_current_chunks(FSMRelation *fsmrel); +static int fsm_current_allocation(FSMRelation *fsmrel); /* @@ -170,8 +212,6 @@ void InitFreeSpaceMap(void) { HASHCTL info; - FSMChunk *chunks, - *prevchunk; int nchunks; /* Create table header */ @@ -194,22 +234,20 @@ InitFreeSpaceMap(void) if (!FreeSpaceMap->relHash) elog(FATAL, "Insufficient shared memory for free space map"); - /* Allocate FSMChunks and fill up the free-chunks list */ + /* Allocate page-storage arena */ nchunks = (MaxFSMPages - 1) / CHUNKPAGES + 1; - FreeSpaceMap->numFreeChunks = nchunks; + /* This check ensures spareChunks will be greater than zero */ + if (nchunks <= MaxFSMRelations) + elog(FATAL, "max_fsm_pages must exceed max_fsm_relations * %d", + CHUNKPAGES); - chunks = (FSMChunk *) ShmemAlloc(nchunks * sizeof(FSMChunk)); - if (chunks == NULL) + FreeSpaceMap->arena = (char *) ShmemAlloc(nchunks * CHUNKBYTES); + if (FreeSpaceMap->arena == NULL) elog(FATAL, "Insufficient shared memory for free space map"); - prevchunk = NULL; - while (nchunks-- > 0) - { - chunks->next = prevchunk; - prevchunk = chunks; - chunks++; - } - FreeSpaceMap->freeChunks = prevchunk; + FreeSpaceMap->totalChunks = nchunks; + FreeSpaceMap->usedChunks = 0; + FreeSpaceMap->sumRequests = 0; } /* @@ -227,10 +265,13 @@ FreeSpaceShmemSize(void) /* hash table, including the FSMRelation objects */ size += hash_estimate_size(MaxFSMRelations, sizeof(FSMRelation)); - /* FSMChunk objects */ + /* page-storage arena */ nchunks = (MaxFSMPages - 1) / CHUNKPAGES + 1; - size += MAXALIGN(nchunks * sizeof(FSMChunk)); + if (nchunks >= (INT_MAX / CHUNKBYTES)) + elog(FATAL, "max_fsm_pages is too large"); + + size += MAXALIGN(nchunks * CHUNKBYTES); return size; } @@ -244,8 +285,9 @@ FreeSpaceShmemSize(void) * The caller must be prepared for the possibility that the returned page * will turn out to have too little space available by the time the caller * gets a lock on it. In that case, the caller should report the actual - * amount of free space available on that page (via RecordFreeSpace) and - * then try again. If InvalidBlockNumber is returned, extend the relation. + * amount of free space available on that page and then try again (see + * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned, + * extend the relation. */ BlockNumber GetPageWithFreeSpace(RelFileNode *rel, Size spaceNeeded) @@ -261,21 +303,17 @@ GetPageWithFreeSpace(RelFileNode *rel, Size spaceNeeded) fsmrel = create_fsm_rel(rel); /* - * Adjust the threshold towards the space request. This essentially - * implements an exponential moving average with an equivalent period - * of about 63 requests. Ignore silly requests, however, to ensure - * that the average stays in bounds. - * - * In theory, if the threshold increases here we should immediately - * delete any pages that fall below the new threshold. In practice it - * seems OK to wait until we have a need to compact space. + * Update the moving average of space requests. This code implements an + * exponential moving average with an equivalent period of about 63 + * requests. Ignore silly requests, however, to ensure that the average + * stays sane. */ if (spaceNeeded > 0 && spaceNeeded < BLCKSZ) { - int cur_avg = (int) fsmrel->threshold; + int cur_avg = (int) fsmrel->avgRequest; cur_avg += ((int) spaceNeeded - cur_avg) / 32; - fsmrel->threshold = (Size) cur_avg; + fsmrel->avgRequest = (Size) cur_avg; } freepage = find_free_space(fsmrel, spaceNeeded); LWLockRelease(FreeSpaceLock); @@ -283,37 +321,10 @@ GetPageWithFreeSpace(RelFileNode *rel, Size spaceNeeded) } /* - * RecordFreeSpace - record the amount of free space available on a page. + * RecordAndGetPageWithFreeSpace - update info about a page and try again. * - * The FSM is at liberty to forget about the page instead, if the amount of - * free space is too small to be interesting. The only guarantee is that - * a subsequent request to get a page with more than spaceAvail space free - * will not return this page. - */ -void -RecordFreeSpace(RelFileNode *rel, BlockNumber page, Size spaceAvail) -{ - FSMRelation *fsmrel; - - /* Sanity check: ensure spaceAvail will fit into ItemLength */ - AssertArg(spaceAvail < BLCKSZ); - - LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); - - /* - * We choose not to add rels to the hashtable unless they've been - * inquired about with GetPageWithFreeSpace. Also, a Record operation - * does not increase the useCount or change threshold, only Get does. - */ - fsmrel = lookup_fsm_rel(rel); - if (fsmrel) - fsm_record_free_space(fsmrel, page, spaceAvail); - LWLockRelease(FreeSpaceLock); -} - -/* - * RecordAndGetPageWithFreeSpace - combo form to save one lock and - * hash table lookup cycle. + * We provide this combo form, instead of a separate Record operation, + * to save one lock and hash table lookup cycle. */ BlockNumber RecordAndGetPageWithFreeSpace(RelFileNode *rel, @@ -324,7 +335,7 @@ RecordAndGetPageWithFreeSpace(RelFileNode *rel, FSMRelation *fsmrel; BlockNumber freepage; - /* Sanity check: ensure spaceAvail will fit into ItemLength */ + /* Sanity check: ensure spaceAvail will fit into OffsetNumber */ AssertArg(oldSpaceAvail < BLCKSZ); LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); @@ -334,23 +345,19 @@ RecordAndGetPageWithFreeSpace(RelFileNode *rel, */ fsmrel = create_fsm_rel(rel); + /* Do the Record */ + fsm_record_free_space(fsmrel, oldPage, oldSpaceAvail); /* - * Adjust the threshold towards the space request, same as in + * Update the moving average of space requests, same as in * GetPageWithFreeSpace. - * - * Note that we do this before storing data for oldPage, which means this - * isn't exactly equivalent to Record followed by Get; but it seems - * appropriate to adjust the threshold first. */ if (spaceNeeded > 0 && spaceNeeded < BLCKSZ) { - int cur_avg = (int) fsmrel->threshold; + int cur_avg = (int) fsmrel->avgRequest; cur_avg += ((int) spaceNeeded - cur_avg) / 32; - fsmrel->threshold = (Size) cur_avg; + fsmrel->avgRequest = (Size) cur_avg; } - /* Do the Record */ - fsm_record_free_space(fsmrel, oldPage, oldSpaceAvail); /* Do the Get */ freepage = find_free_space(fsmrel, spaceNeeded); LWLockRelease(FreeSpaceLock); @@ -358,91 +365,270 @@ RecordAndGetPageWithFreeSpace(RelFileNode *rel, } /* - * MultiRecordFreeSpace - record available-space info about multiple pages - * of a relation in one call. + * GetAvgFSMRequestSize - get average FSM request size for a relation. + * + * If the relation is not known to FSM, return a default value. + */ +Size +GetAvgFSMRequestSize(RelFileNode *rel) +{ + Size result; + FSMRelation *fsmrel; + + LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); + fsmrel = lookup_fsm_rel(rel); + if (fsmrel) + result = fsmrel->avgRequest; + else + result = INITIAL_AVERAGE; + LWLockRelease(FreeSpaceLock); + return result; +} + +/* + * RecordRelationFreeSpace - record available-space info about a relation. * - * First, the FSM must discard any entries it has for pages >= minPage. - * This allows obsolete info to be discarded (for example, it is used when - * truncating a relation). Any entries before minPage should be kept. + * Any pre-existing info about the relation is assumed obsolete and discarded. * - * Second, if nPages > 0, record the page numbers and free space amounts in - * the given pageSpaces[] array. As with RecordFreeSpace, the FSM is at - * liberty to discard some of this information. The pageSpaces[] array must - * be sorted in order by blkno, and may not contain entries before minPage. + * The given pageSpaces[] array must be sorted in order by blkno. Note that + * the FSM is at liberty to discard some or all of the data. */ void -MultiRecordFreeSpace(RelFileNode *rel, - BlockNumber minPage, - int nPages, - PageFreeSpaceInfo *pageSpaces) +RecordRelationFreeSpace(RelFileNode *rel, + int nPages, + PageFreeSpaceInfo *pageSpaces) { FSMRelation *fsmrel; - int i; + + /* Limit nPages to something sane */ + if (nPages < 0) + nPages = 0; + else if (nPages > MaxFSMPages) + nPages = MaxFSMPages; LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); + /* + * Note we don't record info about a relation unless there's already + * an FSM entry for it, implying someone has done GetPageWithFreeSpace + * for it. Inactive rels thus will not clutter the map simply by being + * vacuumed. + */ fsmrel = lookup_fsm_rel(rel); if (fsmrel) { + int myRequest; + int myAlloc; + int curAlloc; + int curAllocPages; + FSMPageData *newLocation; + /* - * Remove entries >= minPage + * Delete existing entries, and update request status. */ - FSMChunk *chunk; - int chunkRelIndex; - - /* Use lookup to locate first entry >= minPage */ - lookup_fsm_page_entry(fsmrel, minPage, &chunk, &chunkRelIndex); - /* Set free space to 0 for each page >= minPage */ - while (chunk) + fsmrel->storedPages = 0; + FreeSpaceMap->sumRequests -= fsm_calc_request(fsmrel); + fsmrel->lastPageCount = nPages; + fsmrel->isIndex = false; + myRequest = fsm_calc_request(fsmrel); + FreeSpaceMap->sumRequests += myRequest; + myAlloc = fsm_calc_target_allocation(myRequest); + /* + * Need to reallocate space if (a) my target allocation is more + * than my current allocation, AND (b) my actual immediate need + * (myRequest+1 chunks) is more than my current allocation. + * Otherwise just store the new data in-place. + */ + curAlloc = fsm_current_allocation(fsmrel); + if (myAlloc > curAlloc && (myRequest+1) > curAlloc && nPages > 0) { - int numPages = chunk->numPages; - - for (i = chunkRelIndex; i < numPages; i++) - chunk->bytes[i] = 0; - chunk = chunk->next; - chunkRelIndex = 0; + /* Remove entry from storage list, and compact */ + unlink_fsm_rel_storage(fsmrel); + compact_fsm_storage(); + /* Reattach to end of storage list */ + link_fsm_rel_storage(fsmrel); + /* And allocate storage */ + fsmrel->firstChunk = FreeSpaceMap->usedChunks; + FreeSpaceMap->usedChunks += myAlloc; + curAlloc = myAlloc; + /* Watch out for roundoff error */ + if (FreeSpaceMap->usedChunks > FreeSpaceMap->totalChunks) + { + FreeSpaceMap->usedChunks = FreeSpaceMap->totalChunks; + curAlloc = FreeSpaceMap->totalChunks - fsmrel->firstChunk; + } } - /* Now compact out the zeroed entries, along with any other junk */ - compact_fsm_page_list(fsmrel); - /* - * Add new entries, if appropriate. - * - * This can be much cheaper than a full fsm_record_free_space() - * call because we know we are appending to the end of the relation. + * If the data fits in our current allocation, just copy it; + * otherwise must compress. */ - for (i = 0; i < nPages; i++) + newLocation = (FSMPageData *) + (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); + curAllocPages = curAlloc * CHUNKPAGES; + if (nPages <= curAllocPages) { - BlockNumber page = pageSpaces[i].blkno; - Size avail = pageSpaces[i].avail; - FSMChunk *chunk; + int i; - /* Check caller provides sorted data */ - if (i > 0 ? (page <= pageSpaces[i-1].blkno) : (page < minPage)) - elog(ERROR, "MultiRecordFreeSpace: data not in page order"); + for (i = 0; i < nPages; i++) + { + BlockNumber page = pageSpaces[i].blkno; + Size avail = pageSpaces[i].avail; + + /* Check caller provides sorted data */ + if (i > 0 && page <= pageSpaces[i-1].blkno) + elog(ERROR, "RecordRelationFreeSpace: data not in page order"); + FSMPageSetPageNum(newLocation, page); + FSMPageSetSpace(newLocation, avail); + newLocation++; + } + fsmrel->storedPages = nPages; + } + else + { + pack_incoming_pages(newLocation, curAllocPages, + pageSpaces, nPages); + fsmrel->storedPages = curAllocPages; + } + } + LWLockRelease(FreeSpaceLock); +} - /* Ignore pages too small to fit */ - if (avail < fsmrel->threshold) - continue; +/* + * GetFreeIndexPage - like GetPageWithFreeSpace, but for indexes + */ +BlockNumber +GetFreeIndexPage(RelFileNode *rel) +{ + FSMRelation *fsmrel; + BlockNumber freepage; + + LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); + + /* + * We always add a rel to the hashtable when it is inquired about. + */ + fsmrel = create_fsm_rel(rel); + + freepage = find_index_free_space(fsmrel); + LWLockRelease(FreeSpaceLock); + return freepage; +} + +/* + * RecordIndexFreeSpace - like RecordRelationFreeSpace, but for indexes + */ +void +RecordIndexFreeSpace(RelFileNode *rel, + int nPages, + BlockNumber *pages) +{ + FSMRelation *fsmrel; + + /* Limit nPages to something sane */ + if (nPages < 0) + nPages = 0; + else if (nPages > MaxFSMPages) + nPages = MaxFSMPages; + + LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); + /* + * Note we don't record info about a relation unless there's already + * an FSM entry for it, implying someone has done GetFreeIndexPage + * for it. Inactive rels thus will not clutter the map simply by being + * vacuumed. + */ + fsmrel = lookup_fsm_rel(rel); + if (fsmrel) + { + int myRequest; + int myAlloc; + int curAlloc; + int curAllocPages; + int i; + IndexFSMPageData *newLocation; - /* Get another chunk if needed */ - /* We may need to loop if acquire_fsm_free_space() fails */ - while ((chunk = fsmrel->lastChunk) == NULL || - chunk->numPages >= CHUNKPAGES) + /* + * Delete existing entries, and update request status. + */ + fsmrel->storedPages = 0; + FreeSpaceMap->sumRequests -= fsm_calc_request(fsmrel); + fsmrel->lastPageCount = nPages; + fsmrel->isIndex = true; + myRequest = fsm_calc_request(fsmrel); + FreeSpaceMap->sumRequests += myRequest; + myAlloc = fsm_calc_target_allocation(myRequest); + /* + * Need to reallocate space if (a) my target allocation is more + * than my current allocation, AND (b) my actual immediate need + * (myRequest+1 chunks) is more than my current allocation. + * Otherwise just store the new data in-place. + */ + curAlloc = fsm_current_allocation(fsmrel); + if (myAlloc > curAlloc && (myRequest+1) > curAlloc && nPages > 0) + { + /* Remove entry from storage list, and compact */ + unlink_fsm_rel_storage(fsmrel); + compact_fsm_storage(); + /* Reattach to end of storage list */ + link_fsm_rel_storage(fsmrel); + /* And allocate storage */ + fsmrel->firstChunk = FreeSpaceMap->usedChunks; + FreeSpaceMap->usedChunks += myAlloc; + curAlloc = myAlloc; + /* Watch out for roundoff error */ + if (FreeSpaceMap->usedChunks > FreeSpaceMap->totalChunks) { - if (!append_fsm_chunk(fsmrel)) - acquire_fsm_free_space(); + FreeSpaceMap->usedChunks = FreeSpaceMap->totalChunks; + curAlloc = FreeSpaceMap->totalChunks - fsmrel->firstChunk; } + } + /* + * If the data fits in our current allocation, just copy it; + * otherwise must compress. But compression is easy: we merely + * forget extra pages. + */ + newLocation = (IndexFSMPageData *) + (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); + curAllocPages = curAlloc * INDEXCHUNKPAGES; + if (nPages > curAllocPages) + nPages = curAllocPages; - /* Recheck in case threshold was raised by acquire */ - if (avail < fsmrel->threshold) - continue; + for (i = 0; i < nPages; i++) + { + BlockNumber page = pages[i]; - /* Okay to store */ - chunk->pages[chunk->numPages] = page; - chunk->bytes[chunk->numPages] = (ItemLength) avail; - chunk->numPages++; - fsmrel->numPages++; + /* Check caller provides sorted data */ + if (i > 0 && page <= pages[i-1]) + elog(ERROR, "RecordIndexFreeSpace: data not in page order"); + IndexFSMPageSetPageNum(newLocation, page); + newLocation++; } + fsmrel->storedPages = nPages; + } + LWLockRelease(FreeSpaceLock); +} + +/* + * FreeSpaceMapTruncateRel - adjust for truncation of a relation. + * + * We need to delete any stored data past the new relation length, so that + * we don't bogusly return removed block numbers. + */ +void +FreeSpaceMapTruncateRel(RelFileNode *rel, BlockNumber nblocks) +{ + FSMRelation *fsmrel; + + LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); + fsmrel = lookup_fsm_rel(rel); + if (fsmrel) + { + int pageIndex; + + /* Use lookup to locate first entry >= nblocks */ + (void) lookup_fsm_page_entry(fsmrel, nblocks, &pageIndex); + /* Delete all such entries */ + fsmrel->storedPages = pageIndex; + /* XXX should we adjust rel's lastPageCount and sumRequests? */ } LWLockRelease(FreeSpaceLock); } @@ -482,15 +668,53 @@ FreeSpaceMapForgetDatabase(Oid dbid) *nextrel; LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); - for (fsmrel = FreeSpaceMap->relList; fsmrel; fsmrel = nextrel) + for (fsmrel = FreeSpaceMap->usageList; fsmrel; fsmrel = nextrel) { - nextrel = fsmrel->nextRel; /* in case we delete it */ + nextrel = fsmrel->nextUsage; /* in case we delete it */ if (fsmrel->key.tblNode == dbid) delete_fsm_rel(fsmrel); } LWLockRelease(FreeSpaceLock); } +/* + * PrintFreeSpaceMapStatistics - print statistics about FSM contents + * + * The info is sent to elog() with the specified message level. This is + * intended for use during VACUUM. + */ +void +PrintFreeSpaceMapStatistics(int elevel) +{ + FSMRelation *fsmrel; + int storedPages = 0; + int numRels; + double sumRequests; + double needed; + + LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); + /* Count total space used --- tedious, but seems useful */ + for (fsmrel = FreeSpaceMap->firstRel; + fsmrel != NULL; + fsmrel = fsmrel->nextPhysical) + { + storedPages += fsmrel->storedPages; + } + /* Copy other stats before dropping lock */ + numRels = FreeSpaceMap->numRels; + sumRequests = FreeSpaceMap->sumRequests; + LWLockRelease(FreeSpaceLock); + + /* Convert stats to actual number of page slots needed */ + needed = (sumRequests + numRels) * CHUNKPAGES; + + elog(elevel, "Free space map: %d relations, %d pages stored; %.0f total pages needed." + "\n\tAllocated FSM size: %d relations + %d pages = %.0f KB shared mem.", + numRels, storedPages, needed, + MaxFSMRelations, MaxFSMPages, + (double) FreeSpaceShmemSize() / 1024.0); +} + /* * Internal routines. These all assume the caller holds the FreeSpaceLock. @@ -498,7 +722,8 @@ FreeSpaceMapForgetDatabase(Oid dbid) /* * Lookup a relation in the hash table. If not present, return NULL. - * The relation's useCount is not changed. + * + * The relation's position in the LRU list is not changed. */ static FSMRelation * lookup_fsm_rel(RelFileNode *rel) @@ -518,14 +743,12 @@ lookup_fsm_rel(RelFileNode *rel) /* * Lookup a relation in the hash table, creating an entry if not present. * - * On successful lookup, the relation's useCount is incremented, possibly - * causing it to move up in the priority list. + * On successful lookup, the relation is moved to the front of the LRU list. */ static FSMRelation * create_fsm_rel(RelFileNode *rel) { - FSMRelation *fsmrel, - *oldrel; + FSMRelation *fsmrel; bool found; fsmrel = (FSMRelation *) hash_search(FreeSpaceMap->relHash, @@ -538,49 +761,31 @@ create_fsm_rel(RelFileNode *rel) if (!found) { /* New hashtable entry, initialize it (hash_search set the key) */ - fsmrel->useCount = 1; - fsmrel->threshold = BLCKSZ / 2; /* starting point for new entry */ + fsmrel->isIndex = false; /* until we learn different */ + fsmrel->avgRequest = INITIAL_AVERAGE; + fsmrel->lastPageCount = 0; + fsmrel->firstChunk = -1; /* no space allocated */ + fsmrel->storedPages = 0; fsmrel->nextPage = 0; - fsmrel->numPages = 0; - fsmrel->numChunks = 0; - fsmrel->relChunks = NULL; - fsmrel->lastChunk = NULL; + /* Discard lowest-priority existing rel, if we are over limit */ if (FreeSpaceMap->numRels >= MaxFSMRelations) - delete_fsm_rel(FreeSpaceMap->relListTail); + delete_fsm_rel(FreeSpaceMap->usageListTail); - /* - * Add new entry in front of any others with useCount 1 (since it - * is more recently used than them). - */ - oldrel = FreeSpaceMap->relListTail; - while (oldrel && oldrel->useCount <= 1) - oldrel = oldrel->priorRel; - link_fsm_rel_after(fsmrel, oldrel); + /* Add new entry at front of LRU list */ + link_fsm_rel_usage(fsmrel); + fsmrel->nextPhysical = NULL; /* not in physical-storage list */ + fsmrel->priorPhysical = NULL; FreeSpaceMap->numRels++; + /* sumRequests is unchanged because request must be zero */ } else { - int myCount; - - /* Existing entry, advance its useCount */ - if (++(fsmrel->useCount) >= INT_MAX / 2) - { - /* When useCounts threaten to overflow, reduce 'em all 2X */ - for (oldrel = FreeSpaceMap->relList; - oldrel != NULL; - oldrel = oldrel->nextRel) - oldrel->useCount >>= 1; - } - /* If warranted, move it up the priority list */ - oldrel = fsmrel->priorRel; - myCount = fsmrel->useCount; - if (oldrel && oldrel->useCount <= myCount) + /* Existing entry, move to front of LRU list */ + if (fsmrel->priorUsage != NULL) { - unlink_fsm_rel(fsmrel); - while (oldrel && oldrel->useCount <= myCount) - oldrel = oldrel->priorRel; - link_fsm_rel_after(fsmrel, oldrel); + unlink_fsm_rel_usage(fsmrel); + link_fsm_rel_usage(fsmrel); } } @@ -595,8 +800,9 @@ delete_fsm_rel(FSMRelation *fsmrel) { FSMRelation *result; - free_chunk_chain(fsmrel->relChunks); - unlink_fsm_rel(fsmrel); + FreeSpaceMap->sumRequests -= fsm_calc_request(fsmrel); + unlink_fsm_rel_usage(fsmrel); + unlink_fsm_rel_storage(fsmrel); FreeSpaceMap->numRels--; result = (FSMRelation *) hash_search(FreeSpaceMap->relHash, (void *) &(fsmrel->key), @@ -607,74 +813,78 @@ delete_fsm_rel(FSMRelation *fsmrel) } /* - * Link a FSMRelation into the priority list just after the given existing - * entry (or at the head of the list, if oldrel is NULL). + * Link a FSMRelation into the LRU list (always at the head). */ static void -link_fsm_rel_after(FSMRelation *fsmrel, FSMRelation *oldrel) +link_fsm_rel_usage(FSMRelation *fsmrel) { - if (oldrel == NULL) - { - /* insert at head */ - fsmrel->priorRel = NULL; - fsmrel->nextRel = FreeSpaceMap->relList; - FreeSpaceMap->relList = fsmrel; - if (fsmrel->nextRel != NULL) - fsmrel->nextRel->priorRel = fsmrel; - else - FreeSpaceMap->relListTail = fsmrel; - } + fsmrel->priorUsage = NULL; + fsmrel->nextUsage = FreeSpaceMap->usageList; + FreeSpaceMap->usageList = fsmrel; + if (fsmrel->nextUsage != NULL) + fsmrel->nextUsage->priorUsage = fsmrel; else - { - /* insert after oldrel */ - fsmrel->priorRel = oldrel; - fsmrel->nextRel = oldrel->nextRel; - oldrel->nextRel = fsmrel; - if (fsmrel->nextRel != NULL) - fsmrel->nextRel->priorRel = fsmrel; - else - FreeSpaceMap->relListTail = fsmrel; - } + FreeSpaceMap->usageListTail = fsmrel; } /* - * Delink a FSMRelation from the priority list. + * Delink a FSMRelation from the LRU list. */ static void -unlink_fsm_rel(FSMRelation *fsmrel) +unlink_fsm_rel_usage(FSMRelation *fsmrel) { - if (fsmrel->priorRel != NULL) - fsmrel->priorRel->nextRel = fsmrel->nextRel; + if (fsmrel->priorUsage != NULL) + fsmrel->priorUsage->nextUsage = fsmrel->nextUsage; else - FreeSpaceMap->relList = fsmrel->nextRel; - if (fsmrel->nextRel != NULL) - fsmrel->nextRel->priorRel = fsmrel->priorRel; + FreeSpaceMap->usageList = fsmrel->nextUsage; + if (fsmrel->nextUsage != NULL) + fsmrel->nextUsage->priorUsage = fsmrel->priorUsage; else - FreeSpaceMap->relListTail = fsmrel->priorRel; + FreeSpaceMap->usageListTail = fsmrel->priorUsage; + /* + * We don't bother resetting fsmrel's links, since it's about to be + * deleted or relinked at the head. + */ } /* - * Return all the FSMChunks in the chain starting at fchunk to the freelist. - * (Caller must handle unlinking them from wherever they were.) + * Link a FSMRelation into the storage-order list (always at the tail). */ static void -free_chunk_chain(FSMChunk *fchunk) +link_fsm_rel_storage(FSMRelation *fsmrel) { - int nchunks; - FSMChunk *lchunk; + fsmrel->nextPhysical = NULL; + fsmrel->priorPhysical = FreeSpaceMap->lastRel; + if (FreeSpaceMap->lastRel != NULL) + FreeSpaceMap->lastRel->nextPhysical = fsmrel; + else + FreeSpaceMap->firstRel = fsmrel; + FreeSpaceMap->lastRel = fsmrel; +} - if (fchunk == NULL) - return; - nchunks = 1; - lchunk = fchunk; - while (lchunk->next != NULL) +/* + * Delink a FSMRelation from the storage-order list, if it's in it. + */ +static void +unlink_fsm_rel_storage(FSMRelation *fsmrel) +{ + if (fsmrel->priorPhysical != NULL || FreeSpaceMap->firstRel == fsmrel) { - nchunks++; - lchunk = lchunk->next; + if (fsmrel->priorPhysical != NULL) + fsmrel->priorPhysical->nextPhysical = fsmrel->nextPhysical; + else + FreeSpaceMap->firstRel = fsmrel->nextPhysical; + if (fsmrel->nextPhysical != NULL) + fsmrel->nextPhysical->priorPhysical = fsmrel->priorPhysical; + else + FreeSpaceMap->lastRel = fsmrel->priorPhysical; } - lchunk->next = FreeSpaceMap->freeChunks; - FreeSpaceMap->freeChunks = fchunk; - FreeSpaceMap->numFreeChunks += nchunks; + /* mark as not in list, since we may not put it back immediately */ + fsmrel->nextPhysical = NULL; + fsmrel->priorPhysical = NULL; + /* Also mark it as having no storage */ + fsmrel->firstChunk = -1; + fsmrel->storedPages = 0; } /* @@ -688,104 +898,108 @@ free_chunk_chain(FSMChunk *fchunk) static BlockNumber find_free_space(FSMRelation *fsmrel, Size spaceNeeded) { + FSMPageData *info; int pagesToCheck, /* outer loop counter */ - pageIndex, /* current page index relative to relation */ - chunkRelIndex; /* current page index relative to curChunk */ - FSMChunk *curChunk; + pageIndex; /* current page index */ + if (fsmrel->isIndex) + elog(ERROR, "find_free_space: called for an index relation"); + info = (FSMPageData *) + (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); pageIndex = fsmrel->nextPage; /* Last operation may have left nextPage pointing past end */ - if (pageIndex >= fsmrel->numPages) + if (pageIndex >= fsmrel->storedPages) pageIndex = 0; - curChunk = fsmrel->relChunks; - chunkRelIndex = pageIndex; - for (pagesToCheck = fsmrel->numPages; pagesToCheck > 0; pagesToCheck--) + for (pagesToCheck = fsmrel->storedPages; pagesToCheck > 0; pagesToCheck--) { - /* - * Make sure we are in the right chunk. First time through, we - * may have to advance through multiple chunks; subsequent loops - * should do this at most once. - */ - while (chunkRelIndex >= curChunk->numPages) - { - chunkRelIndex -= curChunk->numPages; - curChunk = curChunk->next; - } - /* Check the next page */ - if ((Size) curChunk->bytes[chunkRelIndex] >= spaceNeeded) + FSMPageData *page = info + pageIndex; + Size spaceAvail = FSMPageGetSpace(page); + + /* Check this page */ + if (spaceAvail >= spaceNeeded) { /* - * Found what we want --- adjust the entry. In theory we could - * delete the entry immediately if it drops below threshold, - * but it seems better to wait till we next need space. + * Found what we want --- adjust the entry, and update nextPage. */ - curChunk->bytes[chunkRelIndex] -= (ItemLength) spaceNeeded; + FSMPageSetSpace(page, spaceAvail - spaceNeeded); fsmrel->nextPage = pageIndex + 1; - return curChunk->pages[chunkRelIndex]; + return FSMPageGetPageNum(page); } - /* Advance pageIndex and chunkRelIndex, wrapping around if needed */ - if (++pageIndex >= fsmrel->numPages) - { + /* Advance pageIndex, wrapping around if needed */ + if (++pageIndex >= fsmrel->storedPages) pageIndex = 0; - curChunk = fsmrel->relChunks; - chunkRelIndex = 0; - } - else - chunkRelIndex++; } return InvalidBlockNumber; /* nothing found */ } /* - * fsm_record_free_space - guts of RecordFreeSpace, which is also used by - * RecordAndGetPageWithFreeSpace. + * As above, but for index case --- we only deal in whole pages. + */ +static BlockNumber +find_index_free_space(FSMRelation *fsmrel) +{ + IndexFSMPageData *info; + BlockNumber result; + + /* + * If isIndex isn't set, it could be that RecordIndexFreeSpace() has + * never yet been called on this relation, and we're still looking + * at the default setting from create_fsm_rel(). If so, just act as + * though there's no space. + */ + if (!fsmrel->isIndex) + { + if (fsmrel->storedPages == 0) + return InvalidBlockNumber; + elog(ERROR, "find_index_free_space: called for a non-index relation"); + } + /* + * For indexes, there's no need for the nextPage state variable; we just + * remove and return the first available page. (We could save cycles here + * by returning the last page, but it seems better to encourage re-use + * of lower-numbered pages.) + */ + if (fsmrel->storedPages <= 0) + return InvalidBlockNumber; /* no pages available */ + info = (IndexFSMPageData *) + (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); + result = IndexFSMPageGetPageNum(info); + fsmrel->storedPages--; + memmove(info, info + 1, fsmrel->storedPages * sizeof(IndexFSMPageData)); + return result; +} + +/* + * fsm_record_free_space - guts of RecordFreeSpace operation (now only + * provided as part of RecordAndGetPageWithFreeSpace). */ static void fsm_record_free_space(FSMRelation *fsmrel, BlockNumber page, Size spaceAvail) { - FSMChunk *chunk; - int chunkRelIndex; + int pageIndex; - if (lookup_fsm_page_entry(fsmrel, page, &chunk, &chunkRelIndex)) + if (fsmrel->isIndex) + elog(ERROR, "fsm_record_free_space: called for an index relation"); + if (lookup_fsm_page_entry(fsmrel, page, &pageIndex)) { - /* Found an existing entry for page; update or delete it */ - if (spaceAvail >= fsmrel->threshold) - chunk->bytes[chunkRelIndex] = (ItemLength) spaceAvail; - else - delete_fsm_page_entry(fsmrel, chunk, chunkRelIndex); + /* Found an existing entry for page; update it */ + FSMPageData *info; + + info = (FSMPageData *) + (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); + info += pageIndex; + FSMPageSetSpace(info, spaceAvail); } else { /* - * No existing entry; add one if spaceAvail exceeds threshold. - * - * CORNER CASE: if we have to do acquire_fsm_free_space then our own - * threshold will increase, possibly meaning that we shouldn't - * store the page after all. Loop to redo the test if that - * happens. The loop also covers the possibility that - * acquire_fsm_free_space must be executed more than once to free - * any space (ie, thresholds must be more than doubled). + * No existing entry; ignore the call. We used to add the page + * to the FSM --- but in practice, if the page hasn't got enough + * space to satisfy the caller who's kicking it back to us, then + * it's probably uninteresting to everyone else as well. */ - while (spaceAvail >= fsmrel->threshold) - { - if (insert_fsm_page_entry(fsmrel, page, spaceAvail, - chunk, chunkRelIndex)) - break; - /* No space, acquire some and recheck threshold */ - acquire_fsm_free_space(); - if (spaceAvail < fsmrel->threshold) - break; - - /* - * Need to redo the lookup since our own page list may well - * have lost entries, so position is not correct anymore. - */ - if (lookup_fsm_page_entry(fsmrel, page, - &chunk, &chunkRelIndex)) - elog(ERROR, "fsm_record_free_space: unexpected match"); - } } } @@ -793,293 +1007,474 @@ fsm_record_free_space(FSMRelation *fsmrel, BlockNumber page, Size spaceAvail) * Look for an entry for a specific page (block number) in a FSMRelation. * Returns TRUE if a matching entry exists, else FALSE. * - * The output arguments *outChunk, *outChunkRelIndex are set to indicate where - * the entry exists (if TRUE result) or could be inserted (if FALSE result). - * *chunk is set to NULL if there is no place to insert, ie, the entry would - * need to be added to a new chunk. + * The output argument *outPageIndex is set to indicate where the entry exists + * (if TRUE result) or could be inserted (if FALSE result). */ static bool lookup_fsm_page_entry(FSMRelation *fsmrel, BlockNumber page, - FSMChunk **outChunk, int *outChunkRelIndex) + int *outPageIndex) { - FSMChunk *chunk; - int chunkRelIndex; - - for (chunk = fsmrel->relChunks; chunk; chunk = chunk->next) + /* Check for empty relation */ + if (fsmrel->storedPages <= 0) { - int numPages = chunk->numPages; + *outPageIndex = 0; + return false; + } - /* Can skip the chunk quickly if page must be after last in chunk */ - if (numPages > 0 && page <= chunk->pages[numPages - 1]) + /* Do binary search */ + if (fsmrel->isIndex) + { + IndexFSMPageData *info; + int low, + high; + + info = (IndexFSMPageData *) + (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); + low = 0; + high = fsmrel->storedPages - 1; + while (low <= high) { - for (chunkRelIndex = 0; chunkRelIndex < numPages; chunkRelIndex++) + int middle; + BlockNumber probe; + + middle = low + (high - low) / 2; + probe = IndexFSMPageGetPageNum(info + middle); + if (probe == page) { - if (page <= chunk->pages[chunkRelIndex]) - { - *outChunk = chunk; - *outChunkRelIndex = chunkRelIndex; - return (page == chunk->pages[chunkRelIndex]); - } + *outPageIndex = middle; + return true; } - /* Should not get here, given above test */ - Assert(false); + else if (probe < page) + low = middle + 1; + else + high = middle - 1; } - - /* - * If we are about to fall off the end, and there's space - * available in the end chunk, return a pointer to it. - */ - if (chunk->next == NULL && numPages < CHUNKPAGES) + *outPageIndex = low; + return false; + } + else + { + FSMPageData *info; + int low, + high; + + info = (FSMPageData *) + (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); + low = 0; + high = fsmrel->storedPages - 1; + while (low <= high) { - *outChunk = chunk; - *outChunkRelIndex = numPages; - return false; + int middle; + BlockNumber probe; + + middle = low + (high - low) / 2; + probe = FSMPageGetPageNum(info + middle); + if (probe == page) + { + *outPageIndex = middle; + return true; + } + else if (probe < page) + low = middle + 1; + else + high = middle - 1; } + *outPageIndex = low; + return false; } - - /* - * Adding the page would require a new chunk (or, perhaps, compaction - * of available free space --- not my problem here). - */ - *outChunk = NULL; - *outChunkRelIndex = 0; - return false; } /* - * Insert a new page entry into a FSMRelation's list at given position - * (chunk == NULL implies at end). - * - * If there is no space available to insert the entry, return FALSE. + * Re-pack the FSM storage arena, dropping data if necessary to meet the + * current allocation target for each relation. At conclusion, all available + * space in the arena will be coalesced at the end. */ -static bool -insert_fsm_page_entry(FSMRelation *fsmrel, BlockNumber page, Size spaceAvail, - FSMChunk *chunk, int chunkRelIndex) +static void +compact_fsm_storage(void) { - /* Outer loop handles retry after compacting rel's page list */ - for (;;) + int nextChunkIndex = 0; + FSMRelation *fsmrel; + + for (fsmrel = FreeSpaceMap->firstRel; + fsmrel != NULL; + fsmrel = fsmrel->nextPhysical) { - if (fsmrel->numPages >= fsmrel->numChunks * CHUNKPAGES) + int newAlloc; + int newAllocPages; + int newChunkIndex; + int oldChunkIndex; + int curChunks; + char *newLocation; + char *oldLocation; + + /* + * Calculate target allocation, make sure we don't overrun due to + * roundoff error + */ + newAlloc = fsm_calc_target_allocation(fsm_calc_request(fsmrel)); + if (newAlloc > FreeSpaceMap->totalChunks - nextChunkIndex) + newAlloc = FreeSpaceMap->totalChunks - nextChunkIndex; + if (fsmrel->isIndex) + newAllocPages = newAlloc * INDEXCHUNKPAGES; + else + newAllocPages = newAlloc * CHUNKPAGES; + newChunkIndex = nextChunkIndex; + nextChunkIndex += newAlloc; + /* + * Determine current size, current and new locations + */ + curChunks = fsm_current_chunks(fsmrel); + oldChunkIndex = fsmrel->firstChunk; + newLocation = FreeSpaceMap->arena + newChunkIndex * CHUNKBYTES; + oldLocation = FreeSpaceMap->arena + oldChunkIndex * CHUNKBYTES; + /* + * It's possible that we have to move data down, not up, if the + * allocations of previous rels expanded. This should mean that + * our allocation expanded too (or at least got no worse), and + * ditto for later rels. So there should be room --- but we might + * have to push down following rels to make it. We don't want to + * do the push more than once, so pack everything against the + * end of the arena if so. + */ + if (newChunkIndex > oldChunkIndex) { - /* No free space within chunk list, so need another chunk */ - if (!append_fsm_chunk(fsmrel)) - return false; /* can't do it */ - if (chunk == NULL) + int limitChunkIndex; + + if (newAllocPages < fsmrel->storedPages) + elog(PANIC, "compact_fsm_storage: can't juggle and compress too"); + if (fsmrel->nextPhysical != NULL) + limitChunkIndex = fsmrel->nextPhysical->firstChunk; + else + limitChunkIndex = FreeSpaceMap->totalChunks; + if (newChunkIndex + curChunks > limitChunkIndex) { - /* Original search found that new page belongs at end */ - chunk = fsmrel->lastChunk; - chunkRelIndex = 0; + /* need to push down additional rels */ + push_fsm_rels_after(fsmrel); + /* recheck for safety */ + if (fsmrel->nextPhysical != NULL) + limitChunkIndex = fsmrel->nextPhysical->firstChunk; + else + limitChunkIndex = FreeSpaceMap->totalChunks; + if (newChunkIndex + curChunks > limitChunkIndex) + elog(PANIC, "compact_fsm_storage: insufficient room"); } + memmove(newLocation, oldLocation, curChunks * CHUNKBYTES); } - - /* - * Try to insert it the easy way, ie, just move down subsequent - * data - */ - if (chunk && - push_fsm_page_entry(page, spaceAvail, chunk, chunkRelIndex)) + else if (newAllocPages < fsmrel->storedPages) { - fsmrel->numPages++; - fsmrel->nextPage++; /* don't return same page twice running */ - return true; + /* + * Need to compress the page data. For an index, "compression" + * just means dropping excess pages; otherwise we try to keep + * the ones with the most space. + */ + if (fsmrel->isIndex) + { + fsmrel->storedPages = newAllocPages; + /* may need to move data */ + if (newChunkIndex != oldChunkIndex) + memmove(newLocation, oldLocation, newAlloc * CHUNKBYTES); + } + else + { + pack_existing_pages((FSMPageData *) newLocation, + newAllocPages, + (FSMPageData *) oldLocation, + fsmrel->storedPages); + fsmrel->storedPages = newAllocPages; + } } - - /* - * There is space available, but evidently it's before the place - * where the page entry needs to go. Compact the list and try - * again. This will require us to redo the search for the - * appropriate place. Furthermore, compact_fsm_page_list deletes - * empty end chunks, so we may need to repeat the action of - * grabbing a new end chunk. - */ - compact_fsm_page_list(fsmrel); - if (lookup_fsm_page_entry(fsmrel, page, &chunk, &chunkRelIndex)) - elog(ERROR, "insert_fsm_page_entry: entry already exists!"); + else if (newChunkIndex != oldChunkIndex) + { + /* + * No compression needed, but must copy the data up + */ + memmove(newLocation, oldLocation, curChunks * CHUNKBYTES); + } + fsmrel->firstChunk = newChunkIndex; } + Assert(nextChunkIndex <= FreeSpaceMap->totalChunks); + FreeSpaceMap->usedChunks = nextChunkIndex; } /* - * Add one chunk to a FSMRelation's chunk list, if possible. + * Push all FSMRels physically after afterRel to the end of the storage arena. * - * Returns TRUE if successful, FALSE if no space available. Note that on - * success, the new chunk is easily accessible via fsmrel->lastChunk. + * We sometimes have to do this when deletion or truncation of a relation + * causes the allocations of remaining rels to expand markedly. We must + * temporarily push existing data down to the end so that we can move it + * back up in an orderly fashion. */ -static bool -append_fsm_chunk(FSMRelation *fsmrel) +static void +push_fsm_rels_after(FSMRelation *afterRel) { - FSMChunk *newChunk; - - /* Remove a chunk from the freelist */ - if ((newChunk = FreeSpaceMap->freeChunks) == NULL) - return false; /* can't do it */ - FreeSpaceMap->freeChunks = newChunk->next; - FreeSpaceMap->numFreeChunks--; - - /* Initialize chunk to empty */ - newChunk->next = NULL; - newChunk->numPages = 0; + int nextChunkIndex = FreeSpaceMap->totalChunks; + FSMRelation *fsmrel; - /* Link it into FSMRelation */ - if (fsmrel->relChunks == NULL) - fsmrel->relChunks = newChunk; - else - fsmrel->lastChunk->next = newChunk; - fsmrel->lastChunk = newChunk; - fsmrel->numChunks++; + FreeSpaceMap->usedChunks = FreeSpaceMap->totalChunks; - return true; + for (fsmrel = FreeSpaceMap->lastRel; + fsmrel != NULL; + fsmrel = fsmrel->priorPhysical) + { + int chunkCount; + int newChunkIndex; + int oldChunkIndex; + char *newLocation; + char *oldLocation; + + if (fsmrel == afterRel) + break; + + chunkCount = fsm_current_chunks(fsmrel); + nextChunkIndex -= chunkCount; + newChunkIndex = nextChunkIndex; + oldChunkIndex = fsmrel->firstChunk; + if (newChunkIndex < oldChunkIndex) + { + /* trouble... */ + elog(PANIC, "push_fsm_rels_after: out of room"); + } + else if (newChunkIndex > oldChunkIndex) + { + /* need to move it */ + newLocation = FreeSpaceMap->arena + newChunkIndex * CHUNKBYTES; + oldLocation = FreeSpaceMap->arena + oldChunkIndex * CHUNKBYTES; + memmove(newLocation, oldLocation, chunkCount * CHUNKBYTES); + fsmrel->firstChunk = newChunkIndex; + } + } + Assert(nextChunkIndex >= 0); } /* - * Auxiliary routine for insert_fsm_page_entry: try to push entries to the - * right to insert at chunk/chunkRelIndex. Return TRUE if successful. - * Note that the FSMRelation's own fields are not updated. + * Pack a set of per-page freespace data into a smaller amount of space. + * + * The method is to compute a low-resolution histogram of the free space + * amounts, then determine which histogram bin contains the break point. + * We then keep all pages above that bin, none below it, and just enough + * of the pages in that bin to fill the output area exactly. */ -static bool -push_fsm_page_entry(BlockNumber page, Size spaceAvail, - FSMChunk *chunk, int chunkRelIndex) +#define HISTOGRAM_BINS 64 + +static void +pack_incoming_pages(FSMPageData *newLocation, int newPages, + PageFreeSpaceInfo *pageSpaces, int nPages) { - int i; + int histogram[HISTOGRAM_BINS]; + int above, + binct, + i; + Size thresholdL, + thresholdU; + + Assert(newPages < nPages); /* else I shouldn't have been called */ + /* Build histogram */ + MemSet(histogram, 0, sizeof(histogram)); + for (i = 0; i < nPages; i++) + { + Size avail = pageSpaces[i].avail; - if (chunk->numPages >= CHUNKPAGES) + if (avail >= BLCKSZ) + elog(ERROR, "pack_incoming_pages: bogus freespace amount"); + avail /= (BLCKSZ/HISTOGRAM_BINS); + histogram[avail]++; + } + /* Find the breakpoint bin */ + above = 0; + for (i = HISTOGRAM_BINS-1; i >= 0; i--) { - if (chunk->next == NULL) - return false; /* no space */ - /* try to push chunk's last item to next chunk */ - if (!push_fsm_page_entry(chunk->pages[CHUNKPAGES - 1], - chunk->bytes[CHUNKPAGES - 1], - chunk->next, 0)) - return false; - /* successfully pushed it */ - chunk->numPages--; + int sum = above + histogram[i]; + + if (sum > newPages) + break; + above = sum; } - for (i = chunk->numPages; i > chunkRelIndex; i--) + Assert(i >= 0); + thresholdL = i * BLCKSZ/HISTOGRAM_BINS; /* low bound of bp bin */ + thresholdU = (i+1) * BLCKSZ/HISTOGRAM_BINS; /* hi bound */ + binct = newPages - above; /* number to take from bp bin */ + /* And copy the appropriate data */ + for (i = 0; i < nPages; i++) { - chunk->pages[i] = chunk->pages[i - 1]; - chunk->bytes[i] = chunk->bytes[i - 1]; + BlockNumber page = pageSpaces[i].blkno; + Size avail = pageSpaces[i].avail; + + /* Check caller provides sorted data */ + if (i > 0 && page <= pageSpaces[i-1].blkno) + elog(ERROR, "RecordIndexFreeSpace: data not in page order"); + /* Save this page? */ + if (avail >= thresholdU || + (avail >= thresholdL && (--binct >= 0))) + { + FSMPageSetPageNum(newLocation, page); + FSMPageSetSpace(newLocation, avail); + newLocation++; + newPages--; + } } - chunk->numPages++; - chunk->pages[chunkRelIndex] = page; - chunk->bytes[chunkRelIndex] = (ItemLength) spaceAvail; - return true; + Assert(newPages == 0); } /* - * Delete one page entry from a FSMRelation's list. Compact free space - * in the list, but only if a chunk can be returned to the freelist. + * Pack a set of per-page freespace data into a smaller amount of space. + * + * This is algorithmically identical to pack_incoming_pages(), but accepts + * a different input representation. Also, we assume the input data has + * previously been checked for validity (size in bounds, pages in order). + * + * Note: it is possible for the source and destination arrays to overlap. + * The caller is responsible for making sure newLocation is at lower addresses + * so that we can copy data moving forward in the arrays without problem. */ static void -delete_fsm_page_entry(FSMRelation *fsmrel, FSMChunk *chunk, int chunkRelIndex) +pack_existing_pages(FSMPageData *newLocation, int newPages, + FSMPageData *oldLocation, int oldPages) { - int i, - lim; + int histogram[HISTOGRAM_BINS]; + int above, + binct, + i; + Size thresholdL, + thresholdU; + + Assert(newPages < oldPages); /* else I shouldn't have been called */ + /* Build histogram */ + MemSet(histogram, 0, sizeof(histogram)); + for (i = 0; i < oldPages; i++) + { + Size avail = FSMPageGetSpace(oldLocation + i); + + /* Shouldn't happen, but test to protect against stack clobber */ + if (avail >= BLCKSZ) + elog(ERROR, "pack_existing_pages: bogus freespace amount"); + avail /= (BLCKSZ/HISTOGRAM_BINS); + histogram[avail]++; + } + /* Find the breakpoint bin */ + above = 0; + for (i = HISTOGRAM_BINS-1; i >= 0; i--) + { + int sum = above + histogram[i]; - Assert(chunk && chunkRelIndex >= 0 && chunkRelIndex < chunk->numPages); - /* Compact out space in this chunk */ - lim = --chunk->numPages; - for (i = chunkRelIndex; i < lim; i++) + if (sum > newPages) + break; + above = sum; + } + Assert(i >= 0); + thresholdL = i * BLCKSZ/HISTOGRAM_BINS; /* low bound of bp bin */ + thresholdU = (i+1) * BLCKSZ/HISTOGRAM_BINS; /* hi bound */ + binct = newPages - above; /* number to take from bp bin */ + /* And copy the appropriate data */ + for (i = 0; i < oldPages; i++) { - chunk->pages[i] = chunk->pages[i + 1]; - chunk->bytes[i] = chunk->bytes[i + 1]; + BlockNumber page = FSMPageGetPageNum(oldLocation + i); + Size avail = FSMPageGetSpace(oldLocation + i); + + /* Save this page? */ + if (avail >= thresholdU || + (avail >= thresholdL && (--binct >= 0))) + { + FSMPageSetPageNum(newLocation, page); + FSMPageSetSpace(newLocation, avail); + newLocation++; + newPages--; + } } - /* Compact the whole list if a chunk can be freed */ - fsmrel->numPages--; - if (fsmrel->numPages <= (fsmrel->numChunks - 1) * CHUNKPAGES) - compact_fsm_page_list(fsmrel); + Assert(newPages == 0); } /* - * Remove any pages with free space less than the current threshold for the - * FSMRelation, compact out free slots in non-last chunks, and return any - * completely freed chunks to the freelist. + * Calculate number of chunks "requested" by a rel. + * + * Rel's lastPageCount and isIndex settings must be up-to-date when called. + * + * See notes at top of file for details. */ -static void -compact_fsm_page_list(FSMRelation *fsmrel) +static int +fsm_calc_request(FSMRelation *fsmrel) { - Size threshold = fsmrel->threshold; - FSMChunk *srcChunk, - *dstChunk; - int srcIndex, - dstIndex, - dstPages, - dstChunkCnt; - - srcChunk = dstChunk = fsmrel->relChunks; - srcIndex = dstIndex = 0; - dstPages = 0; /* total pages kept */ - dstChunkCnt = 1; /* includes current dstChunk */ - - while (srcChunk != NULL) - { - int srcPages = srcChunk->numPages; + int chunkCount; - while (srcIndex < srcPages) - { - if ((Size) srcChunk->bytes[srcIndex] >= threshold) - { - if (dstIndex >= CHUNKPAGES) - { - /* - * At this point srcChunk must be pointing to a later - * chunk, so it's OK to overwrite dstChunk->numPages. - */ - dstChunk->numPages = dstIndex; - dstChunk = dstChunk->next; - dstChunkCnt++; - dstIndex = 0; - } - dstChunk->pages[dstIndex] = srcChunk->pages[srcIndex]; - dstChunk->bytes[dstIndex] = srcChunk->bytes[srcIndex]; - dstIndex++; - dstPages++; - } - srcIndex++; - } - srcChunk = srcChunk->next; - srcIndex = 0; - } + /* Convert page count to chunk count */ + if (fsmrel->isIndex) + chunkCount = (fsmrel->lastPageCount - 1) / INDEXCHUNKPAGES + 1; + else + chunkCount = (fsmrel->lastPageCount - 1) / CHUNKPAGES + 1; + /* "Request" is anything beyond our one guaranteed chunk */ + if (chunkCount <= 0) + return 0; + else + return chunkCount - 1; +} + +/* + * Calculate target allocation (number of chunks) for a rel + * + * Parameter is the result from fsm_calc_request(). The global sumRequests + * and numRels totals must be up-to-date already. + * + * See notes at top of file for details. + */ +static int +fsm_calc_target_allocation(int myRequest) +{ + double spareChunks; + int extra; - if (dstPages == 0) + spareChunks = FreeSpaceMap->totalChunks - FreeSpaceMap->numRels; + Assert(spareChunks > 0); + if (spareChunks >= FreeSpaceMap->sumRequests) { - /* No chunks to be kept at all */ - fsmrel->nextPage = 0; - fsmrel->numPages = 0; - fsmrel->numChunks = 0; - fsmrel->relChunks = NULL; - fsmrel->lastChunk = NULL; - free_chunk_chain(dstChunk); + /* We aren't oversubscribed, so allocate exactly the request */ + extra = myRequest; } else { - /* we deliberately don't change nextPage here */ - fsmrel->numPages = dstPages; - fsmrel->numChunks = dstChunkCnt; - dstChunk->numPages = dstIndex; - free_chunk_chain(dstChunk->next); - dstChunk->next = NULL; - fsmrel->lastChunk = dstChunk; + extra = (int) rint(spareChunks * myRequest / FreeSpaceMap->sumRequests); + if (extra < 0) /* shouldn't happen, but make sure */ + extra = 0; } + return 1 + extra; } /* - * Acquire some free space by raising the thresholds of all FSMRelations. - * - * Note there is no guarantee as to how much space will be freed by a single - * invocation; caller may repeat if necessary. + * Calculate number of chunks actually used to store current data */ -static void -acquire_fsm_free_space(void) +static int +fsm_current_chunks(FSMRelation *fsmrel) { - FSMRelation *fsmrel; + int chunkCount; + + /* Convert page count to chunk count */ + if (fsmrel->isIndex) + chunkCount = (fsmrel->storedPages - 1) / INDEXCHUNKPAGES + 1; + else + chunkCount = (fsmrel->storedPages - 1) / CHUNKPAGES + 1; + /* Make sure storedPages==0 produces right answer */ + if (chunkCount < 0) + chunkCount = 0; + return chunkCount; +} - for (fsmrel = FreeSpaceMap->relList; fsmrel; fsmrel = fsmrel->nextRel) +/* + * Calculate current actual allocation (number of chunks) for a rel + */ +static int +fsm_current_allocation(FSMRelation *fsmrel) +{ + if (fsmrel->nextPhysical != NULL) + { + return fsmrel->nextPhysical->firstChunk - fsmrel->firstChunk; + } + else if (fsmrel == FreeSpaceMap->lastRel) { - fsmrel->threshold *= 2; - /* Limit thresholds to BLCKSZ so they can't get ridiculously large */ - if (fsmrel->threshold > BLCKSZ) - fsmrel->threshold = BLCKSZ; - /* Release any pages that don't meet the new threshold */ - compact_fsm_page_list(fsmrel); + return FreeSpaceMap->usedChunks - fsmrel->firstChunk; + } + else + { + /* it's not in the storage-order list */ + Assert(fsmrel->firstChunk < 0 && fsmrel->storedPages == 0); + return 0; } } @@ -1097,55 +1492,54 @@ DumpFreeSpace(void) FSMRelation *fsmrel; FSMRelation *prevrel = NULL; int relNum = 0; - FSMChunk *chunk; - int chunkRelIndex; - int nChunks; int nPages; - for (fsmrel = FreeSpaceMap->relList; fsmrel; fsmrel = fsmrel->nextRel) + for (fsmrel = FreeSpaceMap->usageList; fsmrel; fsmrel = fsmrel->nextUsage) { relNum++; - fprintf(stderr, "Map %d: rel %u/%u useCount %d threshold %u nextPage %d\nMap= ", + fprintf(stderr, "Map %d: rel %u/%u isIndex %d avgRequest %u lastPageCount %d nextPage %d\nMap= ", relNum, fsmrel->key.tblNode, fsmrel->key.relNode, - fsmrel->useCount, fsmrel->threshold, fsmrel->nextPage); - nChunks = nPages = 0; - for (chunk = fsmrel->relChunks; chunk; chunk = chunk->next) + (int) fsmrel->isIndex, fsmrel->avgRequest, + fsmrel->lastPageCount, fsmrel->nextPage); + if (fsmrel->isIndex) + { + IndexFSMPageData *page; + + page = (IndexFSMPageData *) + (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); + for (nPages = 0; nPages < fsmrel->storedPages; nPages++) + { + fprintf(stderr, " %u", + IndexFSMPageGetPageNum(page)); + page++; + } + } + else { - int numPages = chunk->numPages; + FSMPageData *page; - nChunks++; - for (chunkRelIndex = 0; chunkRelIndex < numPages; chunkRelIndex++) + page = (FSMPageData *) + (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); + for (nPages = 0; nPages < fsmrel->storedPages; nPages++) { - nPages++; fprintf(stderr, " %u:%u", - chunk->pages[chunkRelIndex], - chunk->bytes[chunkRelIndex]); + FSMPageGetPageNum(page), + FSMPageGetSpace(page)); + page++; } } fprintf(stderr, "\n"); - /* Cross-check local counters and list links */ - if (nPages != fsmrel->numPages) - fprintf(stderr, "DumpFreeSpace: %d pages in rel, but numPages = %d\n", - nPages, fsmrel->numPages); - if (nChunks != fsmrel->numChunks) - fprintf(stderr, "DumpFreeSpace: %d chunks in rel, but numChunks = %d\n", - nChunks, fsmrel->numChunks); - if (prevrel != fsmrel->priorRel) + /* Cross-check list links */ + if (prevrel != fsmrel->priorUsage) fprintf(stderr, "DumpFreeSpace: broken list links\n"); prevrel = fsmrel; } - if (prevrel != FreeSpaceMap->relListTail) + if (prevrel != FreeSpaceMap->usageListTail) fprintf(stderr, "DumpFreeSpace: broken list links\n"); /* Cross-check global counters */ if (relNum != FreeSpaceMap->numRels) fprintf(stderr, "DumpFreeSpace: %d rels in list, but numRels = %d\n", relNum, FreeSpaceMap->numRels); - nChunks = 0; - for (chunk = FreeSpaceMap->freeChunks; chunk; chunk = chunk->next) - nChunks++; - if (nChunks != FreeSpaceMap->numFreeChunks) - fprintf(stderr, "DumpFreeSpace: %d chunks in list, but numFreeChunks = %d\n", - nChunks, FreeSpaceMap->numFreeChunks); } #endif /* FREESPACE_DEBUG */ diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index b20873ba48..710b7d9917 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.61 2002/09/20 19:56:01 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.62 2003/03/04 21:51:21 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -410,7 +410,7 @@ smgrtruncate(int16 which, Relation reln, BlockNumber nblocks) * for the about-to-be-deleted blocks. We want to be sure it * won't return bogus block numbers later on. */ - MultiRecordFreeSpace(&reln->rd_node, nblocks, 0, NULL); + FreeSpaceMapTruncateRel(&reln->rd_node, nblocks); newblks = (*(smgrsw[which].smgr_truncate)) (reln, nblocks); if (newblks == InvalidBlockNumber) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 2582843829..37e1192b6d 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -5,7 +5,7 @@ * command, configuration file, and command line options. * See src/backend/utils/misc/README for more information. * - * $Header: /cvsroot/pgsql/src/backend/utils/misc/guc.c,v 1.115 2003/02/23 23:27:21 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/misc/guc.c,v 1.116 2003/03/04 21:51:21 tgl Exp $ * * Copyright 2000 by PostgreSQL Global Development Group * Written by Peter Eisentraut . @@ -644,11 +644,11 @@ static struct config_int { {"max_fsm_relations", PGC_POSTMASTER}, &MaxFSMRelations, - 1000, 10, INT_MAX, NULL, NULL + 1000, 100, INT_MAX, NULL, NULL }, { {"max_fsm_pages", PGC_POSTMASTER}, &MaxFSMPages, - 10000, 1000, INT_MAX, NULL, NULL + 20000, 1000, INT_MAX, NULL, NULL }, { diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index f40c2a0035..40b143c7ca 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -48,10 +48,11 @@ # Shared Memory Size # #shared_buffers = 64 # min max_connections*2 or 16, 8KB each -#max_fsm_relations = 1000 # min 10, fsm is free space map, ~40 bytes -#max_fsm_pages = 10000 # min 1000, fsm is free space map, ~6 bytes #max_locks_per_transaction = 64 # min 10 #wal_buffers = 8 # min 4, typically 8KB each +# fsm = free space map +#max_fsm_relations = 1000 # min 100, ~50 bytes each +#max_fsm_pages = 20000 # min max_fsm_relations*16, 6 bytes each # # Non-shared Memory Sizes diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h index 428898bdac..05cf77d761 100644 --- a/src/include/storage/freespace.h +++ b/src/include/storage/freespace.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: freespace.h,v 1.8 2002/09/20 19:56:01 tgl Exp $ + * $Id: freespace.h,v 1.9 2003/03/04 21:51:22 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -28,6 +28,7 @@ typedef struct PageFreeSpaceInfo } PageFreeSpaceInfo; +/* GUC variables */ extern int MaxFSMRelations; extern int MaxFSMPages; @@ -39,19 +40,26 @@ extern void InitFreeSpaceMap(void); extern int FreeSpaceShmemSize(void); extern BlockNumber GetPageWithFreeSpace(RelFileNode *rel, Size spaceNeeded); -extern void RecordFreeSpace(RelFileNode *rel, BlockNumber page, - Size spaceAvail); extern BlockNumber RecordAndGetPageWithFreeSpace(RelFileNode *rel, BlockNumber oldPage, Size oldSpaceAvail, Size spaceNeeded); -extern void MultiRecordFreeSpace(RelFileNode *rel, - BlockNumber minPage, - int nPages, - PageFreeSpaceInfo *pageSpaces); +extern Size GetAvgFSMRequestSize(RelFileNode *rel); +extern void RecordRelationFreeSpace(RelFileNode *rel, + int nPages, + PageFreeSpaceInfo *pageSpaces); + +extern BlockNumber GetFreeIndexPage(RelFileNode *rel); +extern void RecordIndexFreeSpace(RelFileNode *rel, + int nPages, + BlockNumber *pages); + +extern void FreeSpaceMapTruncateRel(RelFileNode *rel, BlockNumber nblocks); extern void FreeSpaceMapForgetRel(RelFileNode *rel); extern void FreeSpaceMapForgetDatabase(Oid dbid); +extern void PrintFreeSpaceMapStatistics(int elevel); + #ifdef FREESPACE_DEBUG extern void DumpFreeSpace(void); #endif -- 2.11.0