OSDN Git Service

aee2a205aa4090ba03131402515510986212feec
[pg-rex/syncrep.git] / src / backend / access / heap / hio.c
1 /*-------------------------------------------------------------------------
2  *
3  * hio.c
4  *        POSTGRES heap access method input/output code.
5  *
6  * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        src/backend/access/heap/hio.c
12  *
13  *-------------------------------------------------------------------------
14  */
15
16 #include "postgres.h"
17
18 #include "access/heapam.h"
19 #include "access/hio.h"
20 #include "access/visibilitymap.h"
21 #include "storage/bufmgr.h"
22 #include "storage/freespace.h"
23 #include "storage/lmgr.h"
24 #include "storage/smgr.h"
25
26
27 /*
28  * RelationPutHeapTuple - place tuple at specified page
29  *
30  * !!! EREPORT(ERROR) IS DISALLOWED HERE !!!  Must PANIC on failure!!!
31  *
32  * Note - caller must hold BUFFER_LOCK_EXCLUSIVE on the buffer.
33  */
34 void
35 RelationPutHeapTuple(Relation relation,
36                                          Buffer buffer,
37                                          HeapTuple tuple)
38 {
39         Page            pageHeader;
40         OffsetNumber offnum;
41         ItemId          itemId;
42         Item            item;
43
44         /* Add the tuple to the page */
45         pageHeader = BufferGetPage(buffer);
46
47         offnum = PageAddItem(pageHeader, (Item) tuple->t_data,
48                                                  tuple->t_len, InvalidOffsetNumber, false, true);
49
50         if (offnum == InvalidOffsetNumber)
51                 elog(PANIC, "failed to add tuple to page");
52
53         /* Update tuple->t_self to the actual position where it was stored */
54         ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum);
55
56         /* Insert the correct position into CTID of the stored tuple, too */
57         itemId = PageGetItemId(pageHeader, offnum);
58         item = PageGetItem(pageHeader, itemId);
59         ((HeapTupleHeader) item)->t_ctid = tuple->t_self;
60 }
61
62 /*
63  * Read in a buffer, using bulk-insert strategy if bistate isn't NULL.
64  */
65 static Buffer
66 ReadBufferBI(Relation relation, BlockNumber targetBlock,
67                          BulkInsertState bistate)
68 {
69         Buffer          buffer;
70
71         /* If not bulk-insert, exactly like ReadBuffer */
72         if (!bistate)
73                 return ReadBuffer(relation, targetBlock);
74
75         /* If we have the desired block already pinned, re-pin and return it */
76         if (bistate->current_buf != InvalidBuffer)
77         {
78                 if (BufferGetBlockNumber(bistate->current_buf) == targetBlock)
79                 {
80                         IncrBufferRefCount(bistate->current_buf);
81                         return bistate->current_buf;
82                 }
83                 /* ... else drop the old buffer */
84                 ReleaseBuffer(bistate->current_buf);
85                 bistate->current_buf = InvalidBuffer;
86         }
87
88         /* Perform a read using the buffer strategy */
89         buffer = ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock,
90                                                                 RBM_NORMAL, bistate->strategy);
91
92         /* Save the selected block as target for future inserts */
93         IncrBufferRefCount(buffer);
94         bistate->current_buf = buffer;
95
96         return buffer;
97 }
98
99 /*
100  * RelationGetBufferForTuple
101  *
102  *      Returns pinned and exclusive-locked buffer of a page in given relation
103  *      with free space >= given len.
104  *
105  *      If otherBuffer is not InvalidBuffer, then it references a previously
106  *      pinned buffer of another page in the same relation; on return, this
107  *      buffer will also be exclusive-locked.  (This case is used by heap_update;
108  *      the otherBuffer contains the tuple being updated.)
109  *
110  *      The reason for passing otherBuffer is that if two backends are doing
111  *      concurrent heap_update operations, a deadlock could occur if they try
112  *      to lock the same two buffers in opposite orders.  To ensure that this
113  *      can't happen, we impose the rule that buffers of a relation must be
114  *      locked in increasing page number order.  This is most conveniently done
115  *      by having RelationGetBufferForTuple lock them both, with suitable care
116  *      for ordering.
117  *
118  *      NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the
119  *      same buffer we select for insertion of the new tuple (this could only
120  *      happen if space is freed in that page after heap_update finds there's not
121  *      enough there).  In that case, the page will be pinned and locked only once.
122  *
123  *      We normally use FSM to help us find free space.  However,
124  *      if HEAP_INSERT_SKIP_FSM is specified, we just append a new empty page to
125  *      the end of the relation if the tuple won't fit on the current target page.
126  *      This can save some cycles when we know the relation is new and doesn't
127  *      contain useful amounts of free space.
128  *
129  *      HEAP_INSERT_SKIP_FSM is also useful for non-WAL-logged additions to a
130  *      relation, if the caller holds exclusive lock and is careful to invalidate
131  *      relation's smgr_targblock before the first insertion --- that ensures that
132  *      all insertions will occur into newly added pages and not be intermixed
133  *      with tuples from other transactions.  That way, a crash can't risk losing
134  *      any committed data of other transactions.  (See heap_insert's comments
135  *      for additional constraints needed for safe usage of this behavior.)
136  *
137  *      The caller can also provide a BulkInsertState object to optimize many
138  *      insertions into the same relation.      This keeps a pin on the current
139  *      insertion target page (to save pin/unpin cycles) and also passes a
140  *      BULKWRITE buffer selection strategy object to the buffer manager.
141  *      Passing NULL for bistate selects the default behavior.
142  *
143  *      We always try to avoid filling existing pages further than the fillfactor.
144  *      This is OK since this routine is not consulted when updating a tuple and
145  *      keeping it on the same page, which is the scenario fillfactor is meant
146  *      to reserve space for.
147  *
148  *      ereport(ERROR) is allowed here, so this routine *must* be called
149  *      before any (unlogged) changes are made in buffer pool.
150  */
151 Buffer
152 RelationGetBufferForTuple(Relation relation, Size len,
153                                                   Buffer otherBuffer, int options,
154                                                   struct BulkInsertStateData * bistate,
155                                                   Buffer *vmbuffer)
156 {
157         bool            use_fsm = !(options & HEAP_INSERT_SKIP_FSM);
158         Buffer          buffer = InvalidBuffer;
159         Page            page;
160         Size            pageFreeSpace,
161                                 saveFreeSpace;
162         BlockNumber targetBlock,
163                                 otherBlock;
164         bool            needLock;
165
166         len = MAXALIGN(len);            /* be conservative */
167
168         /* Bulk insert is not supported for updates, only inserts. */
169         Assert(otherBuffer == InvalidBuffer || !bistate);
170
171         /*
172          * If we're gonna fail for oversize tuple, do it right away
173          */
174         if (len > MaxHeapTupleSize)
175                 ereport(ERROR,
176                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
177                                  errmsg("row is too big: size %lu, maximum size %lu",
178                                                 (unsigned long) len,
179                                                 (unsigned long) MaxHeapTupleSize)));
180
181         /* Compute desired extra freespace due to fillfactor option */
182         saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
183                                                                                                    HEAP_DEFAULT_FILLFACTOR);
184
185         if (otherBuffer != InvalidBuffer)
186                 otherBlock = BufferGetBlockNumber(otherBuffer);
187         else
188                 otherBlock = InvalidBlockNumber;                /* just to keep compiler quiet */
189
190         /*
191          * We first try to put the tuple on the same page we last inserted a tuple
192          * on, as cached in the BulkInsertState or relcache entry.      If that
193          * doesn't work, we ask the Free Space Map to locate a suitable page.
194          * Since the FSM's info might be out of date, we have to be prepared to
195          * loop around and retry multiple times. (To insure this isn't an infinite
196          * loop, we must update the FSM with the correct amount of free space on
197          * each page that proves not to be suitable.)  If the FSM has no record of
198          * a page with enough free space, we give up and extend the relation.
199          *
200          * When use_fsm is false, we either put the tuple onto the existing target
201          * page or extend the relation.
202          */
203         if (len + saveFreeSpace > MaxHeapTupleSize)
204         {
205                 /* can't fit, don't bother asking FSM */
206                 targetBlock = InvalidBlockNumber;
207                 use_fsm = false;
208         }
209         else if (bistate && bistate->current_buf != InvalidBuffer)
210                 targetBlock = BufferGetBlockNumber(bistate->current_buf);
211         else
212                 targetBlock = RelationGetTargetBlock(relation);
213
214         if (targetBlock == InvalidBlockNumber && use_fsm)
215         {
216                 /*
217                  * We have no cached target page, so ask the FSM for an initial
218                  * target.
219                  */
220                 targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
221
222                 /*
223                  * If the FSM knows nothing of the rel, try the last page before we
224                  * give up and extend.  This avoids one-tuple-per-page syndrome during
225                  * bootstrapping or in a recently-started system.
226                  */
227                 if (targetBlock == InvalidBlockNumber)
228                 {
229                         BlockNumber nblocks = RelationGetNumberOfBlocks(relation);
230
231                         if (nblocks > 0)
232                                 targetBlock = nblocks - 1;
233                 }
234         }
235
236         while (targetBlock != InvalidBlockNumber)
237         {
238                 /*
239                  * Read and exclusive-lock the target block, as well as the other
240                  * block if one was given, taking suitable care with lock ordering and
241                  * the possibility they are the same block.
242                  *
243                  * If the page-level all-visible flag is set, caller will need to clear
244                  * both that and the corresponding visibility map bit.  However, by the
245                  * time we return, we'll have x-locked the buffer, and we don't want to
246                  * do any I/O while in that state.  So we check the bit here before
247                  * taking the lock, and pin the page if it appears necessary.
248                  * Checking without the lock creates a risk of getting the wrong
249                  * answer, so we'll have to recheck after acquiring the lock.
250                  */
251                 if (otherBuffer == InvalidBuffer)
252                 {
253                         /* easy case */
254                         buffer = ReadBufferBI(relation, targetBlock, bistate);
255                         if (PageIsAllVisible(BufferGetPage(buffer)))
256                                 visibilitymap_pin(relation, targetBlock, vmbuffer);
257                         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
258                 }
259                 else if (otherBlock == targetBlock)
260                 {
261                         /* also easy case */
262                         buffer = otherBuffer;
263                         if (PageIsAllVisible(BufferGetPage(buffer)))
264                                 visibilitymap_pin(relation, targetBlock, vmbuffer);
265                         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
266                 }
267                 else if (otherBlock < targetBlock)
268                 {
269                         /* lock other buffer first */
270                         buffer = ReadBuffer(relation, targetBlock);
271                         if (PageIsAllVisible(BufferGetPage(buffer)))
272                                 visibilitymap_pin(relation, targetBlock, vmbuffer);
273                         LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
274                         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
275                 }
276                 else
277                 {
278                         /* lock target buffer first */
279                         buffer = ReadBuffer(relation, targetBlock);
280                         if (PageIsAllVisible(BufferGetPage(buffer)))
281                                 visibilitymap_pin(relation, targetBlock, vmbuffer);
282                         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
283                         LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
284                 }
285
286                 /*
287                  * If the page is all visible but we don't have the right visibility
288                  * map page pinned, then give up our locks, go get the pin, and
289                  * re-lock.  This is pretty painful, but hopefully shouldn't happen
290                  * often.  Note that there's a small possibility that we didn't pin
291                  * the page above but still have the correct page pinned anyway, either
292                  * because we've already made a previous pass through this loop, or
293                  * because caller passed us the right page anyway.
294                  *
295                  * Note also that it's possible that by the time we get the pin and
296                  * retake the buffer locks, the visibility map bit will have been
297                  * cleared by some other backend anyway.  In that case, we'll have done
298                  * a bit of extra work for no gain, but there's no real harm done.
299                  */
300                 if (PageIsAllVisible(BufferGetPage(buffer))
301                         && !visibilitymap_pin_ok(targetBlock, *vmbuffer))
302                 {
303                         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
304                         if (otherBlock != targetBlock)
305                                 LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
306                         visibilitymap_pin(relation, targetBlock, vmbuffer);
307                         if (otherBuffer != InvalidBuffer && otherBlock < targetBlock)
308                                 LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
309                         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
310                         if (otherBuffer != InvalidBuffer && otherBlock > targetBlock)
311                                 LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
312                 }
313
314                 /*
315                  * Now we can check to see if there's enough free space here. If so,
316                  * we're done.
317                  */
318                 page = BufferGetPage(buffer);
319                 pageFreeSpace = PageGetHeapFreeSpace(page);
320                 if (len + saveFreeSpace <= pageFreeSpace)
321                 {
322                         /* use this page as future insert target, too */
323                         RelationSetTargetBlock(relation, targetBlock);
324                         return buffer;
325                 }
326
327                 /*
328                  * Not enough space, so we must give up our page locks and pin (if
329                  * any) and prepare to look elsewhere.  We don't care which order we
330                  * unlock the two buffers in, so this can be slightly simpler than the
331                  * code above.
332                  */
333                 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
334                 if (otherBuffer == InvalidBuffer)
335                         ReleaseBuffer(buffer);
336                 else if (otherBlock != targetBlock)
337                 {
338                         LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
339                         ReleaseBuffer(buffer);
340                 }
341
342                 /* Without FSM, always fall out of the loop and extend */
343                 if (!use_fsm)
344                         break;
345
346                 /*
347                  * Update FSM as to condition of this page, and ask for another page
348                  * to try.
349                  */
350                 targetBlock = RecordAndGetPageWithFreeSpace(relation,
351                                                                                                         targetBlock,
352                                                                                                         pageFreeSpace,
353                                                                                                         len + saveFreeSpace);
354         }
355
356         /*
357          * Have to extend the relation.
358          *
359          * We have to use a lock to ensure no one else is extending the rel at the
360          * same time, else we will both try to initialize the same new page.  We
361          * can skip locking for new or temp relations, however, since no one else
362          * could be accessing them.
363          */
364         needLock = !RELATION_IS_LOCAL(relation);
365
366         if (needLock)
367                 LockRelationForExtension(relation, ExclusiveLock);
368
369         /*
370          * XXX This does an lseek - rather expensive - but at the moment it is the
371          * only way to accurately determine how many blocks are in a relation.  Is
372          * it worth keeping an accurate file length in shared memory someplace,
373          * rather than relying on the kernel to do it for us?
374          */
375         buffer = ReadBufferBI(relation, P_NEW, bistate);
376
377         /*
378          * We can be certain that locking the otherBuffer first is OK, since it
379          * must have a lower page number.
380          */
381         if (otherBuffer != InvalidBuffer)
382                 LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
383
384         /*
385          * Now acquire lock on the new page.
386          */
387         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
388
389         /*
390          * Release the file-extension lock; it's now OK for someone else to extend
391          * the relation some more.      Note that we cannot release this lock before
392          * we have buffer lock on the new page, or we risk a race condition
393          * against vacuumlazy.c --- see comments therein.
394          */
395         if (needLock)
396                 UnlockRelationForExtension(relation, ExclusiveLock);
397
398         /*
399          * We need to initialize the empty new page.  Double-check that it really
400          * is empty (this should never happen, but if it does we don't want to
401          * risk wiping out valid data).
402          */
403         page = BufferGetPage(buffer);
404
405         if (!PageIsNew(page))
406                 elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
407                          BufferGetBlockNumber(buffer),
408                          RelationGetRelationName(relation));
409
410         PageInit(page, BufferGetPageSize(buffer), 0);
411
412         if (len > PageGetHeapFreeSpace(page))
413         {
414                 /* We should not get here given the test at the top */
415                 elog(PANIC, "tuple is too big: size %lu", (unsigned long) len);
416         }
417
418         /*
419          * Remember the new page as our target for future insertions.
420          *
421          * XXX should we enter the new page into the free space map immediately,
422          * or just keep it for this backend's exclusive use in the short run
423          * (until VACUUM sees it)?      Seems to depend on whether you expect the
424          * current backend to make more insertions or not, which is probably a
425          * good bet most of the time.  So for now, don't add it to FSM yet.
426          */
427         RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer));
428
429         return buffer;
430 }