diff options
-rw-r--r-- | src/backend/access/hash/README | 18 | ||||
-rw-r--r-- | src/backend/access/hash/hashovfl.c | 29 | ||||
-rw-r--r-- | src/backend/access/hash/hashpage.c | 175 | ||||
-rw-r--r-- | src/include/access/hash.h | 3 |
4 files changed, 135 insertions, 90 deletions
diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README index 3ff70cde3ce..2c8e448dae7 100644 --- a/src/backend/access/hash/README +++ b/src/backend/access/hash/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.4 2003/11/29 19:51:40 pgsql Exp $ +$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.4.4.1 2007/04/19 20:24:28 tgl Exp $ This directory contains an implementation of hash indexing for Postgres. @@ -72,6 +72,18 @@ index, and preparing to allocate additional overflow pages after those bucket pages. hashm_spares[] entries before S cannot change anymore, since that would require moving already-created bucket pages. +The last page nominally used by the index is always determinable from +hashm_spares[S]. To avoid complaints from smgr, the logical EOF as seen by +the filesystem and smgr must always be greater than or equal to this page. +We have to allow the case "greater than" because it's possible that during +an index extension we crash after allocating filesystem space and before +updating the metapage. Note that on filesystems that allow "holes" in +files, it's entirely likely that pages before the logical EOF are not yet +allocated: when we allocate a new splitpoint's worth of bucket pages, we +physically zero the last such page to force the EOF up, and the first such +page will be used immediately, but the intervening pages are not written +until needed. + Since overflow pages may be recycled if enough tuples are deleted from their bucket, we need a way to keep track of currently-free overflow pages. The state of each overflow page (0 = available, 1 = not available) @@ -305,6 +317,10 @@ we can just error out without any great harm being done. Free space management --------------------- +(Question: why is this so complicated? Why not just have a linked list +of free pages with the list head in the metapage? It's not like we +avoid needing to modify the metapage with all this.) + Free space management consists of two sub-algorithms, one for reserving an overflow page to add to a bucket chain, and one for returning an empty overflow page to the free pool. diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 7aae76423ef..de3f3ef9c21 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.45.4.1 2006/11/19 21:33:37 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.45.4.2 2007/04/19 20:24:28 tgl Exp $ * * NOTES * Overflow pages look like ordinary relation pages. @@ -271,19 +271,12 @@ _hash_getovflpage(Relation rel, Buffer metabuf) blkno = bitno_to_blkno(metap, bit); /* - * We have to fetch the page with P_NEW to ensure smgr's idea of the + * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the * relation length stays in sync with ours. XXX It's annoying to do this * with metapage write lock held; would be better to use a lock that - * doesn't block incoming searches. Best way to fix it would be to stop - * maintaining hashm_spares[hashm_ovflpoint] and rely entirely on the - * smgr relation length to track where new overflow pages come from; - * then we could release the metapage before we do the smgrextend. - * FIXME later (not in beta...) + * doesn't block incoming searches. */ - newbuf = _hash_getbuf(rel, P_NEW, HASH_WRITE); - if (BufferGetBlockNumber(newbuf) != blkno) - elog(ERROR, "unexpected hash relation size: %u, should be %u", - BufferGetBlockNumber(newbuf), blkno); + newbuf = _hash_getnewbuf(rel, blkno, HASH_WRITE); metap->hashm_spares[splitnum]++; @@ -507,21 +500,15 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno) /* * It is okay to write-lock the new bitmap page while holding metapage - * write lock, because no one else could be contending for the new - * page. - * Also, the metapage lock makes it safe to extend the index using P_NEW, - * which we want to do to ensure the smgr's idea of the relation size - * stays in step with ours. + * write lock, because no one else could be contending for the new page. + * Also, the metapage lock makes it safe to extend the index using + * _hash_getnewbuf. * * There is some loss of concurrency in possibly doing I/O for the new * page while holding the metapage lock, but this path is taken so * seldom that it's not worth worrying about. */ - buf = _hash_getbuf(rel, P_NEW, HASH_WRITE); - if (BufferGetBlockNumber(buf) != blkno) - elog(ERROR, "unexpected hash relation size: %u, should be %u", - BufferGetBlockNumber(buf), blkno); - + buf = _hash_getnewbuf(rel, blkno, HASH_WRITE); pg = BufferGetPage(buf); /* initialize the page */ diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 07b4ab870fd..aa58ffb0eb9 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.47.4.1 2006/11/19 21:33:37 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.47.4.2 2007/04/19 20:24:28 tgl Exp $ * * NOTES * Postgres hash pages look like ordinary relation pages. The opaque @@ -35,7 +35,8 @@ #include "utils/lsyscache.h" -static BlockNumber _hash_alloc_buckets(Relation rel, uint32 nblocks); +static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, + uint32 nblocks); static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, @@ -103,14 +104,18 @@ _hash_droplock(Relation rel, BlockNumber whichlock, int access) * requested buffer and its reference count has been incremented * (ie, the buffer is "locked and pinned"). * - * blkno == P_NEW is allowed, but it is caller's responsibility to - * ensure that only one process can extend the index at a time. + * P_NEW is disallowed because this routine should only be used + * to access pages that are known to be before the filesystem EOF. + * Extending the index should be done with _hash_getnewbuf. */ Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access) { Buffer buf; + if (blkno == P_NEW) + elog(ERROR, "hash AM does not use P_NEW"); + buf = ReadBuffer(rel, blkno); if (access != HASH_NOLOCK) @@ -121,6 +126,51 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access) } /* + * _hash_getnewbuf() -- Get a new page at the end of the index. + * + * This has the same API as _hash_getbuf, except that we are adding + * a page to the index, and hence expect the page to be past the + * logical EOF. (However, we have to support the case where it isn't, + * since a prior try might have crashed after extending the filesystem + * EOF but before updating the metapage to reflect the added page.) + * + * It is caller's responsibility to ensure that only one process can + * extend the index at a time. + * + * All call sites should call _hash_pageinit on the returned page. + * Also, it's difficult to imagine why access would not be HASH_WRITE. + */ +Buffer +_hash_getnewbuf(Relation rel, BlockNumber blkno, int access) +{ + BlockNumber nblocks = RelationGetNumberOfBlocks(rel); + Buffer buf; + + if (blkno == P_NEW) + elog(ERROR, "hash AM does not use P_NEW"); + if (blkno > nblocks) + elog(ERROR, "access to noncontiguous page in hash index \"%s\"", + RelationGetRelationName(rel)); + + /* smgr insists we use P_NEW to extend the relation */ + if (blkno == nblocks) + { + buf = ReadBuffer(rel, P_NEW); + if (BufferGetBlockNumber(buf) != blkno) + elog(ERROR, "unexpected hash relation size: %u, should be %u", + BufferGetBlockNumber(buf), blkno); + } + else + buf = ReadBuffer(rel, blkno); + + if (access != HASH_NOLOCK) + LockBuffer(buf, access); + + /* ref count and lock type are correct */ + return buf; +} + +/* * _hash_relbuf() -- release a locked buffer. * * Lock and pin (refcount) are both dropped. Note that either read or @@ -252,12 +302,11 @@ _hash_metapinit(Relation rel) /* * We initialize the metapage, the first two bucket pages, and the - * first bitmap page in sequence, using P_NEW to cause smgrextend() - * calls to occur. This ensures that the smgr level has the right - * idea of the physical index length. + * first bitmap page in sequence, using _hash_getnewbuf to cause + * smgrextend() calls to occur. This ensures that the smgr level + * has the right idea of the physical index length. */ - metabuf = _hash_getbuf(rel, P_NEW, HASH_WRITE); - Assert(BufferGetBlockNumber(metabuf) == HASH_METAPAGE); + metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, HASH_WRITE); pg = BufferGetPage(metabuf); _hash_pageinit(pg, BufferGetPageSize(metabuf)); @@ -311,8 +360,7 @@ _hash_metapinit(Relation rel) */ for (i = 0; i <= 1; i++) { - buf = _hash_getbuf(rel, P_NEW, HASH_WRITE); - Assert(BufferGetBlockNumber(buf) == BUCKET_TO_BLKNO(metap, i)); + buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE); pg = BufferGetPage(buf); _hash_pageinit(pg, BufferGetPageSize(buf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); @@ -360,7 +408,6 @@ _hash_expandtable(Relation rel, Buffer metabuf) Bucket old_bucket; Bucket new_bucket; uint32 spare_ndx; - BlockNumber firstblock = InvalidBlockNumber; BlockNumber start_oblkno; BlockNumber start_nblkno; uint32 maxbucket; @@ -414,38 +461,14 @@ _hash_expandtable(Relation rel, Buffer metabuf) goto fail; /* - * If the split point is increasing (hashm_maxbucket's log base 2 - * increases), we need to allocate a new batch of bucket pages. - */ - new_bucket = metap->hashm_maxbucket + 1; - spare_ndx = _hash_log2(new_bucket + 1); - if (spare_ndx > metap->hashm_ovflpoint) - { - Assert(spare_ndx == metap->hashm_ovflpoint + 1); - /* - * The number of buckets in the new splitpoint is equal to the - * total number already in existence, i.e. new_bucket. Currently - * this maps one-to-one to blocks required, but someday we may need - * a more complicated calculation here. - */ - firstblock = _hash_alloc_buckets(rel, new_bucket); - if (firstblock == InvalidBlockNumber) - goto fail; /* can't split due to BlockNumber overflow */ - } - - /* * Determine which bucket is to be split, and attempt to lock the old * bucket. If we can't get the lock, give up. * * The lock protects us against other backends, but not against our own * backend. Must check for active scans separately. - * - * Ideally we would lock the new bucket too before proceeding, but if we - * are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping - * isn't correct yet. For simplicity we update the metapage first and - * then lock. This should be okay because no one else should be - * trying to lock the new bucket yet... */ + new_bucket = metap->hashm_maxbucket + 1; + old_bucket = (new_bucket & metap->hashm_lowmask); start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); @@ -457,6 +480,45 @@ _hash_expandtable(Relation rel, Buffer metabuf) goto fail; /* + * Likewise lock the new bucket (should never fail). + * + * Note: it is safe to compute the new bucket's blkno here, even though + * we may still need to update the BUCKET_TO_BLKNO mapping. This is + * because the current value of hashm_spares[hashm_ovflpoint] correctly + * shows where we are going to put a new splitpoint's worth of buckets. + */ + start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); + + if (_hash_has_active_scan(rel, new_bucket)) + elog(ERROR, "scan in progress on supposedly new bucket"); + + if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE)) + elog(ERROR, "could not get lock on supposedly new bucket"); + + /* + * If the split point is increasing (hashm_maxbucket's log base 2 + * increases), we need to allocate a new batch of bucket pages. + */ + spare_ndx = _hash_log2(new_bucket + 1); + if (spare_ndx > metap->hashm_ovflpoint) + { + Assert(spare_ndx == metap->hashm_ovflpoint + 1); + /* + * The number of buckets in the new splitpoint is equal to the + * total number already in existence, i.e. new_bucket. Currently + * this maps one-to-one to blocks required, but someday we may need + * a more complicated calculation here. + */ + if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket)) + { + /* can't split due to BlockNumber overflow */ + _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); + _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); + goto fail; + } + } + + /* * Okay to proceed with split. Update the metapage bucket mapping * info. */ @@ -481,20 +543,6 @@ _hash_expandtable(Relation rel, Buffer metabuf) metap->hashm_ovflpoint = spare_ndx; } - /* now we can compute the new bucket's primary block number */ - start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); - - /* if we added a splitpoint, should match result of _hash_alloc_buckets */ - if (firstblock != InvalidBlockNumber && - firstblock != start_nblkno) - elog(PANIC, "unexpected hash relation size: %u, should be %u", - firstblock, start_nblkno); - - Assert(!_hash_has_active_scan(rel, new_bucket)); - - if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE)) - elog(PANIC, "could not get lock on supposedly new bucket"); - /* * Copy bucket mapping info now; this saves re-accessing the meta page * inside _hash_splitbucket's inner loop. Note that once we drop the @@ -558,23 +606,16 @@ fail: * for the purpose. OTOH, adding a splitpoint is a very infrequent operation, * so it may not be worth worrying about. * - * Returns the first block number in the new splitpoint's range, or - * InvalidBlockNumber if allocation failed due to BlockNumber overflow. + * Returns TRUE if successful, or FALSE if allocation failed due to + * BlockNumber overflow. */ -static BlockNumber -_hash_alloc_buckets(Relation rel, uint32 nblocks) +static bool +_hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) { - BlockNumber firstblock; BlockNumber lastblock; BlockNumber endblock; char zerobuf[BLCKSZ]; - /* - * Since we hold metapage lock, no one else is either splitting or - * allocating a new page in _hash_getovflpage(); hence it's safe to - * assume that the relation length isn't changing under us. - */ - firstblock = RelationGetNumberOfBlocks(rel); lastblock = firstblock + nblocks - 1; /* @@ -582,12 +623,12 @@ _hash_alloc_buckets(Relation rel, uint32 nblocks) * extend the index anymore. */ if (lastblock < firstblock || lastblock == InvalidBlockNumber) - return InvalidBlockNumber; - - /* Note: we assume RelationGetNumberOfBlocks did RelationOpenSmgr for us */ + return false; MemSet(zerobuf, 0, sizeof(zerobuf)); + RelationOpenSmgr(rel); + /* * XXX If the extension results in creation of new segment files, * we have to make sure that each non-last file is correctly filled out to @@ -604,7 +645,7 @@ _hash_alloc_buckets(Relation rel, uint32 nblocks) smgrextend(rel->rd_smgr, lastblock, zerobuf, rel->rd_istemp); - return firstblock; + return true; } diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 6d4668bdbe4..fe417c60bda 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.59 2004/12/31 22:03:21 pgsql Exp $ + * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.59.4.1 2007/04/19 20:24:28 tgl Exp $ * * NOTES * modeled after Margo Seltzer's hash implementation for unix. @@ -281,6 +281,7 @@ extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access); extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access); extern void _hash_droplock(Relation rel, BlockNumber whichlock, int access); extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access); +extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, int access); extern void _hash_relbuf(Relation rel, Buffer buf); extern void _hash_dropbuf(Relation rel, Buffer buf); extern void _hash_wrtbuf(Relation rel, Buffer buf); |