diff options
Diffstat (limited to 'src/backend/access/hash/hashpage.c')
-rw-r--r-- | src/backend/access/hash/hashpage.c | 150 |
1 files changed, 132 insertions, 18 deletions
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 696d4bf616a..0f643836a1c 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.60 2006/10/04 00:29:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.61 2006/11/19 21:33:23 tgl Exp $ * * NOTES * Postgres hash pages look like ordinary relation pages. The opaque @@ -32,9 +32,11 @@ #include "access/hash.h" #include "miscadmin.h" #include "storage/lmgr.h" +#include "storage/smgr.h" #include "utils/lsyscache.h" +static BlockNumber _hash_alloc_buckets(Relation rel, uint32 nblocks); static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, @@ -102,21 +104,18 @@ _hash_droplock(Relation rel, BlockNumber whichlock, int access) * requested buffer and its reference count has been incremented * (ie, the buffer is "locked and pinned"). * - * XXX P_NEW is not used because, unlike the tree structures, we - * need the bucket blocks to be at certain block numbers. + * blkno == P_NEW is allowed, but it is caller's responsibility to + * ensure that only one process can extend the index at a time. * - * All call sites should call either _hash_pageinit or _hash_checkpage + * All call sites should call either _hash_checkpage or _hash_pageinit * on the returned page, depending on whether the block is expected - * to be new or not. + * to be valid or not. */ Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access) { Buffer buf; - if (blkno == P_NEW) - elog(ERROR, "hash AM does not use P_NEW"); - buf = ReadBuffer(rel, blkno); if (access != HASH_NOLOCK) @@ -237,7 +236,14 @@ _hash_metapinit(Relation rel) if (ffactor < 10) ffactor = 10; - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); + /* + * We initialize the metapage, the first two bucket pages, and the + * first bitmap page in sequence, using P_NEW to cause smgrextend() + * calls to occur. This ensures that the smgr level has the right + * idea of the physical index length. + */ + metabuf = _hash_getbuf(rel, P_NEW, HASH_WRITE); + Assert(BufferGetBlockNumber(metabuf) == HASH_METAPAGE); pg = BufferGetPage(metabuf); _hash_pageinit(pg, BufferGetPageSize(metabuf)); @@ -290,7 +296,8 @@ _hash_metapinit(Relation rel) */ for (i = 0; i <= 1; i++) { - buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE); + buf = _hash_getbuf(rel, P_NEW, HASH_WRITE); + Assert(BufferGetBlockNumber(buf) == BUCKET_TO_BLKNO(metap, i)); pg = BufferGetPage(buf); _hash_pageinit(pg, BufferGetPageSize(buf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); @@ -303,8 +310,7 @@ _hash_metapinit(Relation rel) } /* - * Initialize first bitmap page. Can't do this until we create the first - * two buckets, else smgr will complain. + * Initialize first bitmap page */ _hash_initbitmap(rel, metap, 3); @@ -339,6 +345,7 @@ _hash_expandtable(Relation rel, Buffer metabuf) Bucket old_bucket; Bucket new_bucket; uint32 spare_ndx; + BlockNumber firstblock = InvalidBlockNumber; BlockNumber start_oblkno; BlockNumber start_nblkno; uint32 maxbucket; @@ -377,6 +384,40 @@ _hash_expandtable(Relation rel, Buffer metabuf) goto fail; /* + * Can't split anymore if maxbucket has reached its maximum possible value. + * + * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because + * the calculation maxbucket+1 mustn't overflow). Currently we restrict + * to half that because of overflow looping in _hash_log2() and + * insufficient space in hashm_spares[]. It's moot anyway because an + * index with 2^32 buckets would certainly overflow BlockNumber and + * hence _hash_alloc_buckets() would fail, but if we supported buckets + * smaller than a disk block then this would be an independent constraint. + */ + if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) + goto fail; + + /* + * If the split point is increasing (hashm_maxbucket's log base 2 + * increases), we need to allocate a new batch of bucket pages. + */ + new_bucket = metap->hashm_maxbucket + 1; + spare_ndx = _hash_log2(new_bucket + 1); + if (spare_ndx > metap->hashm_ovflpoint) + { + Assert(spare_ndx == metap->hashm_ovflpoint + 1); + /* + * The number of buckets in the new splitpoint is equal to the + * total number already in existence, i.e. new_bucket. Currently + * this maps one-to-one to blocks required, but someday we may need + * a more complicated calculation here. + */ + firstblock = _hash_alloc_buckets(rel, new_bucket); + if (firstblock == InvalidBlockNumber) + goto fail; /* can't split due to BlockNumber overflow */ + } + + /* * Determine which bucket is to be split, and attempt to lock the old * bucket. If we can't get the lock, give up. * @@ -389,7 +430,6 @@ _hash_expandtable(Relation rel, Buffer metabuf) * lock. This should be okay because no one else should be trying to lock * the new bucket yet... */ - new_bucket = metap->hashm_maxbucket + 1; old_bucket = (new_bucket & metap->hashm_lowmask); start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); @@ -425,14 +465,9 @@ _hash_expandtable(Relation rel, Buffer metabuf) * increases), we need to adjust the hashm_spares[] array and * hashm_ovflpoint so that future overflow pages will be created beyond * this new batch of bucket pages. - * - * XXX should initialize new bucket pages to prevent out-of-order page - * creation? Don't wanna do it right here though. */ - spare_ndx = _hash_log2(metap->hashm_maxbucket + 1); if (spare_ndx > metap->hashm_ovflpoint) { - Assert(spare_ndx == metap->hashm_ovflpoint + 1); metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; metap->hashm_ovflpoint = spare_ndx; } @@ -440,6 +475,12 @@ _hash_expandtable(Relation rel, Buffer metabuf) /* now we can compute the new bucket's primary block number */ start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); + /* if we added a splitpoint, should match result of _hash_alloc_buckets */ + if (firstblock != InvalidBlockNumber && + firstblock != start_nblkno) + elog(PANIC, "unexpected hash relation size: %u, should be %u", + firstblock, start_nblkno); + Assert(!_hash_has_active_scan(rel, new_bucket)); if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE)) @@ -488,6 +529,79 @@ fail: /* + * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages + * + * This does not need to initialize the new bucket pages; we'll do that as + * each one is used by _hash_expandtable(). But we have to extend the logical + * EOF to the end of the splitpoint; otherwise the first overflow page + * allocated beyond the splitpoint will represent a noncontiguous access, + * which can confuse md.c (and will probably be forbidden by future changes + * to md.c). + * + * We do this by writing a page of zeroes at the end of the splitpoint range. + * We expect that the filesystem will ensure that the intervening pages read + * as zeroes too. On many filesystems this "hole" will not be allocated + * immediately, which means that the index file may end up more fragmented + * than if we forced it all to be allocated now; but since we don't scan + * hash indexes sequentially anyway, that probably doesn't matter. + * + * XXX It's annoying that this code is executed with the metapage lock held. + * We need to interlock against _hash_getovflpage() adding a new overflow page + * concurrently, but it'd likely be better to use LockRelationForExtension + * for the purpose. OTOH, adding a splitpoint is a very infrequent operation, + * so it may not be worth worrying about. + * + * Returns the first block number in the new splitpoint's range, or + * InvalidBlockNumber if allocation failed due to BlockNumber overflow. + */ +static BlockNumber +_hash_alloc_buckets(Relation rel, uint32 nblocks) +{ + BlockNumber firstblock; + BlockNumber lastblock; + BlockNumber endblock; + char zerobuf[BLCKSZ]; + + /* + * Since we hold metapage lock, no one else is either splitting or + * allocating a new page in _hash_getovflpage(); hence it's safe to + * assume that the relation length isn't changing under us. + */ + firstblock = RelationGetNumberOfBlocks(rel); + lastblock = firstblock + nblocks - 1; + + /* + * Check for overflow in block number calculation; if so, we cannot + * extend the index anymore. + */ + if (lastblock < firstblock || lastblock == InvalidBlockNumber) + return InvalidBlockNumber; + + /* Note: we assume RelationGetNumberOfBlocks did RelationOpenSmgr for us */ + + MemSet(zerobuf, 0, sizeof(zerobuf)); + + /* + * XXX If the extension results in creation of new segment files, + * we have to make sure that each non-last file is correctly filled out to + * RELSEG_SIZE blocks. This ought to be done inside mdextend, but + * changing the smgr API seems best left for development cycle not late + * beta. Temporary fix for bug #2737. + */ +#ifndef LET_OS_MANAGE_FILESIZE + for (endblock = firstblock | (RELSEG_SIZE - 1); + endblock < lastblock; + endblock += RELSEG_SIZE) + smgrextend(rel->rd_smgr, endblock, zerobuf, rel->rd_istemp); +#endif + + smgrextend(rel->rd_smgr, lastblock, zerobuf, rel->rd_istemp); + + return firstblock; +} + + +/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * We are splitting a bucket that consists of a base bucket page and zero |