diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2003-09-01 20:26:34 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2003-09-01 20:26:34 +0000 |
commit | 65c2d427fb7436c314259778cf56551412cde730 (patch) | |
tree | 444d38bce6ab1e281504cf7c110148fc75387f6b /src/include/access/hash.h | |
parent | eaeb8621f8cba39de0d3ee6c296ab22731b4a183 (diff) | |
download | postgresql-65c2d427fb7436c314259778cf56551412cde730.tar.gz postgresql-65c2d427fb7436c314259778cf56551412cde730.zip |
Preliminary cleanup for hash index code (doesn't attack the locking problem
yet). Fix a couple of bugs that would only appear if multiple bitmap pages
are used, including a buffer reference leak and incorrect computation of bit
indexes. Get rid of 'overflow address' concept, which accomplished nothing
except obfuscating the code and creating a risk of failure due to limited
range of offset field. Rename some misleadingly-named fields and routines,
and improve documentation.
Diffstat (limited to 'src/include/access/hash.h')
-rw-r--r-- | src/include/access/hash.h | 166 |
1 files changed, 61 insertions, 105 deletions
diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 821f8348e8e..83aae20c1ca 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: hash.h,v 1.49 2003/08/04 02:40:10 momjian Exp $ + * $Id: hash.h,v 1.50 2003/09/01 20:26:34 tgl Exp $ * * NOTES * modeled after Margo Seltzer's hash implementation for unix. @@ -24,43 +24,18 @@ #include "fmgr.h" /* - * An overflow page is a spare page allocated for storing data whose - * bucket doesn't have room to store it. We use overflow pages rather - * than just splitting the bucket because there is a linear order in - * the way we split buckets. In other words, if there isn't enough space - * in the bucket itself, put it in an overflow page. - * - * Overflow page addresses are stored in form: (Splitnumber, Page offset). - * - * A splitnumber is the number of the generation where the table doubles - * in size. The ovflpage's offset within the splitnumber; offsets start - * at 1. - * - * We convert the stored bitmap address into a page address with the - * macro OADDR_OF(S, O) where S is the splitnumber and O is the page - * offset. + * Mapping from hash bucket number to physical block number of bucket's + * starting page. Beware of multiple evaluations of argument! Also notice + * macro's implicit dependency on "metap". */ typedef uint32 Bucket; -typedef bits16 OverflowPageAddress; -typedef uint32 SplitNumber; -typedef uint32 PageOffset; - -/* A valid overflow address will always have a page offset >= 1 */ -#define InvalidOvflAddress 0 - -#define SPLITSHIFT 11 -#define SPLITMASK 0x7FF -#define SPLITNUM(N) ((SplitNumber)(((uint32)(N)) >> SPLITSHIFT)) -#define OPAGENUM(N) ((PageOffset)((N) & SPLITMASK)) -#define OADDR_OF(S,O) ((OverflowPageAddress)((uint32)((uint32)(S) << SPLITSHIFT) + (O))) #define BUCKET_TO_BLKNO(B) \ - ((Bucket) ((B) + ((B) ? metap->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1) -#define OADDR_TO_BLKNO(B) \ - ((BlockNumber) \ - (BUCKET_TO_BLKNO ( (1 << SPLITNUM((B))) -1 ) + OPAGENUM((B)))); + ((BlockNumber) ((B) + ((B) ? metap->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1) /* + * Special space for hash index pages. + * * hasho_flag tells us which type of page we're looking at. For * example, knowing overflow pages from bucket pages is necessary * information when you're deleting tuples from a page. If all the @@ -69,7 +44,6 @@ typedef uint32 PageOffset; * the tuples are deleted from a bucket page, no additional action is * necessary. */ - #define LH_UNUSED_PAGE (0) #define LH_OVERFLOW_PAGE (1 << 0) #define LH_BUCKET_PAGE (1 << 1) @@ -78,9 +52,9 @@ typedef uint32 PageOffset; typedef struct HashPageOpaqueData { - bits16 hasho_flag; /* is this page a bucket or ovfl */ + bits16 hasho_flag; /* page type code, see above */ Bucket hasho_bucket; /* bucket number this pg belongs to */ - OverflowPageAddress hasho_oaddr; /* ovfl address of this ovfl pg */ + bits16 hasho_oaddr; /* no longer used; delete someday */ BlockNumber hasho_nextblkno; /* next ovfl blkno */ BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */ } HashPageOpaqueData; @@ -91,10 +65,8 @@ typedef HashPageOpaqueData *HashPageOpaque; * ScanOpaqueData is used to remember which buffers we're currently * examining in the scan. We keep these buffers locked and pinned and * recorded in the opaque entry of the scan in order to avoid doing a - * ReadBuffer() for every tuple in the index. This avoids semop() calls, - * which are expensive. + * ReadBuffer() for every tuple in the index. */ - typedef struct HashScanOpaqueData { Buffer hashso_curbuf; @@ -113,60 +85,55 @@ typedef HashScanOpaqueData *HashScanOpaque; #define HASH_VERSION 0 /* - * NCACHED is used to set the array sizeof spares[] & bitmaps[]. + * Spares[] holds the number of overflow pages currently allocated at or + * before a certain splitpoint. For example, if spares[3] = 7 then there are + * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro). The + * value in spares[ovflpoint] increases as overflow pages are added at the + * end of the index. Once ovflpoint increases (ie, we have actually allocated + * the bucket pages belonging to that splitpoint) the number of spares at the + * prior splitpoint cannot change anymore. * - * Spares[] is used to hold the number overflow pages currently - * allocated at a certain splitpoint. For example, if spares[3] = 7 - * then there are a maximum of 7 ovflpages available at splitpoint 3. - * The value in spares[] will change as ovflpages are added within - * a splitpoint. + * ovflpages that have been recycled for reuse can be found by looking at + * bitmaps that are stored within ovflpages dedicated for the purpose. + * The blknos of these bitmap pages are kept in bitmaps[]; nmaps is the + * number of currently existing bitmaps. * - * Within a splitpoint, one can find which ovflpages are available and - * which are used by looking at a bitmaps that are stored on the ovfl - * pages themselves. There is at least one bitmap for every splitpoint's - * ovflpages. Bitmaps[] contains the ovflpage addresses of the ovflpages - * that hold the ovflpage bitmaps. - * - * The reason that the size is restricted to NCACHED (32) is because - * the bitmaps are 16 bits: upper 5 represent the splitpoint, lower 11 - * indicate the page number within the splitpoint. Since there are - * only 5 bits to store the splitpoint, there can only be 32 splitpoints. - * Both spares[] and bitmaps[] use splitpoints as there indices, so there - * can only be 32 of them. + * The limitation on the size of spares[] comes from the fact that there's + * no point in having more than 2^32 buckets with only uint32 hashcodes. + * There is no particularly good reason for bitmaps[] to be the same size, + * but we're stuck with that until we want to force an initdb. (With 8K + * block size, 32 bitmaps limit us to 8 Gb of overflow space...) */ - -#define NCACHED 32 - +#define HASH_MAX_SPLITPOINTS 32 +#define HASH_MAX_BITMAPS 32 typedef struct HashMetaPageData { PageHeaderData hashm_phdr; /* pad for page header (do not use) */ uint32 hashm_magic; /* magic no. for hash tables */ uint32 hashm_version; /* version ID */ - uint32 hashm_nkeys; /* number of keys stored in the table */ - uint16 hashm_ffactor; /* fill factor */ - uint16 hashm_bsize; /* bucket size (bytes) - must be a power + uint32 hashm_ntuples; /* number of tuples stored in the table */ + uint16 hashm_ffactor; /* target fill factor (tuples/bucket) */ + uint16 hashm_bsize; /* index page size (bytes) - must be a power * of 2 */ - uint16 hashm_bshift; /* bucket shift */ - uint16 hashm_bmsize; /* bitmap array size (bytes) - must be a - * power of 2 */ + uint16 hashm_bshift; /* log2(bsize) */ + uint16 hashm_bmsize; /* bitmap array size (bytes) - must be + * exactly half of hashm_bsize */ uint32 hashm_maxbucket; /* ID of maximum bucket in use */ uint32 hashm_highmask; /* mask to modulo into entire table */ uint32 hashm_lowmask; /* mask to modulo into lower half of table */ - uint32 hashm_ovflpoint;/* pageno. from which ovflpgs being + uint32 hashm_ovflpoint;/* splitpoint from which ovflpgs being * allocated */ - uint32 hashm_lastfreed; /* last ovflpage freed */ - uint32 hashm_nmaps; /* Initial number of bitmaps */ - uint32 hashm_spares[NCACHED]; /* spare pages available at - * splitpoints */ - BlockNumber hashm_mapp[NCACHED]; /* blknumbers of ovfl page maps */ + uint32 hashm_firstfree; /* lowest-number free ovflpage (bit#) */ + uint32 hashm_nmaps; /* number of bitmap pages */ + uint32 hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before + * each splitpoint */ + BlockNumber hashm_mapp[HASH_MAX_BITMAPS]; /* blknos of ovfl bitmaps */ RegProcedure hashm_procid; /* hash procedure id from pg_proc */ } HashMetaPageData; typedef HashMetaPageData *HashMetaPage; -extern bool BuildingHash; - typedef struct HashItemData { IndexTupleData hash_itup; @@ -178,31 +145,33 @@ typedef HashItemData *HashItem; * Constants */ #define DEFAULT_FFACTOR 300 -#define SPLITMAX 8 #define BYTE_TO_BIT 3 /* 2^3 bits/byte */ -#define INT_TO_BYTE 2 /* 2^2 bytes/int */ -#define INT_TO_BIT 5 /* 2^5 bits/int */ #define ALL_SET ((uint32) ~0) /* - * bitmap pages do not contain tuples. they do contain the standard + * Bitmap pages do not contain tuples. They do contain the standard * page headers and trailers; however, everything in between is a - * giant bit array. the number of bits that fit on a page obviously - * depends on the page size and the header/trailer overhead. + * giant bit array. The number of bits that fit on a page obviously + * depends on the page size and the header/trailer overhead. In the + * present implementation, we use exactly half of a page for bitmap, + * so that we have a power-of-2 bits per page. + * + * The fact that the metapage has separate bsize and bmsize fields, + * but only one bshift field, is a design error that ought to be fixed. */ #define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize) #define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT) +#define BMPG_SHIFT(metap) ((metap)->hashm_bshift - 1 + BYTE_TO_BIT) +#define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1) #define HashPageGetBitmap(pg) \ ((uint32 *) (((char *) (pg)) + MAXALIGN(sizeof(PageHeaderData)))) /* - * The number of bits in an ovflpage bitmap which - * tells which ovflpages are empty versus in use (NOT the number of - * bits in an overflow page *address* bitmap). + * The number of bits in an ovflpage bitmap word. */ -#define BITS_PER_MAP 32 /* Number of bits in ovflpage bitmap */ +#define BITS_PER_MAP 32 /* Number of bits in uint32 */ -/* Given the address of the beginning of a big map, clear/set the nth bit */ +/* Given the address of the beginning of a bit map, clear/set the nth bit */ #define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP))) #define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP))) #define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP))) @@ -214,17 +183,8 @@ typedef HashItemData *HashItem; #define HASH_WRITE 1 /* - * In general, the hash code tries to localize its knowledge about page - * layout to a couple of routines. However, we need a special value to - * indicate "no page number" in those places where we expect page numbers. - */ - -#define P_NONE 0 - -/* * Strategy number. There's only one valid strategy for hashing: equality. */ - #define HTEqualStrategyNumber 1 #define HTMaxStrategyNumber 1 @@ -233,9 +193,11 @@ typedef HashItemData *HashItem; * us with an amproc procudure for hashing a key of the new type. * Since we only have one such proc in amproc, it's number 1. */ - #define HASHPROC 1 + +extern bool BuildingHash; + /* public routines */ extern Datum hashbuild(PG_FUNCTION_ARGS); @@ -276,36 +238,32 @@ extern Datum hash_any(register const unsigned char *k, register int keylen); /* hashinsert.c */ extern InsertIndexResult _hash_doinsert(Relation rel, HashItem hitem); - /* hashovfl.c */ -extern Buffer _hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf); -extern Buffer _hash_freeovflpage(Relation rel, Buffer ovflbuf); -extern int32 _hash_initbitmap(Relation rel, HashMetaPage metap, int32 pnum, - int32 nbits, int32 ndx); +extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf); +extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf); +extern void _hash_initbitmap(Relation rel, HashMetaPage metap, + BlockNumber blkno); extern void _hash_squeezebucket(Relation rel, HashMetaPage metap, Bucket bucket); - /* hashpage.c */ extern void _hash_metapinit(Relation rel); extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access); extern void _hash_relbuf(Relation rel, Buffer buf, int access); extern void _hash_wrtbuf(Relation rel, Buffer buf); extern void _hash_wrtnorelbuf(Buffer buf); -extern Page _hash_chgbufaccess(Relation rel, Buffer *bufp, int from_access, +extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, int to_access); extern void _hash_pageinit(Page page, Size size); extern void _hash_pagedel(Relation rel, ItemPointer tid); extern void _hash_expandtable(Relation rel, Buffer metabuf); - /* hashscan.c */ extern void _hash_regscan(IndexScanDesc scan); extern void _hash_dropscan(IndexScanDesc scan); extern void _hash_adjscans(Relation rel, ItemPointer tid); extern void AtEOXact_hash(void); - /* hashsearch.c */ extern void _hash_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP, HashMetaPage metap); @@ -314,7 +272,6 @@ extern bool _hash_first(IndexScanDesc scan, ScanDirection dir); extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf); - /* hashutil.c */ extern ScanKey _hash_mkscankey(Relation rel, IndexTuple itup); extern void _hash_freeskey(ScanKey skey); @@ -324,7 +281,6 @@ extern Bucket _hash_call(Relation rel, HashMetaPage metap, Datum key); extern uint32 _hash_log2(uint32 num); extern void _hash_checkpage(Page page, int flags); - /* hash.c */ extern void hash_redo(XLogRecPtr lsn, XLogRecord *record); extern void hash_undo(XLogRecPtr lsn, XLogRecord *record); |