gooderp18绿色标准版
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

464 line
18KB

  1. /*-------------------------------------------------------------------------
  2. *
  3. * hash.h
  4. * header file for postgres hash access method implementation
  5. *
  6. *
  7. * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
  8. * Portions Copyright (c) 1994, Regents of the University of California
  9. *
  10. * src/include/access/hash.h
  11. *
  12. * NOTES
  13. * modeled after Margo Seltzer's hash implementation for unix.
  14. *
  15. *-------------------------------------------------------------------------
  16. */
  17. #ifndef HASH_H
  18. #define HASH_H
  19. #include "access/amapi.h"
  20. #include "access/itup.h"
  21. #include "access/sdir.h"
  22. #include "fmgr.h"
  23. #include "lib/stringinfo.h"
  24. #include "storage/bufmgr.h"
  25. #include "storage/lockdefs.h"
  26. #include "utils/hashutils.h"
  27. #include "utils/hsearch.h"
  28. #include "utils/relcache.h"
  29. /*
  30. * Mapping from hash bucket number to physical block number of bucket's
  31. * starting page. Beware of multiple evaluations of argument!
  32. */
  33. typedef uint32 Bucket;
  34. #define InvalidBucket ((Bucket) 0xFFFFFFFF)
  35. #define BUCKET_TO_BLKNO(metap,B) \
  36. ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1)
  37. /*
  38. * Special space for hash index pages.
  39. *
  40. * hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at.
  41. * Additional bits in the flag word are used for more transient purposes.
  42. *
  43. * To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE.
  44. * However, we ensure that each used page type has a distinct bit so that
  45. * we can OR together page types for uses such as the allowable-page-types
  46. * argument of _hash_checkpage().
  47. */
  48. #define LH_UNUSED_PAGE (0)
  49. #define LH_OVERFLOW_PAGE (1 << 0)
  50. #define LH_BUCKET_PAGE (1 << 1)
  51. #define LH_BITMAP_PAGE (1 << 2)
  52. #define LH_META_PAGE (1 << 3)
  53. #define LH_BUCKET_BEING_POPULATED (1 << 4)
  54. #define LH_BUCKET_BEING_SPLIT (1 << 5)
  55. #define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6)
  56. #define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
  57. #define LH_PAGE_TYPE \
  58. (LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
  59. /*
  60. * In an overflow page, hasho_prevblkno stores the block number of the previous
  61. * page in the bucket chain; in a bucket page, hasho_prevblkno stores the
  62. * hashm_maxbucket value as of the last time the bucket was last split, or
  63. * else as of the time the bucket was created. The latter convention is used
  64. * to determine whether a cached copy of the metapage is too stale to be used
  65. * without needing to lock or pin the metapage.
  66. *
  67. * hasho_nextblkno is always the block number of the next page in the
  68. * bucket chain, or InvalidBlockNumber if there are no more such pages.
  69. */
  70. typedef struct HashPageOpaqueData
  71. {
  72. BlockNumber hasho_prevblkno; /* see above */
  73. BlockNumber hasho_nextblkno; /* see above */
  74. Bucket hasho_bucket; /* bucket number this pg belongs to */
  75. uint16 hasho_flag; /* page type code + flag bits, see above */
  76. uint16 hasho_page_id; /* for identification of hash indexes */
  77. } HashPageOpaqueData;
  78. typedef HashPageOpaqueData *HashPageOpaque;
  79. #define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0)
  80. #define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
  81. #define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
  82. #define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
  83. /*
  84. * The page ID is for the convenience of pg_filedump and similar utilities,
  85. * which otherwise would have a hard time telling pages of different index
  86. * types apart. It should be the last 2 bytes on the page. This is more or
  87. * less "free" due to alignment considerations.
  88. */
  89. #define HASHO_PAGE_ID 0xFF80
  90. typedef struct HashScanPosItem /* what we remember about each match */
  91. {
  92. ItemPointerData heapTid; /* TID of referenced heap item */
  93. OffsetNumber indexOffset; /* index item's location within page */
  94. } HashScanPosItem;
  95. typedef struct HashScanPosData
  96. {
  97. Buffer buf; /* if valid, the buffer is pinned */
  98. BlockNumber currPage; /* current hash index page */
  99. BlockNumber nextPage; /* next overflow page */
  100. BlockNumber prevPage; /* prev overflow or bucket page */
  101. /*
  102. * The items array is always ordered in index order (ie, increasing
  103. * indexoffset). When scanning backwards it is convenient to fill the
  104. * array back-to-front, so we start at the last slot and fill downwards.
  105. * Hence we need both a first-valid-entry and a last-valid-entry counter.
  106. * itemIndex is a cursor showing which entry was last returned to caller.
  107. */
  108. int firstItem; /* first valid index in items[] */
  109. int lastItem; /* last valid index in items[] */
  110. int itemIndex; /* current index in items[] */
  111. HashScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
  112. } HashScanPosData;
  113. #define HashScanPosIsPinned(scanpos) \
  114. ( \
  115. AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
  116. !BufferIsValid((scanpos).buf)), \
  117. BufferIsValid((scanpos).buf) \
  118. )
  119. #define HashScanPosIsValid(scanpos) \
  120. ( \
  121. AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
  122. !BufferIsValid((scanpos).buf)), \
  123. BlockNumberIsValid((scanpos).currPage) \
  124. )
  125. #define HashScanPosInvalidate(scanpos) \
  126. do { \
  127. (scanpos).buf = InvalidBuffer; \
  128. (scanpos).currPage = InvalidBlockNumber; \
  129. (scanpos).nextPage = InvalidBlockNumber; \
  130. (scanpos).prevPage = InvalidBlockNumber; \
  131. (scanpos).firstItem = 0; \
  132. (scanpos).lastItem = 0; \
  133. (scanpos).itemIndex = 0; \
  134. } while (0)
  135. /*
  136. * HashScanOpaqueData is private state for a hash index scan.
  137. */
  138. typedef struct HashScanOpaqueData
  139. {
  140. /* Hash value of the scan key, ie, the hash key we seek */
  141. uint32 hashso_sk_hash;
  142. /* remember the buffer associated with primary bucket */
  143. Buffer hashso_bucket_buf;
  144. /*
  145. * remember the buffer associated with primary bucket page of bucket being
  146. * split. it is required during the scan of the bucket which is being
  147. * populated during split operation.
  148. */
  149. Buffer hashso_split_bucket_buf;
  150. /* Whether scan starts on bucket being populated due to split */
  151. bool hashso_buc_populated;
  152. /*
  153. * Whether scanning bucket being split? The value of this parameter is
  154. * referred only when hashso_buc_populated is true.
  155. */
  156. bool hashso_buc_split;
  157. /* info about killed items if any (killedItems is NULL if never used) */
  158. int *killedItems; /* currPos.items indexes of killed items */
  159. int numKilled; /* number of currently stored items */
  160. /*
  161. * Identify all the matching items on a page and save them in
  162. * HashScanPosData
  163. */
  164. HashScanPosData currPos; /* current position data */
  165. } HashScanOpaqueData;
  166. typedef HashScanOpaqueData *HashScanOpaque;
  167. /*
  168. * Definitions for metapage.
  169. */
  170. #define HASH_METAPAGE 0 /* metapage is always block 0 */
  171. #define HASH_MAGIC 0x6440640
  172. #define HASH_VERSION 4
  173. /*
  174. * spares[] holds the number of overflow pages currently allocated at or
  175. * before a certain splitpoint. For example, if spares[3] = 7 then there are
  176. * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro). The
  177. * value in spares[ovflpoint] increases as overflow pages are added at the
  178. * end of the index. Once ovflpoint increases (ie, we have actually allocated
  179. * the bucket pages belonging to that splitpoint) the number of spares at the
  180. * prior splitpoint cannot change anymore.
  181. *
  182. * ovflpages that have been recycled for reuse can be found by looking at
  183. * bitmaps that are stored within ovflpages dedicated for the purpose.
  184. * The blknos of these bitmap pages are kept in mapp[]; nmaps is the
  185. * number of currently existing bitmaps.
  186. *
  187. * The limitation on the size of spares[] comes from the fact that there's
  188. * no point in having more than 2^32 buckets with only uint32 hashcodes.
  189. * (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is
  190. * adjusted in such a way to accommodate multi phased allocation of buckets
  191. * after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE).
  192. *
  193. * There is no particular upper limit on the size of mapp[], other than
  194. * needing to fit into the metapage. (With 8K block size, 1024 bitmaps
  195. * limit us to 256 GB of overflow space...). For smaller block size we
  196. * can not use 1024 bitmaps as it will lead to the meta page data crossing
  197. * the block size boundary. So we use BLCKSZ to determine the maximum number
  198. * of bitmaps.
  199. */
  200. #define HASH_MAX_BITMAPS Min(BLCKSZ / 8, 1024)
  201. #define HASH_SPLITPOINT_PHASE_BITS 2
  202. #define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS)
  203. #define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1)
  204. #define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10
  205. /* defines max number of splitpoint phases a hash index can have */
  206. #define HASH_MAX_SPLITPOINT_GROUP 32
  207. #define HASH_MAX_SPLITPOINTS \
  208. (((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \
  209. HASH_SPLITPOINT_PHASES_PER_GRP) + \
  210. HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
  211. typedef struct HashMetaPageData
  212. {
  213. uint32 hashm_magic; /* magic no. for hash tables */
  214. uint32 hashm_version; /* version ID */
  215. double hashm_ntuples; /* number of tuples stored in the table */
  216. uint16 hashm_ffactor; /* target fill factor (tuples/bucket) */
  217. uint16 hashm_bsize; /* index page size (bytes) */
  218. uint16 hashm_bmsize; /* bitmap array size (bytes) - must be a power
  219. * of 2 */
  220. uint16 hashm_bmshift; /* log2(bitmap array size in BITS) */
  221. uint32 hashm_maxbucket; /* ID of maximum bucket in use */
  222. uint32 hashm_highmask; /* mask to modulo into entire table */
  223. uint32 hashm_lowmask; /* mask to modulo into lower half of table */
  224. uint32 hashm_ovflpoint; /* splitpoint from which ovflpgs being
  225. * allocated */
  226. uint32 hashm_firstfree; /* lowest-number free ovflpage (bit#) */
  227. uint32 hashm_nmaps; /* number of bitmap pages */
  228. RegProcedure hashm_procid; /* hash function id from pg_proc */
  229. uint32 hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before each
  230. * splitpoint */
  231. BlockNumber hashm_mapp[HASH_MAX_BITMAPS]; /* blknos of ovfl bitmaps */
  232. } HashMetaPageData;
  233. typedef HashMetaPageData *HashMetaPage;
  234. /*
  235. * Maximum size of a hash index item (it's okay to have only one per page)
  236. */
  237. #define HashMaxItemSize(page) \
  238. MAXALIGN_DOWN(PageGetPageSize(page) - \
  239. SizeOfPageHeaderData - \
  240. sizeof(ItemIdData) - \
  241. MAXALIGN(sizeof(HashPageOpaqueData)))
  242. #define INDEX_MOVED_BY_SPLIT_MASK INDEX_AM_RESERVED_BIT
  243. #define HASH_MIN_FILLFACTOR 10
  244. #define HASH_DEFAULT_FILLFACTOR 75
  245. /*
  246. * Constants
  247. */
  248. #define BYTE_TO_BIT 3 /* 2^3 bits/byte */
  249. #define ALL_SET ((uint32) ~0)
  250. /*
  251. * Bitmap pages do not contain tuples. They do contain the standard
  252. * page headers and trailers; however, everything in between is a
  253. * giant bit array. The number of bits that fit on a page obviously
  254. * depends on the page size and the header/trailer overhead. We require
  255. * the number of bits per page to be a power of 2.
  256. */
  257. #define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize)
  258. #define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT)
  259. #define BMPG_SHIFT(metap) ((metap)->hashm_bmshift)
  260. #define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1)
  261. #define HashPageGetBitmap(page) \
  262. ((uint32 *) PageGetContents(page))
  263. #define HashGetMaxBitmapSize(page) \
  264. (PageGetPageSize((Page) page) - \
  265. (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData))))
  266. #define HashPageGetMeta(page) \
  267. ((HashMetaPage) PageGetContents(page))
  268. /*
  269. * The number of bits in an ovflpage bitmap word.
  270. */
  271. #define BITS_PER_MAP 32 /* Number of bits in uint32 */
  272. /* Given the address of the beginning of a bit map, clear/set the nth bit */
  273. #define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP)))
  274. #define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP)))
  275. #define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP)))
  276. /*
  277. * page-level and high-level locking modes (see README)
  278. */
  279. #define HASH_READ BUFFER_LOCK_SHARE
  280. #define HASH_WRITE BUFFER_LOCK_EXCLUSIVE
  281. #define HASH_NOLOCK (-1)
  282. /*
  283. * When a new operator class is declared, we require that the user supply
  284. * us with an amproc function for hashing a key of the new type, returning
  285. * a 32-bit hash value. We call this the "standard" hash function. We
  286. * also allow an optional "extended" hash function which accepts a salt and
  287. * returns a 64-bit hash value. This is highly recommended but, for reasons
  288. * of backward compatibility, optional.
  289. *
  290. * When the salt is 0, the low 32 bits of the value returned by the extended
  291. * hash function should match the value that would have been returned by the
  292. * standard hash function.
  293. */
  294. #define HASHSTANDARD_PROC 1
  295. #define HASHEXTENDED_PROC 2
  296. #define HASHNProcs 2
  297. /* public routines */
  298. extern IndexBuildResult *hashbuild(Relation heap, Relation index,
  299. struct IndexInfo *indexInfo);
  300. extern void hashbuildempty(Relation index);
  301. extern bool hashinsert(Relation rel, Datum *values, bool *isnull,
  302. ItemPointer ht_ctid, Relation heapRel,
  303. IndexUniqueCheck checkUnique,
  304. struct IndexInfo *indexInfo);
  305. extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir);
  306. extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
  307. extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys);
  308. extern void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
  309. ScanKey orderbys, int norderbys);
  310. extern void hashendscan(IndexScanDesc scan);
  311. extern IndexBulkDeleteResult *hashbulkdelete(IndexVacuumInfo *info,
  312. IndexBulkDeleteResult *stats,
  313. IndexBulkDeleteCallback callback,
  314. void *callback_state);
  315. extern IndexBulkDeleteResult *hashvacuumcleanup(IndexVacuumInfo *info,
  316. IndexBulkDeleteResult *stats);
  317. extern bytea *hashoptions(Datum reloptions, bool validate);
  318. extern bool hashvalidate(Oid opclassoid);
  319. /* private routines */
  320. /* hashinsert.c */
  321. extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel);
  322. extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf,
  323. Size itemsize, IndexTuple itup);
  324. extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
  325. OffsetNumber *itup_offsets, uint16 nitups);
  326. /* hashovfl.c */
  327. extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin);
  328. extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
  329. Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
  330. Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy);
  331. extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage);
  332. extern void _hash_squeezebucket(Relation rel,
  333. Bucket bucket, BlockNumber bucket_blkno,
  334. Buffer bucket_buf,
  335. BufferAccessStrategy bstrategy);
  336. extern uint32 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno);
  337. /* hashpage.c */
  338. extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
  339. int access, int flags);
  340. extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
  341. BlockNumber blkno, int flags);
  342. extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf,
  343. bool force_refresh);
  344. extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey,
  345. int access,
  346. HashMetaPage *cachedmetap);
  347. extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
  348. extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket,
  349. uint32 flag, bool initpage);
  350. extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno,
  351. ForkNumber forkNum);
  352. extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
  353. int access, int flags,
  354. BufferAccessStrategy bstrategy);
  355. extern void _hash_relbuf(Relation rel, Buffer buf);
  356. extern void _hash_dropbuf(Relation rel, Buffer buf);
  357. extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so);
  358. extern uint32 _hash_init(Relation rel, double num_tuples,
  359. ForkNumber forkNum);
  360. extern void _hash_init_metabuffer(Buffer buf, double num_tuples,
  361. RegProcedure procid, uint16 ffactor, bool initpage);
  362. extern void _hash_pageinit(Page page, Size size);
  363. extern void _hash_expandtable(Relation rel, Buffer metabuf);
  364. extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf,
  365. Bucket obucket, uint32 maxbucket, uint32 highmask,
  366. uint32 lowmask);
  367. /* hashsearch.c */
  368. extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
  369. extern bool _hash_first(IndexScanDesc scan, ScanDirection dir);
  370. /* hashsort.c */
  371. typedef struct HSpool HSpool; /* opaque struct in hashsort.c */
  372. extern HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets);
  373. extern void _h_spooldestroy(HSpool *hspool);
  374. extern void _h_spool(HSpool *hspool, ItemPointer self,
  375. Datum *values, bool *isnull);
  376. extern void _h_indexbuild(HSpool *hspool, Relation heapRel);
  377. /* hashutil.c */
  378. extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
  379. extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
  380. extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype);
  381. extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
  382. uint32 highmask, uint32 lowmask);
  383. extern uint32 _hash_log2(uint32 num);
  384. extern uint32 _hash_spareindex(uint32 num_bucket);
  385. extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase);
  386. extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
  387. extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
  388. extern bool _hash_convert_tuple(Relation index,
  389. Datum *user_values, bool *user_isnull,
  390. Datum *index_values, bool *index_isnull);
  391. extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
  392. extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
  393. extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
  394. extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
  395. extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
  396. uint32 lowmask, uint32 maxbucket);
  397. extern void _hash_kill_items(IndexScanDesc scan);
  398. /* hash.c */
  399. extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
  400. Buffer bucket_buf, BlockNumber bucket_blkno,
  401. BufferAccessStrategy bstrategy,
  402. uint32 maxbucket, uint32 highmask, uint32 lowmask,
  403. double *tuples_removed, double *num_index_tuples,
  404. bool split_cleanup,
  405. IndexBulkDeleteCallback callback, void *callback_state);
  406. #endif /* HASH_H */
上海开阖软件有限公司 沪ICP备12045867号-1