# Makefile for access/hash
#
# IDENTIFICATION
-# $PostgreSQL: pgsql/src/backend/access/hash/Makefile,v 1.14 2008/02/19 10:30:06 petere Exp $
+# $PostgreSQL: pgsql/src/backend/access/hash/Makefile,v 1.15 2008/03/16 23:15:08 tgl Exp $
#
#-------------------------------------------------------------------------
include $(top_builddir)/src/Makefile.global
OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
- hashsearch.o hashutil.o
+ hashsearch.o hashsort.o hashutil.o
include $(top_srcdir)/src/backend/common.mk
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.99 2008/03/15 20:46:31 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.100 2008/03/16 23:15:08 tgl Exp $
*
* NOTES
* This file contains only the public interface routines.
#include "access/hash.h"
#include "catalog/index.h"
#include "commands/vacuum.h"
+#include "optimizer/cost.h"
#include "optimizer/plancat.h"
/* Working state for hashbuild and its callback */
typedef struct
{
- double indtuples;
+ HSpool *spool; /* NULL if not using spooling */
+ double indtuples; /* # tuples accepted into index */
} HashBuildState;
static void hashbuildCallback(Relation index,
IndexBuildResult *result;
BlockNumber relpages;
double reltuples;
+ uint32 num_buckets;
HashBuildState buildstate;
/*
elog(ERROR, "index \"%s\" already contains data",
RelationGetRelationName(index));
- /* estimate the number of rows currently present in the table */
+ /* Estimate the number of rows currently present in the table */
estimate_rel_size(heap, NULL, &relpages, &reltuples);
- /* initialize the hash index metadata page and initial buckets */
- _hash_metapinit(index, reltuples);
+ /* Initialize the hash index metadata page and initial buckets */
+ num_buckets = _hash_metapinit(index, reltuples);
- /* build the index */
+ /*
+ * If we just insert the tuples into the index in scan order, then
+ * (assuming their hash codes are pretty random) there will be no locality
+ * of access to the index, and if the index is bigger than available RAM
+ * then we'll thrash horribly. To prevent that scenario, we can sort the
+ * tuples by (expected) bucket number. However, such a sort is useless
+ * overhead when the index does fit in RAM. We choose to sort if the
+ * initial index size exceeds effective_cache_size.
+ *
+ * NOTE: this test will need adjustment if a bucket is ever different
+ * from one page.
+ */
+ if (num_buckets >= (uint32) effective_cache_size)
+ buildstate.spool = _h_spoolinit(index, num_buckets);
+ else
+ buildstate.spool = NULL;
+
+ /* prepare to build the index */
buildstate.indtuples = 0;
/* do the heap scan */
reltuples = IndexBuildHeapScan(heap, index, indexInfo,
hashbuildCallback, (void *) &buildstate);
+ if (buildstate.spool)
+ {
+ /* sort the tuples and insert them into the index */
+ _h_indexbuild(buildstate.spool);
+ _h_spooldestroy(buildstate.spool);
+ }
+
/*
* Return statistics
*/
return;
}
- _hash_doinsert(index, itup);
+ /* Either spool the tuple for sorting, or just put it into the index */
+ if (buildstate->spool)
+ _h_spool(itup, buildstate->spool);
+ else
+ _hash_doinsert(index, itup);
buildstate->indtuples += 1;
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.73 2008/03/15 20:46:31 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.74 2008/03/16 23:15:08 tgl Exp $
*
* NOTES
* Postgres hash pages look like ordinary relation pages. The opaque
* the initial buckets, and the initial bitmap page.
*
* The initial number of buckets is dependent on num_tuples, an estimate
- * of the number of tuples to be loaded into the index initially.
+ * of the number of tuples to be loaded into the index initially. The
+ * chosen number of buckets is returned.
*
* We are fairly cavalier about locking here, since we know that no one else
* could be accessing this index. In particular the rule about not holding
* multiple buffer locks is ignored.
*/
-void
+uint32
_hash_metapinit(Relation rel, double num_tuples)
{
HashMetaPage metap;
metap->hashm_ovflpoint = log2_num_buckets;
metap->hashm_firstfree = 0;
+ /*
+ * Release buffer lock on the metapage while we initialize buckets.
+ * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
+ * won't accomplish anything. It's a bad idea to hold buffer locks
+ * for long intervals in any case, since that can block the bgwriter.
+ */
+ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
+
/*
* Initialize the first N buckets
*/
for (i = 0; i < num_buckets; i++)
{
+ /* Allow interrupts, in case N is huge */
+ CHECK_FOR_INTERRUPTS();
+
buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
pg = BufferGetPage(buf);
pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
_hash_wrtbuf(rel, buf);
}
+ /* Now reacquire buffer lock on metapage */
+ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
+
/*
* Initialize first bitmap page
*/
/* all done */
_hash_wrtbuf(rel, metabuf);
+
+ return num_buckets;
}
/*
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * hashsort.c
+ * Sort tuples for insertion into a new hash index.
+ *
+ * When building a very large hash index, we pre-sort the tuples by bucket
+ * number to improve locality of access to the index, and thereby avoid
+ * thrashing. We use tuplesort.c to sort the given index tuples into order.
+ *
+ * Note: if the number of rows in the table has been underestimated,
+ * bucket splits may occur during the index build. In that case we'd
+ * be inserting into two or more buckets for each possible masked-off
+ * hash code value. That's no big problem though, since we'll still have
+ * plenty of locality of access.
+ *
+ *
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/backend/access/hash/hashsort.c,v 1.1 2008/03/16 23:15:08 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/hash.h"
+#include "miscadmin.h"
+#include "utils/tuplesort.h"
+
+
+/*
+ * Status record for spooling/sorting phase.
+ */
+struct HSpool
+{
+ Tuplesortstate *sortstate; /* state data for tuplesort.c */
+ Relation index;
+};
+
+
+/*
+ * create and initialize a spool structure
+ */
+HSpool *
+_h_spoolinit(Relation index, uint32 num_buckets)
+{
+ HSpool *hspool = (HSpool *) palloc0(sizeof(HSpool));
+ uint32 hash_mask;
+
+ hspool->index = index;
+
+ /*
+ * Determine the bitmask for hash code values. Since there are currently
+ * num_buckets buckets in the index, the appropriate mask can be computed
+ * as follows.
+ *
+ * Note: at present, the passed-in num_buckets is always a power of 2,
+ * so we could just compute num_buckets - 1. We prefer not to assume
+ * that here, though.
+ */
+ hash_mask = (((uint32) 1) << _hash_log2(num_buckets)) - 1;
+
+ /*
+ * We size the sort area as maintenance_work_mem rather than work_mem to
+ * speed index creation. This should be OK since a single backend can't
+ * run multiple index creations in parallel.
+ */
+ hspool->sortstate = tuplesort_begin_index_hash(index,
+ hash_mask,
+ maintenance_work_mem,
+ false);
+
+ return hspool;
+}
+
+/*
+ * clean up a spool structure and its substructures.
+ */
+void
+_h_spooldestroy(HSpool *hspool)
+{
+ tuplesort_end(hspool->sortstate);
+ pfree(hspool);
+}
+
+/*
+ * spool an index entry into the sort file.
+ */
+void
+_h_spool(IndexTuple itup, HSpool *hspool)
+{
+ tuplesort_putindextuple(hspool->sortstate, itup);
+}
+
+/*
+ * given a spool loaded by successive calls to _h_spool,
+ * create an entire index.
+ */
+void
+_h_indexbuild(HSpool *hspool)
+{
+ IndexTuple itup;
+ bool should_free;
+
+ tuplesort_performsort(hspool->sortstate);
+
+ while ((itup = tuplesort_getindextuple(hspool->sortstate,
+ true, &should_free)) != NULL)
+ {
+ _hash_doinsert(hspool->index, itup);
+ if (should_free)
+ pfree(itup);
+ }
+}
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.114 2008/01/01 19:45:46 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.115 2008/03/16 23:15:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
* work_mem.
*/
btKbytes = isdead ? work_mem : maintenance_work_mem;
- btspool->sortstate = tuplesort_begin_index(index, isunique,
- btKbytes, false);
+ btspool->sortstate = tuplesort_begin_index_btree(index, isunique,
+ btKbytes, false);
return btspool;
}
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.81 2008/01/01 19:45:55 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.82 2008/03/16 23:15:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include
+#include "access/hash.h"
#include "access/heapam.h"
#include "access/nbtree.h"
#include "catalog/pg_amop.h"
/*
* These variables are specific to the IndexTuple case; they are set by
- * tuplesort_begin_index and used only by the IndexTuple routines.
+ * tuplesort_begin_index_xxx and used only by the IndexTuple routines.
*/
- Relation indexRel;
+ Relation indexRel; /* index being built */
+
+ /* These are specific to the index_btree subcase: */
ScanKey indexScanKey;
bool enforceUnique; /* complain if we find duplicate tuples */
+ /* These are specific to the index_hash subcase: */
+ uint32 hash_mask; /* mask for sortable part of hash code */
+
/*
* These variables are specific to the Datum case; they are set by
* tuplesort_begin_datum and used only by the DatumTuple routines.
static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
int tapenum, unsigned int len);
static void reversedirection_heap(Tuplesortstate *state);
-static int comparetup_index(const SortTuple *a, const SortTuple *b,
+static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
+ Tuplesortstate *state);
+static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
static void writetup_index(Tuplesortstate *state, int tapenum,
SortTuple *stup);
static void readtup_index(Tuplesortstate *state, SortTuple *stup,
int tapenum, unsigned int len);
-static void reversedirection_index(Tuplesortstate *state);
+static void reversedirection_index_btree(Tuplesortstate *state);
+static void reversedirection_index_hash(Tuplesortstate *state);
static int comparetup_datum(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
}
Tuplesortstate *
-tuplesort_begin_index(Relation indexRel,
- bool enforceUnique,
- int workMem, bool randomAccess)
+tuplesort_begin_index_btree(Relation indexRel,
+ bool enforceUnique,
+ int workMem, bool randomAccess)
{
Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
MemoryContext oldcontext;
state->nKeys = RelationGetNumberOfAttributes(indexRel);
- state->comparetup = comparetup_index;
+ state->comparetup = comparetup_index_btree;
state->copytup = copytup_index;
state->writetup = writetup_index;
state->readtup = readtup_index;
- state->reversedirection = reversedirection_index;
+ state->reversedirection = reversedirection_index_btree;
state->indexRel = indexRel;
- /* see comments below about btree dependence of this code... */
state->indexScanKey = _bt_mkscankey_nodata(indexRel);
state->enforceUnique = enforceUnique;
return state;
}
+Tuplesortstate *
+tuplesort_begin_index_hash(Relation indexRel,
+ uint32 hash_mask,
+ int workMem, bool randomAccess)
+{
+ Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(state->sortcontext);
+
+#ifdef TRACE_SORT
+ if (trace_sort)
+ elog(LOG,
+ "begin index sort: hash_mask = 0x%x, workMem = %d, randomAccess = %c",
+ hash_mask,
+ workMem, randomAccess ? 't' : 'f');
+#endif
+
+ state->nKeys = 1; /* Only one sort column, the hash code */
+
+ state->comparetup = comparetup_index_hash;
+ state->copytup = copytup_index;
+ state->writetup = writetup_index;
+ state->readtup = readtup_index;
+ state->reversedirection = reversedirection_index_hash;
+
+ state->indexRel = indexRel;
+ state->hash_mask = hash_mask;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return state;
+}
+
Tuplesortstate *
tuplesort_begin_datum(Oid datumType,
Oid sortOperator, bool nullsFirstFlag,
/*
* Routines specialized for IndexTuple case
*
- * NOTE: actually, these are specialized for the btree case; it's not
- * clear whether you could use them for a non-btree index. Possibly
- * you'd need to make another set of routines if you needed to sort
- * according to another kind of index.
+ * The btree and hash cases require separate comparison functions, but the
+ * IndexTuple representation is the same so the copy/write/read support
+ * functions can be shared.
*/
static int
-comparetup_index(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
+comparetup_index_btree(const SortTuple *a, const SortTuple *b,
+ Tuplesortstate *state)
{
/*
* This is similar to _bt_tuplecompare(), but we have already done the
return 0;
}
+static int
+comparetup_index_hash(const SortTuple *a, const SortTuple *b,
+ Tuplesortstate *state)
+{
+ /*
+ * It's slightly annoying to redo the hash function each time, although
+ * most hash functions ought to be cheap. Is it worth having a variant
+ * tuple storage format so we can store the hash code?
+ */
+ uint32 hash1;
+ uint32 hash2;
+ IndexTuple tuple1;
+ IndexTuple tuple2;
+
+ /* Allow interrupting long sorts */
+ CHECK_FOR_INTERRUPTS();
+
+ /* Compute hash codes and mask off bits we don't want to sort by */
+ Assert(!a->isnull1);
+ Assert(!b->isnull1);
+
+ hash1 = _hash_datum2hashkey(state->indexRel, a->datum1) & state->hash_mask;
+ hash2 = _hash_datum2hashkey(state->indexRel, b->datum1) & state->hash_mask;
+
+ if (hash1 > hash2)
+ return 1;
+ else if (hash1 < hash2)
+ return -1;
+
+ /*
+ * If hash values are equal, we sort on ItemPointer. This does not affect
+ * validity of the finished index, but it offers cheap insurance against
+ * performance problems with bad qsort implementations that have trouble
+ * with large numbers of equal keys.
+ */
+ tuple1 = (IndexTuple) a->tuple;
+ tuple2 = (IndexTuple) b->tuple;
+
+ {
+ BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
+ BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
+
+ if (blk1 != blk2)
+ return (blk1 < blk2) ? -1 : 1;
+ }
+ {
+ OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
+ OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
+
+ if (pos1 != pos2)
+ return (pos1 < pos2) ? -1 : 1;
+ }
+
+ return 0;
+}
+
static void
copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
{
}
static void
-reversedirection_index(Tuplesortstate *state)
+reversedirection_index_btree(Tuplesortstate *state)
{
ScanKey scanKey = state->indexScanKey;
int nkey;
}
}
+static void
+reversedirection_index_hash(Tuplesortstate *state)
+{
+ /* We don't support reversing direction in a hash index sort */
+ elog(ERROR, "reversedirection_index_hash is not implemented");
+}
+
/*
* Routines specialized for DatumTuple case
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.85 2008/03/15 20:46:31 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.86 2008/03/16 23:15:08 tgl Exp $
*
* NOTES
* modeled after Margo Seltzer's hash implementation for unix.
extern void _hash_wrtbuf(Relation rel, Buffer buf);
extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
int to_access);
-extern void _hash_metapinit(Relation rel, double num_tuples);
+extern uint32 _hash_metapinit(Relation rel, double num_tuples);
extern void _hash_pageinit(Page page, Size size);
extern void _hash_expandtable(Relation rel, Buffer metabuf);
extern bool _hash_first(IndexScanDesc scan, ScanDirection dir);
extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
+/* hashsort.c */
+typedef struct HSpool HSpool; /* opaque struct in hashsort.c */
+
+extern HSpool *_h_spoolinit(Relation index, uint32 num_buckets);
+extern void _h_spooldestroy(HSpool *hspool);
+extern void _h_spool(IndexTuple itup, HSpool *hspool);
+extern void _h_indexbuild(HSpool *hspool);
+
/* hashutil.c */
extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/utils/tuplesort.h,v 1.28 2008/01/01 19:45:59 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/utils/tuplesort.h,v 1.29 2008/03/16 23:15:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
* rather than forming actual HeapTuples (which'd have to be converted to
* MinimalTuples).
*
- * Yet a third slightly different interface supports sorting bare Datums.
+ * The IndexTuple case is itself broken into two subcases, one for btree
+ * indexes and one for hash indexes; the latter variant actually sorts
+ * the tuples by hash code. The API is the same except for the "begin"
+ * routine.
+ *
+ * Yet another slightly different interface supports sorting bare Datums.
*/
extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
int nkeys, AttrNumber *attNums,
Oid *sortOperators, bool *nullsFirstFlags,
int workMem, bool randomAccess);
-extern Tuplesortstate *tuplesort_begin_index(Relation indexRel,
- bool enforceUnique,
- int workMem, bool randomAccess);
+extern Tuplesortstate *tuplesort_begin_index_btree(Relation indexRel,
+ bool enforceUnique,
+ int workMem, bool randomAccess);
+extern Tuplesortstate *tuplesort_begin_index_hash(Relation indexRel,
+ uint32 hash_mask,
+ int workMem, bool randomAccess);
extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
Oid sortOperator, bool nullsFirstFlag,
int workMem, bool randomAccess);