Write some real documentation about the index access method API.

author Tom Lane

Sun, 13 Feb 2005 03:04:15 +0000 (03:04 +0000)

committer Tom Lane

Sun, 13 Feb 2005 03:04:15 +0000 (03:04 +0000)
author Tom Lane
Sun, 13 Feb 2005 03:04:15 +0000 (03:04 +0000)
committer Tom Lane
Sun, 13 Feb 2005 03:04:15 +0000 (03:04 +0000)
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml

index b74f6ea9f1fb5f16b133fac373264ccf34276386..7cfca6f1182c206820d3cd182bd1fd49cdd96ffb 100644 (file)
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -1,6 +1,6 @@
  
  
  
@@ -289,9 +289,10 @@
    
  
    
-   The catalog pg_am stores information about index access
-   methods.  There is one row for each index access method supported by
-   the system.
+   The catalog pg_am stores information about index
+   access methods.  There is one row for each index access method supported by
+   the system.  The contents of this catalog are discussed in detail in
+   .
    
  
    @@ -453,20 +454,6 @@
    
   
@@ -453,20 +454,6 @@
     
    
  
-   
-    An index access method that supports multiple columns (has
-    amcanmulticol true) must
-    support indexing null values in columns after the first, because the planner
-    will assume the index can be used for queries on just the first
-    column(s).  For example, consider an index on (a,b) and a query with
-    WHERE a = 4.  The system will assume the index can be used to scan for
-    rows with a = 4, which is wrong if the index omits rows where b is null.
-    It is, however, OK to omit rows where the first indexed column is null.
-    (GiST currently does so.)
-    amindexnulls should be set true only if the
-    index access method indexes all rows, including arbitrary combinations of null values.
-   
-
   
  
  
diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml

index 21e8db881b247d468fb359026f591772a099b882..0198ca4af5f1e37710ea26a271a02fc0d77b2e62 100644 (file)
--- a/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
@@ -1,4 +1,4 @@
-
+
  
  
  
@@ -77,7 +77,7 @@
  
  
  
-cost  SYSTEM "indexcost.sgml">
+am    SYSTEM "indexam.sgml">
  
  
  
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml

new file mode 100644 (file)

index 0000000..bda539c
--- /dev/null
+++ b/doc/src/sgml/indexam.sgml
@@ -0,0 +1,837 @@
+
+
+
+ Index Access Method Interface Definition
+
+  
+   This chapter defines the interface between the core
+   PostgreSQL system and index access
+   methods, which manage individual index types.  The core system
+   knows nothing about indexes beyond what is specified here, so it is
+   possible to develop entirely new index types by writing add-on code.
+  
+
+  
+   All indexes in PostgreSQL are what are known
+   technically as secondary indexes; that is, the index is
+   physically separate from the table file that it describes.  Each index
+   is stored as its own physical relation and so is described
+   by an entry in the pg_class catalog.  The contents of an
+   index are entirely under the control of its index access method.  In
+   practice, all index access methods divide indexes into standard-size
+   pages so that they can use the regular storage manager and buffer manager
+   to access the index contents.  (All the existing index access methods
+   furthermore use the standard page layout described in 
+   linkend="storage-page-layout">, and they all use the same format for index
+   tuple headers; but these decisions are not forced on an access method.)
+  
+
+  
+   An index is effectively a mapping from some data key values to
+   tuple identifiers, or TIDs, of row versions
+   (tuples) in the index's parent table.  A TID consists of a
+   block number and an item number within that block (see 
+   linkend="storage-page-layout">).  This is sufficient
+   information to fetch a particular row version from the table.
+   Indexes are not directly aware that under MVCC, there may be multiple
+   extant versions of the same logical row; to an index, each tuple is
+   an independent object that needs its own index entry.  Thus, an
+   update of a row always creates all-new index entries for the row, even if
+   the key values did not change.  Index entries for dead tuples are
+   reclaimed (by vacuuming) when the dead tuples themselves are reclaimed.
+  
+
+ 
+  Catalog Entries for Indexes
+
+  
+   Each index access method is described by a row in the
+   pg_am system catalog (see
+   ).  The principal contents of a
+   pg_am row are references to
+   pg_proc
+   entries that identify the index access
+   functions supplied by the access method.  The APIs for these functions
+   are defined later in this chapter.  In addition, the
+   pg_am row specifies a few fixed properties of
+   the access method, such as whether it can support multi-column indexes.
+   There is not currently any special support
+   for creating or deleting pg_am entries;
+   anyone able to write a new access method is expected to be competent
+   to insert an appropriate row for themselves.
+  
+
+  
+   To be useful, an index access method must also have one or more
+   operator classes defined in
+   pg_opclass,
+   pg_amop, and
+   pg_amproc.
+   These entries allow the planner
+   to determine what kinds of query qualifications can be used with
+   indexes of this access method.  Operator classes are described
+   in , which is prerequisite material for reading
+   this chapter.
+  
+
+  
+   An individual index is defined by a 
+   pg_class
+   entry that describes it as a physical relation, plus a
+   pg_index
+   entry that shows the logical content of the index — that is, the set
+   of index columns it has and the semantics of those columns, as captured by
+   the associated operator classes.  The index columns (key values) can be
+   either simple columns of the underlying table or expressions over the table
+   rows.  The index access method normally has no interest in where the index
+   key values come from (it is always handed precomputed key values) but it
+   will be very interested in the operator class information in
+   pg_index.  Both of these catalog entries can be
+   accessed as part of the Relation data structure that is
+   passed to all operations on the index.
+  
+
+  
+   Some of the flag columns of pg_am have nonobvious
+   implications.  The requirements of amcanunique
+   are discussed in , and those of
+   amconcurrent in .
+   The amcanmulticol flag asserts that the
+   access method supports multi-column indexes, while
+   amindexnulls asserts that index entries are
+   created for NULL key values.  Since most indexable operators are
+   strict and hence cannot return TRUE for NULL inputs,
+   it is at first sight attractive to not store index entries for NULLs:
+   they could never be returned by an index scan anyway.  However, this
+   argument fails for a full-table index scan (one with no scan keys);
+   such a scan should include null rows.  In practice this means that
+   indexes that support ordered scans (have amorderstrategy
+   nonzero) must index nulls, since the planner might decide to use such a
+   scan as a substitute for sorting.  Another restriction is that an index
+   access method that supports multiple index columns must
+   support indexing null values in columns after the first, because the planner
+   will assume the index can be used for queries on just the first
+   column(s).  For example, consider an index on (a,b) and a query with
+   WHERE a = 4.  The system will assume the index can be
+   used to scan for rows with a = 4, which is wrong if the
+   index omits rows where b is null.
+   It is, however, OK to omit rows where the first indexed column is null.
+   (GiST currently does so.)  Thus,
+   amindexnulls should be set true only if the
+   index access method indexes all rows, including arbitrary combinations of
+   null values.
+  
+
+ 
+
+ 
+  Index Access Method Functions
+
+  
+   The index construction and maintenance functions that an index access
+   method must provide are:
+  
+
+  
+
+void
+ambuild (Relation heapRelation,
+         Relation indexRelation,
+         IndexInfo *indexInfo);
+
+   Build a new index.  The index relation has been physically created,
+   but is empty.  It must be filled in with whatever fixed data the
+   access method requires, plus entries for all tuples already existing
+   in the table.  Ordinarily the ambuild function will call
+   IndexBuildHeapScan() to scan the table for existing tuples
+   and compute the keys that need to be inserted into the index.
+  
+
+  
+
+InsertIndexResult
+aminsert (Relation indexRelation,
+          Datum *datums,
+          char *nulls,
+          ItemPointer heap_tid,
+          Relation heapRelation,
+          bool check_uniqueness);
+
+   Insert a new tuple into an existing index.  The datums and
+   nulls arrays give the key values to be indexed, and
+   heap_tid is the TID to be indexed.
+   If the access method supports unique indexes (its
+   pg_am.amcanunique flag is true) then
+   check_uniqueness may be true, in which case the access method
+   must verify that there is no conflicting row; this is the only situation in
+   which the access method normally needs the heapRelation
+   parameter.  See  for details.
+   The result is a struct that must be pfree'd by the caller.  (The result
+   struct is really quite useless and should be removed...)
+  
+
+  
+
+IndexBulkDeleteResult *
+ambulkdelete (Relation indexRelation,
+              IndexBulkDeleteCallback callback,
+              void *callback_state);
+
+   Delete tuple(s) from the index.  This is a bulk delete operation
+   that is intended to be implemented by scanning the whole index and checking
+   each entry to see if it should be deleted.
+   The passed-in callback function may be called, in the style
+   callback(TID, callback_state) returns bool,
+   to determine whether any particular index entry, as identified by its
+   referenced TID, is to be deleted.  Must return either NULL or a palloc'd
+   struct containing statistics about the effects of the deletion operation.
+  
+
+  
+
+IndexBulkDeleteResult *
+amvacuumcleanup (Relation indexRelation,
+                 IndexVacuumCleanupInfo *info,
+                 IndexBulkDeleteResult *stats);
+
+   Clean up after a VACUUM operation (one or more
+   ambulkdelete calls).  An index access method does not have
+   to provide this function (if so, the entry in pg_am must
+   be zero).  If it is provided, it is typically used for bulk cleanup
+   such as reclaiming empty index pages.  info
+   provides some additional arguments such as a message level for statistical
+   reports, and stats is whatever the last
+   ambulkdelete call returned.  amvacuumcleanup
+   may replace or modify this struct before returning it.  If the result
+   is not NULL it must be a palloc'd struct.  The statistics it contains
+   will be reported by VACUUM if VERBOSE is given.
+  
+
+  
+   The purpose of an index, of course, is to support scans for tuples matching
+   an indexable WHERE condition, often called a
+   qualifier or scan key.  The semantics of
+   index scanning are described more fully in ,
+   below.  The scan-related functions that an index access method must provide
+   are:
+  
+
+  
+
+IndexScanDesc
+ambeginscan (Relation indexRelation,
+             int nkeys,
+             ScanKey key);
+
+   Begin a new scan.  The key array (of length nkeys)
+   describes the scan key(s) for the index scan.  The result must be a
+   palloc'd struct. For implementation reasons the index access method
+   must create this struct by calling
+   RelationGetIndexScan().  In most cases
+   ambeginscan itself does little beyond making that call;
+   the interesting parts of indexscan startup are in amrescan.
+  
+
+  
+
+boolean
+amgettuple (IndexScanDesc scan,
+            ScanDirection direction);
+
+   Fetch the next tuple in the given scan, moving in the given
+   direction (forward or backward in the index).  Returns TRUE if a tuple was
+   obtained, FALSE if no matching tuples remain.  In the TRUE case the tuple
+   TID is stored into the scan structure.  Note that
+   success means only that the index contains an entry that matches
+   the scan keys, not that the tuple necessarily still exists in the heap or
+   will pass the caller's snapshot test.
+  
+
+  
+
+void
+amrescan (IndexScanDesc scan,
+          ScanKey key);
+
+   Restart the given scan, possibly with new scan keys (to continue using
+   the old keys, NULL is passed for key).  Note that it is not
+   possible for the number of keys to be changed.  In practice the restart
+   feature is used when a new outer tuple is selected by a nestloop join
+   and so a new key comparison value is needed, but the scan key structure
+   remains the same.  This function is also called by
+   RelationGetIndexScan(), so it is used for initial setup
+   of an indexscan as well as rescanning.
+  
+
+  
+
+void
+amendscan (IndexScanDesc scan);
+
+   End a scan and release resources.  The scan struct itself
+   should not be freed, but any locks or pins taken internally by the
+   access method must be released.
+  
+
+  
+
+void
+ammarkpos (IndexScanDesc scan);
+
+   Mark current scan position.  The access method need only support one
+   remembered scan position per scan.
+  
+
+  
+
+void
+amrestrpos (IndexScanDesc scan);
+
+   Restore the scan to the most recently marked position.
+  
+
+  
+
+void
+amcostestimate (Query *root,
+                RelOptInfo *rel,
+                IndexOptInfo *index,
+                List *indexQuals,
+                Cost *indexStartupCost,
+                Cost *indexTotalCost,
+                Selectivity *indexSelectivity,
+                double *indexCorrelation);
+
+   Estimate the costs of an index scan.  This function is described fully
+   in , below.
+  
+
+  
+   By convention, the pg_proc entry for any index
+   access method function should show the correct number of arguments,
+   but declare them all as type internal (since most of the arguments
+   have types that are not known to SQL, and we don't want users calling
+   the functions directly anyway).  The return type is declared as
+   void, internal, or boolean as appropriate.
+  
+
+ 
+
+ 
+  Index Scanning
+
+  
+   In an index scan, the index access method is responsible for regurgitating
+   the TIDs of all the tuples it has been told about that match the
+   scan keys.  The access method is not involved in
+   actually fetching those tuples from the index's parent table, nor in
+   determining whether they pass the scan's time qualification test or other
+   conditions.
+  
+
+  
+   A scan key is the internal representation of a WHERE clause of
+   the form index_key operator
+   constant, where the index key is one of the columns of the
+   index and the operator is one of the members of the operator class
+   associated with that index column.  An index scan has zero or more scan
+   keys, which are implicitly ANDed — the returned tuples are expected
+   to satisfy all the indicated conditions.
+  
+
+  
+   The operator class may indicate that the index is lossy for a
+   particular operator; this implies that the index scan will return all the
+   entries that pass the scan key, plus possibly additional entries that do
+   not.  The core system's indexscan machinery will then apply that operator
+   again to the heap tuple to verify whether or not it really should be
+   selected.  For non-lossy operators, the index scan must return exactly the
+   set of matching entries, as there is no recheck.
+  
+
+  
+   Note that it is entirely up to the access method to ensure that it
+   correctly finds all and only the entries passing all the given scan keys.
+   Also, the core system will simply hand off all the WHERE
+   clauses that match the index keys and operator classes, without any
+   semantic analysis to determine whether they are redundant or
+   contradictory.  As an example, given
+   WHERE x > 4 AND x > 14 where x is a b-tree
+   indexed column, it is left to the b-tree amrescan function
+   to realize that the first scan key is redundant and can be discarded.
+   The extent of preprocessing needed during amrescan will
+   depend on the extent to which the index access method needs to reduce
+   the scan keys to a normalized form.
+  
+
+  
+   The amgettuple function has a direction argument,
+   which can be either ForwardScanDirection (the normal case)
+   or  BackwardScanDirection.  If the first call after
+   amrescan specifies BackwardScanDirection, then the
+   set of matching index entries is to be scanned back-to-front rather than in
+   the normal front-to-back direction, so amgettuple must return
+   the last matching tuple in the index, rather than the first one as it
+   normally would.  (This will only occur for access
+   methods that advertise they support ordered scans by setting
+   pg_am.amorderstrategy nonzero.)  After the
+   first call, amgettuple must be prepared to advance the scan in
+   either direction from the most recently returned entry.
+  
+
+  
+   The access method must support marking a position in a scan
+   and later returning to the marked position.  The same position may be
+   restored multiple times.  However, only one position need be remembered
+   per scan; a new ammarkpos call overrides the previously
+   marked position.
+  
+
+  
+   Both the scan position and the mark position (if any) must be maintained
+   consistently in the face of concurrent insertions or deletions in the
+   index.  It is OK if a freshly-inserted entry is not returned by a scan that
+   would have found the entry if it had existed when the scan started, or for
+   the scan to return such an entry upon rescanning or backing
+   up even though it had not been returned the first time through.  Similarly,
+   a concurrent delete may or may not be reflected in the results of a scan.
+   What is important is that insertions or deletions not cause the scan to
+   miss or multiply return entries that were not themselves being inserted or
+   deleted.  (For an index type that does not set
+   pg_am.amconcurrent, it is sufficient to
+   handle these cases for insertions or deletions performed by the same
+   backend that's doing the scan.  But when amconcurrent is
+   true, insertions or deletions from other backends must be handled as well.)
+  
+
+ 
+
+ 
+  Index Locking Considerations
+
+  
+   An index access method can choose whether it supports concurrent updates
+   of the index by multiple processes.  If the method's
+   pg_am.amconcurrent flag is true, then
+   the core PostgreSQL system obtains
+   AccessShareLock on the index during an index scan, and
+   RowExclusiveLock when updating the index.  Since these lock
+   types do not conflict, the access method is responsible for handling any
+   fine-grained locking it may need.  An exclusive lock on the index as a whole
+   will be taken only during index creation, destruction, or
+   REINDEX.  When amconcurrent is false,
+   PostgreSQL still obtains
+   AccessShareLock during index scans, but it obtains
+   AccessExclusiveLock during any update.  This ensures that
+   updaters have sole use of the index.  Note that this implicitly assumes
+   that index scans are read-only; an access method that might modify the
+   index during a scan will still have to do its own locking to handle the
+   case of concurrent scans.
+  
+
+  
+   Recall that a backend's own locks never conflict; therefore, even a
+   non-concurrent index type must be prepared to handle the case where
+   a backend is inserting or deleting entries in an index that it is itself
+   scanning.  (This is of course necessary to support an UPDATE
+   that uses the index to find the rows to be updated.)
+  
+
+  
+   Building an index type that supports concurrent updates usually requires
+   extensive and subtle analysis of the required behavior.  For the b-tree
+   and hash index types, you can read about the design decisions involved in
+   src/backend/access/nbtree/README and
+   src/backend/access/hash/README.
+  
+
+  
+   Aside from the index's own internal consistency requirements, concurrent
+   updates create issues about consistency between the parent table (the
+   heap) and the index.  Because
+   PostgreSQL separates accesses 
+   and updates of the heap from those of the index, there are windows in
+   which the index may be inconsistent with the heap.  We handle this problem
+   with the following rules:
+
+    
+     
+      
+       A new heap entry is made before making its index entries.  (Therefore
+       a concurrent index scan is likely to fail to see the heap entry.
+       This is okay because the index reader would be uninterested in an
+       uncommitted row anyway.  But see .)
+      
+     
+     
+      
+       When a heap entry is to be deleted (by VACUUM), all its
+       index entries must be removed first.
+      
+     
+     
+      
+       For concurrent index types, an indexscan must maintain a pin
+       on the index page holding the item last returned by
+       amgettuple, and ambulkdelete cannot delete
+       entries from pages that are pinned by other backends.  The need
+       for this rule is explained below.
+      
+     
+    
+
+   If an index is concurrent then it is possible for an index reader to
+   see an index entry just before it is removed by VACUUM, and
+   then to arrive at the corresponding heap entry after that was removed by
+   VACUUM.  (With a nonconcurrent index, this is not possible
+   because of the conflicting index-level locks that will be taken out.)
+   This creates no serious problems if that item
+   number is still unused when the reader reaches it, since an empty
+   item slot will be ignored by heap_fetch().  But what if a
+   third backend has already re-used the item slot for something else?
+   When using an MVCC-compliant snapshot, there is no problem because
+   the new occupant of the slot is certain to be too new to pass the
+   snapshot test.  However, with a non-MVCC-compliant snapshot (such as
+   SnapshotNow), it would be possible to accept and return
+   a row that does not in fact match the scan keys.  We could defend
+   against this scenario by requiring the scan keys to be rechecked
+   against the heap row in all cases, but that is too expensive.  Instead,
+   we use a pin on an index page as a proxy to indicate that the reader
+   may still be in flight from the index entry to the matching
+   heap entry.  Making ambulkdelete block on such a pin ensures
+   that VACUUM cannot delete the heap entry before the reader
+   is done with it.  This solution costs little in runtime, and adds blocking
+   overhead only in the rare cases where there actually is a conflict.
+  
+
+  
+   This solution requires that index scans be synchronous: we have
+   to fetch each heap tuple immediately after scanning the corresponding index
+   entry.  This is expensive for a number of reasons.  An
+   asynchronous scan in which we collect many TIDs from the index,
+   and only visit the heap tuples sometime later, requires much less index
+   locking overhead and may allow a more efficient heap access pattern.
+   Per the above analysis, we must use the synchronous approach for
+   non-MVCC-compliant snapshots, but an asynchronous scan would be safe
+   for a query using an MVCC snapshot.  This possibility is not exploited
+   as of PostgreSQL 8.0, but it is likely to be
+   investigated soon.
+  
+
+ 
+
+ 
+  Index Uniqueness Checks
+
+  
+   PostgreSQL enforces SQL uniqueness constraints
+   using unique indexes, which are indexes that disallow
+   multiple entries with identical keys.  An access method that supports this
+   feature sets pg_am.amcanunique true.
+   (At present, only b-tree supports it.)
+  
+
+  
+   Because of MVCC, it is always necessary to allow duplicate entries to
+   exist physically in an index: the entries might refer to successive
+   versions of a single logical row.  The behavior we actually want to
+   enforce is that no MVCC snapshot could include two rows with equal
+   index keys.  This breaks down into the following cases that must be
+   checked when inserting a new row into a unique index:
+
+    
+     
+      
+       If a conflicting valid row has been deleted by the current transaction,
+       it's okay.  (In particular, since an UPDATE always deletes the old row
+       version before inserting the new version, this will allow an UPDATE on
+       a row without changing the key.)
+      
+     
+     
+      
+       If a conflicting row has been inserted by an as-yet-uncommitted
+       transaction, the would-be inserter must wait to see if that transaction
+       commits.  If it rolls back then there is no conflict.  If it commits
+       without deleting the conflicting row again, there is a uniqueness
+       violation.  (In practice we just wait for the other transaction to
+       end and then redo the visibility check in toto.)
+      
+     
+     
+      
+       Similarly, if a conflicting valid row has been deleted by an
+       as-yet-uncommitted transaction, the would-be inserter must wait
+       for that transaction to commit or abort, and then repeat the test.
+      
+     
+    
+  
+
+  
+   We require the index access method to apply these tests itself, which
+   means that it must reach into the heap to check the commit status of
+   any row that is shown to have a duplicate key according to the index
+   contents.  This is without a doubt ugly and non-modular, but it saves
+   redundant work: if we did a separate probe then the index lookup for
+   a conflicting row would be essentially repeated while finding the place to
+   insert the new row's index entry.  What's more, there is no obvious way
+   to avoid race conditions unless the conflict check is an integral part
+   of insertion of the new index entry.
+  
+
+  
+   The main limitation of this scheme is that it has no convenient way
+   to support deferred uniqueness checks.
+  
+
+ 
+
+ 
+  Index Cost Estimation Functions
+
+  
+   The amcostestimate function is given a list of WHERE clauses that have
+   been determined to be usable with the index.  It must return estimates
+   of the cost of accessing the index and the selectivity of the WHERE
+   clauses (that is, the fraction of parent-table rows that will be
+   retrieved during the index scan).  For simple cases, nearly all the
+   work of the cost estimator can be done by calling standard routines
+   in the optimizer; the point of having an amcostestimate function is
+   to allow index access methods to provide index-type-specific knowledge,
+   in case it is possible to improve on the standard estimates.
+  
+
+  
+   Each amcostestimate function must have the signature:
+
+
+void
+amcostestimate (Query *root,
+                RelOptInfo *rel,
+                IndexOptInfo *index,
+                List *indexQuals,
+                Cost *indexStartupCost,
+                Cost *indexTotalCost,
+                Selectivity *indexSelectivity,
+                double *indexCorrelation);
+
+
+   The first four parameters are inputs:
+
+   
+    
+     root
+     
+      
+       The query being processed.
+      
+     
+    
+
+    
+     rel
+     
+      
+       The relation the index is on.
+      
+     
+    
+
+    
+     index
+     
+      
+       The index itself.
+      
+     
+    
+
+    
+     indexQuals
+     
+      
+       List of index qual clauses (implicitly ANDed);
+       a NIL list indicates no qualifiers are available.
+       Note that the list contains expression trees, not ScanKeys.
+      
+     
+    
+   
+  
+
+  
+   The last four parameters are pass-by-reference outputs:
+
+   
+    
+     *indexStartupCost
+     
+      
+       Set to cost of index start-up processing
+      
+     
+    
+
+    
+     *indexTotalCost
+     
+      
+       Set to total cost of index processing
+      
+     
+    
+
+    
+     *indexSelectivity
+     
+      
+       Set to index selectivity
+      
+     
+    
+
+    
+     *indexCorrelation
+     
+      
+       Set to correlation coefficient between index scan order and
+       underlying table's order
+      
+     
+    
+   
+  
+
+  
+   Note that cost estimate functions must be written in C, not in SQL or
+   any available procedural language, because they must access internal
+   data structures of the planner/optimizer.
+  
+
+  
+   The index access costs should be computed in the units used by
+   src/backend/optimizer/path/costsize.c: a sequential disk block fetch
+   has cost 1.0, a nonsequential fetch has cost random_page_cost, and
+   the cost of processing one index row should usually be taken as
+   cpu_index_tuple_cost (which is a user-adjustable optimizer parameter).
+   In addition, an appropriate multiple of cpu_operator_cost should be charged
+   for any comparison operators invoked during index processing (especially
+   evaluation of the indexQuals themselves).
+  
+
+  
+   The access costs should include all disk and CPU costs associated with
+   scanning the index itself, but NOT the costs of retrieving or processing
+   the parent-table rows that are identified by the index.
+  
+
+  
+   The start-up cost is the part of the total scan cost that must be expended
+   before we can begin to fetch the first row.  For most indexes this can
+   be taken as zero, but an index type with a high start-up cost might want
+   to set it nonzero.
+  
+
+  
+   The indexSelectivity should be set to the estimated fraction of the parent
+   table rows that will be retrieved during the index scan.  In the case
+   of a lossy index, this will typically be higher than the fraction of
+   rows that actually pass the given qual conditions.
+  
+
+  
+   The indexCorrelation should be set to the correlation (ranging between
+   -1.0 and 1.0) between the index order and the table order.  This is used
+   to adjust the estimate for the cost of fetching rows from the parent
+   table.
+  
+
+  
+   Cost Estimation
+   
+    A typical cost estimator will proceed as follows:
+   
+
+   
+    
+     Estimate and return the fraction of parent-table rows that will be visited
+     based on the given qual conditions.  In the absence of any index-type-specific
+     knowledge, use the standard optimizer function clauselist_selectivity():
+
+
+*indexSelectivity = clauselist_selectivity(root, indexQuals,
+                                           rel->relid, JOIN_INNER);
+
+    
+   
+
+   
+    
+     Estimate the number of index rows that will be visited during the
+     scan.  For many index types this is the same as indexSelectivity times
+     the number of rows in the index, but it might be more.  (Note that the
+     index's size in pages and rows is available from the IndexOptInfo struct.)
+    
+   
+
+   
+    
+     Estimate the number of index pages that will be retrieved during the scan.
+     This might be just indexSelectivity times the index's size in pages.
+    
+   
+
+   
+    
+     Compute the index access cost.  A generic estimator might do this:
+
+
+    /*
+     * Our generic assumption is that the index pages will be read
+     * sequentially, so they have cost 1.0 each, not random_page_cost.
+     * Also, we charge for evaluation of the indexquals at each index row.
+     * All the costs are assumed to be paid incrementally during the scan.
+     */
+    cost_qual_eval(&index_qual_cost, indexQuals);
+    *indexStartupCost = index_qual_cost.startup;
+    *indexTotalCost = numIndexPages +
+        (cpu_index_tuple_cost + index_qual_cost.per_tuple) * numIndexTuples;
+
+    
+   
+
+   
+    
+     Estimate the index correlation.  For a simple ordered index on a single
+     field, this can be retrieved from pg_statistic.  If the correlation
+     is not known, the conservative estimate is zero (no correlation).
+    
+   
+  
+
+  
+   Examples of cost estimator functions can be found in
+   src/backend/utils/adt/selfuncs.c.
+  
+ 
+
+
+


diff --git a/doc/src/sgml/indexcost.sgml b/doc/src/sgml/indexcost.sgml

deleted file mode 100644 (file)

index 9758e8e..0000000


--- a/doc/src/sgml/indexcost.sgml
+++ /dev/null
@@ -1,285 +0,0 @@
-
-
- 
-  Index Cost Estimation Functions
-
-  
-   Author
-
-   
-    Written by Tom Lane ([email protected]) on 2000-01-24
-   
-  
-
-   
-    
-     This must eventually become part of a much larger chapter about
-     writing new index access methods.
-    
-   
-
-  
-   Every index access method must provide a cost estimation function for
-   use by the planner/optimizer.  The procedure OID of this function is
-   given in the amcostestimate field of the access
-   method's pg_am entry.
-
-   
-    
-     Prior to PostgreSQL 7.0, a different
-     scheme was used for registering 
-     index-specific cost estimation functions.
-    
-   
-  
-
-  
-   The amcostestimate function is given a list of WHERE clauses that have
-   been determined to be usable with the index.  It must return estimates
-   of the cost of accessing the index and the selectivity of the WHERE
-   clauses (that is, the fraction of main-table rows that will be
-   retrieved during the index scan).  For simple cases, nearly all the
-   work of the cost estimator can be done by calling standard routines
-   in the optimizer; the point of having an amcostestimate function is
-   to allow index access methods to provide index-type-specific knowledge,
-   in case it is possible to improve on the standard estimates.
-  
-
-  
-   Each amcostestimate function must have the signature:
-
-   
-void
-amcostestimate (Query *root,
-                RelOptInfo *rel,
-                IndexOptInfo *index,
-                List *indexQuals,
-                Cost *indexStartupCost,
-                Cost *indexTotalCost,
-                Selectivity *indexSelectivity,
-                double *indexCorrelation);
-   
-
-   The first four parameters are inputs:
-
-   
-    
-     root
-     
-      
-       The query being processed.
-      
-     
-    
-
-    
-     rel
-     
-      
-       The relation the index is on.
-      
-     
-    
-
-    
-     index
-     
-      
-       The index itself.
-      
-     
-    
-
-    
-     indexQuals
-     
-      
-       List of index qual clauses (implicitly ANDed);
-       a NIL list indicates no qualifiers are available.
-      
-     
-    
-   
-  
-
-  
-   The last four parameters are pass-by-reference outputs:
-
-   
-    
-     *indexStartupCost
-     
-      
-       Set to cost of index start-up processing
-      
-     
-    
-
-    
-     *indexTotalCost
-     
-      
-       Set to total cost of index processing
-      
-     
-    
-
-    
-     *indexSelectivity
-     
-      
-       Set to index selectivity
-      
-     
-    
-
-    
-     *indexCorrelation
-     
-      
-       Set to correlation coefficient between index scan order and
-       underlying table's order
-      
-     
-    
-   
-  
-
-  
-   Note that cost estimate functions must be written in C, not in SQL or
-   any available procedural language, because they must access internal
-   data structures of the planner/optimizer.
-  
-
-  
-   The index access costs should be computed in the units used by
-   src/backend/optimizer/path/costsize.c: a sequential disk block fetch
-   has cost 1.0, a nonsequential fetch has cost random_page_cost, and
-   the cost of processing one index row should usually be taken as
-   cpu_index_tuple_cost (which is a user-adjustable optimizer parameter).
-   In addition, an appropriate multiple of cpu_operator_cost should be charged
-   for any comparison operators invoked during index processing (especially
-   evaluation of the indexQuals themselves).
-  
-
-  
-   The access costs should include all disk and CPU costs associated with
-   scanning the index itself, but NOT the costs of retrieving or processing
-   the main-table rows that are identified by the index.
-  
-
-  
-   The start-up cost is the part of the total scan cost that must be expended
-   before we can begin to fetch the first row.  For most indexes this can
-   be taken as zero, but an index type with a high start-up cost might want
-   to set it nonzero.
-  
-
-  
-   The indexSelectivity should be set to the estimated fraction of the main
-   table rows that will be retrieved during the index scan.  In the case
-   of a lossy index, this will typically be higher than the fraction of
-   rows that actually pass the given qual conditions.
-  
-
-  
-   The indexCorrelation should be set to the correlation (ranging between
-   -1.0 and 1.0) between the index order and the table order.  This is used
-   to adjust the estimate for the cost of fetching rows from the main
-   table.
-  
-
-  
-   Cost Estimation
-   
-    A typical cost estimator will proceed as follows:
-   
-
-   
-    
-     Estimate and return the fraction of main-table rows that will be visited
-     based on the given qual conditions.  In the absence of any index-type-specific
-     knowledge, use the standard optimizer function clauselist_selectivity():
-
-     
-*indexSelectivity = clauselist_selectivity(root, indexQuals,
-                                           rel->relid, JOIN_INNER);
-     
-    
-   
-
-   
-    
-     Estimate the number of index rows that will be visited during the
-     scan.  For many index types this is the same as indexSelectivity times
-     the number of rows in the index, but it might be more.  (Note that the
-     index's size in pages and rows is available from the IndexOptInfo struct.)
-    
-   
-
-   
-    
-     Estimate the number of index pages that will be retrieved during the scan.
-     This might be just indexSelectivity times the index's size in pages.
-    
-   
-
-   
-    
-     Compute the index access cost.  A generic estimator might do this:
-
-     
-    /*
-     * Our generic assumption is that the index pages will be read
-     * sequentially, so they have cost 1.0 each, not random_page_cost.
-     * Also, we charge for evaluation of the indexquals at each index row.
-     * All the costs are assumed to be paid incrementally during the scan.
-     */
-    cost_qual_eval(&index_qual_cost, indexQuals);
-    *indexStartupCost = index_qual_cost.startup;
-    *indexTotalCost = numIndexPages +
-        (cpu_index_tuple_cost + index_qual_cost.per_tuple) * numIndexTuples;
-     
-    
-   
-
-   
-    
-     Estimate the index correlation.  For a simple ordered index on a single
-     field, this can be retrieved from pg_statistic.  If the correlation
-     is not known, the conservative estimate is zero (no correlation).
-    
-   
-  
-
-  
-   Examples of cost estimator functions can be found in
-   src/backend/utils/adt/selfuncs.c.
-  
-
-  
-   By convention, the pg_proc entry for an
-   amcostestimate function should show
-   eight arguments all declared as internal (since none of them have
-   types that are known to SQL), and the return type is void.
-  
- 
-
-


diff --git a/doc/src/sgml/postgres.sgml b/doc/src/sgml/postgres.sgml

index 8ec6262226805f9374ade128977536224b814831..a7ba58ce01f8bd28d7430b5822de5ce055df657a 100644 (file)


--- a/doc/src/sgml/postgres.sgml
+++ b/doc/src/sgml/postgres.sgml
@@ -1,5 +1,5 @@
 
 
 
@@ -235,7 +235,7 @@ $PostgreSQL: pgsql/doc/src/sgml/postgres.sgml,v 1.73 2005/01/10 00:04:38 tgl Exp
   &nls;
   &plhandler;
   &geqo;
-  &indexcost;
+  &indexam;
   &gist;
   &storage;
   &bki;


diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml

index 63b2f40592289ffd26aa67a0de8303c476e46e58..0b254324485b6324e3d75e379302223979fbb8db 100644 (file)


--- a/doc/src/sgml/xindex.sgml
+++ b/doc/src/sgml/xindex.sgml
@@ -1,5 +1,5 @@
 
 
 
@@ -43,7 +43,7 @@ $PostgreSQL: pgsql/doc/src/sgml/xindex.sgml,v 1.38 2005/01/23 00:30:18 momjian E
    described in pg_am.  It is possible to add a
    new index method by defining the required interface routines and
    then creating a row in pg_am — but that is
-   far beyond the scope of this chapter.
+   beyond the scope of this chapter (see ).
   
 
   
@@ -514,7 +514,7 @@ CREATE OPERATOR < (
    
     
      Although PostgreSQL can cope with
-     functions having the same name as long as they have different
+     functions having the same SQL name as long as they have different
      argument data types, C can only cope with one global function
      having a given name.  So we shouldn't name the C function
      something simple like abs_eq.  Usually it's
@@ -525,14 +525,12 @@ CREATE OPERATOR < (
 
    
     
-     We could have made the PostgreSQL name
+     We could have made the SQL name
      of the function abs_eq, relying on
      PostgreSQL to distinguish it by
-     argument data types from any other
-     PostgreSQL function of the same name.
+     argument data types from any other SQL function of the same name.
      To keep the example simple, we make the function have the same
-     names at the C level and PostgreSQL
-     level.
+     names at the C level and SQL level.
     
    
   
+   linkend="storage-page-layout">, and they all use the same format for index
+   tuple headers; but these decisions are not forced on an access method.)
+  
+
+  
+   An index is effectively a mapping from some data key values to
+   tuple identifiers, or TIDs, of row versions
+   (tuples) in the index's parent table.  A TID consists of a
+   block number and an item number within that block (see 
+   linkend="storage-page-layout">).  This is sufficient
+   information to fetch a particular row version from the table.
+   Indexes are not directly aware that under MVCC, there may be multiple
+   extant versions of the same logical row; to an index, each tuple is
+   an independent object that needs its own index entry.  Thus, an
+   update of a row always creates all-new index entries for the row, even if
+   the key values did not change.  Index entries for dead tuples are
+   reclaimed (by vacuuming) when the dead tuples themselves are reclaimed.
+  
+
+ 
+  Catalog Entries for Indexes
+
+  
+   Each index access method is described by a row in the
+   pg_am system catalog (see
+   ).  The principal contents of a
+   pg_am row are references to
+   pg_proc
+   entries that identify the index access
+   functions supplied by the access method.  The APIs for these functions
+   are defined later in this chapter.  In addition, the
+   pg_am row specifies a few fixed properties of
+   the access method, such as whether it can support multi-column indexes.
+   There is not currently any special support
+   for creating or deleting pg_am entries;
+   anyone able to write a new access method is expected to be competent
+   to insert an appropriate row for themselves.
+  
+
+  
+   To be useful, an index access method must also have one or more
+   operator classes defined in
+   pg_opclass,
+   pg_amop, and
+   pg_amproc.
+   These entries allow the planner
+   to determine what kinds of query qualifications can be used with
+   indexes of this access method.  Operator classes are described
+   in , which is prerequisite material for reading
+   this chapter.
+  
+
+  
+   An individual index is defined by a 
+   pg_class
+   entry that describes it as a physical relation, plus a
+   pg_index
+   entry that shows the logical content of the index — that is, the set
+   of index columns it has and the semantics of those columns, as captured by
+   the associated operator classes.  The index columns (key values) can be
+   either simple columns of the underlying table or expressions over the table
+   rows.  The index access method normally has no interest in where the index
+   key values come from (it is always handed precomputed key values) but it
+   will be very interested in the operator class information in
+   pg_index.  Both of these catalog entries can be
+   accessed as part of the Relation data structure that is
+   passed to all operations on the index.
+  
+
+  
+   Some of the flag columns of pg_am have nonobvious
+   implications.  The requirements of amcanunique
+   are discussed in , and those of
+   amconcurrent in .
+   The amcanmulticol flag asserts that the
+   access method supports multi-column indexes, while
+   amindexnulls asserts that index entries are
+   created for NULL key values.  Since most indexable operators are
+   strict and hence cannot return TRUE for NULL inputs,
+   it is at first sight attractive to not store index entries for NULLs:
+   they could never be returned by an index scan anyway.  However, this
+   argument fails for a full-table index scan (one with no scan keys);
+   such a scan should include null rows.  In practice this means that
+   indexes that support ordered scans (have amorderstrategy
+   nonzero) must index nulls, since the planner might decide to use such a
+   scan as a substitute for sorting.  Another restriction is that an index
+   access method that supports multiple index columns must
+   support indexing null values in columns after the first, because the planner
+   will assume the index can be used for queries on just the first
+   column(s).  For example, consider an index on (a,b) and a query with
+   WHERE a = 4.  The system will assume the index can be
+   used to scan for rows with a = 4, which is wrong if the
+   index omits rows where b is null.
+   It is, however, OK to omit rows where the first indexed column is null.
+   (GiST currently does so.)  Thus,
+   amindexnulls should be set true only if the
+   index access method indexes all rows, including arbitrary combinations of
+   null values.
+  
+
+ 
+
+ 
+  Index Access Method Functions
+
+  
+   The index construction and maintenance functions that an index access
+   method must provide are:
+  
+
+  
+
+void
+ambuild (Relation heapRelation,
+         Relation indexRelation,
+         IndexInfo *indexInfo);
+
+   Build a new index.  The index relation has been physically created,
+   but is empty.  It must be filled in with whatever fixed data the
+   access method requires, plus entries for all tuples already existing
+   in the table.  Ordinarily the ambuild function will call
+   IndexBuildHeapScan() to scan the table for existing tuples
+   and compute the keys that need to be inserted into the index.
+  
+
+  
+
+InsertIndexResult
+aminsert (Relation indexRelation,
+          Datum *datums,
+          char *nulls,
+          ItemPointer heap_tid,
+          Relation heapRelation,
+          bool check_uniqueness);
+
+   Insert a new tuple into an existing index.  The datums and
+   nulls arrays give the key values to be indexed, and
+   heap_tid is the TID to be indexed.
+   If the access method supports unique indexes (its
+   pg_am.amcanunique flag is true) then
+   check_uniqueness may be true, in which case the access method
+   must verify that there is no conflicting row; this is the only situation in
+   which the access method normally needs the heapRelation
+   parameter.  See  for details.
+   The result is a struct that must be pfree'd by the caller.  (The result
+   struct is really quite useless and should be removed...)
+  
+
+  
+
+IndexBulkDeleteResult *
+ambulkdelete (Relation indexRelation,
+              IndexBulkDeleteCallback callback,
+              void *callback_state);
+
+   Delete tuple(s) from the index.  This is a bulk delete operation
+   that is intended to be implemented by scanning the whole index and checking
+   each entry to see if it should be deleted.
+   The passed-in callback function may be called, in the style
+   callback(TID, callback_state) returns bool,
+   to determine whether any particular index entry, as identified by its
+   referenced TID, is to be deleted.  Must return either NULL or a palloc'd
+   struct containing statistics about the effects of the deletion operation.
+  
+
+  
+
+IndexBulkDeleteResult *
+amvacuumcleanup (Relation indexRelation,
+                 IndexVacuumCleanupInfo *info,
+                 IndexBulkDeleteResult *stats);
+
+   Clean up after a VACUUM operation (one or more
+   ambulkdelete calls).  An index access method does not have
+   to provide this function (if so, the entry in pg_am must
+   be zero).  If it is provided, it is typically used for bulk cleanup
+   such as reclaiming empty index pages.  info
+   provides some additional arguments such as a message level for statistical
+   reports, and stats is whatever the last
+   ambulkdelete call returned.  amvacuumcleanup
+   may replace or modify this struct before returning it.  If the result
+   is not NULL it must be a palloc'd struct.  The statistics it contains
+   will be reported by VACUUM if VERBOSE is given.
+  
+
+  
+   The purpose of an index, of course, is to support scans for tuples matching
+   an indexable WHERE condition, often called a
+   qualifier or scan key.  The semantics of
+   index scanning are described more fully in ,
+   below.  The scan-related functions that an index access method must provide
+   are:
+  
+
+  
+
+IndexScanDesc
+ambeginscan (Relation indexRelation,
+             int nkeys,
+             ScanKey key);
+
+   Begin a new scan.  The key array (of length nkeys)
+   describes the scan key(s) for the index scan.  The result must be a
+   palloc'd struct. For implementation reasons the index access method
+   must create this struct by calling
+   RelationGetIndexScan().  In most cases
+   ambeginscan itself does little beyond making that call;
+   the interesting parts of indexscan startup are in amrescan.
+  
+
+  
+
+boolean
+amgettuple (IndexScanDesc scan,
+            ScanDirection direction);
+
+   Fetch the next tuple in the given scan, moving in the given
+   direction (forward or backward in the index).  Returns TRUE if a tuple was
+   obtained, FALSE if no matching tuples remain.  In the TRUE case the tuple
+   TID is stored into the scan structure.  Note that
+   success means only that the index contains an entry that matches
+   the scan keys, not that the tuple necessarily still exists in the heap or
+   will pass the caller's snapshot test.
+  
+
+  
+
+void
+amrescan (IndexScanDesc scan,
+          ScanKey key);
+
+   Restart the given scan, possibly with new scan keys (to continue using
+   the old keys, NULL is passed for key).  Note that it is not
+   possible for the number of keys to be changed.  In practice the restart
+   feature is used when a new outer tuple is selected by a nestloop join
+   and so a new key comparison value is needed, but the scan key structure
+   remains the same.  This function is also called by
+   RelationGetIndexScan(), so it is used for initial setup
+   of an indexscan as well as rescanning.
+  
+
+  
+
+void
+amendscan (IndexScanDesc scan);
+
+   End a scan and release resources.  The scan struct itself
+   should not be freed, but any locks or pins taken internally by the
+   access method must be released.
+  
+
+  
+
+void
+ammarkpos (IndexScanDesc scan);
+
+   Mark current scan position.  The access method need only support one
+   remembered scan position per scan.
+  
+
+  
+
+void
+amrestrpos (IndexScanDesc scan);
+
+   Restore the scan to the most recently marked position.
+  
+
+  
+
+void
+amcostestimate (Query *root,
+                RelOptInfo *rel,
+                IndexOptInfo *index,
+                List *indexQuals,
+                Cost *indexStartupCost,
+                Cost *indexTotalCost,
+                Selectivity *indexSelectivity,
+                double *indexCorrelation);
+
+   Estimate the costs of an index scan.  This function is described fully
+   in , below.
+  
+
+  
+   By convention, the pg_proc entry for any index
+   access method function should show the correct number of arguments,
+   but declare them all as type internal (since most of the arguments
+   have types that are not known to SQL, and we don't want users calling
+   the functions directly anyway).  The return type is declared as
+   void, internal, or boolean as appropriate.
+  
+
+ 
+
+ 
+  Index Scanning
+
+  
+   In an index scan, the index access method is responsible for regurgitating
+   the TIDs of all the tuples it has been told about that match the
+   scan keys.  The access method is not involved in
+   actually fetching those tuples from the index's parent table, nor in
+   determining whether they pass the scan's time qualification test or other
+   conditions.
+  
+
+  
+   A scan key is the internal representation of a WHERE clause of
+   the form index_key operator
+   constant, where the index key is one of the columns of the
+   index and the operator is one of the members of the operator class
+   associated with that index column.  An index scan has zero or more scan
+   keys, which are implicitly ANDed — the returned tuples are expected
+   to satisfy all the indicated conditions.
+  
+
+  
+   The operator class may indicate that the index is lossy for a
+   particular operator; this implies that the index scan will return all the
+   entries that pass the scan key, plus possibly additional entries that do
+   not.  The core system's indexscan machinery will then apply that operator
+   again to the heap tuple to verify whether or not it really should be
+   selected.  For non-lossy operators, the index scan must return exactly the
+   set of matching entries, as there is no recheck.
+  
+
+  
+   Note that it is entirely up to the access method to ensure that it
+   correctly finds all and only the entries passing all the given scan keys.
+   Also, the core system will simply hand off all the WHERE
+   clauses that match the index keys and operator classes, without any
+   semantic analysis to determine whether they are redundant or
+   contradictory.  As an example, given
+   WHERE x > 4 AND x > 14 where x is a b-tree
+   indexed column, it is left to the b-tree amrescan function
+   to realize that the first scan key is redundant and can be discarded.
+   The extent of preprocessing needed during amrescan will
+   depend on the extent to which the index access method needs to reduce
+   the scan keys to a normalized form.
+  
+
+  
+   The amgettuple function has a direction argument,
+   which can be either ForwardScanDirection (the normal case)
+   or  BackwardScanDirection.  If the first call after
+   amrescan specifies BackwardScanDirection, then the
+   set of matching index entries is to be scanned back-to-front rather than in
+   the normal front-to-back direction, so amgettuple must return
+   the last matching tuple in the index, rather than the first one as it
+   normally would.  (This will only occur for access
+   methods that advertise they support ordered scans by setting
+   pg_am.amorderstrategy nonzero.)  After the
+   first call, amgettuple must be prepared to advance the scan in
+   either direction from the most recently returned entry.
+  
+
+  
+   The access method must support marking a position in a scan
+   and later returning to the marked position.  The same position may be
+   restored multiple times.  However, only one position need be remembered
+   per scan; a new ammarkpos call overrides the previously
+   marked position.
+  
+
+  
+   Both the scan position and the mark position (if any) must be maintained
+   consistently in the face of concurrent insertions or deletions in the
+   index.  It is OK if a freshly-inserted entry is not returned by a scan that
+   would have found the entry if it had existed when the scan started, or for
+   the scan to return such an entry upon rescanning or backing
+   up even though it had not been returned the first time through.  Similarly,
+   a concurrent delete may or may not be reflected in the results of a scan.
+   What is important is that insertions or deletions not cause the scan to
+   miss or multiply return entries that were not themselves being inserted or
+   deleted.  (For an index type that does not set
+   pg_am.amconcurrent, it is sufficient to
+   handle these cases for insertions or deletions performed by the same
+   backend that's doing the scan.  But when amconcurrent is
+   true, insertions or deletions from other backends must be handled as well.)
+  
+
+ 
+
+ 
+  Index Locking Considerations
+
+  
+   An index access method can choose whether it supports concurrent updates
+   of the index by multiple processes.  If the method's
+   pg_am.amconcurrent flag is true, then
+   the core PostgreSQL system obtains
+   AccessShareLock on the index during an index scan, and
+   RowExclusiveLock when updating the index.  Since these lock
+   types do not conflict, the access method is responsible for handling any
+   fine-grained locking it may need.  An exclusive lock on the index as a whole
+   will be taken only during index creation, destruction, or
+   REINDEX.  When amconcurrent is false,
+   PostgreSQL still obtains
+   AccessShareLock during index scans, but it obtains
+   AccessExclusiveLock during any update.  This ensures that
+   updaters have sole use of the index.  Note that this implicitly assumes
+   that index scans are read-only; an access method that might modify the
+   index during a scan will still have to do its own locking to handle the
+   case of concurrent scans.
+  
+
+  
+   Recall that a backend's own locks never conflict; therefore, even a
+   non-concurrent index type must be prepared to handle the case where
+   a backend is inserting or deleting entries in an index that it is itself
+   scanning.  (This is of course necessary to support an UPDATE
+   that uses the index to find the rows to be updated.)
+  
+
+  
+   Building an index type that supports concurrent updates usually requires
+   extensive and subtle analysis of the required behavior.  For the b-tree
+   and hash index types, you can read about the design decisions involved in
+   src/backend/access/nbtree/README and
+   src/backend/access/hash/README.
+  
+
+  
+   Aside from the index's own internal consistency requirements, concurrent
+   updates create issues about consistency between the parent table (the
+   heap) and the index.  Because
+   PostgreSQL separates accesses 
+   and updates of the heap from those of the index, there are windows in
+   which the index may be inconsistent with the heap.  We handle this problem
+   with the following rules:
+
+    
+     
+      
+       A new heap entry is made before making its index entries.  (Therefore
+       a concurrent index scan is likely to fail to see the heap entry.
+       This is okay because the index reader would be uninterested in an
+       uncommitted row anyway.  But see .)
+      
+     
+     
+      
+       When a heap entry is to be deleted (by VACUUM), all its
+       index entries must be removed first.
+      
+     
+     
+      
+       For concurrent index types, an indexscan must maintain a pin
+       on the index page holding the item last returned by
+       amgettuple, and ambulkdelete cannot delete
+       entries from pages that are pinned by other backends.  The need
+       for this rule is explained below.
+      
+     
+    
+
+   If an index is concurrent then it is possible for an index reader to
+   see an index entry just before it is removed by VACUUM, and
+   then to arrive at the corresponding heap entry after that was removed by
+   VACUUM.  (With a nonconcurrent index, this is not possible
+   because of the conflicting index-level locks that will be taken out.)
+   This creates no serious problems if that item
+   number is still unused when the reader reaches it, since an empty
+   item slot will be ignored by heap_fetch().  But what if a
+   third backend has already re-used the item slot for something else?
+   When using an MVCC-compliant snapshot, there is no problem because
+   the new occupant of the slot is certain to be too new to pass the
+   snapshot test.  However, with a non-MVCC-compliant snapshot (such as
+   SnapshotNow), it would be possible to accept and return
+   a row that does not in fact match the scan keys.  We could defend
+   against this scenario by requiring the scan keys to be rechecked
+   against the heap row in all cases, but that is too expensive.  Instead,
+   we use a pin on an index page as a proxy to indicate that the reader
+   may still be in flight from the index entry to the matching
+   heap entry.  Making ambulkdelete block on such a pin ensures
+   that VACUUM cannot delete the heap entry before the reader
+   is done with it.  This solution costs little in runtime, and adds blocking
+   overhead only in the rare cases where there actually is a conflict.
+  
+
+  
+   This solution requires that index scans be synchronous: we have
+   to fetch each heap tuple immediately after scanning the corresponding index
+   entry.  This is expensive for a number of reasons.  An
+   asynchronous scan in which we collect many TIDs from the index,
+   and only visit the heap tuples sometime later, requires much less index
+   locking overhead and may allow a more efficient heap access pattern.
+   Per the above analysis, we must use the synchronous approach for
+   non-MVCC-compliant snapshots, but an asynchronous scan would be safe
+   for a query using an MVCC snapshot.  This possibility is not exploited
+   as of PostgreSQL 8.0, but it is likely to be
+   investigated soon.
+  
+
+ 
+
+ 
+  Index Uniqueness Checks
+
+  
+   PostgreSQL enforces SQL uniqueness constraints
+   using unique indexes, which are indexes that disallow
+   multiple entries with identical keys.  An access method that supports this
+   feature sets pg_am.amcanunique true.
+   (At present, only b-tree supports it.)
+  
+
+  
+   Because of MVCC, it is always necessary to allow duplicate entries to
+   exist physically in an index: the entries might refer to successive
+   versions of a single logical row.  The behavior we actually want to
+   enforce is that no MVCC snapshot could include two rows with equal
+   index keys.  This breaks down into the following cases that must be
+   checked when inserting a new row into a unique index:
+
+    
+     
+      
+       If a conflicting valid row has been deleted by the current transaction,
+       it's okay.  (In particular, since an UPDATE always deletes the old row
+       version before inserting the new version, this will allow an UPDATE on
+       a row without changing the key.)
+      
+     
+     
+      
+       If a conflicting row has been inserted by an as-yet-uncommitted
+       transaction, the would-be inserter must wait to see if that transaction
+       commits.  If it rolls back then there is no conflict.  If it commits
+       without deleting the conflicting row again, there is a uniqueness
+       violation.  (In practice we just wait for the other transaction to
+       end and then redo the visibility check in toto.)
+      
+     
+     
+      
+       Similarly, if a conflicting valid row has been deleted by an
+       as-yet-uncommitted transaction, the would-be inserter must wait
+       for that transaction to commit or abort, and then repeat the test.
+      
+     
+    
+  
+
+  
+   We require the index access method to apply these tests itself, which
+   means that it must reach into the heap to check the commit status of
+   any row that is shown to have a duplicate key according to the index
+   contents.  This is without a doubt ugly and non-modular, but it saves
+   redundant work: if we did a separate probe then the index lookup for
+   a conflicting row would be essentially repeated while finding the place to
+   insert the new row's index entry.  What's more, there is no obvious way
+   to avoid race conditions unless the conflict check is an integral part
+   of insertion of the new index entry.
+  
+
+  
+   The main limitation of this scheme is that it has no convenient way
+   to support deferred uniqueness checks.
+  
+
+ 
+
+ 
+  Index Cost Estimation Functions
+
+  
+   The amcostestimate function is given a list of WHERE clauses that have
+   been determined to be usable with the index.  It must return estimates
+   of the cost of accessing the index and the selectivity of the WHERE
+   clauses (that is, the fraction of parent-table rows that will be
+   retrieved during the index scan).  For simple cases, nearly all the
+   work of the cost estimator can be done by calling standard routines
+   in the optimizer; the point of having an amcostestimate function is
+   to allow index access methods to provide index-type-specific knowledge,
+   in case it is possible to improve on the standard estimates.
+  
+
+  
+   Each amcostestimate function must have the signature:
+
+
+void
+amcostestimate (Query *root,
+                RelOptInfo *rel,
+                IndexOptInfo *index,
+                List *indexQuals,
+                Cost *indexStartupCost,
+                Cost *indexTotalCost,
+                Selectivity *indexSelectivity,
+                double *indexCorrelation);
+
+
+   The first four parameters are inputs:
+
+   
+    
+     root
+     
+      
+       The query being processed.
+      
+     
+    
+
+    
+     rel
+     
+      
+       The relation the index is on.
+      
+     
+    
+
+    
+     index
+     
+      
+       The index itself.
+      
+     
+    
+
+    
+     indexQuals
+     
+      
+       List of index qual clauses (implicitly ANDed);
+       a NIL list indicates no qualifiers are available.
+       Note that the list contains expression trees, not ScanKeys.
+      
+     
+    
+   
+  
+
+  
+   The last four parameters are pass-by-reference outputs:
+
+   
+    
+     *indexStartupCost
+     
+      
+       Set to cost of index start-up processing
+      
+     
+    
+
+    
+     *indexTotalCost
+     
+      
+       Set to total cost of index processing
+      
+     
+    
+
+    
+     *indexSelectivity
+     
+      
+       Set to index selectivity
+      
+     
+    
+
+    
+     *indexCorrelation
+     
+      
+       Set to correlation coefficient between index scan order and
+       underlying table's order
+      
+     
+    
+   
+  
+
+  
+   Note that cost estimate functions must be written in C, not in SQL or
+   any available procedural language, because they must access internal
+   data structures of the planner/optimizer.
+  
+
+  
+   The index access costs should be computed in the units used by
+   src/backend/optimizer/path/costsize.c: a sequential disk block fetch
+   has cost 1.0, a nonsequential fetch has cost random_page_cost, and
+   the cost of processing one index row should usually be taken as
+   cpu_index_tuple_cost (which is a user-adjustable optimizer parameter).
+   In addition, an appropriate multiple of cpu_operator_cost should be charged
+   for any comparison operators invoked during index processing (especially
+   evaluation of the indexQuals themselves).
+  
+
+  
+   The access costs should include all disk and CPU costs associated with
+   scanning the index itself, but NOT the costs of retrieving or processing
+   the parent-table rows that are identified by the index.
+  
+
+  
+   The start-up cost is the part of the total scan cost that must be expended
+   before we can begin to fetch the first row.  For most indexes this can
+   be taken as zero, but an index type with a high start-up cost might want
+   to set it nonzero.
+  
+
+  
+   The indexSelectivity should be set to the estimated fraction of the parent
+   table rows that will be retrieved during the index scan.  In the case
+   of a lossy index, this will typically be higher than the fraction of
+   rows that actually pass the given qual conditions.
+  
+
+  
+   The indexCorrelation should be set to the correlation (ranging between
+   -1.0 and 1.0) between the index order and the table order.  This is used
+   to adjust the estimate for the cost of fetching rows from the parent
+   table.
+  
+
+  
+   Cost Estimation
+   
+    A typical cost estimator will proceed as follows:
+   
+
+   
+    
+     Estimate and return the fraction of parent-table rows that will be visited
+     based on the given qual conditions.  In the absence of any index-type-specific
+     knowledge, use the standard optimizer function clauselist_selectivity():
+
+
+*indexSelectivity = clauselist_selectivity(root, indexQuals,
+                                           rel->relid, JOIN_INNER);
+
+    
+   
+
+   
+    
+     Estimate the number of index rows that will be visited during the
+     scan.  For many index types this is the same as indexSelectivity times
+     the number of rows in the index, but it might be more.  (Note that the
+     index's size in pages and rows is available from the IndexOptInfo struct.)
+    
+   
+
+   
+    
+     Estimate the number of index pages that will be retrieved during the scan.
+     This might be just indexSelectivity times the index's size in pages.
+    
+   
+
+   
+    
+     Compute the index access cost.  A generic estimator might do this:
+
+
+    /*
+     * Our generic assumption is that the index pages will be read
+     * sequentially, so they have cost 1.0 each, not random_page_cost.
+     * Also, we charge for evaluation of the indexquals at each index row.
+     * All the costs are assumed to be paid incrementally during the scan.
+     */
+    cost_qual_eval(&index_qual_cost, indexQuals);
+    *indexStartupCost = index_qual_cost.startup;
+    *indexTotalCost = numIndexPages +
+        (cpu_index_tuple_cost + index_qual_cost.per_tuple) * numIndexTuples;
+
+    
+   
+
+   
+    
+     Estimate the index correlation.  For a simple ordered index on a single
+     field, this can be retrieved from pg_statistic.  If the correlation
+     is not known, the conservative estimate is zero (no correlation).
+    
+   
+  
+
+  
+   Examples of cost estimator functions can be found in
+   src/backend/utils/adt/selfuncs.c.
+  
+ 
+
+
+
+   linkend="storage-page-layout">).  This is sufficient
+   information to fetch a particular row version from the table.
+   Indexes are not directly aware that under MVCC, there may be multiple
+   extant versions of the same logical row; to an index, each tuple is
+   an independent object that needs its own index entry.  Thus, an
+   update of a row always creates all-new index entries for the row, even if
+   the key values did not change.  Index entries for dead tuples are
+   reclaimed (by vacuuming) when the dead tuples themselves are reclaimed.
+  
+
+ 
+  Catalog Entries for Indexes
+
+  
+   Each index access method is described by a row in the
+   pg_am system catalog (see
+   ).  The principal contents of a
+   pg_am row are references to
+   pg_proc
+   entries that identify the index access
+   functions supplied by the access method.  The APIs for these functions
+   are defined later in this chapter.  In addition, the
+   pg_am row specifies a few fixed properties of
+   the access method, such as whether it can support multi-column indexes.
+   There is not currently any special support
+   for creating or deleting pg_am entries;
+   anyone able to write a new access method is expected to be competent
+   to insert an appropriate row for themselves.
+  
+
+  
+   To be useful, an index access method must also have one or more
+   operator classes defined in
+   pg_opclass,
+   pg_amop, and
+   pg_amproc.
+   These entries allow the planner
+   to determine what kinds of query qualifications can be used with
+   indexes of this access method.  Operator classes are described
+   in , which is prerequisite material for reading
+   this chapter.
+  
+
+  
+   An individual index is defined by a 
+   pg_class
+   entry that describes it as a physical relation, plus a
+   pg_index
+   entry that shows the logical content of the index — that is, the set
+   of index columns it has and the semantics of those columns, as captured by
+   the associated operator classes.  The index columns (key values) can be
+   either simple columns of the underlying table or expressions over the table
+   rows.  The index access method normally has no interest in where the index
+   key values come from (it is always handed precomputed key values) but it
+   will be very interested in the operator class information in
+   pg_index.  Both of these catalog entries can be
+   accessed as part of the Relation data structure that is
+   passed to all operations on the index.
+  
+
+  
+   Some of the flag columns of pg_am have nonobvious
+   implications.  The requirements of amcanunique
+   are discussed in , and those of
+   amconcurrent in .
+   The amcanmulticol flag asserts that the
+   access method supports multi-column indexes, while
+   amindexnulls asserts that index entries are
+   created for NULL key values.  Since most indexable operators are
+   strict and hence cannot return TRUE for NULL inputs,
+   it is at first sight attractive to not store index entries for NULLs:
+   they could never be returned by an index scan anyway.  However, this
+   argument fails for a full-table index scan (one with no scan keys);
+   such a scan should include null rows.  In practice this means that
+   indexes that support ordered scans (have amorderstrategy
+   nonzero) must index nulls, since the planner might decide to use such a
+   scan as a substitute for sorting.  Another restriction is that an index
+   access method that supports multiple index columns must
+   support indexing null values in columns after the first, because the planner
+   will assume the index can be used for queries on just the first
+   column(s).  For example, consider an index on (a,b) and a query with
+   WHERE a = 4.  The system will assume the index can be
+   used to scan for rows with a = 4, which is wrong if the
+   index omits rows where b is null.
+   It is, however, OK to omit rows where the first indexed column is null.
+   (GiST currently does so.)  Thus,
+   amindexnulls should be set true only if the
+   index access method indexes all rows, including arbitrary combinations of
+   null values.
+  
+
+ 
+
+ 
+  Index Access Method Functions
+
+  
+   The index construction and maintenance functions that an index access
+   method must provide are:
+  
+
+  
+
+void
+ambuild (Relation heapRelation,
+         Relation indexRelation,
+         IndexInfo *indexInfo);
+
+   Build a new index.  The index relation has been physically created,
+   but is empty.  It must be filled in with whatever fixed data the
+   access method requires, plus entries for all tuples already existing
+   in the table.  Ordinarily the ambuild function will call
+   IndexBuildHeapScan() to scan the table for existing tuples
+   and compute the keys that need to be inserted into the index.
+  
+
+  
+
+InsertIndexResult
+aminsert (Relation indexRelation,
+          Datum *datums,
+          char *nulls,
+          ItemPointer heap_tid,
+          Relation heapRelation,
+          bool check_uniqueness);
+
+   Insert a new tuple into an existing index.  The datums and
+   nulls arrays give the key values to be indexed, and
+   heap_tid is the TID to be indexed.
+   If the access method supports unique indexes (its
+   pg_am.amcanunique flag is true) then
+   check_uniqueness may be true, in which case the access method
+   must verify that there is no conflicting row; this is the only situation in
+   which the access method normally needs the heapRelation
+   parameter.  See  for details.
+   The result is a struct that must be pfree'd by the caller.  (The result
+   struct is really quite useless and should be removed...)
+  
+
+  
+
+IndexBulkDeleteResult *
+ambulkdelete (Relation indexRelation,
+              IndexBulkDeleteCallback callback,
+              void *callback_state);
+
+   Delete tuple(s) from the index.  This is a bulk delete operation
+   that is intended to be implemented by scanning the whole index and checking
+   each entry to see if it should be deleted.
+   The passed-in callback function may be called, in the style
+   callback(TID, callback_state) returns bool,
+   to determine whether any particular index entry, as identified by its
+   referenced TID, is to be deleted.  Must return either NULL or a palloc'd
+   struct containing statistics about the effects of the deletion operation.
+  
+
+  
+
+IndexBulkDeleteResult *
+amvacuumcleanup (Relation indexRelation,
+                 IndexVacuumCleanupInfo *info,
+                 IndexBulkDeleteResult *stats);
+
+   Clean up after a VACUUM operation (one or more
+   ambulkdelete calls).  An index access method does not have
+   to provide this function (if so, the entry in pg_am must
+   be zero).  If it is provided, it is typically used for bulk cleanup
+   such as reclaiming empty index pages.  info
+   provides some additional arguments such as a message level for statistical
+   reports, and stats is whatever the last
+   ambulkdelete call returned.  amvacuumcleanup
+   may replace or modify this struct before returning it.  If the result
+   is not NULL it must be a palloc'd struct.  The statistics it contains
+   will be reported by VACUUM if VERBOSE is given.
+  
+
+  
+   The purpose of an index, of course, is to support scans for tuples matching
+   an indexable WHERE condition, often called a
+   qualifier or scan key.  The semantics of
+   index scanning are described more fully in ,
+   below.  The scan-related functions that an index access method must provide
+   are:
+  
+
+  
+
+IndexScanDesc
+ambeginscan (Relation indexRelation,
+             int nkeys,
+             ScanKey key);
+
+   Begin a new scan.  The key array (of length nkeys)
+   describes the scan key(s) for the index scan.  The result must be a
+   palloc'd struct. For implementation reasons the index access method
+   must create this struct by calling
+   RelationGetIndexScan().  In most cases
+   ambeginscan itself does little beyond making that call;
+   the interesting parts of indexscan startup are in amrescan.
+  
+
+  
+
+boolean
+amgettuple (IndexScanDesc scan,
+            ScanDirection direction);
+
+   Fetch the next tuple in the given scan, moving in the given
+   direction (forward or backward in the index).  Returns TRUE if a tuple was
+   obtained, FALSE if no matching tuples remain.  In the TRUE case the tuple
+   TID is stored into the scan structure.  Note that
+   success means only that the index contains an entry that matches
+   the scan keys, not that the tuple necessarily still exists in the heap or
+   will pass the caller's snapshot test.
+  
+
+  
+
+void
+amrescan (IndexScanDesc scan,
+          ScanKey key);
+
+   Restart the given scan, possibly with new scan keys (to continue using
+   the old keys, NULL is passed for key).  Note that it is not
+   possible for the number of keys to be changed.  In practice the restart
+   feature is used when a new outer tuple is selected by a nestloop join
+   and so a new key comparison value is needed, but the scan key structure
+   remains the same.  This function is also called by
+   RelationGetIndexScan(), so it is used for initial setup
+   of an indexscan as well as rescanning.
+  
+
+  
+
+void
+amendscan (IndexScanDesc scan);
+
+   End a scan and release resources.  The scan struct itself
+   should not be freed, but any locks or pins taken internally by the
+   access method must be released.
+  
+
+  
+
+void
+ammarkpos (IndexScanDesc scan);
+
+   Mark current scan position.  The access method need only support one
+   remembered scan position per scan.
+  
+
+  
+
+void
+amrestrpos (IndexScanDesc scan);
+
+   Restore the scan to the most recently marked position.
+  
+
+  
+
+void
+amcostestimate (Query *root,
+                RelOptInfo *rel,
+                IndexOptInfo *index,
+                List *indexQuals,
+                Cost *indexStartupCost,
+                Cost *indexTotalCost,
+                Selectivity *indexSelectivity,
+                double *indexCorrelation);
+
+   Estimate the costs of an index scan.  This function is described fully
+   in , below.
+  
+
+  
+   By convention, the pg_proc entry for any index
+   access method function should show the correct number of arguments,
+   but declare them all as type internal (since most of the arguments
+   have types that are not known to SQL, and we don't want users calling
+   the functions directly anyway).  The return type is declared as
+   void, internal, or boolean as appropriate.
+  
+
+ 
+
+ 
+  Index Scanning
+
+  
+   In an index scan, the index access method is responsible for regurgitating
+   the TIDs of all the tuples it has been told about that match the
+   scan keys.  The access method is not involved in
+   actually fetching those tuples from the index's parent table, nor in
+   determining whether they pass the scan's time qualification test or other
+   conditions.
+  
+
+  
+   A scan key is the internal representation of a WHERE clause of
+   the form index_key operator
+   constant, where the index key is one of the columns of the
+   index and the operator is one of the members of the operator class
+   associated with that index column.  An index scan has zero or more scan
+   keys, which are implicitly ANDed — the returned tuples are expected
+   to satisfy all the indicated conditions.
+  
+
+  
+   The operator class may indicate that the index is lossy for a
+   particular operator; this implies that the index scan will return all the
+   entries that pass the scan key, plus possibly additional entries that do
+   not.  The core system's indexscan machinery will then apply that operator
+   again to the heap tuple to verify whether or not it really should be
+   selected.  For non-lossy operators, the index scan must return exactly the
+   set of matching entries, as there is no recheck.
+  
+
+  
+   Note that it is entirely up to the access method to ensure that it
+   correctly finds all and only the entries passing all the given scan keys.
+   Also, the core system will simply hand off all the WHERE
+   clauses that match the index keys and operator classes, without any
+   semantic analysis to determine whether they are redundant or
+   contradictory.  As an example, given
+   WHERE x > 4 AND x > 14 where x is a b-tree
+   indexed column, it is left to the b-tree amrescan function
+   to realize that the first scan key is redundant and can be discarded.
+   The extent of preprocessing needed during amrescan will
+   depend on the extent to which the index access method needs to reduce
+   the scan keys to a normalized form.
+  
+
+  
+   The amgettuple function has a direction argument,
+   which can be either ForwardScanDirection (the normal case)
+   or  BackwardScanDirection.  If the first call after
+   amrescan specifies BackwardScanDirection, then the
+   set of matching index entries is to be scanned back-to-front rather than in
+   the normal front-to-back direction, so amgettuple must return
+   the last matching tuple in the index, rather than the first one as it
+   normally would.  (This will only occur for access
+   methods that advertise they support ordered scans by setting
+   pg_am.amorderstrategy nonzero.)  After the
+   first call, amgettuple must be prepared to advance the scan in
+   either direction from the most recently returned entry.
+  
+
+  
+   The access method must support marking a position in a scan
+   and later returning to the marked position.  The same position may be
+   restored multiple times.  However, only one position need be remembered
+   per scan; a new ammarkpos call overrides the previously
+   marked position.
+  
+
+  
+   Both the scan position and the mark position (if any) must be maintained
+   consistently in the face of concurrent insertions or deletions in the
+   index.  It is OK if a freshly-inserted entry is not returned by a scan that
+   would have found the entry if it had existed when the scan started, or for
+   the scan to return such an entry upon rescanning or backing
+   up even though it had not been returned the first time through.  Similarly,
+   a concurrent delete may or may not be reflected in the results of a scan.
+   What is important is that insertions or deletions not cause the scan to
+   miss or multiply return entries that were not themselves being inserted or
+   deleted.  (For an index type that does not set
+   pg_am.amconcurrent, it is sufficient to
+   handle these cases for insertions or deletions performed by the same
+   backend that's doing the scan.  But when amconcurrent is
+   true, insertions or deletions from other backends must be handled as well.)
+  
+
+ 
+
+ 
+  Index Locking Considerations
+
+  
+   An index access method can choose whether it supports concurrent updates
+   of the index by multiple processes.  If the method's
+   pg_am.amconcurrent flag is true, then
+   the core PostgreSQL system obtains
+   AccessShareLock on the index during an index scan, and
+   RowExclusiveLock when updating the index.  Since these lock
+   types do not conflict, the access method is responsible for handling any
+   fine-grained locking it may need.  An exclusive lock on the index as a whole
+   will be taken only during index creation, destruction, or
+   REINDEX.  When amconcurrent is false,
+   PostgreSQL still obtains
+   AccessShareLock during index scans, but it obtains
+   AccessExclusiveLock during any update.  This ensures that
+   updaters have sole use of the index.  Note that this implicitly assumes
+   that index scans are read-only; an access method that might modify the
+   index during a scan will still have to do its own locking to handle the
+   case of concurrent scans.
+  
+
+  
+   Recall that a backend's own locks never conflict; therefore, even a
+   non-concurrent index type must be prepared to handle the case where
+   a backend is inserting or deleting entries in an index that it is itself
+   scanning.  (This is of course necessary to support an UPDATE
+   that uses the index to find the rows to be updated.)
+  
+
+  
+   Building an index type that supports concurrent updates usually requires
+   extensive and subtle analysis of the required behavior.  For the b-tree
+   and hash index types, you can read about the design decisions involved in
+   src/backend/access/nbtree/README and
+   src/backend/access/hash/README.
+  
+
+  
+   Aside from the index's own internal consistency requirements, concurrent
+   updates create issues about consistency between the parent table (the
+   heap) and the index.  Because
+   PostgreSQL separates accesses 
+   and updates of the heap from those of the index, there are windows in
+   which the index may be inconsistent with the heap.  We handle this problem
+   with the following rules:
+
+    
+     
+      
+       A new heap entry is made before making its index entries.  (Therefore
+       a concurrent index scan is likely to fail to see the heap entry.
+       This is okay because the index reader would be uninterested in an
+       uncommitted row anyway.  But see .)
+      
+     
+     
+      
+       When a heap entry is to be deleted (by VACUUM), all its
+       index entries must be removed first.
+      
+     
+     
+      
+       For concurrent index types, an indexscan must maintain a pin
+       on the index page holding the item last returned by
+       amgettuple, and ambulkdelete cannot delete
+       entries from pages that are pinned by other backends.  The need
+       for this rule is explained below.
+      
+     
+    
+
+   If an index is concurrent then it is possible for an index reader to
+   see an index entry just before it is removed by VACUUM, and
+   then to arrive at the corresponding heap entry after that was removed by
+   VACUUM.  (With a nonconcurrent index, this is not possible
+   because of the conflicting index-level locks that will be taken out.)
+   This creates no serious problems if that item
+   number is still unused when the reader reaches it, since an empty
+   item slot will be ignored by heap_fetch().  But what if a
+   third backend has already re-used the item slot for something else?
+   When using an MVCC-compliant snapshot, there is no problem because
+   the new occupant of the slot is certain to be too new to pass the
+   snapshot test.  However, with a non-MVCC-compliant snapshot (such as
+   SnapshotNow), it would be possible to accept and return
+   a row that does not in fact match the scan keys.  We could defend
+   against this scenario by requiring the scan keys to be rechecked
+   against the heap row in all cases, but that is too expensive.  Instead,
+   we use a pin on an index page as a proxy to indicate that the reader
+   may still be in flight from the index entry to the matching
+   heap entry.  Making ambulkdelete block on such a pin ensures
+   that VACUUM cannot delete the heap entry before the reader
+   is done with it.  This solution costs little in runtime, and adds blocking
+   overhead only in the rare cases where there actually is a conflict.
+  
+
+  
+   This solution requires that index scans be synchronous: we have
+   to fetch each heap tuple immediately after scanning the corresponding index
+   entry.  This is expensive for a number of reasons.  An
+   asynchronous scan in which we collect many TIDs from the index,
+   and only visit the heap tuples sometime later, requires much less index
+   locking overhead and may allow a more efficient heap access pattern.
+   Per the above analysis, we must use the synchronous approach for
+   non-MVCC-compliant snapshots, but an asynchronous scan would be safe
+   for a query using an MVCC snapshot.  This possibility is not exploited
+   as of PostgreSQL 8.0, but it is likely to be
+   investigated soon.
+  
+
+ 
+
+ 
+  Index Uniqueness Checks
+
+  
+   PostgreSQL enforces SQL uniqueness constraints
+   using unique indexes, which are indexes that disallow
+   multiple entries with identical keys.  An access method that supports this
+   feature sets pg_am.amcanunique true.
+   (At present, only b-tree supports it.)
+  
+
+  
+   Because of MVCC, it is always necessary to allow duplicate entries to
+   exist physically in an index: the entries might refer to successive
+   versions of a single logical row.  The behavior we actually want to
+   enforce is that no MVCC snapshot could include two rows with equal
+   index keys.  This breaks down into the following cases that must be
+   checked when inserting a new row into a unique index:
+
+    
+     
+      
+       If a conflicting valid row has been deleted by the current transaction,
+       it's okay.  (In particular, since an UPDATE always deletes the old row
+       version before inserting the new version, this will allow an UPDATE on
+       a row without changing the key.)
+      
+     
+     
+      
+       If a conflicting row has been inserted by an as-yet-uncommitted
+       transaction, the would-be inserter must wait to see if that transaction
+       commits.  If it rolls back then there is no conflict.  If it commits
+       without deleting the conflicting row again, there is a uniqueness
+       violation.  (In practice we just wait for the other transaction to
+       end and then redo the visibility check in toto.)
+      
+     
+     
+      
+       Similarly, if a conflicting valid row has been deleted by an
+       as-yet-uncommitted transaction, the would-be inserter must wait
+       for that transaction to commit or abort, and then repeat the test.
+      
+     
+    
+  
+
+  
+   We require the index access method to apply these tests itself, which
+   means that it must reach into the heap to check the commit status of
+   any row that is shown to have a duplicate key according to the index
+   contents.  This is without a doubt ugly and non-modular, but it saves
+   redundant work: if we did a separate probe then the index lookup for
+   a conflicting row would be essentially repeated while finding the place to
+   insert the new row's index entry.  What's more, there is no obvious way
+   to avoid race conditions unless the conflict check is an integral part
+   of insertion of the new index entry.
+  
+
+  
+   The main limitation of this scheme is that it has no convenient way
+   to support deferred uniqueness checks.
+  
+
+ 
+
+ 
+  Index Cost Estimation Functions
+
+  
+   The amcostestimate function is given a list of WHERE clauses that have
+   been determined to be usable with the index.  It must return estimates
+   of the cost of accessing the index and the selectivity of the WHERE
+   clauses (that is, the fraction of parent-table rows that will be
+   retrieved during the index scan).  For simple cases, nearly all the
+   work of the cost estimator can be done by calling standard routines
+   in the optimizer; the point of having an amcostestimate function is
+   to allow index access methods to provide index-type-specific knowledge,
+   in case it is possible to improve on the standard estimates.
+  
+
+  
+   Each amcostestimate function must have the signature:
+
+
+void
+amcostestimate (Query *root,
+                RelOptInfo *rel,
+                IndexOptInfo *index,
+                List *indexQuals,
+                Cost *indexStartupCost,
+                Cost *indexTotalCost,
+                Selectivity *indexSelectivity,
+                double *indexCorrelation);
+
+
+   The first four parameters are inputs:
+
+   
+    
+     root
+     
+      
+       The query being processed.
+      
+     
+    
+
+    
+     rel
+     
+      
+       The relation the index is on.
+      
+     
+    
+
+    
+     index
+     
+      
+       The index itself.
+      
+     
+    
+
+    
+     indexQuals
+     
+      
+       List of index qual clauses (implicitly ANDed);
+       a NIL list indicates no qualifiers are available.
+       Note that the list contains expression trees, not ScanKeys.
+      
+     
+    
+   
+  
+
+  
+   The last four parameters are pass-by-reference outputs:
+
+   
+    
+     *indexStartupCost
+     
+      
+       Set to cost of index start-up processing
+      
+     
+    
+
+    
+     *indexTotalCost
+     
+      
+       Set to total cost of index processing
+      
+     
+    
+
+    
+     *indexSelectivity
+     
+      
+       Set to index selectivity
+      
+     
+    
+
+    
+     *indexCorrelation
+     
+      
+       Set to correlation coefficient between index scan order and
+       underlying table's order
+      
+     
+    
+   
+  
+
+  
+   Note that cost estimate functions must be written in C, not in SQL or
+   any available procedural language, because they must access internal
+   data structures of the planner/optimizer.
+  
+
+  
+   The index access costs should be computed in the units used by
+   src/backend/optimizer/path/costsize.c: a sequential disk block fetch
+   has cost 1.0, a nonsequential fetch has cost random_page_cost, and
+   the cost of processing one index row should usually be taken as
+   cpu_index_tuple_cost (which is a user-adjustable optimizer parameter).
+   In addition, an appropriate multiple of cpu_operator_cost should be charged
+   for any comparison operators invoked during index processing (especially
+   evaluation of the indexQuals themselves).
+  
+
+  
+   The access costs should include all disk and CPU costs associated with
+   scanning the index itself, but NOT the costs of retrieving or processing
+   the parent-table rows that are identified by the index.
+  
+
+  
+   The start-up cost is the part of the total scan cost that must be expended
+   before we can begin to fetch the first row.  For most indexes this can
+   be taken as zero, but an index type with a high start-up cost might want
+   to set it nonzero.
+  
+
+  
+   The indexSelectivity should be set to the estimated fraction of the parent
+   table rows that will be retrieved during the index scan.  In the case
+   of a lossy index, this will typically be higher than the fraction of
+   rows that actually pass the given qual conditions.
+  
+
+  
+   The indexCorrelation should be set to the correlation (ranging between
+   -1.0 and 1.0) between the index order and the table order.  This is used
+   to adjust the estimate for the cost of fetching rows from the parent
+   table.
+  
+
+  
+   Cost Estimation
+   
+    A typical cost estimator will proceed as follows:
+   
+
+   
+    
+     Estimate and return the fraction of parent-table rows that will be visited
+     based on the given qual conditions.  In the absence of any index-type-specific
+     knowledge, use the standard optimizer function clauselist_selectivity():
+
+
+*indexSelectivity = clauselist_selectivity(root, indexQuals,
+                                           rel->relid, JOIN_INNER);
+
+    
+   
+
+   
+    
+     Estimate the number of index rows that will be visited during the
+     scan.  For many index types this is the same as indexSelectivity times
+     the number of rows in the index, but it might be more.  (Note that the
+     index's size in pages and rows is available from the IndexOptInfo struct.)
+    
+   
+
+   
+    
+     Estimate the number of index pages that will be retrieved during the scan.
+     This might be just indexSelectivity times the index's size in pages.
+    
+   
+
+   
+    
+     Compute the index access cost.  A generic estimator might do this:
+
+
+    /*
+     * Our generic assumption is that the index pages will be read
+     * sequentially, so they have cost 1.0 each, not random_page_cost.
+     * Also, we charge for evaluation of the indexquals at each index row.
+     * All the costs are assumed to be paid incrementally during the scan.
+     */
+    cost_qual_eval(&index_qual_cost, indexQuals);
+    *indexStartupCost = index_qual_cost.startup;
+    *indexTotalCost = numIndexPages +
+        (cpu_index_tuple_cost + index_qual_cost.per_tuple) * numIndexTuples;
+
+    
+   
+
+   
+    
+     Estimate the index correlation.  For a simple ordered index on a single
+     field, this can be retrieved from pg_statistic.  If the correlation
+     is not known, the conservative estimate is zero (no correlation).
+    
+   
+  
+
+  
+   Examples of cost estimator functions can be found in
+   src/backend/utils/adt/selfuncs.c.
+  
+ 
+
+
+
diff --git a/doc/src/sgml/indexcost.sgml b/doc/src/sgml/indexcost.sgml

deleted file mode 100644 (file)

index 9758e8e..0000000
--- a/doc/src/sgml/indexcost.sgml
+++ /dev/null
@@ -1,285 +0,0 @@
-
-
- 
-  Index Cost Estimation Functions
-
-  
-   Author
-
-   
-    Written by Tom Lane ([email protected]) on 2000-01-24
-   
-  
-
-   
-    
-     This must eventually become part of a much larger chapter about
-     writing new index access methods.
-    
-   
-
-  
-   Every index access method must provide a cost estimation function for
-   use by the planner/optimizer.  The procedure OID of this function is
-   given in the amcostestimate field of the access
-   method's pg_am entry.
-
-   
-    
-     Prior to PostgreSQL 7.0, a different
-     scheme was used for registering 
-     index-specific cost estimation functions.
-    
-   
-  
-
-  
-   The amcostestimate function is given a list of WHERE clauses that have
-   been determined to be usable with the index.  It must return estimates
-   of the cost of accessing the index and the selectivity of the WHERE
-   clauses (that is, the fraction of main-table rows that will be
-   retrieved during the index scan).  For simple cases, nearly all the
-   work of the cost estimator can be done by calling standard routines
-   in the optimizer; the point of having an amcostestimate function is
-   to allow index access methods to provide index-type-specific knowledge,
-   in case it is possible to improve on the standard estimates.
-  
-
-  
-   Each amcostestimate function must have the signature:
-
-   
-void
-amcostestimate (Query *root,
-                RelOptInfo *rel,
-                IndexOptInfo *index,
-                List *indexQuals,
-                Cost *indexStartupCost,
-                Cost *indexTotalCost,
-                Selectivity *indexSelectivity,
-                double *indexCorrelation);
-   
-
-   The first four parameters are inputs:
-
-   
-    
-     root
-     
-      
-       The query being processed.
-      
-     
-    
-
-    
-     rel
-     
-      
-       The relation the index is on.
-      
-     
-    
-
-    
-     index
-     
-      
-       The index itself.
-      
-     
-    
-
-    
-     indexQuals
-     
-      
-       List of index qual clauses (implicitly ANDed);
-       a NIL list indicates no qualifiers are available.
-      
-     
-    
-   
-  
-
-  
-   The last four parameters are pass-by-reference outputs:
-
-   
-    
-     *indexStartupCost
-     
-      
-       Set to cost of index start-up processing
-      
-     
-    
-
-    
-     *indexTotalCost
-     
-      
-       Set to total cost of index processing
-      
-     
-    
-
-    
-     *indexSelectivity
-     
-      
-       Set to index selectivity
-      
-     
-    
-
-    
-     *indexCorrelation
-     
-      
-       Set to correlation coefficient between index scan order and
-       underlying table's order
-      
-     
-    
-   
-  
-
-  
-   Note that cost estimate functions must be written in C, not in SQL or
-   any available procedural language, because they must access internal
-   data structures of the planner/optimizer.
-  
-
-  
-   The index access costs should be computed in the units used by
-   src/backend/optimizer/path/costsize.c: a sequential disk block fetch
-   has cost 1.0, a nonsequential fetch has cost random_page_cost, and
-   the cost of processing one index row should usually be taken as
-   cpu_index_tuple_cost (which is a user-adjustable optimizer parameter).
-   In addition, an appropriate multiple of cpu_operator_cost should be charged
-   for any comparison operators invoked during index processing (especially
-   evaluation of the indexQuals themselves).
-  
-
-  
-   The access costs should include all disk and CPU costs associated with
-   scanning the index itself, but NOT the costs of retrieving or processing
-   the main-table rows that are identified by the index.
-  
-
-  
-   The start-up cost is the part of the total scan cost that must be expended
-   before we can begin to fetch the first row.  For most indexes this can
-   be taken as zero, but an index type with a high start-up cost might want
-   to set it nonzero.
-  
-
-  
-   The indexSelectivity should be set to the estimated fraction of the main
-   table rows that will be retrieved during the index scan.  In the case
-   of a lossy index, this will typically be higher than the fraction of
-   rows that actually pass the given qual conditions.
-  
-
-  
-   The indexCorrelation should be set to the correlation (ranging between
-   -1.0 and 1.0) between the index order and the table order.  This is used
-   to adjust the estimate for the cost of fetching rows from the main
-   table.
-  
-
-  
-   Cost Estimation
-   
-    A typical cost estimator will proceed as follows:
-   
-
-   
-    
-     Estimate and return the fraction of main-table rows that will be visited
-     based on the given qual conditions.  In the absence of any index-type-specific
-     knowledge, use the standard optimizer function clauselist_selectivity():
-
-     
-*indexSelectivity = clauselist_selectivity(root, indexQuals,
-                                           rel->relid, JOIN_INNER);
-     
-    
-   
-
-   
-    
-     Estimate the number of index rows that will be visited during the
-     scan.  For many index types this is the same as indexSelectivity times
-     the number of rows in the index, but it might be more.  (Note that the
-     index's size in pages and rows is available from the IndexOptInfo struct.)
-    
-   
-
-   
-    
-     Estimate the number of index pages that will be retrieved during the scan.
-     This might be just indexSelectivity times the index's size in pages.
-    
-   
-
-   
-    
-     Compute the index access cost.  A generic estimator might do this:
-
-     
-    /*
-     * Our generic assumption is that the index pages will be read
-     * sequentially, so they have cost 1.0 each, not random_page_cost.
-     * Also, we charge for evaluation of the indexquals at each index row.
-     * All the costs are assumed to be paid incrementally during the scan.
-     */
-    cost_qual_eval(&index_qual_cost, indexQuals);
-    *indexStartupCost = index_qual_cost.startup;
-    *indexTotalCost = numIndexPages +
-        (cpu_index_tuple_cost + index_qual_cost.per_tuple) * numIndexTuples;
-     
-    
-   
-
-   
-    
-     Estimate the index correlation.  For a simple ordered index on a single
-     field, this can be retrieved from pg_statistic.  If the correlation
-     is not known, the conservative estimate is zero (no correlation).
-    
-   
-  
-
-  
-   Examples of cost estimator functions can be found in
-   src/backend/utils/adt/selfuncs.c.
-  
-
-  
-   By convention, the pg_proc entry for an
-   amcostestimate function should show
-   eight arguments all declared as internal (since none of them have
-   types that are known to SQL), and the return type is void.
-  
- 
-
-
diff --git a/doc/src/sgml/postgres.sgml b/doc/src/sgml/postgres.sgml

index 8ec6262226805f9374ade128977536224b814831..a7ba58ce01f8bd28d7430b5822de5ce055df657a 100644 (file)
--- a/doc/src/sgml/postgres.sgml
+++ b/doc/src/sgml/postgres.sgml
@@ -1,5 +1,5 @@
  
  
  
@@ -235,7 +235,7 @@ $PostgreSQL: pgsql/doc/src/sgml/postgres.sgml,v 1.73 2005/01/10 00:04:38 tgl Exp
   &nls;
   &plhandler;
   &geqo;
-  &indexcost;
+  &indexam;
   &gist;
   &storage;
   &bki;
@@ -235,7 +235,7 @@ $PostgreSQL: pgsql/doc/src/sgml/postgres.sgml,v 1.73 2005/01/10 00:04:38 tgl Exp
    &nls;
    &plhandler;
    &geqo;
-  &indexcost;
+  &indexam;
    &gist;
    &storage;
    &bki;
diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml

index 63b2f40592289ffd26aa67a0de8303c476e46e58..0b254324485b6324e3d75e379302223979fbb8db 100644 (file)
--- a/doc/src/sgml/xindex.sgml
+++ b/doc/src/sgml/xindex.sgml
@@ -1,5 +1,5 @@
  
  
  
@@ -43,7 +43,7 @@ $PostgreSQL: pgsql/doc/src/sgml/xindex.sgml,v 1.38 2005/01/23 00:30:18 momjian E
     described in pg_am.  It is possible to add a
     new index method by defining the required interface routines and
     then creating a row in pg_am — but that is
-   far beyond the scope of this chapter.
+   beyond the scope of this chapter (see ).
    
  
    
@@ -514,7 +514,7 @@ CREATE OPERATOR < (
     
      
       Although PostgreSQL can cope with
-     functions having the same name as long as they have different
+     functions having the same SQL name as long as they have different
       argument data types, C can only cope with one global function
       having a given name.  So we shouldn't name the C function
       something simple like abs_eq.  Usually it's
@@ -525,14 +525,12 @@ CREATE OPERATOR < (
  
     
      
-     We could have made the PostgreSQL name
+     We could have made the SQL name
       of the function abs_eq, relying on
       PostgreSQL to distinguish it by
-     argument data types from any other
-     PostgreSQL function of the same name.
+     argument data types from any other SQL function of the same name.
       To keep the example simple, we make the function have the same
-     names at the C level and PostgreSQL
-     level.
+     names at the C level and SQL level.
author	Tom Lane
	Sun, 13 Feb 2005 03:04:15 +0000 (03:04 +0000)
committer	Tom Lane
	Sun, 13 Feb 2005 03:04:15 +0000 (03:04 +0000)
doc/src/sgml/catalogs.sgml		patch \| blob \| blame \| history
doc/src/sgml/filelist.sgml		patch \| blob \| blame \| history
doc/src/sgml/indexam.sgml	[new file with mode: 0644]	patch \| blob
doc/src/sgml/indexcost.sgml	[deleted file]	patch \| blob \| blame \| history
doc/src/sgml/postgres.sgml		patch \| blob \| blame \| history
doc/src/sgml/xindex.sgml		patch \| blob \| blame \| history