diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtsort.c')
-rw-r--r-- | src/backend/access/nbtree/nbtsort.c | 41 |
1 files changed, 32 insertions, 9 deletions
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 7b88e977196..f95f67ad4b5 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -40,6 +40,18 @@ * them. They will need to be re-read into shared buffers on first use after * the build finishes. * + * Since the index will never be used unless it is completely built, + * from a crash-recovery point of view there is no need to WAL-log the + * steps of the build. After completing the index build, we can just sync + * the whole file to disk using smgrimmedsync() before exiting this module. + * This can be seen to be sufficient for crash recovery by considering that + * it's effectively equivalent to what would happen if a CHECKPOINT occurred + * just after the index build. However, it is clearly not sufficient if the + * DBA is using the WAL log for PITR or replication purposes, since another + * machine would not be able to reconstruct the index from WAL. Therefore, + * we log the completed index pages to WAL if and only if WAL archiving is + * active. + * * This code isn't concerned about the FSM at all. The caller is responsible * for initializing that. * @@ -204,7 +216,12 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) wstate.heap = btspool->heap; wstate.index = btspool->index; - wstate.btws_use_wal = RelationNeedsWAL(wstate.index); + + /* + * We need to log index creation in WAL iff WAL archiving/streaming is + * enabled UNLESS the index isn't WAL-logged anyway. + */ + wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; @@ -794,15 +811,21 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) _bt_uppershutdown(wstate, state); /* - * When we WAL-logged index pages, we must nonetheless fsync index files. - * Since we're building outside shared buffers, a CHECKPOINT occurring - * during the build has no way to flush the previously written data to - * disk (indeed it won't know the index even exists). A crash later on - * would replay WAL from the checkpoint, therefore it wouldn't replay our - * earlier WAL entries. If we do not fsync those pages here, they might - * still not be on disk when the crash occurs. + * If the index is WAL-logged, we must fsync it down to disk before it's + * safe to commit the transaction. (For a non-WAL-logged index we don't + * care since the index will be uninteresting after a crash anyway.) + * + * It's obvious that we must do this when not WAL-logging the build. It's + * less obvious that we have to do it even if we did WAL-log the index + * pages. The reason is that since we're building outside shared buffers, + * a CHECKPOINT occurring during the build has no way to flush the + * previously written data to disk (indeed it won't know the index even + * exists). A crash later on would replay WAL from the checkpoint, + * therefore it wouldn't replay our earlier WAL entries. If we do not + * fsync those pages here, they might still not be on disk when the crash + * occurs. */ - if (wstate->btws_use_wal) + if (RelationNeedsWAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); |