Skip to content

Commit 78d4f7b

Browse files
committed
Improve stability of btree page split on ERRORs
This improves the stability of VACUUM when processing btree indexes, which was previously able to trigger an assertion failure in _bt_lock_subtree_parent() when an error was previously thrown outside the scope of _bt_split() when splitting a btree page. VACUUM would consider the index as in a corrupted state as the right page would not be zeroed for the error thrown (allocation failure is one pattern). In a non-assert build, VACUUM is able to succeed, reporting what it sees as a corruption while attempting to fix the index. This would manifest as a LOG message, as of: LOG: failed to re-find parent key in index "idx" for deletion target page N CONTEXT: while vacuuming index "idx" of relation "public.tab" This commit improves the code to rely on two PGAlignedBlocks that are used as a temporary space for the left and right pages. The main change concerns the right page, whose contents are now copied into the "temporary" PGAlignedBlock page while its original space is zeroed. Its contents are moved from the PGAlignedBlock page back to the page once we enter in the critical section used for the split. This simplifies the split logic, as it is not necessary to zero the right page before throwing an error anymore. Hence errors can now be thrown outside the split code. For the left page, this shaves one allocation, with PageGetTempPage() being previously used. The previous logic originates from commit 8fa30f9, at a point where PGAlignedBlock did not exist yet. This could be argued as something that should be backpatched, but the lack of complaints indicates that it may not be necessary. Author: Konstantin Knizhnik <[email protected]> Discussion: https://postgr.es/m/[email protected]
1 parent 3fbcbf1 commit 78d4f7b

File tree

1 file changed

+31
-20
lines changed

1 file changed

+31
-20
lines changed

src/backend/access/nbtree/nbtinsert.c

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "miscadmin.h"
2626
#include "storage/lmgr.h"
2727
#include "storage/predicate.h"
28+
#include "utils/injection_point.h"
2829

2930
/* Minimum tree height for application of fastpath optimization */
3031
#define BTREE_FASTPATH_MIN_LEVEL 2
@@ -1472,6 +1473,8 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
14721473
Page origpage;
14731474
Page leftpage,
14741475
rightpage;
1476+
PGAlignedBlock leftpage_buf,
1477+
rightpage_buf;
14751478
BlockNumber origpagenumber,
14761479
rightpagenumber;
14771480
BTPageOpaque ropaque,
@@ -1545,8 +1548,8 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
15451548
firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
15461549
newitem, &newitemonleft);
15471550

1548-
/* Allocate temp buffer for leftpage */
1549-
leftpage = PageGetTempPage(origpage);
1551+
/* Use temporary buffer for leftpage */
1552+
leftpage = leftpage_buf.data;
15501553
_bt_pageinit(leftpage, BufferGetPageSize(buf));
15511554
lopaque = BTPageGetOpaque(leftpage);
15521555

@@ -1709,19 +1712,23 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
17091712

17101713
/*
17111714
* Acquire a new right page to split into, now that left page has a new
1712-
* high key. From here on, it's not okay to throw an error without
1713-
* zeroing rightpage first. This coding rule ensures that we won't
1714-
* confuse future VACUUM operations, which might otherwise try to re-find
1715-
* a downlink to a leftover junk page as the page undergoes deletion.
1715+
* high key.
17161716
*
1717-
* It would be reasonable to start the critical section just after the new
1718-
* rightpage buffer is acquired instead; that would allow us to avoid
1719-
* leftover junk pages without bothering to zero rightpage. We do it this
1720-
* way because it avoids an unnecessary PANIC when either origpage or its
1721-
* existing sibling page are corrupt.
1717+
* To not confuse future VACUUM operations, we zero the right page and
1718+
* work on an in-memory copy of it before writing WAL, then copy its
1719+
* contents back to the actual page once we start the critical section
1720+
* work. This simplifies the split work, so as there is no need to zero
1721+
* the right page before throwing an error.
17221722
*/
17231723
rbuf = _bt_allocbuf(rel, heaprel);
1724-
rightpage = BufferGetPage(rbuf);
1724+
rightpage = rightpage_buf.data;
1725+
1726+
/*
1727+
* Copy the contents of the right page into its temporary location, and
1728+
* zero the original space.
1729+
*/
1730+
memcpy(rightpage, BufferGetPage(rbuf), BLCKSZ);
1731+
memset(BufferGetPage(rbuf), 0, BLCKSZ);
17251732
rightpagenumber = BufferGetBlockNumber(rbuf);
17261733
/* rightpage was initialized by _bt_allocbuf */
17271734
ropaque = BTPageGetOpaque(rightpage);
@@ -1770,7 +1777,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
17701777
if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
17711778
false, false) == InvalidOffsetNumber)
17721779
{
1773-
memset(rightpage, 0, BufferGetPageSize(rbuf));
17741780
elog(ERROR, "failed to add high key to the right sibling"
17751781
" while splitting block %u of index \"%s\"",
17761782
origpagenumber, RelationGetRelationName(rel));
@@ -1818,7 +1824,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
18181824
if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
18191825
false))
18201826
{
1821-
memset(rightpage, 0, BufferGetPageSize(rbuf));
18221827
elog(ERROR, "failed to add new item to the left sibling"
18231828
" while splitting block %u of index \"%s\"",
18241829
origpagenumber, RelationGetRelationName(rel));
@@ -1831,7 +1836,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
18311836
if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
18321837
afterrightoff == minusinfoff))
18331838
{
1834-
memset(rightpage, 0, BufferGetPageSize(rbuf));
18351839
elog(ERROR, "failed to add new item to the right sibling"
18361840
" while splitting block %u of index \"%s\"",
18371841
origpagenumber, RelationGetRelationName(rel));
@@ -1845,7 +1849,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
18451849
{
18461850
if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
18471851
{
1848-
memset(rightpage, 0, BufferGetPageSize(rbuf));
18491852
elog(ERROR, "failed to add old item to the left sibling"
18501853
" while splitting block %u of index \"%s\"",
18511854
origpagenumber, RelationGetRelationName(rel));
@@ -1857,7 +1860,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
18571860
if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
18581861
afterrightoff == minusinfoff))
18591862
{
1860-
memset(rightpage, 0, BufferGetPageSize(rbuf));
18611863
elog(ERROR, "failed to add old item to the right sibling"
18621864
" while splitting block %u of index \"%s\"",
18631865
origpagenumber, RelationGetRelationName(rel));
@@ -1878,7 +1880,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
18781880
if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
18791881
afterrightoff == minusinfoff))
18801882
{
1881-
memset(rightpage, 0, BufferGetPageSize(rbuf));
18821883
elog(ERROR, "failed to add new item to the right sibling"
18831884
" while splitting block %u of index \"%s\"",
18841885
origpagenumber, RelationGetRelationName(rel));
@@ -1894,11 +1895,11 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
18941895
if (!isrightmost)
18951896
{
18961897
sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1898+
INJECTION_POINT("bt-split", NULL);
18971899
spage = BufferGetPage(sbuf);
18981900
sopaque = BTPageGetOpaque(spage);
18991901
if (sopaque->btpo_prev != origpagenumber)
19001902
{
1901-
memset(rightpage, 0, BufferGetPageSize(rbuf));
19021903
ereport(ERROR,
19031904
(errcode(ERRCODE_INDEX_CORRUPTED),
19041905
errmsg_internal("right sibling's left-link doesn't match: "
@@ -1941,9 +1942,19 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
19411942
* original. We need to do this before writing the WAL record, so that
19421943
* XLogInsert can WAL log an image of the page if necessary.
19431944
*/
1944-
PageRestoreTempPage(leftpage, origpage);
1945+
memcpy(origpage, leftpage, BLCKSZ);
19451946
/* leftpage, lopaque must not be used below here */
19461947

1948+
/*
1949+
* Move the contents of the right page from its temporary location to the
1950+
* destination buffer, before writing the WAL record. Unlike the left
1951+
* page, the right page and its opaque area are still needed to complete
1952+
* the update of the page, so reinitialize them.
1953+
*/
1954+
rightpage = BufferGetPage(rbuf);
1955+
memcpy(rightpage, rightpage_buf.data, BLCKSZ);
1956+
ropaque = BTPageGetOpaque(rightpage);
1957+
19471958
MarkBufferDirty(buf);
19481959
MarkBufferDirty(rbuf);
19491960

0 commit comments

Comments
 (0)