Skip to content

Commit 9978599

Browse files
Robbie Kokdave
Robbie Ko
authored andcommitted
btrfs: reduce lock contention when eb cache miss for btree search
When crawling btree, if an eb cache miss occurs, we change to use the eb read lock and release all previous locks (including the parent lock) to reduce lock contention. If an eb cache miss occurs in a leaf and needs to execute IO, before this change we released locks only from level 2 and up and we read a leaf's content from disk while holding a lock on its parent (level 1), causing the unnecessary lock contention on the parent, after this change we release locks from level 1 and up, but we lock level 0, and read leaf's content from disk. Because we have prepared the check parameters and the read lock of eb we hold, we can ensure that no race will occur during the check and cause unexpected errors. Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Robbie Ko <robbieko@synology.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent a9c50c9 commit 9978599

File tree

1 file changed

+70
-31
lines changed

1 file changed

+70
-31
lines changed

fs/btrfs/ctree.c

+70-31
Original file line numberDiff line numberDiff line change
@@ -1515,12 +1515,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
15151515
struct btrfs_tree_parent_check check = { 0 };
15161516
u64 blocknr;
15171517
u64 gen;
1518-
struct extent_buffer *tmp;
1519-
int ret;
1518+
struct extent_buffer *tmp = NULL;
1519+
int ret = 0;
15201520
int parent_level;
1521-
bool unlock_up;
1521+
int err;
1522+
bool read_tmp = false;
1523+
bool tmp_locked = false;
1524+
bool path_released = false;
15221525

1523-
unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
15241526
blocknr = btrfs_node_blockptr(*eb_ret, slot);
15251527
gen = btrfs_node_ptr_generation(*eb_ret, slot);
15261528
parent_level = btrfs_header_level(*eb_ret);
@@ -1551,68 +1553,105 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
15511553
*/
15521554
if (btrfs_verify_level_key(tmp,
15531555
parent_level - 1, &check.first_key, gen)) {
1554-
free_extent_buffer(tmp);
1555-
return -EUCLEAN;
1556+
ret = -EUCLEAN;
1557+
goto out;
15561558
}
15571559
*eb_ret = tmp;
1558-
return 0;
1560+
tmp = NULL;
1561+
ret = 0;
1562+
goto out;
15591563
}
15601564

15611565
if (p->nowait) {
1562-
free_extent_buffer(tmp);
1563-
return -EAGAIN;
1566+
ret = -EAGAIN;
1567+
goto out;
15641568
}
15651569

1566-
if (unlock_up)
1570+
if (!p->skip_locking) {
15671571
btrfs_unlock_up_safe(p, level + 1);
1568-
1569-
/* now we're allowed to do a blocking uptodate check */
1570-
ret = btrfs_read_extent_buffer(tmp, &check);
1571-
if (ret) {
1572-
free_extent_buffer(tmp);
1572+
tmp_locked = true;
1573+
btrfs_tree_read_lock(tmp);
15731574
btrfs_release_path(p);
1574-
return ret;
1575+
ret = -EAGAIN;
1576+
path_released = true;
15751577
}
15761578

1577-
if (unlock_up)
1578-
ret = -EAGAIN;
1579+
/* Now we're allowed to do a blocking uptodate check. */
1580+
err = btrfs_read_extent_buffer(tmp, &check);
1581+
if (err) {
1582+
ret = err;
1583+
goto out;
1584+
}
15791585

1586+
if (ret == 0) {
1587+
ASSERT(!tmp_locked);
1588+
*eb_ret = tmp;
1589+
tmp = NULL;
1590+
}
15801591
goto out;
15811592
} else if (p->nowait) {
1582-
return -EAGAIN;
1593+
ret = -EAGAIN;
1594+
goto out;
15831595
}
15841596

1585-
if (unlock_up) {
1597+
if (!p->skip_locking) {
15861598
btrfs_unlock_up_safe(p, level + 1);
15871599
ret = -EAGAIN;
1588-
} else {
1589-
ret = 0;
15901600
}
15911601

15921602
if (p->reada != READA_NONE)
15931603
reada_for_search(fs_info, p, level, slot, key->objectid);
15941604

1595-
tmp = read_tree_block(fs_info, blocknr, &check);
1605+
tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
15961606
if (IS_ERR(tmp)) {
1607+
ret = PTR_ERR(tmp);
1608+
tmp = NULL;
1609+
goto out;
1610+
}
1611+
read_tmp = true;
1612+
1613+
if (!p->skip_locking) {
1614+
ASSERT(ret == -EAGAIN);
1615+
tmp_locked = true;
1616+
btrfs_tree_read_lock(tmp);
15971617
btrfs_release_path(p);
1598-
return PTR_ERR(tmp);
1618+
path_released = true;
1619+
}
1620+
1621+
/* Now we're allowed to do a blocking uptodate check. */
1622+
err = btrfs_read_extent_buffer(tmp, &check);
1623+
if (err) {
1624+
ret = err;
1625+
goto out;
15991626
}
1627+
16001628
/*
16011629
* If the read above didn't mark this buffer up to date,
16021630
* it will never end up being up to date. Set ret to EIO now
16031631
* and give up so that our caller doesn't loop forever
16041632
* on our EAGAINs.
16051633
*/
1606-
if (!extent_buffer_uptodate(tmp))
1634+
if (!extent_buffer_uptodate(tmp)) {
16071635
ret = -EIO;
1636+
goto out;
1637+
}
16081638

1609-
out:
16101639
if (ret == 0) {
1640+
ASSERT(!tmp_locked);
16111641
*eb_ret = tmp;
1612-
} else {
1613-
free_extent_buffer(tmp);
1614-
btrfs_release_path(p);
1642+
tmp = NULL;
1643+
}
1644+
out:
1645+
if (tmp) {
1646+
if (tmp_locked)
1647+
btrfs_tree_read_unlock(tmp);
1648+
if (read_tmp && ret && ret != -EAGAIN)
1649+
free_extent_buffer_stale(tmp);
1650+
else
1651+
free_extent_buffer(tmp);
16151652
}
1653+
if (ret && !path_released)
1654+
btrfs_release_path(p);
16161655

16171656
return ret;
16181657
}
@@ -2198,7 +2237,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
21982237
}
21992238

22002239
err = read_block_for_search(root, p, &b, level, slot, key);
2201-
if (err == -EAGAIN)
2240+
if (err == -EAGAIN && !p->nowait)
22022241
goto again;
22032242
if (err) {
22042243
ret = err;
@@ -2325,7 +2364,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
23252364
}
23262365

23272366
err = read_block_for_search(root, p, &b, level, slot, key);
2328-
if (err == -EAGAIN)
2367+
if (err == -EAGAIN && !p->nowait)
23292368
goto again;
23302369
if (err) {
23312370
ret = err;

0 commit comments

Comments
 (0)