Skip to content

Commit 82842fe

Browse files
Dave Chinnerdchinner
Dave Chinner
authored andcommitted
xfs: fix AGF vs inode cluster buffer deadlock
Lock order in XFS is AGI -> AGF, hence for operations involving inode unlinked list operations we always lock the AGI first. Inode unlinked list operations operate on the inode cluster buffer, so the lock order there is AGI -> inode cluster buffer. For O_TMPFILE operations, this now means the lock order set down in xfs_rename and xfs_link is AGI -> inode cluster buffer -> AGF as the unlinked ops are done before the directory modifications that may allocate space and lock the AGF. Unfortunately, we also now lock the inode cluster buffer when logging an inode so that we can attach the inode to the cluster buffer and pin it in memory. This creates a lock order of AGF -> inode cluster buffer in directory operations as we have to log the inode after we've allocated new space for it. This creates a lock inversion between the AGF and the inode cluster buffer. Because the inode cluster buffer is shared across multiple inodes, the inversion is not specific to individual inodes but can occur when inodes in the same cluster buffer are accessed in different orders. To fix this we need move all the inode log item cluster buffer interactions to the end of the current transaction. Unfortunately, xfs_trans_log_inode() calls are littered throughout the transactions with no thought to ordering against other items or locking. This makes it difficult to do anything that involves changing the call sites of xfs_trans_log_inode() to change locking orders. However, we do now have a mechanism that allows is to postpone dirty item processing to just before we commit the transaction: the ->iop_precommit method. This will be called after all the modifications are done and high level objects like AGI and AGF buffers have been locked and modified, thereby providing a mechanism that guarantees we don't lock the inode cluster buffer before those high level objects are locked. This change is largely moving the guts of xfs_trans_log_inode() to xfs_inode_item_precommit() and providing an extra flag context in the inode log item to track the dirty state of the inode in the current transaction. This also means we do a lot less repeated work in xfs_trans_log_inode() by only doing it once per transaction when all the work is done. Fixes: 298f7be ("xfs: pin inode backing buffer to the inode log item") Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Dave Chinner <david@fromorbit.com>
1 parent cb04211 commit 82842fe

File tree

4 files changed

+166
-106
lines changed

4 files changed

+166
-106
lines changed

fs/xfs/libxfs/xfs_log_format.h

+8-1
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,6 @@ struct xfs_inode_log_format_32 {
324324
#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */
325325
#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */
326326

327-
328327
/*
329328
* The timestamps are dirty, but not necessarily anything else in the inode
330329
* core. Unlike the other fields above this one must never make it to disk
@@ -333,6 +332,14 @@ struct xfs_inode_log_format_32 {
333332
*/
334333
#define XFS_ILOG_TIMESTAMP 0x4000
335334

335+
/*
336+
* The version field has been changed, but not necessarily anything else of
337+
* interest. This must never make it to disk - it is used purely to ensure that
338+
* the inode item ->precommit operation can update the fsync flag triggers
339+
* in the inode item correctly.
340+
*/
341+
#define XFS_ILOG_IVERSION 0x8000
342+
336343
#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
337344
XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
338345
XFS_ILOG_ADATA | XFS_ILOG_AEXT | \

fs/xfs/libxfs/xfs_trans_inode.c

+8-105
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,8 @@ xfs_trans_ijoin(
4040
iip->ili_lock_flags = lock_flags;
4141
ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
4242

43-
/*
44-
* Get a log_item_desc to point at the new item.
45-
*/
43+
/* Reset the per-tx dirty context and add the item to the tx. */
44+
iip->ili_dirty_flags = 0;
4645
xfs_trans_add_item(tp, &iip->ili_item);
4746
}
4847

@@ -76,17 +75,10 @@ xfs_trans_ichgtime(
7675
/*
7776
* This is called to mark the fields indicated in fieldmask as needing to be
7877
* logged when the transaction is committed. The inode must already be
79-
* associated with the given transaction.
80-
*
81-
* The values for fieldmask are defined in xfs_inode_item.h. We always log all
82-
* of the core inode if any of it has changed, and we always log all of the
83-
* inline data/extents/b-tree root if any of them has changed.
84-
*
85-
* Grab and pin the cluster buffer associated with this inode to avoid RMW
86-
* cycles at inode writeback time. Avoid the need to add error handling to every
87-
* xfs_trans_log_inode() call by shutting down on read error. This will cause
88-
* transactions to fail and everything to error out, just like if we return a
89-
* read error in a dirty transaction and cancel it.
78+
* associated with the given transaction. All we do here is record where the
79+
* inode was dirtied and mark the transaction and inode log item dirty;
80+
* everything else is done in the ->precommit log item operation after the
81+
* changes in the transaction have been completed.
9082
*/
9183
void
9284
xfs_trans_log_inode(
@@ -96,26 +88,13 @@ xfs_trans_log_inode(
9688
{
9789
struct xfs_inode_log_item *iip = ip->i_itemp;
9890
struct inode *inode = VFS_I(ip);
99-
uint iversion_flags = 0;
10091

10192
ASSERT(iip);
10293
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
10394
ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
10495

10596
tp->t_flags |= XFS_TRANS_DIRTY;
10697

107-
/*
108-
* Don't bother with i_lock for the I_DIRTY_TIME check here, as races
109-
* don't matter - we either will need an extra transaction in 24 hours
110-
* to log the timestamps, or will clear already cleared fields in the
111-
* worst case.
112-
*/
113-
if (inode->i_state & I_DIRTY_TIME) {
114-
spin_lock(&inode->i_lock);
115-
inode->i_state &= ~I_DIRTY_TIME;
116-
spin_unlock(&inode->i_lock);
117-
}
118-
11998
/*
12099
* First time we log the inode in a transaction, bump the inode change
121100
* counter if it is configured for this to occur. While we have the
@@ -128,86 +107,10 @@ xfs_trans_log_inode(
128107
if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) {
129108
if (IS_I_VERSION(inode) &&
130109
inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE))
131-
iversion_flags = XFS_ILOG_CORE;
132-
}
133-
134-
/*
135-
* If we're updating the inode core or the timestamps and it's possible
136-
* to upgrade this inode to bigtime format, do so now.
137-
*/
138-
if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
139-
xfs_has_bigtime(ip->i_mount) &&
140-
!xfs_inode_has_bigtime(ip)) {
141-
ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME;
142-
flags |= XFS_ILOG_CORE;
143-
}
144-
145-
/*
146-
* Inode verifiers do not check that the extent size hint is an integer
147-
* multiple of the rt extent size on a directory with both rtinherit
148-
* and extszinherit flags set. If we're logging a directory that is
149-
* misconfigured in this way, clear the hint.
150-
*/
151-
if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
152-
(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
153-
(ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
154-
ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
155-
XFS_DIFLAG_EXTSZINHERIT);
156-
ip->i_extsize = 0;
157-
flags |= XFS_ILOG_CORE;
110+
flags |= XFS_ILOG_IVERSION;
158111
}
159112

160-
/*
161-
* Record the specific change for fdatasync optimisation. This allows
162-
* fdatasync to skip log forces for inodes that are only timestamp
163-
* dirty.
164-
*/
165-
spin_lock(&iip->ili_lock);
166-
iip->ili_fsync_fields |= flags;
167-
168-
if (!iip->ili_item.li_buf) {
169-
struct xfs_buf *bp;
170-
int error;
171-
172-
/*
173-
* We hold the ILOCK here, so this inode is not going to be
174-
* flushed while we are here. Further, because there is no
175-
* buffer attached to the item, we know that there is no IO in
176-
* progress, so nothing will clear the ili_fields while we read
177-
* in the buffer. Hence we can safely drop the spin lock and
178-
* read the buffer knowing that the state will not change from
179-
* here.
180-
*/
181-
spin_unlock(&iip->ili_lock);
182-
error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp);
183-
if (error) {
184-
xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR);
185-
return;
186-
}
187-
188-
/*
189-
* We need an explicit buffer reference for the log item but
190-
* don't want the buffer to remain attached to the transaction.
191-
* Hold the buffer but release the transaction reference once
192-
* we've attached the inode log item to the buffer log item
193-
* list.
194-
*/
195-
xfs_buf_hold(bp);
196-
spin_lock(&iip->ili_lock);
197-
iip->ili_item.li_buf = bp;
198-
bp->b_flags |= _XBF_INODES;
199-
list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
200-
xfs_trans_brelse(tp, bp);
201-
}
202-
203-
/*
204-
* Always OR in the bits from the ili_last_fields field. This is to
205-
* coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
206-
* in the eventual clearing of the ili_fields bits. See the big comment
207-
* in xfs_iflush() for an explanation of this coordination mechanism.
208-
*/
209-
iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags);
210-
spin_unlock(&iip->ili_lock);
113+
iip->ili_dirty_flags |= flags;
211114
}
212115

213116
int

fs/xfs/xfs_inode_item.c

+149
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,153 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
2929
return container_of(lip, struct xfs_inode_log_item, ili_item);
3030
}
3131

32+
static uint64_t
33+
xfs_inode_item_sort(
34+
struct xfs_log_item *lip)
35+
{
36+
return INODE_ITEM(lip)->ili_inode->i_ino;
37+
}
38+
39+
/*
40+
* Prior to finally logging the inode, we have to ensure that all the
41+
* per-modification inode state changes are applied. This includes VFS inode
42+
* state updates, format conversions, verifier state synchronisation and
43+
* ensuring the inode buffer remains in memory whilst the inode is dirty.
44+
*
45+
* We have to be careful when we grab the inode cluster buffer due to lock
46+
* ordering constraints. The unlinked inode modifications (xfs_iunlink_item)
47+
* require AGI -> inode cluster buffer lock order. The inode cluster buffer is
48+
* not locked until ->precommit, so it happens after everything else has been
49+
* modified.
50+
*
51+
* Further, we have AGI -> AGF lock ordering, and with O_TMPFILE handling we
52+
* have AGI -> AGF -> iunlink item -> inode cluster buffer lock order. Hence we
53+
* cannot safely lock the inode cluster buffer in xfs_trans_log_inode() because
54+
* it can be called on a inode (e.g. via bumplink/droplink) before we take the
55+
* AGF lock modifying directory blocks.
56+
*
57+
* Rather than force a complete rework of all the transactions to call
58+
* xfs_trans_log_inode() once and once only at the end of every transaction, we
59+
* move the pinning of the inode cluster buffer to a ->precommit operation. This
60+
* matches how the xfs_iunlink_item locks the inode cluster buffer, and it
61+
* ensures that the inode cluster buffer locking is always done last in a
62+
* transaction. i.e. we ensure the lock order is always AGI -> AGF -> inode
63+
* cluster buffer.
64+
*
65+
* If we return the inode number as the precommit sort key then we'll also
66+
* guarantee that the order all inode cluster buffer locking is the same all the
67+
* inodes and unlink items in the transaction.
68+
*/
69+
static int
70+
xfs_inode_item_precommit(
71+
struct xfs_trans *tp,
72+
struct xfs_log_item *lip)
73+
{
74+
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
75+
struct xfs_inode *ip = iip->ili_inode;
76+
struct inode *inode = VFS_I(ip);
77+
unsigned int flags = iip->ili_dirty_flags;
78+
79+
/*
80+
* Don't bother with i_lock for the I_DIRTY_TIME check here, as races
81+
* don't matter - we either will need an extra transaction in 24 hours
82+
* to log the timestamps, or will clear already cleared fields in the
83+
* worst case.
84+
*/
85+
if (inode->i_state & I_DIRTY_TIME) {
86+
spin_lock(&inode->i_lock);
87+
inode->i_state &= ~I_DIRTY_TIME;
88+
spin_unlock(&inode->i_lock);
89+
}
90+
91+
/*
92+
* If we're updating the inode core or the timestamps and it's possible
93+
* to upgrade this inode to bigtime format, do so now.
94+
*/
95+
if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
96+
xfs_has_bigtime(ip->i_mount) &&
97+
!xfs_inode_has_bigtime(ip)) {
98+
ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME;
99+
flags |= XFS_ILOG_CORE;
100+
}
101+
102+
/*
103+
* Inode verifiers do not check that the extent size hint is an integer
104+
* multiple of the rt extent size on a directory with both rtinherit
105+
* and extszinherit flags set. If we're logging a directory that is
106+
* misconfigured in this way, clear the hint.
107+
*/
108+
if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
109+
(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
110+
(ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
111+
ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
112+
XFS_DIFLAG_EXTSZINHERIT);
113+
ip->i_extsize = 0;
114+
flags |= XFS_ILOG_CORE;
115+
}
116+
117+
/*
118+
* Record the specific change for fdatasync optimisation. This allows
119+
* fdatasync to skip log forces for inodes that are only timestamp
120+
* dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it
121+
* to XFS_ILOG_CORE so that the actual on-disk dirty tracking
122+
* (ili_fields) correctly tracks that the version has changed.
123+
*/
124+
spin_lock(&iip->ili_lock);
125+
iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION);
126+
if (flags & XFS_ILOG_IVERSION)
127+
flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
128+
129+
if (!iip->ili_item.li_buf) {
130+
struct xfs_buf *bp;
131+
int error;
132+
133+
/*
134+
* We hold the ILOCK here, so this inode is not going to be
135+
* flushed while we are here. Further, because there is no
136+
* buffer attached to the item, we know that there is no IO in
137+
* progress, so nothing will clear the ili_fields while we read
138+
* in the buffer. Hence we can safely drop the spin lock and
139+
* read the buffer knowing that the state will not change from
140+
* here.
141+
*/
142+
spin_unlock(&iip->ili_lock);
143+
error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp);
144+
if (error)
145+
return error;
146+
147+
/*
148+
* We need an explicit buffer reference for the log item but
149+
* don't want the buffer to remain attached to the transaction.
150+
* Hold the buffer but release the transaction reference once
151+
* we've attached the inode log item to the buffer log item
152+
* list.
153+
*/
154+
xfs_buf_hold(bp);
155+
spin_lock(&iip->ili_lock);
156+
iip->ili_item.li_buf = bp;
157+
bp->b_flags |= _XBF_INODES;
158+
list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
159+
xfs_trans_brelse(tp, bp);
160+
}
161+
162+
/*
163+
* Always OR in the bits from the ili_last_fields field. This is to
164+
* coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
165+
* in the eventual clearing of the ili_fields bits. See the big comment
166+
* in xfs_iflush() for an explanation of this coordination mechanism.
167+
*/
168+
iip->ili_fields |= (flags | iip->ili_last_fields);
169+
spin_unlock(&iip->ili_lock);
170+
171+
/*
172+
* We are done with the log item transaction dirty state, so clear it so
173+
* that it doesn't pollute future transactions.
174+
*/
175+
iip->ili_dirty_flags = 0;
176+
return 0;
177+
}
178+
32179
/*
33180
* The logged size of an inode fork is always the current size of the inode
34181
* fork. This means that when an inode fork is relogged, the size of the logged
@@ -662,6 +809,8 @@ xfs_inode_item_committing(
662809
}
663810

664811
static const struct xfs_item_ops xfs_inode_item_ops = {
812+
.iop_sort = xfs_inode_item_sort,
813+
.iop_precommit = xfs_inode_item_precommit,
665814
.iop_size = xfs_inode_item_size,
666815
.iop_format = xfs_inode_item_format,
667816
.iop_pin = xfs_inode_item_pin,

fs/xfs/xfs_inode_item.h

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ struct xfs_inode_log_item {
1717
struct xfs_log_item ili_item; /* common portion */
1818
struct xfs_inode *ili_inode; /* inode ptr */
1919
unsigned short ili_lock_flags; /* inode lock flags */
20+
unsigned int ili_dirty_flags; /* dirty in current tx */
2021
/*
2122
* The ili_lock protects the interactions between the dirty state and
2223
* the flush state of the inode log item. This allows us to do atomic

0 commit comments

Comments
 (0)