Skip to content

Commit 3178308

Browse files
Binary-EaterSaeed Mahameed
authored and
Saeed Mahameed
committed
net/mlx5e: Make tx_port_ts logic resilient to out-of-order CQEs
Use a map structure for associating CQEs containing port timestamping information with the appropriate skb. Track order of WQEs submitted using a FIFO. Check if the corresponding port timestamping CQEs from the lookup values in the FIFO are considered dropped due to time elapsed. Return the lookup value to a freelist after consuming the skb. Reuse the freed lookup in future WQE submission iterations. The map structure uses an integer identifier for the key and returns an skb corresponding to that identifier. Embed the integer identifier in the WQE submitted to the WQ for the transmit path when the SQ is a PTP (port timestamping) SQ. The embedded identifier can then be queried using a field in the CQE of the corresponding port timestamping CQ. In the port timestamping napi_poll context, the identifier is queried from the CQE polled from CQ and used to lookup the corresponding skb from the WQE submit path. The skb reference is removed from map and then embedded with the port HW timestamp information from the CQE and eventually consumed. The metadata freelist FIFO is an array containing integer identifiers that can be pushed and popped in the FIFO. The purpose of this structure is bookkeeping what identifier values can safely be used in a subsequent WQE submission and should not contain identifiers that have still not been reaped by processing a corresponding CQE completion on the port timestamping CQ. The ts_cqe_pending_list structure is a combination of an array and linked list. The array is pre-populated with the nodes that will be added and removed from the head of the linked list. Each node contains the unique identifier value associated with the values submitted in the WQEs and retrieved in the port timestamping CQEs. When a WQE is submitted, the node in the array corresponding to the identifier popped from the metadata freelist is added to the end of the CQE pending list and is marked as "in-use". The node is removed from the linked list under two conditions. The first condition is that the corresponding port timestamping CQE is polled in the PTP napi_poll context. The second condition is that more than a second has elapsed since the DMA timestamp value corresponding to the WQE submission. When the first condition occurs, the "in-use" bit in the linked list node is cleared, and the resources corresponding to the WQE submission are then released. The second condition, however, indicates that the port timestamping CQE will likely never be delivered. It's not impossible for the device to post a CQE after an infinite amount of time though highly improbable. In order to be resilient to this improbable case, resources related to the corresponding WQE submission are still kept, the identifier value is not returned to the freelist, and the "in-use" bit is cleared on the node to indicate that it's no longer part of the linked list of "likely to be delivered" port timestamping CQE identifiers. A count for the number of port timestamping CQEs considered highly likely to never be delivered by the device is maintained. This count gets decremented in the unlikely event a port timestamping CQE considered unlikely to ever be delivered is polled in the PTP napi_poll context. Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com> Reviewed-by: Tariq Toukan <tariqt@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
1 parent b608dd6 commit 3178308

File tree

7 files changed

+236
-81
lines changed

7 files changed

+236
-81
lines changed

Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst

+6
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,12 @@ the software port.
683683
time protocol.
684684
- Error
685685

686+
* - `ptp_cq[i]_late_cqe`
687+
- Number of times a CQE has been delivered on the PTP timestamping CQ when
688+
the CQE was not expected since a certain amount of time had elapsed where
689+
the device typically ensures not posting the CQE.
690+
- Error
691+
686692
.. [#ring_global] The corresponding ring and global counters do not share the
687693
same name (i.e. do not follow the common naming scheme).
688694

drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c

+158-57
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
#include "en/txrx.h"
66
#include "en/params.h"
77
#include "en/fs_tt_redirect.h"
8+
#include <linux/list.h>
9+
#include <linux/spinlock.h>
810

911
struct mlx5e_ptp_fs {
1012
struct mlx5_flow_handle *l2_rule;
@@ -19,6 +21,48 @@ struct mlx5e_ptp_params {
1921
struct mlx5e_rq_param rq_param;
2022
};
2123

24+
struct mlx5e_ptp_port_ts_cqe_tracker {
25+
u8 metadata_id;
26+
bool inuse : 1;
27+
struct list_head entry;
28+
};
29+
30+
struct mlx5e_ptp_port_ts_cqe_list {
31+
struct mlx5e_ptp_port_ts_cqe_tracker *nodes;
32+
struct list_head tracker_list_head;
33+
/* Sync list operations in xmit and napi_poll contexts */
34+
spinlock_t tracker_list_lock;
35+
};
36+
37+
static inline void
38+
mlx5e_ptp_port_ts_cqe_list_add(struct mlx5e_ptp_port_ts_cqe_list *list, u8 metadata)
39+
{
40+
struct mlx5e_ptp_port_ts_cqe_tracker *tracker = &list->nodes[metadata];
41+
42+
WARN_ON_ONCE(tracker->inuse);
43+
tracker->inuse = true;
44+
spin_lock(&list->tracker_list_lock);
45+
list_add_tail(&tracker->entry, &list->tracker_list_head);
46+
spin_unlock(&list->tracker_list_lock);
47+
}
48+
49+
static void
50+
mlx5e_ptp_port_ts_cqe_list_remove(struct mlx5e_ptp_port_ts_cqe_list *list, u8 metadata)
51+
{
52+
struct mlx5e_ptp_port_ts_cqe_tracker *tracker = &list->nodes[metadata];
53+
54+
WARN_ON_ONCE(!tracker->inuse);
55+
tracker->inuse = false;
56+
spin_lock(&list->tracker_list_lock);
57+
list_del(&tracker->entry);
58+
spin_unlock(&list->tracker_list_lock);
59+
}
60+
61+
void mlx5e_ptpsq_track_metadata(struct mlx5e_ptpsq *ptpsq, u8 metadata)
62+
{
63+
mlx5e_ptp_port_ts_cqe_list_add(ptpsq->ts_cqe_pending_list, metadata);
64+
}
65+
2266
struct mlx5e_skb_cb_hwtstamp {
2367
ktime_t cqe_hwtstamp;
2468
ktime_t port_hwtstamp;
@@ -79,75 +123,88 @@ void mlx5e_skb_cb_hwtstamp_handler(struct sk_buff *skb, int hwtstamp_type,
79123
memset(skb->cb, 0, sizeof(struct mlx5e_skb_cb_hwtstamp));
80124
}
81125

82-
#define PTP_WQE_CTR2IDX(val) ((val) & ptpsq->ts_cqe_ctr_mask)
83-
84-
static bool mlx5e_ptp_ts_cqe_drop(struct mlx5e_ptpsq *ptpsq, u16 skb_ci, u16 skb_id)
126+
static struct sk_buff *
127+
mlx5e_ptp_metadata_map_lookup(struct mlx5e_ptp_metadata_map *map, u16 metadata)
85128
{
86-
return (ptpsq->ts_cqe_ctr_mask && (skb_ci != skb_id));
129+
return map->data[metadata];
87130
}
88131

89-
static bool mlx5e_ptp_ts_cqe_ooo(struct mlx5e_ptpsq *ptpsq, u16 skb_id)
132+
static struct sk_buff *
133+
mlx5e_ptp_metadata_map_remove(struct mlx5e_ptp_metadata_map *map, u16 metadata)
90134
{
91-
u16 skb_ci = PTP_WQE_CTR2IDX(ptpsq->skb_fifo_cc);
92-
u16 skb_pi = PTP_WQE_CTR2IDX(ptpsq->skb_fifo_pc);
135+
struct sk_buff *skb;
93136

94-
if (PTP_WQE_CTR2IDX(skb_id - skb_ci) >= PTP_WQE_CTR2IDX(skb_pi - skb_ci))
95-
return true;
137+
skb = map->data[metadata];
138+
map->data[metadata] = NULL;
96139

97-
return false;
140+
return skb;
98141
}
99142

100-
static void mlx5e_ptp_skb_fifo_ts_cqe_resync(struct mlx5e_ptpsq *ptpsq, u16 skb_ci,
101-
u16 skb_id, int budget)
143+
static void mlx5e_ptpsq_mark_ts_cqes_undelivered(struct mlx5e_ptpsq *ptpsq,
144+
ktime_t port_tstamp)
102145
{
103-
struct skb_shared_hwtstamps hwts = {};
104-
struct sk_buff *skb;
146+
struct mlx5e_ptp_port_ts_cqe_list *cqe_list = ptpsq->ts_cqe_pending_list;
147+
ktime_t timeout = ns_to_ktime(MLX5E_PTP_TS_CQE_UNDELIVERED_TIMEOUT);
148+
struct mlx5e_ptp_metadata_map *metadata_map = &ptpsq->metadata_map;
149+
struct mlx5e_ptp_port_ts_cqe_tracker *pos, *n;
150+
151+
spin_lock(&cqe_list->tracker_list_lock);
152+
list_for_each_entry_safe(pos, n, &cqe_list->tracker_list_head, entry) {
153+
struct sk_buff *skb =
154+
mlx5e_ptp_metadata_map_lookup(metadata_map, pos->metadata_id);
155+
ktime_t dma_tstamp = mlx5e_skb_cb_get_hwts(skb)->cqe_hwtstamp;
105156

106-
ptpsq->cq_stats->resync_event++;
157+
if (!dma_tstamp ||
158+
ktime_after(ktime_add(dma_tstamp, timeout), port_tstamp))
159+
break;
107160

108-
while (skb_ci != skb_id) {
109-
skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo);
110-
hwts.hwtstamp = mlx5e_skb_cb_get_hwts(skb)->cqe_hwtstamp;
111-
skb_tstamp_tx(skb, &hwts);
112-
ptpsq->cq_stats->resync_cqe++;
113-
napi_consume_skb(skb, budget);
114-
skb_ci = PTP_WQE_CTR2IDX(ptpsq->skb_fifo_cc);
161+
metadata_map->undelivered_counter++;
162+
WARN_ON_ONCE(!pos->inuse);
163+
pos->inuse = false;
164+
list_del(&pos->entry);
115165
}
166+
spin_unlock(&cqe_list->tracker_list_lock);
116167
}
117168

169+
#define PTP_WQE_CTR2IDX(val) ((val) & ptpsq->ts_cqe_ctr_mask)
170+
118171
static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
119172
struct mlx5_cqe64 *cqe,
120173
int budget)
121174
{
122-
u16 skb_id = PTP_WQE_CTR2IDX(be16_to_cpu(cqe->wqe_counter));
123-
u16 skb_ci = PTP_WQE_CTR2IDX(ptpsq->skb_fifo_cc);
175+
struct mlx5e_ptp_port_ts_cqe_list *pending_cqe_list = ptpsq->ts_cqe_pending_list;
176+
u8 metadata_id = PTP_WQE_CTR2IDX(be16_to_cpu(cqe->wqe_counter));
177+
bool is_err_cqe = !!MLX5E_RX_ERR_CQE(cqe);
124178
struct mlx5e_txqsq *sq = &ptpsq->txqsq;
125179
struct sk_buff *skb;
126180
ktime_t hwtstamp;
127181

128-
if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
129-
skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo);
130-
ptpsq->cq_stats->err_cqe++;
131-
goto out;
182+
if (likely(pending_cqe_list->nodes[metadata_id].inuse)) {
183+
mlx5e_ptp_port_ts_cqe_list_remove(pending_cqe_list, metadata_id);
184+
} else {
185+
/* Reclaim space in the unlikely event CQE was delivered after
186+
* marking it late.
187+
*/
188+
ptpsq->metadata_map.undelivered_counter--;
189+
ptpsq->cq_stats->late_cqe++;
132190
}
133191

134-
if (mlx5e_ptp_ts_cqe_drop(ptpsq, skb_ci, skb_id)) {
135-
if (mlx5e_ptp_ts_cqe_ooo(ptpsq, skb_id)) {
136-
/* already handled by a previous resync */
137-
ptpsq->cq_stats->ooo_cqe_drop++;
138-
return;
139-
}
140-
mlx5e_ptp_skb_fifo_ts_cqe_resync(ptpsq, skb_ci, skb_id, budget);
192+
skb = mlx5e_ptp_metadata_map_remove(&ptpsq->metadata_map, metadata_id);
193+
194+
if (unlikely(is_err_cqe)) {
195+
ptpsq->cq_stats->err_cqe++;
196+
goto out;
141197
}
142198

143-
skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo);
144199
hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, get_cqe_ts(cqe));
145200
mlx5e_skb_cb_hwtstamp_handler(skb, MLX5E_SKB_CB_PORT_HWTSTAMP,
146201
hwtstamp, ptpsq->cq_stats);
147202
ptpsq->cq_stats->cqe++;
148203

204+
mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp);
149205
out:
150206
napi_consume_skb(skb, budget);
207+
mlx5e_ptp_metadata_fifo_push(&ptpsq->metadata_freelist, metadata_id);
151208
}
152209

153210
static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget)
@@ -291,36 +348,78 @@ static void mlx5e_ptp_destroy_sq(struct mlx5_core_dev *mdev, u32 sqn)
291348

292349
static int mlx5e_ptp_alloc_traffic_db(struct mlx5e_ptpsq *ptpsq, int numa)
293350
{
294-
int wq_sz = mlx5_wq_cyc_get_size(&ptpsq->txqsq.wq);
295-
struct mlx5_core_dev *mdev = ptpsq->txqsq.mdev;
351+
struct mlx5e_ptp_metadata_fifo *metadata_freelist = &ptpsq->metadata_freelist;
352+
struct mlx5e_ptp_metadata_map *metadata_map = &ptpsq->metadata_map;
353+
struct mlx5e_ptp_port_ts_cqe_list *cqe_list;
354+
int db_sz;
355+
int md;
296356

297-
ptpsq->skb_fifo.fifo = kvzalloc_node(array_size(wq_sz, sizeof(*ptpsq->skb_fifo.fifo)),
298-
GFP_KERNEL, numa);
299-
if (!ptpsq->skb_fifo.fifo)
357+
cqe_list = kvzalloc_node(sizeof(*ptpsq->ts_cqe_pending_list), GFP_KERNEL, numa);
358+
if (!cqe_list)
300359
return -ENOMEM;
360+
ptpsq->ts_cqe_pending_list = cqe_list;
361+
362+
db_sz = min_t(u32, mlx5_wq_cyc_get_size(&ptpsq->txqsq.wq),
363+
1 << MLX5_CAP_GEN_2(ptpsq->txqsq.mdev,
364+
ts_cqe_metadata_size2wqe_counter));
365+
ptpsq->ts_cqe_ctr_mask = db_sz - 1;
366+
367+
cqe_list->nodes = kvzalloc_node(array_size(db_sz, sizeof(*cqe_list->nodes)),
368+
GFP_KERNEL, numa);
369+
if (!cqe_list->nodes)
370+
goto free_cqe_list;
371+
INIT_LIST_HEAD(&cqe_list->tracker_list_head);
372+
spin_lock_init(&cqe_list->tracker_list_lock);
373+
374+
metadata_freelist->data =
375+
kvzalloc_node(array_size(db_sz, sizeof(*metadata_freelist->data)),
376+
GFP_KERNEL, numa);
377+
if (!metadata_freelist->data)
378+
goto free_cqe_list_nodes;
379+
metadata_freelist->mask = ptpsq->ts_cqe_ctr_mask;
380+
381+
for (md = 0; md < db_sz; ++md) {
382+
cqe_list->nodes[md].metadata_id = md;
383+
metadata_freelist->data[md] = md;
384+
}
385+
metadata_freelist->pc = db_sz;
386+
387+
metadata_map->data =
388+
kvzalloc_node(array_size(db_sz, sizeof(*metadata_map->data)),
389+
GFP_KERNEL, numa);
390+
if (!metadata_map->data)
391+
goto free_metadata_freelist;
392+
metadata_map->capacity = db_sz;
301393

302-
ptpsq->skb_fifo.pc = &ptpsq->skb_fifo_pc;
303-
ptpsq->skb_fifo.cc = &ptpsq->skb_fifo_cc;
304-
ptpsq->skb_fifo.mask = wq_sz - 1;
305-
if (MLX5_CAP_GEN_2(mdev, ts_cqe_metadata_size2wqe_counter))
306-
ptpsq->ts_cqe_ctr_mask =
307-
(1 << MLX5_CAP_GEN_2(mdev, ts_cqe_metadata_size2wqe_counter)) - 1;
308394
return 0;
395+
396+
free_metadata_freelist:
397+
kvfree(metadata_freelist->data);
398+
free_cqe_list_nodes:
399+
kvfree(cqe_list->nodes);
400+
free_cqe_list:
401+
kvfree(cqe_list);
402+
return -ENOMEM;
309403
}
310404

311-
static void mlx5e_ptp_drain_skb_fifo(struct mlx5e_skb_fifo *skb_fifo)
405+
static void mlx5e_ptp_drain_metadata_map(struct mlx5e_ptp_metadata_map *map)
312406
{
313-
while (*skb_fifo->pc != *skb_fifo->cc) {
314-
struct sk_buff *skb = mlx5e_skb_fifo_pop(skb_fifo);
407+
int idx;
408+
409+
for (idx = 0; idx < map->capacity; ++idx) {
410+
struct sk_buff *skb = map->data[idx];
315411

316412
dev_kfree_skb_any(skb);
317413
}
318414
}
319415

320-
static void mlx5e_ptp_free_traffic_db(struct mlx5e_skb_fifo *skb_fifo)
416+
static void mlx5e_ptp_free_traffic_db(struct mlx5e_ptpsq *ptpsq)
321417
{
322-
mlx5e_ptp_drain_skb_fifo(skb_fifo);
323-
kvfree(skb_fifo->fifo);
418+
mlx5e_ptp_drain_metadata_map(&ptpsq->metadata_map);
419+
kvfree(ptpsq->metadata_map.data);
420+
kvfree(ptpsq->metadata_freelist.data);
421+
kvfree(ptpsq->ts_cqe_pending_list->nodes);
422+
kvfree(ptpsq->ts_cqe_pending_list);
324423
}
325424

326425
static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn,
@@ -348,8 +447,7 @@ static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn,
348447
if (err)
349448
goto err_free_txqsq;
350449

351-
err = mlx5e_ptp_alloc_traffic_db(ptpsq,
352-
dev_to_node(mlx5_core_dma_dev(c->mdev)));
450+
err = mlx5e_ptp_alloc_traffic_db(ptpsq, dev_to_node(mlx5_core_dma_dev(c->mdev)));
353451
if (err)
354452
goto err_free_txqsq;
355453

@@ -366,7 +464,7 @@ static void mlx5e_ptp_close_txqsq(struct mlx5e_ptpsq *ptpsq)
366464
struct mlx5e_txqsq *sq = &ptpsq->txqsq;
367465
struct mlx5_core_dev *mdev = sq->mdev;
368466

369-
mlx5e_ptp_free_traffic_db(&ptpsq->skb_fifo);
467+
mlx5e_ptp_free_traffic_db(ptpsq);
370468
cancel_work_sync(&sq->recover_work);
371469
mlx5e_ptp_destroy_sq(mdev, sq->sqn);
372470
mlx5e_free_txqsq_descs(sq);
@@ -534,7 +632,10 @@ static void mlx5e_ptp_build_params(struct mlx5e_ptp *c,
534632

535633
/* SQ */
536634
if (test_bit(MLX5E_PTP_STATE_TX, c->state)) {
537-
params->log_sq_size = orig->log_sq_size;
635+
params->log_sq_size =
636+
min(MLX5_CAP_GEN_2(c->mdev, ts_cqe_metadata_size2wqe_counter),
637+
MLX5E_PTP_MAX_LOG_SQ_SIZE);
638+
params->log_sq_size = min(params->log_sq_size, orig->log_sq_size);
538639
mlx5e_ptp_build_sq_param(c->mdev, params, &cparams->txq_sq_param);
539640
}
540641
/* RQ */

0 commit comments

Comments
 (0)