static struct pci_driver mlx5_core_driver = {
.name = DRIVER_NAME,
.id_table = mlx5_core_pci_table,
.probe = init_one,
.remove = remove_one,
.shutdown = shutdown,
.err_handler = &mlx5_err_handler,
.sriov_configure = mlx5_core_sriov_configure,
};
5.1 mlx5分配ring buffer init_one() mlx5_load_one() alloc_comp_eqs() mlx5_create_map_eq() mlx5_buf_alloc() mlx5_buf_alloc_node() `mlx5_buf_list`指针 kzalloc() 分配DMA内存,即ring buffer mlx5_dma_zalloc_coherent_node() dma_zalloc_coherent() dma_alloc_coherent() 注意 :经过实际测试,发现调整ring buffer会增加内存开销,具体取决于每个驱动的实现、PAGESIZE、MTU。
场景 :128核CPU,512G内存,mlx5_core 5.8-3.0.7驱动,MTU 9000,PAGESIZE 4096,queue数量63。
结论 :将一个网卡的ring buffer从1024调整到8192大约增加6G内存,2个网卡增加12G内存。
评估公式 : PAGESIZE*page_count*buffer_count*queue_count
。PAGESIZE影响最小分配内存,MTU影响page个数。
如下以4k的PAGESIZE和9000MTU为例,ring buffer从1024调整到8192进行计算。
分配ring buffer 的核心逻辑主要在alloc_comp_eqs()
中。
为每个完成队列分配一个CPU中断映射表
将中断向量添加到CPU中断映射中
使用snprintf生成完成队列的名称,并调用mlx5_create_map_eq函数创建完成队列
enum {
MLX5_COMP_EQ_SIZE = 1024,
};
static int alloc_comp_eqs(struct mlx5_core_dev *dev)
{
struct mlx5_eq_table *table = &dev->priv.eq_table;
char name[MLX5_MAX_IRQ_NAME];
struct mlx5_eq *eq;
int ncomp_vec;
int nent;
int err;
int i;
INIT_LIST_HEAD(&table->comp_eqs_list);
ncomp_vec = table->num_comp_vectors;
nent = MLX5_COMP_EQ_SIZE;
#ifdef CONFIG_RFS_ACCEL
dev->rmap = alloc_irq_cpu_rmap(ncomp_vec);
if (!dev->rmap)
return -ENOMEM;
#endif
for (i = 0; i < ncomp_vec; i++) {
eq = kzalloc(sizeof(*eq), GFP_KERNEL);
if (!eq) {
err = -ENOMEM;
goto clean;
}
#ifdef CONFIG_RFS_ACCEL
irq_cpu_rmap_add(dev->rmap, pci_irq_vector(dev->pdev,
MLX5_EQ_VEC_COMP_BASE + i));
#endif
snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
err = mlx5_create_map_eq(dev, eq,
i + MLX5_EQ_VEC_COMP_BASE, nent, 0,
name, MLX5_EQ_TYPE_COMP);
if (err) {
kfree(eq);
goto clean;
}
mlx5_core_dbg(dev, "allocated completion EQN %d\n", eq->eqn);
eq->index = i;
spin_lock(&table->lock);
list_add_tail(&eq->list, &table->comp_eqs_list);
spin_unlock(&table->lock);
}
return 0;
clean:
free_comp_eqs(dev);
return err;
}
在alloc_comp_eqs()
中执行了如下代码分配内存,决定了后续函数分配内存的大小。
nent = MLX5_COMP_EQ_SIZE;
// 省略
err = mlx5_create_map_eq(dev, eq,
i + MLX5_EQ_VEC_COMP_BASE, nent, 0,
name, MLX5_EQ_TYPE_COMP);
enum {
MLX5_NUM_SPARE_EQE = 0x80, // 十进制为128
MLX5_NUM_ASYNC_EQE = 0x1000,
MLX5_NUM_CMD_EQE = 32,
MLX5_NUM_PF_DRAIN = 64,
};
enum {
MLX5_EQE_SIZE = sizeof(struct mlx5_eqe),
MLX5_EQE_OWNER_INIT_VAL = 0x1,
};
union ev_data {
__be32 raw[7];
struct mlx5_eqe_cmd cmd;
struct mlx5_eqe_comp comp;
struct mlx5_eqe_qp_srq qp_srq;
struct mlx5_eqe_cq_err cq_err;
struct mlx5_eqe_port_state port;
struct mlx5_eqe_gpio gpio;
struct mlx5_eqe_congestion cong;
struct mlx5_eqe_stall_vl stall_vl;
struct mlx5_eqe_page_req req_pages;
struct mlx5_eqe_page_fault page_fault;
struct mlx5_eqe_vport_change vport_change;
struct mlx5_eqe_port_module port_module;
struct mlx5_eqe_pps pps;
struct mlx5_eqe_dct dct;
struct mlx5_eqe_temp_warning temp_warning;
} __packed;
struct mlx5_eqe {
u8 rsvd0;
u8 type;
u8 rsvd1;
u8 sub_type;
__be32 rsvd2[7];
union ev_data data;
__be16 rsvd3;
u8 signature;
u8 owner;
} __packed;
int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
int nent, u64 mask, const char *name,
enum mlx5_eq_type type)
{
struct mlx5_cq_table *cq_table = &eq->cq_table;
u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
struct mlx5_priv *priv = &dev->priv;
irq_handler_t handler;
__be64 *pas;
void *eqc;
int inlen;
u32 *in;
int err;
/* Init CQ table */
memset(cq_table, 0, sizeof(*cq_table));
spin_lock_init(&cq_table->lock);
INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
eq->type = type;
// 1024+128再roundup_pow_of_two后得到2048
eq->nent = roundup_pow_of_two(nent + MLX5_NUM_SPARE_EQE);
eq->cons_index = 0;
// 2048*64=131072字节
err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, &eq->buf);
if (err)
return err;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (type == MLX5_EQ_TYPE_PF)
handler = mlx5_eq_pf_int;
else
#endif
handler = mlx5_eq_int;
init_eq_buf(eq);
inlen = MLX5_ST_SZ_BYTES(create_eq_in) +
MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->buf.npages;
in = kvzalloc(inlen, GFP_KERNEL);
if (!in) {
err = -ENOMEM;
goto err_buf;
}
pas = (__be64 *)MLX5_ADDR_OF(create_eq_in, in, pas);
mlx5_fill_page_array(&eq->buf, pas);
MLX5_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ);
MLX5_SET64(create_eq_in, in, event_bitmask, mask);
eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry);
MLX5_SET(eqc, eqc, log_eq_size, ilog2(eq->nent));
MLX5_SET(eqc, eqc, uar_page, priv->uar->index);
MLX5_SET(eqc, eqc, intr, vecidx);
MLX5_SET(eqc, eqc, log_page_size,
eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
if (err)
goto err_in;
snprintf(priv->irq_info[vecidx].name, MLX5_MAX_IRQ_NAME, "%s@pci:%s",
name, pci_name(dev->pdev));
eq->eqn = MLX5_GET(create_eq_out, out, eq_number);
eq->irqn = pci_irq_vector(dev->pdev, vecidx);
eq->dev = dev;
eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET;
err = request_irq(eq->irqn, handler, 0,
priv->irq_info[vecidx].name, eq);
if (err)
goto err_eq;
err = mlx5_debug_eq_add(dev, eq);
if (err)
goto err_irq;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (type == MLX5_EQ_TYPE_PF) {
err = init_pf_ctx(&eq->pf_ctx, name);
if (err)
goto err_irq;
} else
#endif
{
INIT_LIST_HEAD(&eq->tasklet_ctx.list);
INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
spin_lock_init(&eq->tasklet_ctx.lock);
tasklet_init(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb,
(unsigned long)&eq->tasklet_ctx);
}
/* EQs are created in ARMED state
*/
eq_update_ci(eq, 1);
kvfree(in);
return 0;
err_irq:
free_irq(eq->irqn, eq);
err_eq:
mlx5_cmd_destroy_eq(dev, eq->eqn);
err_in:
kvfree(in);
err_buf:
mlx5_buf_free(dev, &eq->buf);
return err;
}
eq->type = type;
// 1024+128再roundup_pow_of_two后得到2048
eq->nent = roundup_pow_of_two(nent + MLX5_NUM_SPARE_EQE);
eq->cons_index = 0;
// 2048*64=131072字节
err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, &eq->buf);
if (err)
return err;
// /home/kangxiaoning/workspace/kernel-4.19.90-2404.2.0/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
struct mlx5_frag_buf *buf, int node)
{
dma_addr_t t;
buf->size = size;
buf->npages = 1;
buf->page_shift = (u8)get_order(size) + PAGE_SHIFT;
buf->frags = kzalloc(sizeof(*buf->frags), GFP_KERNEL);
if (!buf->frags)
return -ENOMEM;
buf->frags->buf = mlx5_dma_zalloc_coherent_node(dev, size,
&t, node);
if (!buf->frags->buf)
goto err_out;
buf->frags->map = t;
while (t & ((1 << buf->page_shift) - 1)) {
--buf->page_shift;
buf->npages *= 2;
}
return 0;
err_out:
kfree(buf->frags);
return -ENOMEM;
}
从下面的收包逻辑可以判断,分配ring buffer过程中把RX的skb空间也分配了,收到包以后就从队列中取一个skb进行处理。
void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
{
struct mlx5_wq_cyc *wq = &rq->wqe.wq;
struct mlx5e_wqe_frag_info *wi;
struct sk_buff *skb;
u32 cqe_bcnt;
u16 ci;
ci = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
wi = get_frag(rq, ci);
cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
if (!skb) {
/* probably for XDP */
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
/* do not return page to cache,
* it will be returned on XDP_TX completion.
*/
goto wq_cyc_pop;
}
goto free_wqe;
}
mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
napi_gro_receive(rq->cq.napi, skb);
free_wqe:
mlx5e_free_rx_wqe(rq, wi, true);
wq_cyc_pop:
mlx5_wq_cyc_pop(wq);
}
5.2 mlx5初始化tasklet
int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
int nent, u64 mask, const char *name,
enum mlx5_eq_type type)
{
struct mlx5_cq_table *cq_table = &eq->cq_table;
u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
struct mlx5_priv *priv = &dev->priv;
irq_handler_t handler;
__be64 *pas;
void *eqc;
int inlen;
u32 *in;
int err;
/* Init CQ table */
memset(cq_table, 0, sizeof(*cq_table));
spin_lock_init(&cq_table->lock);
INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
eq->type = type;
eq->nent = roundup_pow_of_two(nent + MLX5_NUM_SPARE_EQE);
eq->cons_index = 0;
err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, &eq->buf);
if (err)
return err;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (type == MLX5_EQ_TYPE_PF)
handler = mlx5_eq_pf_int;
else
#endif
handler = mlx5_eq_int;
init_eq_buf(eq);
inlen = MLX5_ST_SZ_BYTES(create_eq_in) +
MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->buf.npages;
in = kvzalloc(inlen, GFP_KERNEL);
if (!in) {
err = -ENOMEM;
goto err_buf;
}
pas = (__be64 *)MLX5_ADDR_OF(create_eq_in, in, pas);
mlx5_fill_page_array(&eq->buf, pas);
MLX5_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ);
MLX5_SET64(create_eq_in, in, event_bitmask, mask);
eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry);
MLX5_SET(eqc, eqc, log_eq_size, ilog2(eq->nent));
MLX5_SET(eqc, eqc, uar_page, priv->uar->index);
MLX5_SET(eqc, eqc, intr, vecidx);
MLX5_SET(eqc, eqc, log_page_size,
eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
if (err)
goto err_in;
snprintf(priv->irq_info[vecidx].name, MLX5_MAX_IRQ_NAME, "%s@pci:%s",
name, pci_name(dev->pdev));
eq->eqn = MLX5_GET(create_eq_out, out, eq_number);
eq->irqn = pci_irq_vector(dev->pdev, vecidx);
eq->dev = dev;
eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET;
err = request_irq(eq->irqn, handler, 0,
priv->irq_info[vecidx].name, eq);
if (err)
goto err_eq;
err = mlx5_debug_eq_add(dev, eq);
if (err)
goto err_irq;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (type == MLX5_EQ_TYPE_PF) {
err = init_pf_ctx(&eq->pf_ctx, name);
if (err)
goto err_irq;
} else
#endif
{
INIT_LIST_HEAD(&eq->tasklet_ctx.list);
INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
spin_lock_init(&eq->tasklet_ctx.lock);
tasklet_init(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb,
(unsigned long)&eq->tasklet_ctx);
}
/* EQs are created in ARMED state
*/
eq_update_ci(eq, 1);
kvfree(in);
return 0;
err_irq:
free_irq(eq->irqn, eq);
err_eq:
mlx5_cmd_destroy_eq(dev, eq->eqn);
err_in:
kvfree(in);
err_buf:
mlx5_buf_free(dev, &eq->buf);
return err;
}