OSDN Git Service

net/funeth: add the data path
authorDimitris Michailidis <d.michailidis@fungible.com>
Fri, 25 Feb 2022 02:59:00 +0000 (18:59 -0800)
committerDavid S. Miller <davem@davemloft.net>
Sun, 27 Feb 2022 10:51:23 +0000 (10:51 +0000)
Add the driver's data path. Tx handles skbs, XDP, and kTLS, Rx has skbs
and XDP. Also included are Rx and Tx queue creation/tear-down and
tracing.

Signed-off-by: Dimitris Michailidis <dmichail@fungible.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/fungible/funeth/funeth_rx.c [new file with mode: 0644]
drivers/net/ethernet/fungible/funeth/funeth_trace.h [new file with mode: 0644]
drivers/net/ethernet/fungible/funeth/funeth_tx.c [new file with mode: 0644]
drivers/net/ethernet/fungible/funeth/funeth_txrx.h [new file with mode: 0644]

diff --git a/drivers/net/ethernet/fungible/funeth/funeth_rx.c b/drivers/net/ethernet/fungible/funeth/funeth_rx.c
new file mode 100644 (file)
index 0000000..0f6a549
--- /dev/null
@@ -0,0 +1,826 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
+
+#include <linux/bpf_trace.h>
+#include <linux/dma-mapping.h>
+#include <linux/etherdevice.h>
+#include <linux/filter.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <linux/skbuff.h>
+#include "funeth_txrx.h"
+#include "funeth.h"
+#include "fun_queue.h"
+
+#define CREATE_TRACE_POINTS
+#include "funeth_trace.h"
+
+/* Given the device's max supported MTU and pages of at least 4KB a packet can
+ * be scattered into at most 4 buffers.
+ */
+#define RX_MAX_FRAGS 4
+
+/* Per packet headroom in non-XDP mode. Present only for 1-frag packets. */
+#define FUN_RX_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN)
+
+/* We try to reuse pages for our buffers. To avoid frequent page ref writes we
+ * take EXTRA_PAGE_REFS references at once and then hand them out one per packet
+ * occupying the buffer.
+ */
+#define EXTRA_PAGE_REFS 1000000
+#define MIN_PAGE_REFS 1000
+
+enum {
+       FUN_XDP_FLUSH_REDIR = 1,
+       FUN_XDP_FLUSH_TX = 2,
+};
+
+/* See if a page is running low on refs we are holding and if so take more. */
+static void refresh_refs(struct funeth_rxbuf *buf)
+{
+       if (unlikely(buf->pg_refs < MIN_PAGE_REFS)) {
+               buf->pg_refs += EXTRA_PAGE_REFS;
+               page_ref_add(buf->page, EXTRA_PAGE_REFS);
+       }
+}
+
+/* Offer a buffer to the Rx buffer cache. The cache will hold the buffer if its
+ * page is worth retaining and there's room for it. Otherwise the page is
+ * unmapped and our references released.
+ */
+static void cache_offer(struct funeth_rxq *q, const struct funeth_rxbuf *buf)
+{
+       struct funeth_rx_cache *c = &q->cache;
+
+       if (c->prod_cnt - c->cons_cnt <= c->mask && buf->node == numa_mem_id()) {
+               c->bufs[c->prod_cnt & c->mask] = *buf;
+               c->prod_cnt++;
+       } else {
+               dma_unmap_page_attrs(q->dma_dev, buf->dma_addr, PAGE_SIZE,
+                                    DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
+               __page_frag_cache_drain(buf->page, buf->pg_refs);
+       }
+}
+
+/* Get a page from the Rx buffer cache. We only consider the next available
+ * page and return it if we own all its references.
+ */
+static bool cache_get(struct funeth_rxq *q, struct funeth_rxbuf *rb)
+{
+       struct funeth_rx_cache *c = &q->cache;
+       struct funeth_rxbuf *buf;
+
+       if (c->prod_cnt == c->cons_cnt)
+               return false;             /* empty cache */
+
+       buf = &c->bufs[c->cons_cnt & c->mask];
+       if (page_ref_count(buf->page) == buf->pg_refs) {
+               dma_sync_single_for_device(q->dma_dev, buf->dma_addr,
+                                          PAGE_SIZE, DMA_FROM_DEVICE);
+               *rb = *buf;
+               buf->page = NULL;
+               refresh_refs(rb);
+               c->cons_cnt++;
+               return true;
+       }
+
+       /* Page can't be reused. If the cache is full drop this page. */
+       if (c->prod_cnt - c->cons_cnt > c->mask) {
+               dma_unmap_page_attrs(q->dma_dev, buf->dma_addr, PAGE_SIZE,
+                                    DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
+               __page_frag_cache_drain(buf->page, buf->pg_refs);
+               buf->page = NULL;
+               c->cons_cnt++;
+       }
+       return false;
+}
+
+/* Allocate and DMA-map a page for receive. */
+static int funeth_alloc_page(struct funeth_rxq *q, struct funeth_rxbuf *rb,
+                            int node, gfp_t gfp)
+{
+       struct page *p;
+
+       if (cache_get(q, rb))
+               return 0;
+
+       p = __alloc_pages_node(node, gfp | __GFP_NOWARN, 0);
+       if (unlikely(!p))
+               return -ENOMEM;
+
+       rb->dma_addr = dma_map_page(q->dma_dev, p, 0, PAGE_SIZE,
+                                   DMA_FROM_DEVICE);
+       if (unlikely(dma_mapping_error(q->dma_dev, rb->dma_addr))) {
+               FUN_QSTAT_INC(q, rx_map_err);
+               __free_page(p);
+               return -ENOMEM;
+       }
+
+       FUN_QSTAT_INC(q, rx_page_alloc);
+
+       rb->page = p;
+       rb->pg_refs = 1;
+       refresh_refs(rb);
+       rb->node = page_is_pfmemalloc(p) ? -1 : page_to_nid(p);
+       return 0;
+}
+
+static void funeth_free_page(struct funeth_rxq *q, struct funeth_rxbuf *rb)
+{
+       if (rb->page) {
+               dma_unmap_page(q->dma_dev, rb->dma_addr, PAGE_SIZE,
+                              DMA_FROM_DEVICE);
+               __page_frag_cache_drain(rb->page, rb->pg_refs);
+               rb->page = NULL;
+       }
+}
+
+/* Run the XDP program assigned to an Rx queue.
+ * Return %NULL if the buffer is consumed, or the virtual address of the packet
+ * to turn into an skb.
+ */
+static void *fun_run_xdp(struct funeth_rxq *q, skb_frag_t *frags, void *buf_va,
+                        int ref_ok, struct funeth_txq *xdp_q)
+{
+       struct bpf_prog *xdp_prog;
+       struct xdp_buff xdp;
+       u32 act;
+
+       /* VA includes the headroom, frag size includes headroom + tailroom */
+       xdp_init_buff(&xdp, ALIGN(skb_frag_size(frags), FUN_EPRQ_PKT_ALIGN),
+                     &q->xdp_rxq);
+       xdp_prepare_buff(&xdp, buf_va, FUN_XDP_HEADROOM, skb_frag_size(frags) -
+                        (FUN_RX_TAILROOM + FUN_XDP_HEADROOM), false);
+
+       xdp_prog = READ_ONCE(q->xdp_prog);
+       act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+       switch (act) {
+       case XDP_PASS:
+               /* remove headroom, which may not be FUN_XDP_HEADROOM now */
+               skb_frag_size_set(frags, xdp.data_end - xdp.data);
+               skb_frag_off_add(frags, xdp.data - xdp.data_hard_start);
+               goto pass;
+       case XDP_TX:
+               if (unlikely(!ref_ok))
+                       goto pass;
+               if (!fun_xdp_tx(xdp_q, xdp.data, xdp.data_end - xdp.data))
+                       goto xdp_error;
+               FUN_QSTAT_INC(q, xdp_tx);
+               q->xdp_flush |= FUN_XDP_FLUSH_TX;
+               break;
+       case XDP_REDIRECT:
+               if (unlikely(!ref_ok))
+                       goto pass;
+               if (unlikely(xdp_do_redirect(q->netdev, &xdp, xdp_prog)))
+                       goto xdp_error;
+               FUN_QSTAT_INC(q, xdp_redir);
+               q->xdp_flush |= FUN_XDP_FLUSH_REDIR;
+               break;
+       default:
+               bpf_warn_invalid_xdp_action(q->netdev, xdp_prog, act);
+               fallthrough;
+       case XDP_ABORTED:
+               trace_xdp_exception(q->netdev, xdp_prog, act);
+xdp_error:
+               q->cur_buf->pg_refs++; /* return frags' page reference */
+               FUN_QSTAT_INC(q, xdp_err);
+               break;
+       case XDP_DROP:
+               q->cur_buf->pg_refs++;
+               FUN_QSTAT_INC(q, xdp_drops);
+               break;
+       }
+       return NULL;
+
+pass:
+       return xdp.data;
+}
+
+/* A CQE contains a fixed completion structure along with optional metadata and
+ * even packet data. Given the start address of a CQE return the start of the
+ * contained fixed structure, which lies at the end.
+ */
+static const void *cqe_to_info(const void *cqe)
+{
+       return cqe + FUNETH_CQE_INFO_OFFSET;
+}
+
+/* The inverse of cqe_to_info(). */
+static const void *info_to_cqe(const void *cqe_info)
+{
+       return cqe_info - FUNETH_CQE_INFO_OFFSET;
+}
+
+/* Return the type of hash provided by the device based on the L3 and L4
+ * protocols it parsed for the packet.
+ */
+static enum pkt_hash_types cqe_to_pkt_hash_type(u16 pkt_parse)
+{
+       static const enum pkt_hash_types htype_map[] = {
+               PKT_HASH_TYPE_NONE, PKT_HASH_TYPE_L3,
+               PKT_HASH_TYPE_NONE, PKT_HASH_TYPE_L4,
+               PKT_HASH_TYPE_NONE, PKT_HASH_TYPE_L3,
+               PKT_HASH_TYPE_NONE, PKT_HASH_TYPE_L3
+       };
+       u16 key;
+
+       /* Build the key from the TCP/UDP and IP/IPv6 bits */
+       key = ((pkt_parse >> FUN_ETH_RX_CV_OL4_PROT_S) & 6) |
+             ((pkt_parse >> (FUN_ETH_RX_CV_OL3_PROT_S + 1)) & 1);
+
+       return htype_map[key];
+}
+
+/* Each received packet can be scattered across several Rx buffers or can
+ * share a buffer with previously received packets depending on the buffer
+ * and packet sizes and the room available in the most recently used buffer.
+ *
+ * The rules are:
+ * - If the buffer at the head of an RQ has not been used it gets (part of) the
+ *   next incoming packet.
+ * - Otherwise, if the packet fully fits in the buffer's remaining space the
+ *   packet is written there.
+ * - Otherwise, the packet goes into the next Rx buffer.
+ *
+ * This function returns the Rx buffer for a packet or fragment thereof of the
+ * given length. If it isn't @buf it either recycles or frees that buffer
+ * before advancing the queue to the next buffer.
+ *
+ * If called repeatedly with the remaining length of a packet it will walk
+ * through all the buffers containing the packet.
+ */
+static struct funeth_rxbuf *
+get_buf(struct funeth_rxq *q, struct funeth_rxbuf *buf, unsigned int len)
+{
+       if (q->buf_offset + len <= PAGE_SIZE || !q->buf_offset)
+               return buf;            /* @buf holds (part of) the packet */
+
+       /* The packet occupies part of the next buffer. Move there after
+        * replenishing the current buffer slot either with the spare page or
+        * by reusing the slot's existing page. Note that if a spare page isn't
+        * available and the current packet occupies @buf it is a multi-frag
+        * packet that will be dropped leaving @buf available for reuse.
+        */
+       if ((page_ref_count(buf->page) == buf->pg_refs &&
+            buf->node == numa_mem_id()) || !q->spare_buf.page) {
+               dma_sync_single_for_device(q->dma_dev, buf->dma_addr,
+                                          PAGE_SIZE, DMA_FROM_DEVICE);
+               refresh_refs(buf);
+       } else {
+               cache_offer(q, buf);
+               *buf = q->spare_buf;
+               q->spare_buf.page = NULL;
+               q->rqes[q->rq_cons & q->rq_mask] =
+                       FUN_EPRQ_RQBUF_INIT(buf->dma_addr);
+       }
+       q->buf_offset = 0;
+       q->rq_cons++;
+       return &q->bufs[q->rq_cons & q->rq_mask];
+}
+
+/* Gather the page fragments making up the first Rx packet on @q. Its total
+ * length @tot_len includes optional head- and tail-rooms.
+ *
+ * Return 0 if the device retains ownership of at least some of the pages.
+ * In this case the caller may only copy the packet.
+ *
+ * A non-zero return value gives the caller permission to use references to the
+ * pages, e.g., attach them to skbs. Additionally, if the value is <0 at least
+ * one of the pages is PF_MEMALLOC.
+ *
+ * Regardless of outcome the caller is granted a reference to each of the pages.
+ */
+static int fun_gather_pkt(struct funeth_rxq *q, unsigned int tot_len,
+                         skb_frag_t *frags)
+{
+       struct funeth_rxbuf *buf = q->cur_buf;
+       unsigned int frag_len;
+       int ref_ok = 1;
+
+       for (;;) {
+               buf = get_buf(q, buf, tot_len);
+
+               /* We always keep the RQ full of buffers so before we can give
+                * one of our pages to the stack we require that we can obtain
+                * a replacement page. If we can't the packet will either be
+                * copied or dropped so we can retain ownership of the page and
+                * reuse it.
+                */
+               if (!q->spare_buf.page &&
+                   funeth_alloc_page(q, &q->spare_buf, numa_mem_id(),
+                                     GFP_ATOMIC | __GFP_MEMALLOC))
+                       ref_ok = 0;
+
+               frag_len = min_t(unsigned int, tot_len,
+                                PAGE_SIZE - q->buf_offset);
+               dma_sync_single_for_cpu(q->dma_dev,
+                                       buf->dma_addr + q->buf_offset,
+                                       frag_len, DMA_FROM_DEVICE);
+               buf->pg_refs--;
+               if (ref_ok)
+                       ref_ok |= buf->node;
+
+               __skb_frag_set_page(frags, buf->page);
+               skb_frag_off_set(frags, q->buf_offset);
+               skb_frag_size_set(frags++, frag_len);
+
+               tot_len -= frag_len;
+               if (!tot_len)
+                       break;
+
+               q->buf_offset = PAGE_SIZE;
+       }
+       q->buf_offset = ALIGN(q->buf_offset + frag_len, FUN_EPRQ_PKT_ALIGN);
+       q->cur_buf = buf;
+       return ref_ok;
+}
+
+static bool rx_hwtstamp_enabled(const struct net_device *dev)
+{
+       const struct funeth_priv *d = netdev_priv(dev);
+
+       return d->hwtstamp_cfg.rx_filter == HWTSTAMP_FILTER_ALL;
+}
+
+/* Advance the CQ pointers and phase tag to the next CQE. */
+static void advance_cq(struct funeth_rxq *q)
+{
+       if (unlikely(q->cq_head == q->cq_mask)) {
+               q->cq_head = 0;
+               q->phase ^= 1;
+               q->next_cqe_info = cqe_to_info(q->cqes);
+       } else {
+               q->cq_head++;
+               q->next_cqe_info += FUNETH_CQE_SIZE;
+       }
+       prefetch(q->next_cqe_info);
+}
+
+/* Process the packet represented by the head CQE of @q. Gather the packet's
+ * fragments, run it through the optional XDP program, and if needed construct
+ * an skb and pass it to the stack.
+ */
+static void fun_handle_cqe_pkt(struct funeth_rxq *q, struct funeth_txq *xdp_q)
+{
+       const struct fun_eth_cqe *rxreq = info_to_cqe(q->next_cqe_info);
+       unsigned int i, tot_len, pkt_len = be32_to_cpu(rxreq->pkt_len);
+       struct net_device *ndev = q->netdev;
+       skb_frag_t frags[RX_MAX_FRAGS];
+       struct skb_shared_info *si;
+       unsigned int headroom;
+       gro_result_t gro_res;
+       struct sk_buff *skb;
+       int ref_ok;
+       void *va;
+       u16 cv;
+
+       u64_stats_update_begin(&q->syncp);
+       q->stats.rx_pkts++;
+       q->stats.rx_bytes += pkt_len;
+       u64_stats_update_end(&q->syncp);
+
+       advance_cq(q);
+
+       /* account for head- and tail-room, present only for 1-buffer packets */
+       tot_len = pkt_len;
+       headroom = be16_to_cpu(rxreq->headroom);
+       if (likely(headroom))
+               tot_len += FUN_RX_TAILROOM + headroom;
+
+       ref_ok = fun_gather_pkt(q, tot_len, frags);
+       va = skb_frag_address(frags);
+       if (xdp_q && headroom == FUN_XDP_HEADROOM) {
+               va = fun_run_xdp(q, frags, va, ref_ok, xdp_q);
+               if (!va)
+                       return;
+               headroom = 0;   /* XDP_PASS trims it */
+       }
+       if (unlikely(!ref_ok))
+               goto no_mem;
+
+       if (likely(headroom)) {
+               /* headroom is either FUN_RX_HEADROOM or FUN_XDP_HEADROOM */
+               prefetch(va + headroom);
+               skb = napi_build_skb(va, ALIGN(tot_len, FUN_EPRQ_PKT_ALIGN));
+               if (unlikely(!skb))
+                       goto no_mem;
+
+               skb_reserve(skb, headroom);
+               __skb_put(skb, pkt_len);
+               skb->protocol = eth_type_trans(skb, ndev);
+       } else {
+               prefetch(va);
+               skb = napi_get_frags(q->napi);
+               if (unlikely(!skb))
+                       goto no_mem;
+
+               if (ref_ok < 0)
+                       skb->pfmemalloc = 1;
+
+               si = skb_shinfo(skb);
+               si->nr_frags = rxreq->nsgl;
+               for (i = 0; i < si->nr_frags; i++)
+                       si->frags[i] = frags[i];
+
+               skb->len = pkt_len;
+               skb->data_len = pkt_len;
+               skb->truesize += round_up(pkt_len, FUN_EPRQ_PKT_ALIGN);
+       }
+
+       skb_record_rx_queue(skb, q->qidx);
+       cv = be16_to_cpu(rxreq->pkt_cv);
+       if (likely((q->netdev->features & NETIF_F_RXHASH) && rxreq->hash))
+               skb_set_hash(skb, be32_to_cpu(rxreq->hash),
+                            cqe_to_pkt_hash_type(cv));
+       if (likely((q->netdev->features & NETIF_F_RXCSUM) && rxreq->csum)) {
+               FUN_QSTAT_INC(q, rx_cso);
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
+               skb->csum_level = be16_to_cpu(rxreq->csum) - 1;
+       }
+       if (unlikely(rx_hwtstamp_enabled(q->netdev)))
+               skb_hwtstamps(skb)->hwtstamp = be64_to_cpu(rxreq->timestamp);
+
+       trace_funeth_rx(q, rxreq->nsgl, pkt_len, skb->hash, cv);
+
+       gro_res = skb->data_len ? napi_gro_frags(q->napi) :
+                                 napi_gro_receive(q->napi, skb);
+       if (gro_res == GRO_MERGED || gro_res == GRO_MERGED_FREE)
+               FUN_QSTAT_INC(q, gro_merged);
+       else if (gro_res == GRO_HELD)
+               FUN_QSTAT_INC(q, gro_pkts);
+       return;
+
+no_mem:
+       FUN_QSTAT_INC(q, rx_mem_drops);
+
+       /* Release the references we've been granted for the frag pages.
+        * We return the ref of the last frag and free the rest.
+        */
+       q->cur_buf->pg_refs++;
+       for (i = 0; i < rxreq->nsgl - 1; i++)
+               __free_page(skb_frag_page(frags + i));
+}
+
+/* Return 0 if the phase tag of the CQE at the CQ's head matches expectations
+ * indicating the CQE is new.
+ */
+static u16 cqe_phase_mismatch(const struct fun_cqe_info *ci, u16 phase)
+{
+       u16 sf_p = be16_to_cpu(ci->sf_p);
+
+       return (sf_p & 1) ^ phase;
+}
+
+/* Walk through a CQ identifying and processing fresh CQEs up to the given
+ * budget. Return the remaining budget.
+ */
+static int fun_process_cqes(struct funeth_rxq *q, int budget)
+{
+       struct funeth_priv *fp = netdev_priv(q->netdev);
+       struct funeth_txq **xdpqs, *xdp_q = NULL;
+
+       xdpqs = rcu_dereference_bh(fp->xdpqs);
+       if (xdpqs)
+               xdp_q = xdpqs[smp_processor_id()];
+
+       while (budget && !cqe_phase_mismatch(q->next_cqe_info, q->phase)) {
+               /* access other descriptor fields after the phase check */
+               dma_rmb();
+
+               fun_handle_cqe_pkt(q, xdp_q);
+               budget--;
+       }
+
+       if (unlikely(q->xdp_flush)) {
+               if (q->xdp_flush & FUN_XDP_FLUSH_TX)
+                       fun_txq_wr_db(xdp_q);
+               if (q->xdp_flush & FUN_XDP_FLUSH_REDIR)
+                       xdp_do_flush();
+               q->xdp_flush = 0;
+       }
+
+       return budget;
+}
+
+/* NAPI handler for Rx queues. Calls the CQE processing loop and writes RQ/CQ
+ * doorbells as needed.
+ */
+int fun_rxq_napi_poll(struct napi_struct *napi, int budget)
+{
+       struct fun_irq *irq = container_of(napi, struct fun_irq, napi);
+       struct funeth_rxq *q = irq->rxq;
+       int work_done = budget - fun_process_cqes(q, budget);
+       u32 cq_db_val = q->cq_head;
+
+       if (unlikely(work_done >= budget))
+               FUN_QSTAT_INC(q, rx_budget);
+       else if (napi_complete_done(napi, work_done))
+               cq_db_val |= q->irq_db_val;
+
+       /* check whether to post new Rx buffers */
+       if (q->rq_cons - q->rq_cons_db >= q->rq_db_thres) {
+               u64_stats_update_begin(&q->syncp);
+               q->stats.rx_bufs += q->rq_cons - q->rq_cons_db;
+               u64_stats_update_end(&q->syncp);
+               q->rq_cons_db = q->rq_cons;
+               writel((q->rq_cons - 1) & q->rq_mask, q->rq_db);
+       }
+
+       writel(cq_db_val, q->cq_db);
+       return work_done;
+}
+
+/* Free the Rx buffers of an Rx queue. */
+static void fun_rxq_free_bufs(struct funeth_rxq *q)
+{
+       struct funeth_rxbuf *b = q->bufs;
+       unsigned int i;
+
+       for (i = 0; i <= q->rq_mask; i++, b++)
+               funeth_free_page(q, b);
+
+       funeth_free_page(q, &q->spare_buf);
+       q->cur_buf = NULL;
+}
+
+/* Initially provision an Rx queue with Rx buffers. */
+static int fun_rxq_alloc_bufs(struct funeth_rxq *q, int node)
+{
+       struct funeth_rxbuf *b = q->bufs;
+       unsigned int i;
+
+       for (i = 0; i <= q->rq_mask; i++, b++) {
+               if (funeth_alloc_page(q, b, node, GFP_KERNEL)) {
+                       fun_rxq_free_bufs(q);
+                       return -ENOMEM;
+               }
+               q->rqes[i] = FUN_EPRQ_RQBUF_INIT(b->dma_addr);
+       }
+       q->cur_buf = q->bufs;
+       return 0;
+}
+
+/* Initialize a used-buffer cache of the given depth. */
+static int fun_rxq_init_cache(struct funeth_rx_cache *c, unsigned int depth,
+                             int node)
+{
+       c->mask = depth - 1;
+       c->bufs = kvzalloc_node(depth * sizeof(*c->bufs), GFP_KERNEL, node);
+       return c->bufs ? 0 : -ENOMEM;
+}
+
+/* Deallocate an Rx queue's used-buffer cache and its contents. */
+static void fun_rxq_free_cache(struct funeth_rxq *q)
+{
+       struct funeth_rxbuf *b = q->cache.bufs;
+       unsigned int i;
+
+       for (i = 0; i <= q->cache.mask; i++, b++)
+               funeth_free_page(q, b);
+
+       kvfree(q->cache.bufs);
+       q->cache.bufs = NULL;
+}
+
+int fun_rxq_set_bpf(struct funeth_rxq *q, struct bpf_prog *prog)
+{
+       struct funeth_priv *fp = netdev_priv(q->netdev);
+       struct fun_admin_epcq_req cmd;
+       u16 headroom;
+       int err;
+
+       headroom = prog ? FUN_XDP_HEADROOM : FUN_RX_HEADROOM;
+       if (headroom != q->headroom) {
+               cmd.common = FUN_ADMIN_REQ_COMMON_INIT2(FUN_ADMIN_OP_EPCQ,
+                                                       sizeof(cmd));
+               cmd.u.modify =
+                       FUN_ADMIN_EPCQ_MODIFY_REQ_INIT(FUN_ADMIN_SUBOP_MODIFY,
+                                                      0, q->hw_cqid, headroom);
+               err = fun_submit_admin_sync_cmd(fp->fdev, &cmd.common, NULL, 0,
+                                               0);
+               if (err)
+                       return err;
+               q->headroom = headroom;
+       }
+
+       WRITE_ONCE(q->xdp_prog, prog);
+       return 0;
+}
+
+/* Create an Rx queue, allocating the host memory it needs. */
+static struct funeth_rxq *fun_rxq_create_sw(struct net_device *dev,
+                                           unsigned int qidx,
+                                           unsigned int ncqe,
+                                           unsigned int nrqe,
+                                           struct fun_irq *irq)
+{
+       struct funeth_priv *fp = netdev_priv(dev);
+       struct funeth_rxq *q;
+       int err = -ENOMEM;
+       int numa_node;
+
+       numa_node = fun_irq_node(irq);
+       q = kzalloc_node(sizeof(*q), GFP_KERNEL, numa_node);
+       if (!q)
+               goto err;
+
+       q->qidx = qidx;
+       q->netdev = dev;
+       q->cq_mask = ncqe - 1;
+       q->rq_mask = nrqe - 1;
+       q->numa_node = numa_node;
+       q->rq_db_thres = nrqe / 4;
+       u64_stats_init(&q->syncp);
+       q->dma_dev = &fp->pdev->dev;
+
+       q->rqes = fun_alloc_ring_mem(q->dma_dev, nrqe, sizeof(*q->rqes),
+                                    sizeof(*q->bufs), false, numa_node,
+                                    &q->rq_dma_addr, (void **)&q->bufs, NULL);
+       if (!q->rqes)
+               goto free_q;
+
+       q->cqes = fun_alloc_ring_mem(q->dma_dev, ncqe, FUNETH_CQE_SIZE, 0,
+                                    false, numa_node, &q->cq_dma_addr, NULL,
+                                    NULL);
+       if (!q->cqes)
+               goto free_rqes;
+
+       err = fun_rxq_init_cache(&q->cache, nrqe, numa_node);
+       if (err)
+               goto free_cqes;
+
+       err = fun_rxq_alloc_bufs(q, numa_node);
+       if (err)
+               goto free_cache;
+
+       q->stats.rx_bufs = q->rq_mask;
+       q->init_state = FUN_QSTATE_INIT_SW;
+       return q;
+
+free_cache:
+       fun_rxq_free_cache(q);
+free_cqes:
+       dma_free_coherent(q->dma_dev, ncqe * FUNETH_CQE_SIZE, q->cqes,
+                         q->cq_dma_addr);
+free_rqes:
+       fun_free_ring_mem(q->dma_dev, nrqe, sizeof(*q->rqes), false, q->rqes,
+                         q->rq_dma_addr, q->bufs);
+free_q:
+       kfree(q);
+err:
+       netdev_err(dev, "Unable to allocate memory for Rx queue %u\n", qidx);
+       return ERR_PTR(err);
+}
+
+static void fun_rxq_free_sw(struct funeth_rxq *q)
+{
+       struct funeth_priv *fp = netdev_priv(q->netdev);
+
+       fun_rxq_free_cache(q);
+       fun_rxq_free_bufs(q);
+       fun_free_ring_mem(q->dma_dev, q->rq_mask + 1, sizeof(*q->rqes), false,
+                         q->rqes, q->rq_dma_addr, q->bufs);
+       dma_free_coherent(q->dma_dev, (q->cq_mask + 1) * FUNETH_CQE_SIZE,
+                         q->cqes, q->cq_dma_addr);
+
+       /* Before freeing the queue transfer key counters to the device. */
+       fp->rx_packets += q->stats.rx_pkts;
+       fp->rx_bytes   += q->stats.rx_bytes;
+       fp->rx_dropped += q->stats.rx_map_err + q->stats.rx_mem_drops;
+
+       kfree(q);
+}
+
+/* Create an Rx queue's resources on the device. */
+int fun_rxq_create_dev(struct funeth_rxq *q, struct fun_irq *irq)
+{
+       struct funeth_priv *fp = netdev_priv(q->netdev);
+       unsigned int ncqe = q->cq_mask + 1;
+       unsigned int nrqe = q->rq_mask + 1;
+       int err;
+
+       err = xdp_rxq_info_reg(&q->xdp_rxq, q->netdev, q->qidx,
+                              irq->napi.napi_id);
+       if (err)
+               goto out;
+
+       err = xdp_rxq_info_reg_mem_model(&q->xdp_rxq, MEM_TYPE_PAGE_SHARED,
+                                        NULL);
+       if (err)
+               goto xdp_unreg;
+
+       q->phase = 1;
+       q->irq_cnt = 0;
+       q->cq_head = 0;
+       q->rq_cons = 0;
+       q->rq_cons_db = 0;
+       q->buf_offset = 0;
+       q->napi = &irq->napi;
+       q->irq_db_val = fp->cq_irq_db;
+       q->next_cqe_info = cqe_to_info(q->cqes);
+
+       q->xdp_prog = fp->xdp_prog;
+       q->headroom = fp->xdp_prog ? FUN_XDP_HEADROOM : FUN_RX_HEADROOM;
+
+       err = fun_sq_create(fp->fdev, FUN_ADMIN_RES_CREATE_FLAG_ALLOCATOR |
+                           FUN_ADMIN_EPSQ_CREATE_FLAG_RQ, 0,
+                           FUN_HCI_ID_INVALID, 0, nrqe, q->rq_dma_addr, 0, 0,
+                           0, 0, fp->fdev->kern_end_qid, PAGE_SHIFT,
+                           &q->hw_sqid, &q->rq_db);
+       if (err)
+               goto xdp_unreg;
+
+       err = fun_cq_create(fp->fdev, FUN_ADMIN_RES_CREATE_FLAG_ALLOCATOR |
+                           FUN_ADMIN_EPCQ_CREATE_FLAG_RQ, 0,
+                           q->hw_sqid, ilog2(FUNETH_CQE_SIZE), ncqe,
+                           q->cq_dma_addr, q->headroom, FUN_RX_TAILROOM, 0, 0,
+                           irq->irq_idx, 0, fp->fdev->kern_end_qid,
+                           &q->hw_cqid, &q->cq_db);
+       if (err)
+               goto free_rq;
+
+       irq->rxq = q;
+       writel(q->rq_mask, q->rq_db);
+       q->init_state = FUN_QSTATE_INIT_FULL;
+
+       netif_info(fp, ifup, q->netdev,
+                  "Rx queue %u, depth %u/%u, HW qid %u/%u, IRQ idx %u, node %d, headroom %u\n",
+                  q->qidx, ncqe, nrqe, q->hw_cqid, q->hw_sqid, irq->irq_idx,
+                  q->numa_node, q->headroom);
+       return 0;
+
+free_rq:
+       fun_destroy_sq(fp->fdev, q->hw_sqid);
+xdp_unreg:
+       xdp_rxq_info_unreg(&q->xdp_rxq);
+out:
+       netdev_err(q->netdev,
+                  "Failed to create Rx queue %u on device, error %d\n",
+                  q->qidx, err);
+       return err;
+}
+
+static void fun_rxq_free_dev(struct funeth_rxq *q)
+{
+       struct funeth_priv *fp = netdev_priv(q->netdev);
+       struct fun_irq *irq;
+
+       if (q->init_state < FUN_QSTATE_INIT_FULL)
+               return;
+
+       irq = container_of(q->napi, struct fun_irq, napi);
+       netif_info(fp, ifdown, q->netdev,
+                  "Freeing Rx queue %u (id %u/%u), IRQ %u\n",
+                  q->qidx, q->hw_cqid, q->hw_sqid, irq->irq_idx);
+
+       irq->rxq = NULL;
+       xdp_rxq_info_unreg(&q->xdp_rxq);
+       fun_destroy_sq(fp->fdev, q->hw_sqid);
+       fun_destroy_cq(fp->fdev, q->hw_cqid);
+       q->init_state = FUN_QSTATE_INIT_SW;
+}
+
+/* Create or advance an Rx queue, allocating all the host and device resources
+ * needed to reach the target state.
+ */
+int funeth_rxq_create(struct net_device *dev, unsigned int qidx,
+                     unsigned int ncqe, unsigned int nrqe, struct fun_irq *irq,
+                     int state, struct funeth_rxq **qp)
+{
+       struct funeth_rxq *q = *qp;
+       int err;
+
+       if (!q) {
+               q = fun_rxq_create_sw(dev, qidx, ncqe, nrqe, irq);
+               if (IS_ERR(q))
+                       return PTR_ERR(q);
+       }
+
+       if (q->init_state >= state)
+               goto out;
+
+       err = fun_rxq_create_dev(q, irq);
+       if (err) {
+               if (!*qp)
+                       fun_rxq_free_sw(q);
+               return err;
+       }
+
+out:
+       *qp = q;
+       return 0;
+}
+
+/* Free Rx queue resources until it reaches the target state. */
+struct funeth_rxq *funeth_rxq_free(struct funeth_rxq *q, int state)
+{
+       if (state < FUN_QSTATE_INIT_FULL)
+               fun_rxq_free_dev(q);
+
+       if (state == FUN_QSTATE_DESTROYED) {
+               fun_rxq_free_sw(q);
+               q = NULL;
+       }
+
+       return q;
+}
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_trace.h b/drivers/net/ethernet/fungible/funeth/funeth_trace.h
new file mode 100644 (file)
index 0000000..9e58dfe
--- /dev/null
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM funeth
+
+#if !defined(_TRACE_FUNETH_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FUNETH_H
+
+#include <linux/tracepoint.h>
+
+#include "funeth_txrx.h"
+
+TRACE_EVENT(funeth_tx,
+
+       TP_PROTO(const struct funeth_txq *txq,
+                u32 len,
+                u32 sqe_idx,
+                u32 ngle),
+
+       TP_ARGS(txq, len, sqe_idx, ngle),
+
+       TP_STRUCT__entry(
+               __field(u32, qidx)
+               __field(u32, len)
+               __field(u32, sqe_idx)
+               __field(u32, ngle)
+               __string(devname, txq->netdev->name)
+       ),
+
+       TP_fast_assign(
+               __entry->qidx = txq->qidx;
+               __entry->len = len;
+               __entry->sqe_idx = sqe_idx;
+               __entry->ngle = ngle;
+               __assign_str(devname, txq->netdev->name);
+       ),
+
+       TP_printk("%s: Txq %u, SQE idx %u, len %u, num GLEs %u",
+                 __get_str(devname), __entry->qidx, __entry->sqe_idx,
+                 __entry->len, __entry->ngle)
+);
+
+TRACE_EVENT(funeth_tx_free,
+
+       TP_PROTO(const struct funeth_txq *txq,
+                u32 sqe_idx,
+                u32 num_sqes,
+                u32 hw_head),
+
+       TP_ARGS(txq, sqe_idx, num_sqes, hw_head),
+
+       TP_STRUCT__entry(
+               __field(u32, qidx)
+               __field(u32, sqe_idx)
+               __field(u32, num_sqes)
+               __field(u32, hw_head)
+               __string(devname, txq->netdev->name)
+       ),
+
+       TP_fast_assign(
+               __entry->qidx = txq->qidx;
+               __entry->sqe_idx = sqe_idx;
+               __entry->num_sqes = num_sqes;
+               __entry->hw_head = hw_head;
+               __assign_str(devname, txq->netdev->name);
+       ),
+
+       TP_printk("%s: Txq %u, SQE idx %u, SQEs %u, HW head %u",
+                 __get_str(devname), __entry->qidx, __entry->sqe_idx,
+                 __entry->num_sqes, __entry->hw_head)
+);
+
+TRACE_EVENT(funeth_rx,
+
+       TP_PROTO(const struct funeth_rxq *rxq,
+                u32 num_rqes,
+                u32 pkt_len,
+                u32 hash,
+                u32 cls_vec),
+
+       TP_ARGS(rxq, num_rqes, pkt_len, hash, cls_vec),
+
+       TP_STRUCT__entry(
+               __field(u32, qidx)
+               __field(u32, cq_head)
+               __field(u32, num_rqes)
+               __field(u32, len)
+               __field(u32, hash)
+               __field(u32, cls_vec)
+               __string(devname, rxq->netdev->name)
+       ),
+
+       TP_fast_assign(
+               __entry->qidx = rxq->qidx;
+               __entry->cq_head = rxq->cq_head;
+               __entry->num_rqes = num_rqes;
+               __entry->len = pkt_len;
+               __entry->hash = hash;
+               __entry->cls_vec = cls_vec;
+               __assign_str(devname, rxq->netdev->name);
+       ),
+
+       TP_printk("%s: Rxq %u, CQ head %u, RQEs %u, len %u, hash %u, CV %#x",
+                 __get_str(devname), __entry->qidx, __entry->cq_head,
+                 __entry->num_rqes, __entry->len, __entry->hash,
+                 __entry->cls_vec)
+);
+
+#endif /* _TRACE_FUNETH_H */
+
+/* Below must be outside protection. */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE funeth_trace
+
+#include <trace/define_trace.h>
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_tx.c b/drivers/net/ethernet/fungible/funeth/funeth_tx.c
new file mode 100644 (file)
index 0000000..46684af
--- /dev/null
@@ -0,0 +1,762 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
+
+#include <linux/dma-mapping.h>
+#include <linux/ip.h>
+#include <linux/pci.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <uapi/linux/udp.h>
+#include "funeth.h"
+#include "funeth_txrx.h"
+#include "funeth_trace.h"
+#include "fun_queue.h"
+
+#define FUN_XDP_CLEAN_THRES 32
+#define FUN_XDP_CLEAN_BATCH 16
+
+/* DMA-map a packet and return the (length, DMA_address) pairs for its
+ * segments. If a mapping error occurs -ENOMEM is returned.
+ */
+static int map_skb(const struct sk_buff *skb, struct device *dev,
+                  dma_addr_t *addr, unsigned int *len)
+{
+       const struct skb_shared_info *si;
+       const skb_frag_t *fp, *end;
+
+       *len = skb_headlen(skb);
+       *addr = dma_map_single(dev, skb->data, *len, DMA_TO_DEVICE);
+       if (dma_mapping_error(dev, *addr))
+               return -ENOMEM;
+
+       si = skb_shinfo(skb);
+       end = &si->frags[si->nr_frags];
+
+       for (fp = si->frags; fp < end; fp++) {
+               *++len = skb_frag_size(fp);
+               *++addr = skb_frag_dma_map(dev, fp, 0, *len, DMA_TO_DEVICE);
+               if (dma_mapping_error(dev, *addr))
+                       goto unwind;
+       }
+       return 0;
+
+unwind:
+       while (fp-- > si->frags)
+               dma_unmap_page(dev, *--addr, skb_frag_size(fp), DMA_TO_DEVICE);
+
+       dma_unmap_single(dev, addr[-1], skb_headlen(skb), DMA_TO_DEVICE);
+       return -ENOMEM;
+}
+
+/* Return the address just past the end of a Tx queue's descriptor ring.
+ * It exploits the fact that the HW writeback area is just after the end
+ * of the descriptor ring.
+ */
+static void *txq_end(const struct funeth_txq *q)
+{
+       return (void *)q->hw_wb;
+}
+
+/* Return the amount of space within a Tx ring from the given address to the
+ * end.
+ */
+static unsigned int txq_to_end(const struct funeth_txq *q, void *p)
+{
+       return txq_end(q) - p;
+}
+
+/* Return the number of Tx descriptors occupied by a Tx request. */
+static unsigned int tx_req_ndesc(const struct fun_eth_tx_req *req)
+{
+       return DIV_ROUND_UP(req->len8, FUNETH_SQE_SIZE / 8);
+}
+
+static __be16 tcp_hdr_doff_flags(const struct tcphdr *th)
+{
+       return *(__be16 *)&tcp_flag_word(th);
+}
+
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+#include "funeth_ktls.h"
+
+static struct sk_buff *fun_tls_tx(struct sk_buff *skb, struct funeth_txq *q,
+                                 unsigned int *tls_len)
+{
+       const struct fun_ktls_tx_ctx *tls_ctx;
+       u32 datalen, seq;
+
+       datalen = skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb));
+       if (!datalen)
+               return skb;
+
+       if (likely(!tls_offload_tx_resync_pending(skb->sk))) {
+               seq = ntohl(tcp_hdr(skb)->seq);
+               tls_ctx = tls_driver_ctx(skb->sk, TLS_OFFLOAD_CTX_DIR_TX);
+
+               if (likely(tls_ctx->next_seq == seq)) {
+                       *tls_len = datalen;
+                       return skb;
+               }
+               if (seq - tls_ctx->next_seq < U32_MAX / 4) {
+                       tls_offload_tx_resync_request(skb->sk, seq,
+                                                     tls_ctx->next_seq);
+               }
+       }
+
+       FUN_QSTAT_INC(q, tx_tls_fallback);
+       skb = tls_encrypt_skb(skb);
+       if (!skb)
+               FUN_QSTAT_INC(q, tx_tls_drops);
+
+       return skb;
+}
+#endif
+
+/* Write as many descriptors as needed for the supplied skb starting at the
+ * current producer location. The caller has made certain enough descriptors
+ * are available.
+ *
+ * Returns the number of descriptors written, 0 on error.
+ */
+static unsigned int write_pkt_desc(struct sk_buff *skb, struct funeth_txq *q,
+                                  unsigned int tls_len)
+{
+       unsigned int extra_bytes = 0, extra_pkts = 0;
+       unsigned int idx = q->prod_cnt & q->mask;
+       const struct skb_shared_info *shinfo;
+       unsigned int lens[MAX_SKB_FRAGS + 1];
+       dma_addr_t addrs[MAX_SKB_FRAGS + 1];
+       struct fun_eth_tx_req *req;
+       struct fun_dataop_gl *gle;
+       const struct tcphdr *th;
+       unsigned int ngle, i;
+       u16 flags;
+
+       if (unlikely(map_skb(skb, q->dma_dev, addrs, lens))) {
+               FUN_QSTAT_INC(q, tx_map_err);
+               return 0;
+       }
+
+       req = fun_tx_desc_addr(q, idx);
+       req->op = FUN_ETH_OP_TX;
+       req->len8 = 0;
+       req->flags = 0;
+       req->suboff8 = offsetof(struct fun_eth_tx_req, dataop);
+       req->repr_idn = 0;
+       req->encap_proto = 0;
+
+       shinfo = skb_shinfo(skb);
+       if (likely(shinfo->gso_size)) {
+               if (skb->encapsulation) {
+                       u16 ol4_ofst;
+
+                       flags = FUN_ETH_OUTER_EN | FUN_ETH_INNER_LSO |
+                               FUN_ETH_UPDATE_INNER_L4_CKSUM |
+                               FUN_ETH_UPDATE_OUTER_L3_LEN;
+                       if (shinfo->gso_type & (SKB_GSO_UDP_TUNNEL |
+                                               SKB_GSO_UDP_TUNNEL_CSUM)) {
+                               flags |= FUN_ETH_UPDATE_OUTER_L4_LEN |
+                                        FUN_ETH_OUTER_UDP;
+                               if (shinfo->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)
+                                       flags |= FUN_ETH_UPDATE_OUTER_L4_CKSUM;
+                               ol4_ofst = skb_transport_offset(skb);
+                       } else {
+                               ol4_ofst = skb_inner_network_offset(skb);
+                       }
+
+                       if (ip_hdr(skb)->version == 4)
+                               flags |= FUN_ETH_UPDATE_OUTER_L3_CKSUM;
+                       else
+                               flags |= FUN_ETH_OUTER_IPV6;
+
+                       if (skb->inner_network_header) {
+                               if (inner_ip_hdr(skb)->version == 4)
+                                       flags |= FUN_ETH_UPDATE_INNER_L3_CKSUM |
+                                                FUN_ETH_UPDATE_INNER_L3_LEN;
+                               else
+                                       flags |= FUN_ETH_INNER_IPV6 |
+                                                FUN_ETH_UPDATE_INNER_L3_LEN;
+                       }
+                       th = inner_tcp_hdr(skb);
+                       fun_eth_offload_init(&req->offload, flags,
+                                            shinfo->gso_size,
+                                            tcp_hdr_doff_flags(th), 0,
+                                            skb_inner_network_offset(skb),
+                                            skb_inner_transport_offset(skb),
+                                            skb_network_offset(skb), ol4_ofst);
+                       FUN_QSTAT_INC(q, tx_encap_tso);
+               } else {
+                       /* HW considers one set of headers as inner */
+                       flags = FUN_ETH_INNER_LSO |
+                               FUN_ETH_UPDATE_INNER_L4_CKSUM |
+                               FUN_ETH_UPDATE_INNER_L3_LEN;
+                       if (shinfo->gso_type & SKB_GSO_TCPV6)
+                               flags |= FUN_ETH_INNER_IPV6;
+                       else
+                               flags |= FUN_ETH_UPDATE_INNER_L3_CKSUM;
+                       th = tcp_hdr(skb);
+                       fun_eth_offload_init(&req->offload, flags,
+                                            shinfo->gso_size,
+                                            tcp_hdr_doff_flags(th), 0,
+                                            skb_network_offset(skb),
+                                            skb_transport_offset(skb), 0, 0);
+                       FUN_QSTAT_INC(q, tx_tso);
+               }
+
+               u64_stats_update_begin(&q->syncp);
+               q->stats.tx_cso += shinfo->gso_segs;
+               u64_stats_update_end(&q->syncp);
+
+               extra_pkts = shinfo->gso_segs - 1;
+               extra_bytes = (be16_to_cpu(req->offload.inner_l4_off) +
+                              __tcp_hdrlen(th)) * extra_pkts;
+       } else if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
+               flags = FUN_ETH_UPDATE_INNER_L4_CKSUM;
+               if (skb->csum_offset == offsetof(struct udphdr, check))
+                       flags |= FUN_ETH_INNER_UDP;
+               fun_eth_offload_init(&req->offload, flags, 0, 0, 0, 0,
+                                    skb_checksum_start_offset(skb), 0, 0);
+               FUN_QSTAT_INC(q, tx_cso);
+       } else {
+               fun_eth_offload_init(&req->offload, 0, 0, 0, 0, 0, 0, 0, 0);
+       }
+
+       ngle = shinfo->nr_frags + 1;
+       req->len8 = (sizeof(*req) + ngle * sizeof(*gle)) / 8;
+       req->dataop = FUN_DATAOP_HDR_INIT(ngle, 0, ngle, 0, skb->len);
+
+       for (i = 0, gle = (struct fun_dataop_gl *)req->dataop.imm;
+            i < ngle && txq_to_end(q, gle); i++, gle++)
+               fun_dataop_gl_init(gle, 0, 0, lens[i], addrs[i]);
+
+       if (txq_to_end(q, gle) == 0) {
+               gle = (struct fun_dataop_gl *)q->desc;
+               for ( ; i < ngle; i++, gle++)
+                       fun_dataop_gl_init(gle, 0, 0, lens[i], addrs[i]);
+       }
+
+       if (IS_ENABLED(CONFIG_TLS_DEVICE) && unlikely(tls_len)) {
+               struct fun_eth_tls *tls = (struct fun_eth_tls *)gle;
+               struct fun_ktls_tx_ctx *tls_ctx;
+
+               req->len8 += FUNETH_TLS_SZ / 8;
+               req->flags = cpu_to_be16(FUN_ETH_TX_TLS);
+
+               tls_ctx = tls_driver_ctx(skb->sk, TLS_OFFLOAD_CTX_DIR_TX);
+               tls->tlsid = tls_ctx->tlsid;
+               tls_ctx->next_seq += tls_len;
+
+               u64_stats_update_begin(&q->syncp);
+               q->stats.tx_tls_bytes += tls_len;
+               q->stats.tx_tls_pkts += 1 + extra_pkts;
+               u64_stats_update_end(&q->syncp);
+       }
+
+       u64_stats_update_begin(&q->syncp);
+       q->stats.tx_bytes += skb->len + extra_bytes;
+       q->stats.tx_pkts += 1 + extra_pkts;
+       u64_stats_update_end(&q->syncp);
+
+       q->info[idx].skb = skb;
+
+       trace_funeth_tx(q, skb->len, idx, req->dataop.ngather);
+       return tx_req_ndesc(req);
+}
+
+/* Return the number of available descriptors of a Tx queue.
+ * HW assumes head==tail means the ring is empty so we need to keep one
+ * descriptor unused.
+ */
+static unsigned int fun_txq_avail(const struct funeth_txq *q)
+{
+       return q->mask - q->prod_cnt + q->cons_cnt;
+}
+
+/* Stop a queue if it can't handle another worst-case packet. */
+static void fun_tx_check_stop(struct funeth_txq *q)
+{
+       if (likely(fun_txq_avail(q) >= FUNETH_MAX_PKT_DESC))
+               return;
+
+       netif_tx_stop_queue(q->ndq);
+
+       /* NAPI reclaim is freeing packets in parallel with us and we may race.
+        * We have stopped the queue but check again after synchronizing with
+        * reclaim.
+        */
+       smp_mb();
+       if (likely(fun_txq_avail(q) < FUNETH_MAX_PKT_DESC))
+               FUN_QSTAT_INC(q, tx_nstops);
+       else
+               netif_tx_start_queue(q->ndq);
+}
+
+/* Return true if a queue has enough space to restart. Current condition is
+ * that the queue must be >= 1/4 empty.
+ */
+static bool fun_txq_may_restart(struct funeth_txq *q)
+{
+       return fun_txq_avail(q) >= q->mask / 4;
+}
+
+netdev_tx_t fun_start_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+       struct funeth_priv *fp = netdev_priv(netdev);
+       unsigned int qid = skb_get_queue_mapping(skb);
+       struct funeth_txq *q = fp->txqs[qid];
+       unsigned int tls_len = 0;
+       unsigned int ndesc;
+
+       if (IS_ENABLED(CONFIG_TLS_DEVICE) && skb->sk &&
+           tls_is_sk_tx_device_offloaded(skb->sk)) {
+               skb = fun_tls_tx(skb, q, &tls_len);
+               if (unlikely(!skb))
+                       goto dropped;
+       }
+
+       ndesc = write_pkt_desc(skb, q, tls_len);
+       if (unlikely(!ndesc)) {
+               dev_kfree_skb_any(skb);
+               goto dropped;
+       }
+
+       q->prod_cnt += ndesc;
+       fun_tx_check_stop(q);
+
+       skb_tx_timestamp(skb);
+
+       if (__netdev_tx_sent_queue(q->ndq, skb->len, netdev_xmit_more()))
+               fun_txq_wr_db(q);
+       else
+               FUN_QSTAT_INC(q, tx_more);
+
+       return NETDEV_TX_OK;
+
+dropped:
+       /* A dropped packet may be the last one in a xmit_more train,
+        * ring the doorbell just in case.
+        */
+       if (!netdev_xmit_more())
+               fun_txq_wr_db(q);
+       return NETDEV_TX_OK;
+}
+
+/* Return a Tx queue's HW head index written back to host memory. */
+static u16 txq_hw_head(const struct funeth_txq *q)
+{
+       return (u16)be64_to_cpu(*q->hw_wb);
+}
+
+/* Unmap the Tx packet starting at the given descriptor index and
+ * return the number of Tx descriptors it occupied.
+ */
+static unsigned int unmap_skb(const struct funeth_txq *q, unsigned int idx)
+{
+       const struct fun_eth_tx_req *req = fun_tx_desc_addr(q, idx);
+       unsigned int ngle = req->dataop.ngather;
+       struct fun_dataop_gl *gle;
+
+       if (ngle) {
+               gle = (struct fun_dataop_gl *)req->dataop.imm;
+               dma_unmap_single(q->dma_dev, be64_to_cpu(gle->sgl_data),
+                                be32_to_cpu(gle->sgl_len), DMA_TO_DEVICE);
+
+               for (gle++; --ngle && txq_to_end(q, gle); gle++)
+                       dma_unmap_page(q->dma_dev, be64_to_cpu(gle->sgl_data),
+                                      be32_to_cpu(gle->sgl_len),
+                                      DMA_TO_DEVICE);
+
+               for (gle = (struct fun_dataop_gl *)q->desc; ngle; ngle--, gle++)
+                       dma_unmap_page(q->dma_dev, be64_to_cpu(gle->sgl_data),
+                                      be32_to_cpu(gle->sgl_len),
+                                      DMA_TO_DEVICE);
+       }
+
+       return tx_req_ndesc(req);
+}
+
+/* Reclaim completed Tx descriptors and free their packets. Restart a stopped
+ * queue if we freed enough descriptors.
+ *
+ * Return true if we exhausted the budget while there is more work to be done.
+ */
+static bool fun_txq_reclaim(struct funeth_txq *q, int budget)
+{
+       unsigned int npkts = 0, nbytes = 0, ndesc = 0;
+       unsigned int head, limit, reclaim_idx;
+
+       /* budget may be 0, e.g., netpoll */
+       limit = budget ? budget : UINT_MAX;
+
+       for (head = txq_hw_head(q), reclaim_idx = q->cons_cnt & q->mask;
+            head != reclaim_idx && npkts < limit; head = txq_hw_head(q)) {
+               /* The HW head is continually updated, ensure we don't read
+                * descriptor state before the head tells us to reclaim it.
+                * On the enqueue side the doorbell is an implicit write
+                * barrier.
+                */
+               rmb();
+
+               do {
+                       unsigned int pkt_desc = unmap_skb(q, reclaim_idx);
+                       struct sk_buff *skb = q->info[reclaim_idx].skb;
+
+                       trace_funeth_tx_free(q, reclaim_idx, pkt_desc, head);
+
+                       nbytes += skb->len;
+                       napi_consume_skb(skb, budget);
+                       ndesc += pkt_desc;
+                       reclaim_idx = (reclaim_idx + pkt_desc) & q->mask;
+                       npkts++;
+               } while (reclaim_idx != head && npkts < limit);
+       }
+
+       q->cons_cnt += ndesc;
+       netdev_tx_completed_queue(q->ndq, npkts, nbytes);
+       smp_mb(); /* pairs with the one in fun_tx_check_stop() */
+
+       if (unlikely(netif_tx_queue_stopped(q->ndq) &&
+                    fun_txq_may_restart(q))) {
+               netif_tx_wake_queue(q->ndq);
+               FUN_QSTAT_INC(q, tx_nrestarts);
+       }
+
+       return reclaim_idx != head;
+}
+
+/* The NAPI handler for Tx queues. */
+int fun_txq_napi_poll(struct napi_struct *napi, int budget)
+{
+       struct fun_irq *irq = container_of(napi, struct fun_irq, napi);
+       struct funeth_txq *q = irq->txq;
+       unsigned int db_val;
+
+       if (fun_txq_reclaim(q, budget))
+               return budget;               /* exhausted budget */
+
+       napi_complete(napi);                 /* exhausted pending work */
+       db_val = READ_ONCE(q->irq_db_val) | (q->cons_cnt & q->mask);
+       writel(db_val, q->db);
+       return 0;
+}
+
+static void fun_xdp_unmap(const struct funeth_txq *q, unsigned int idx)
+{
+       const struct fun_eth_tx_req *req = fun_tx_desc_addr(q, idx);
+       const struct fun_dataop_gl *gle;
+
+       gle = (const struct fun_dataop_gl *)req->dataop.imm;
+       dma_unmap_single(q->dma_dev, be64_to_cpu(gle->sgl_data),
+                        be32_to_cpu(gle->sgl_len), DMA_TO_DEVICE);
+}
+
+/* Reclaim up to @budget completed Tx descriptors from a TX XDP queue. */
+static unsigned int fun_xdpq_clean(struct funeth_txq *q, unsigned int budget)
+{
+       unsigned int npkts = 0, head, reclaim_idx;
+
+       for (head = txq_hw_head(q), reclaim_idx = q->cons_cnt & q->mask;
+            head != reclaim_idx && npkts < budget; head = txq_hw_head(q)) {
+               /* The HW head is continually updated, ensure we don't read
+                * descriptor state before the head tells us to reclaim it.
+                * On the enqueue side the doorbell is an implicit write
+                * barrier.
+                */
+               rmb();
+
+               do {
+                       fun_xdp_unmap(q, reclaim_idx);
+                       page_frag_free(q->info[reclaim_idx].vaddr);
+
+                       trace_funeth_tx_free(q, reclaim_idx, 1, head);
+
+                       reclaim_idx = (reclaim_idx + 1) & q->mask;
+                       npkts++;
+               } while (reclaim_idx != head && npkts < budget);
+       }
+
+       q->cons_cnt += npkts;
+       return npkts;
+}
+
+bool fun_xdp_tx(struct funeth_txq *q, void *data, unsigned int len)
+{
+       struct fun_eth_tx_req *req;
+       struct fun_dataop_gl *gle;
+       unsigned int idx;
+       dma_addr_t dma;
+
+       if (fun_txq_avail(q) < FUN_XDP_CLEAN_THRES)
+               fun_xdpq_clean(q, FUN_XDP_CLEAN_BATCH);
+
+       if (!unlikely(fun_txq_avail(q))) {
+               FUN_QSTAT_INC(q, tx_xdp_full);
+               return false;
+       }
+
+       dma = dma_map_single(q->dma_dev, data, len, DMA_TO_DEVICE);
+       if (unlikely(dma_mapping_error(q->dma_dev, dma))) {
+               FUN_QSTAT_INC(q, tx_map_err);
+               return false;
+       }
+
+       idx = q->prod_cnt & q->mask;
+       req = fun_tx_desc_addr(q, idx);
+       req->op = FUN_ETH_OP_TX;
+       req->len8 = (sizeof(*req) + sizeof(*gle)) / 8;
+       req->flags = 0;
+       req->suboff8 = offsetof(struct fun_eth_tx_req, dataop);
+       req->repr_idn = 0;
+       req->encap_proto = 0;
+       fun_eth_offload_init(&req->offload, 0, 0, 0, 0, 0, 0, 0, 0);
+       req->dataop = FUN_DATAOP_HDR_INIT(1, 0, 1, 0, len);
+
+       gle = (struct fun_dataop_gl *)req->dataop.imm;
+       fun_dataop_gl_init(gle, 0, 0, len, dma);
+
+       q->info[idx].vaddr = data;
+
+       u64_stats_update_begin(&q->syncp);
+       q->stats.tx_bytes += len;
+       q->stats.tx_pkts++;
+       u64_stats_update_end(&q->syncp);
+
+       trace_funeth_tx(q, len, idx, 1);
+       q->prod_cnt++;
+
+       return true;
+}
+
+int fun_xdp_xmit_frames(struct net_device *dev, int n,
+                       struct xdp_frame **frames, u32 flags)
+{
+       struct funeth_priv *fp = netdev_priv(dev);
+       struct funeth_txq *q, **xdpqs;
+       int i, q_idx;
+
+       if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+               return -EINVAL;
+
+       xdpqs = rcu_dereference_bh(fp->xdpqs);
+       if (unlikely(!xdpqs))
+               return -ENETDOWN;
+
+       q_idx = smp_processor_id();
+       if (unlikely(q_idx >= fp->num_xdpqs))
+               return -ENXIO;
+
+       for (q = xdpqs[q_idx], i = 0; i < n; i++) {
+               const struct xdp_frame *xdpf = frames[i];
+
+               if (!fun_xdp_tx(q, xdpf->data, xdpf->len))
+                       break;
+       }
+
+       if (unlikely(flags & XDP_XMIT_FLUSH))
+               fun_txq_wr_db(q);
+       return i;
+}
+
+/* Purge a Tx queue of any queued packets. Should be called once HW access
+ * to the packets has been revoked, e.g., after the queue has been disabled.
+ */
+static void fun_txq_purge(struct funeth_txq *q)
+{
+       while (q->cons_cnt != q->prod_cnt) {
+               unsigned int idx = q->cons_cnt & q->mask;
+
+               q->cons_cnt += unmap_skb(q, idx);
+               dev_kfree_skb_any(q->info[idx].skb);
+       }
+       netdev_tx_reset_queue(q->ndq);
+}
+
+static void fun_xdpq_purge(struct funeth_txq *q)
+{
+       while (q->cons_cnt != q->prod_cnt) {
+               unsigned int idx = q->cons_cnt & q->mask;
+
+               fun_xdp_unmap(q, idx);
+               page_frag_free(q->info[idx].vaddr);
+               q->cons_cnt++;
+       }
+}
+
+/* Create a Tx queue, allocating all the host resources needed. */
+static struct funeth_txq *fun_txq_create_sw(struct net_device *dev,
+                                           unsigned int qidx,
+                                           unsigned int ndesc,
+                                           struct fun_irq *irq)
+{
+       struct funeth_priv *fp = netdev_priv(dev);
+       struct funeth_txq *q;
+       int numa_node;
+
+       if (irq)
+               numa_node = fun_irq_node(irq); /* skb Tx queue */
+       else
+               numa_node = cpu_to_node(qidx); /* XDP Tx queue */
+
+       q = kzalloc_node(sizeof(*q), GFP_KERNEL, numa_node);
+       if (!q)
+               goto err;
+
+       q->dma_dev = &fp->pdev->dev;
+       q->desc = fun_alloc_ring_mem(q->dma_dev, ndesc, FUNETH_SQE_SIZE,
+                                    sizeof(*q->info), true, numa_node,
+                                    &q->dma_addr, (void **)&q->info,
+                                    &q->hw_wb);
+       if (!q->desc)
+               goto free_q;
+
+       q->netdev = dev;
+       q->mask = ndesc - 1;
+       q->qidx = qidx;
+       q->numa_node = numa_node;
+       u64_stats_init(&q->syncp);
+       q->init_state = FUN_QSTATE_INIT_SW;
+       return q;
+
+free_q:
+       kfree(q);
+err:
+       netdev_err(dev, "Can't allocate memory for %s queue %u\n",
+                  irq ? "Tx" : "XDP", qidx);
+       return NULL;
+}
+
+static void fun_txq_free_sw(struct funeth_txq *q)
+{
+       struct funeth_priv *fp = netdev_priv(q->netdev);
+
+       fun_free_ring_mem(q->dma_dev, q->mask + 1, FUNETH_SQE_SIZE, true,
+                         q->desc, q->dma_addr, q->info);
+
+       fp->tx_packets += q->stats.tx_pkts;
+       fp->tx_bytes   += q->stats.tx_bytes;
+       fp->tx_dropped += q->stats.tx_map_err;
+
+       kfree(q);
+}
+
+/* Allocate the device portion of a Tx queue. */
+int fun_txq_create_dev(struct funeth_txq *q, struct fun_irq *irq)
+{
+       struct funeth_priv *fp = netdev_priv(q->netdev);
+       unsigned int irq_idx, ndesc = q->mask + 1;
+       int err;
+
+       q->irq = irq;
+       *q->hw_wb = 0;
+       q->prod_cnt = 0;
+       q->cons_cnt = 0;
+       irq_idx = irq ? irq->irq_idx : 0;
+
+       err = fun_sq_create(fp->fdev,
+                           FUN_ADMIN_EPSQ_CREATE_FLAG_HEAD_WB_ADDRESS |
+                           FUN_ADMIN_RES_CREATE_FLAG_ALLOCATOR, 0,
+                           FUN_HCI_ID_INVALID, ilog2(FUNETH_SQE_SIZE), ndesc,
+                           q->dma_addr, fp->tx_coal_count, fp->tx_coal_usec,
+                           irq_idx, 0, fp->fdev->kern_end_qid, 0,
+                           &q->hw_qid, &q->db);
+       if (err)
+               goto out;
+
+       err = fun_create_and_bind_tx(fp, q->hw_qid);
+       if (err < 0)
+               goto free_devq;
+       q->ethid = err;
+
+       if (irq) {
+               irq->txq = q;
+               q->ndq = netdev_get_tx_queue(q->netdev, q->qidx);
+               q->irq_db_val = FUN_IRQ_SQ_DB(fp->tx_coal_usec,
+                                             fp->tx_coal_count);
+               writel(q->irq_db_val, q->db);
+       }
+
+       q->init_state = FUN_QSTATE_INIT_FULL;
+       netif_info(fp, ifup, q->netdev,
+                  "%s queue %u, depth %u, HW qid %u, IRQ idx %u, eth id %u, node %d\n",
+                  irq ? "Tx" : "XDP", q->qidx, ndesc, q->hw_qid, irq_idx,
+                  q->ethid, q->numa_node);
+       return 0;
+
+free_devq:
+       fun_destroy_sq(fp->fdev, q->hw_qid);
+out:
+       netdev_err(q->netdev,
+                  "Failed to create %s queue %u on device, error %d\n",
+                  irq ? "Tx" : "XDP", q->qidx, err);
+       return err;
+}
+
+static void fun_txq_free_dev(struct funeth_txq *q)
+{
+       struct funeth_priv *fp = netdev_priv(q->netdev);
+
+       if (q->init_state < FUN_QSTATE_INIT_FULL)
+               return;
+
+       netif_info(fp, ifdown, q->netdev,
+                  "Freeing %s queue %u (id %u), IRQ %u, ethid %u\n",
+                  q->irq ? "Tx" : "XDP", q->qidx, q->hw_qid,
+                  q->irq ? q->irq->irq_idx : 0, q->ethid);
+
+       fun_destroy_sq(fp->fdev, q->hw_qid);
+       fun_res_destroy(fp->fdev, FUN_ADMIN_OP_ETH, 0, q->ethid);
+
+       if (q->irq) {
+               q->irq->txq = NULL;
+               fun_txq_purge(q);
+       } else {
+               fun_xdpq_purge(q);
+       }
+
+       q->init_state = FUN_QSTATE_INIT_SW;
+}
+
+/* Create or advance a Tx queue, allocating all the host and device resources
+ * needed to reach the target state.
+ */
+int funeth_txq_create(struct net_device *dev, unsigned int qidx,
+                     unsigned int ndesc, struct fun_irq *irq, int state,
+                     struct funeth_txq **qp)
+{
+       struct funeth_txq *q = *qp;
+       int err;
+
+       if (!q)
+               q = fun_txq_create_sw(dev, qidx, ndesc, irq);
+       if (!q)
+               return -ENOMEM;
+
+       if (q->init_state >= state)
+               goto out;
+
+       err = fun_txq_create_dev(q, irq);
+       if (err) {
+               if (!*qp)
+                       fun_txq_free_sw(q);
+               return err;
+       }
+
+out:
+       *qp = q;
+       return 0;
+}
+
+/* Free Tx queue resources until it reaches the target state.
+ * The queue must be already disconnected from the stack.
+ */
+struct funeth_txq *funeth_txq_free(struct funeth_txq *q, int state)
+{
+       if (state < FUN_QSTATE_INIT_FULL)
+               fun_txq_free_dev(q);
+
+       if (state == FUN_QSTATE_DESTROYED) {
+               fun_txq_free_sw(q);
+               q = NULL;
+       }
+
+       return q;
+}
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
new file mode 100644 (file)
index 0000000..7aed056
--- /dev/null
@@ -0,0 +1,264 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+
+#ifndef _FUNETH_TXRX_H
+#define _FUNETH_TXRX_H
+
+#include <linux/netdevice.h>
+#include <linux/u64_stats_sync.h>
+
+/* Tx descriptor size */
+#define FUNETH_SQE_SIZE 64U
+
+/* Size of device headers per Tx packet */
+#define FUNETH_FUNOS_HDR_SZ (sizeof(struct fun_eth_tx_req))
+
+/* Number of gather list entries per Tx descriptor */
+#define FUNETH_GLE_PER_DESC (FUNETH_SQE_SIZE / sizeof(struct fun_dataop_gl))
+
+/* Max gather list size in bytes for an sk_buff. */
+#define FUNETH_MAX_GL_SZ ((MAX_SKB_FRAGS + 1) * sizeof(struct fun_dataop_gl))
+
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+# define FUNETH_TLS_SZ sizeof(struct fun_eth_tls)
+#else
+# define FUNETH_TLS_SZ 0
+#endif
+
+/* Max number of Tx descriptors for an sk_buff using a gather list. */
+#define FUNETH_MAX_GL_DESC \
+       DIV_ROUND_UP((FUNETH_FUNOS_HDR_SZ + FUNETH_MAX_GL_SZ + FUNETH_TLS_SZ), \
+                    FUNETH_SQE_SIZE)
+
+/* Max number of Tx descriptors for any packet. */
+#define FUNETH_MAX_PKT_DESC FUNETH_MAX_GL_DESC
+
+/* Rx CQ descriptor size. */
+#define FUNETH_CQE_SIZE 64U
+
+/* Offset of cqe_info within a CQE. */
+#define FUNETH_CQE_INFO_OFFSET (FUNETH_CQE_SIZE - sizeof(struct fun_cqe_info))
+
+/* Construct the IRQ portion of a CQ doorbell. The resulting value arms the
+ * interrupt with the supplied time delay and packet count moderation settings.
+ */
+#define FUN_IRQ_CQ_DB(usec, pkts) \
+       (FUN_DB_IRQ_ARM_F | ((usec) << FUN_DB_INTCOAL_USEC_S) | \
+        ((pkts) << FUN_DB_INTCOAL_ENTRIES_S))
+
+/* As above for SQ doorbells. */
+#define FUN_IRQ_SQ_DB(usec, pkts) \
+       (FUN_DB_IRQ_ARM_F | \
+        ((usec) << FUN_DB_INTCOAL_USEC_S) | \
+        ((pkts) << FUN_DB_INTCOAL_ENTRIES_S))
+
+/* Per packet tailroom. Present only for 1-frag packets. */
+#define FUN_RX_TAILROOM SKB_DATA_ALIGN(sizeof(struct skb_shared_info))
+
+/* Per packet headroom for XDP. Preferred over XDP_PACKET_HEADROOM to
+ * accommodate two packets per buffer for 4K pages and 1500B MTUs.
+ */
+#define FUN_XDP_HEADROOM 192
+
+/* Initialization state of a queue. */
+enum {
+       FUN_QSTATE_DESTROYED, /* what queue? */
+       FUN_QSTATE_INIT_SW,   /* exists in SW, not on the device */
+       FUN_QSTATE_INIT_FULL, /* exists both in SW and on device */
+};
+
+/* Initialization state of an interrupt. */
+enum {
+       FUN_IRQ_INIT,      /* initialized and in the XArray but inactive */
+       FUN_IRQ_REQUESTED, /* request_irq() done */
+       FUN_IRQ_ENABLED,   /* processing enabled */
+       FUN_IRQ_DISABLED,  /* processing disabled */
+};
+
+struct bpf_prog;
+
+struct funeth_txq_stats {  /* per Tx queue SW counters */
+       u64 tx_pkts;       /* # of Tx packets */
+       u64 tx_bytes;      /* total bytes of Tx packets */
+       u64 tx_cso;        /* # of packets with checksum offload */
+       u64 tx_tso;        /* # of non-encapsulated TSO super-packets */
+       u64 tx_encap_tso;  /* # of encapsulated TSO super-packets */
+       u64 tx_more;       /* # of DBs elided due to xmit_more */
+       u64 tx_nstops;     /* # of times the queue has stopped */
+       u64 tx_nrestarts;  /* # of times the queue has restarted */
+       u64 tx_map_err;    /* # of packets dropped due to DMA mapping errors */
+       u64 tx_xdp_full;   /* # of XDP packets that could not be enqueued */
+       u64 tx_tls_pkts;   /* # of Tx TLS packets offloaded to HW */
+       u64 tx_tls_bytes;  /* Tx bytes of HW-handled TLS payload */
+       u64 tx_tls_fallback; /* attempted Tx TLS offloads punted to SW */
+       u64 tx_tls_drops;  /* attempted Tx TLS offloads dropped */
+};
+
+struct funeth_tx_info {      /* per Tx descriptor state */
+       union {
+               struct sk_buff *skb; /* associated packet */
+               void *vaddr;         /* start address for XDP */
+       };
+};
+
+struct funeth_txq {
+       /* RO cacheline of frequently accessed data */
+       u32 mask;               /* queue depth - 1 */
+       u32 hw_qid;             /* device ID of the queue */
+       void *desc;             /* base address of descriptor ring */
+       struct funeth_tx_info *info;
+       struct device *dma_dev; /* device for DMA mappings */
+       volatile __be64 *hw_wb; /* HW write-back location */
+       u32 __iomem *db;        /* SQ doorbell register address */
+       struct netdev_queue *ndq;
+       dma_addr_t dma_addr;    /* DMA address of descriptor ring */
+       /* producer R/W cacheline */
+       u16 qidx;               /* queue index within net_device */
+       u16 ethid;
+       u32 prod_cnt;           /* producer counter */
+       struct funeth_txq_stats stats;
+       /* shared R/W cacheline, primarily accessed by consumer */
+       u32 irq_db_val;         /* value written to IRQ doorbell */
+       u32 cons_cnt;           /* consumer (cleanup) counter */
+       struct net_device *netdev;
+       struct fun_irq *irq;
+       int numa_node;
+       u8 init_state;          /* queue initialization state */
+       struct u64_stats_sync syncp;
+};
+
+struct funeth_rxq_stats {  /* per Rx queue SW counters */
+       u64 rx_pkts;       /* # of received packets, including SW drops */
+       u64 rx_bytes;      /* total size of received packets */
+       u64 rx_cso;        /* # of packets with checksum offload */
+       u64 rx_bufs;       /* total # of Rx buffers provided to device */
+       u64 gro_pkts;      /* # of GRO superpackets */
+       u64 gro_merged;    /* # of pkts merged into existing GRO superpackets */
+       u64 rx_page_alloc; /* # of page allocations for Rx buffers */
+       u64 rx_budget;     /* NAPI iterations that exhausted their budget */
+       u64 rx_mem_drops;  /* # of packets dropped due to memory shortage */
+       u64 rx_map_err;    /* # of page DMA mapping errors */
+       u64 xdp_drops;     /* XDP_DROPped packets */
+       u64 xdp_tx;        /* successful XDP transmits */
+       u64 xdp_redir;     /* successful XDP redirects */
+       u64 xdp_err;       /* packets dropped due to XDP errors */
+};
+
+struct funeth_rxbuf {          /* per Rx buffer state */
+       struct page *page;     /* associated page */
+       dma_addr_t dma_addr;   /* DMA address of page start */
+       int pg_refs;           /* page refs held by driver */
+       int node;              /* page node, or -1 if it is PF_MEMALLOC */
+};
+
+struct funeth_rx_cache {       /* cache of DMA-mapped previously used buffers */
+       struct funeth_rxbuf *bufs; /* base of Rx buffer state ring */
+       unsigned int prod_cnt;     /* producer counter */
+       unsigned int cons_cnt;     /* consumer counter */
+       unsigned int mask;         /* depth - 1 */
+};
+
+/* An Rx queue consists of a CQ and an SQ used to provide Rx buffers. */
+struct funeth_rxq {
+       struct net_device *netdev;
+       struct napi_struct *napi;
+       struct device *dma_dev;    /* device for DMA mappings */
+       void *cqes;                /* base of CQ descriptor ring */
+       const void *next_cqe_info; /* fun_cqe_info of next CQE */
+       u32 __iomem *cq_db;        /* CQ doorbell register address */
+       unsigned int cq_head;      /* CQ head index */
+       unsigned int cq_mask;      /* CQ depth - 1 */
+       u16 phase;                 /* CQ phase tag */
+       u16 qidx;                  /* queue index within net_device */
+       unsigned int irq_db_val;   /* IRQ info for CQ doorbell */
+       struct fun_eprq_rqbuf *rqes; /* base of RQ descriptor ring */
+       struct funeth_rxbuf *bufs; /* base of Rx buffer state ring */
+       struct funeth_rxbuf *cur_buf; /* currently active buffer */
+       u32 __iomem *rq_db;        /* RQ doorbell register address */
+       unsigned int rq_cons;      /* RQ consumer counter */
+       unsigned int rq_mask;      /* RQ depth - 1 */
+       unsigned int buf_offset;   /* offset of next pkt in head buffer */
+       u8 xdp_flush;              /* XDP flush types needed at NAPI end */
+       u8 init_state;             /* queue initialization state */
+       u16 headroom;              /* per packet headroom */
+       unsigned int rq_cons_db;   /* value of rq_cons at last RQ db */
+       unsigned int rq_db_thres;  /* # of new buffers needed to write RQ db */
+       struct funeth_rxbuf spare_buf; /* spare for next buffer replacement */
+       struct funeth_rx_cache cache; /* used buffer cache */
+       struct bpf_prog *xdp_prog; /* optional XDP BPF program */
+       struct funeth_rxq_stats stats;
+       dma_addr_t cq_dma_addr;    /* DMA address of CQE ring */
+       dma_addr_t rq_dma_addr;    /* DMA address of RQE ring */
+       u16 irq_cnt;
+       u32 hw_cqid;               /* device ID of the queue's CQ */
+       u32 hw_sqid;               /* device ID of the queue's SQ */
+       int numa_node;
+       struct u64_stats_sync syncp;
+       struct xdp_rxq_info xdp_rxq;
+};
+
+#define FUN_QSTAT_INC(q, counter) \
+       do { \
+               u64_stats_update_begin(&(q)->syncp); \
+               (q)->stats.counter++; \
+               u64_stats_update_end(&(q)->syncp); \
+       } while (0)
+
+#define FUN_QSTAT_READ(q, seq, stats_copy) \
+       do { \
+               seq = u64_stats_fetch_begin(&(q)->syncp); \
+               stats_copy = (q)->stats; \
+       } while (u64_stats_fetch_retry(&(q)->syncp, (seq)))
+
+#define FUN_INT_NAME_LEN (IFNAMSIZ + 16)
+
+struct fun_irq {
+       struct napi_struct napi;
+       struct funeth_txq *txq;
+       struct funeth_rxq *rxq;
+       u8 state;
+       u16 irq_idx;              /* index of MSI-X interrupt */
+       int irq;                  /* Linux IRQ vector */
+       cpumask_t affinity_mask;  /* IRQ affinity */
+       struct irq_affinity_notify aff_notify;
+       char name[FUN_INT_NAME_LEN];
+} ____cacheline_internodealigned_in_smp;
+
+/* Return the start address of the idx-th Tx descriptor. */
+static inline void *fun_tx_desc_addr(const struct funeth_txq *q,
+                                    unsigned int idx)
+{
+       return q->desc + idx * FUNETH_SQE_SIZE;
+}
+
+static inline void fun_txq_wr_db(const struct funeth_txq *q)
+{
+       unsigned int tail = q->prod_cnt & q->mask;
+
+       writel(tail, q->db);
+}
+
+static inline int fun_irq_node(const struct fun_irq *p)
+{
+       return local_memory_node(cpu_to_node(cpumask_first(&p->affinity_mask)));
+}
+
+int fun_rxq_napi_poll(struct napi_struct *napi, int budget);
+int fun_txq_napi_poll(struct napi_struct *napi, int budget);
+netdev_tx_t fun_start_xmit(struct sk_buff *skb, struct net_device *netdev);
+bool fun_xdp_tx(struct funeth_txq *q, void *data, unsigned int len);
+int fun_xdp_xmit_frames(struct net_device *dev, int n,
+                       struct xdp_frame **frames, u32 flags);
+
+int funeth_txq_create(struct net_device *dev, unsigned int qidx,
+                     unsigned int ndesc, struct fun_irq *irq, int state,
+                     struct funeth_txq **qp);
+int fun_txq_create_dev(struct funeth_txq *q, struct fun_irq *irq);
+struct funeth_txq *funeth_txq_free(struct funeth_txq *q, int state);
+int funeth_rxq_create(struct net_device *dev, unsigned int qidx,
+                     unsigned int ncqe, unsigned int nrqe, struct fun_irq *irq,
+                     int state, struct funeth_rxq **qp);
+int fun_rxq_create_dev(struct funeth_rxq *q, struct fun_irq *irq);
+struct funeth_rxq *funeth_rxq_free(struct funeth_rxq *q, int state);
+int fun_rxq_set_bpf(struct funeth_rxq *q, struct bpf_prog *prog);
+
+#endif /* _FUNETH_TXRX_H */