OSDN Git Service

net: implement threaded-able napi poll loop support
authorWei Wang <weiwan@google.com>
Mon, 8 Feb 2021 19:34:09 +0000 (11:34 -0800)
committerDavid S. Miller <davem@davemloft.net>
Tue, 9 Feb 2021 23:27:28 +0000 (15:27 -0800)
This patch allows running each napi poll loop inside its own
kernel thread.
The kthread is created during netif_napi_add() if dev->threaded
is set. And threaded mode is enabled in napi_enable(). We will
provide a way to set dev->threaded and enable threaded mode
without a device up/down in the following patch.

Once that threaded mode is enabled and the kthread is
started, napi_schedule() will wake-up such thread instead
of scheduling the softirq.

The threaded poll loop behaves quite likely the net_rx_action,
but it does not have to manipulate local irqs and uses
an explicit scheduling point based on netdev_budget.

Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Co-developed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Wei Wang <weiwan@google.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/linux/netdevice.h
net/core/dev.c

index e9e7ada..99fb4ec 100644 (file)
@@ -347,6 +347,7 @@ struct napi_struct {
        struct list_head        dev_list;
        struct hlist_node       napi_hash_node;
        unsigned int            napi_id;
+       struct task_struct      *thread;
 };
 
 enum {
@@ -358,6 +359,7 @@ enum {
        NAPI_STATE_NO_BUSY_POLL,        /* Do not add in napi_hash, no busy polling */
        NAPI_STATE_IN_BUSY_POLL,        /* sk_busy_loop() owns this NAPI */
        NAPI_STATE_PREFER_BUSY_POLL,    /* prefer busy-polling over softirq processing*/
+       NAPI_STATE_THREADED,            /* The poll is performed inside its own thread*/
 };
 
 enum {
@@ -369,6 +371,7 @@ enum {
        NAPIF_STATE_NO_BUSY_POLL        = BIT(NAPI_STATE_NO_BUSY_POLL),
        NAPIF_STATE_IN_BUSY_POLL        = BIT(NAPI_STATE_IN_BUSY_POLL),
        NAPIF_STATE_PREFER_BUSY_POLL    = BIT(NAPI_STATE_PREFER_BUSY_POLL),
+       NAPIF_STATE_THREADED            = BIT(NAPI_STATE_THREADED),
 };
 
 enum gro_result {
@@ -503,20 +506,7 @@ static inline bool napi_complete(struct napi_struct *n)
  */
 void napi_disable(struct napi_struct *n);
 
-/**
- *     napi_enable - enable NAPI scheduling
- *     @n: NAPI context
- *
- * Resume NAPI from being scheduled on this context.
- * Must be paired with napi_disable.
- */
-static inline void napi_enable(struct napi_struct *n)
-{
-       BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-       smp_mb__before_atomic();
-       clear_bit(NAPI_STATE_SCHED, &n->state);
-       clear_bit(NAPI_STATE_NPSVC, &n->state);
-}
+void napi_enable(struct napi_struct *n);
 
 /**
  *     napi_synchronize - wait until NAPI is not running
@@ -1827,6 +1817,8 @@ enum netdev_priv_flags {
  *
  *     @wol_enabled:   Wake-on-LAN is enabled
  *
+ *     @threaded:      napi threaded mode is enabled
+ *
  *     @net_notifier_list:     List of per-net netdev notifier block
  *                             that follow this device when it is moved
  *                             to another network namespace.
@@ -2145,6 +2137,7 @@ struct net_device {
        struct lock_class_key   *qdisc_running_key;
        bool                    proto_down;
        unsigned                wol_enabled:1;
+       unsigned                threaded:1;
 
        struct list_head        net_notifier_list;
 
index 59751a2..1e35f4f 100644 (file)
@@ -91,6 +91,7 @@
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
 #include <linux/skbuff.h>
+#include <linux/kthread.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
 #include <net/net_namespace.h>
@@ -1494,6 +1495,27 @@ void netdev_notify_peers(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_notify_peers);
 
+static int napi_threaded_poll(void *data);
+
+static int napi_kthread_create(struct napi_struct *n)
+{
+       int err = 0;
+
+       /* Create and wake up the kthread once to put it in
+        * TASK_INTERRUPTIBLE mode to avoid the blocked task
+        * warning and work with loadavg.
+        */
+       n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
+                               n->dev->name, n->napi_id);
+       if (IS_ERR(n->thread)) {
+               err = PTR_ERR(n->thread);
+               pr_err("kthread_run failed with err %d\n", err);
+               n->thread = NULL;
+       }
+
+       return err;
+}
+
 static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 {
        const struct net_device_ops *ops = dev->netdev_ops;
@@ -4265,6 +4287,21 @@ int gro_normal_batch __read_mostly = 8;
 static inline void ____napi_schedule(struct softnet_data *sd,
                                     struct napi_struct *napi)
 {
+       struct task_struct *thread;
+
+       if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
+               /* Paired with smp_mb__before_atomic() in
+                * napi_enable(). Use READ_ONCE() to guarantee
+                * a complete read on napi->thread. Only call
+                * wake_up_process() when it's not NULL.
+                */
+               thread = READ_ONCE(napi->thread);
+               if (thread) {
+                       wake_up_process(thread);
+                       return;
+               }
+       }
+
        list_add_tail(&napi->poll_list, &sd->poll_list);
        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
 }
@@ -6728,6 +6765,12 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
        set_bit(NAPI_STATE_NPSVC, &napi->state);
        list_add_rcu(&napi->dev_list, &dev->napi_list);
        napi_hash_add(napi);
+       /* Create kthread for this napi if dev->threaded is set.
+        * Clear dev->threaded if kthread creation failed so that
+        * threaded mode will not be enabled in napi_enable().
+        */
+       if (dev->threaded && napi_kthread_create(napi))
+               dev->threaded = 0;
 }
 EXPORT_SYMBOL(netif_napi_add);
 
@@ -6745,9 +6788,28 @@ void napi_disable(struct napi_struct *n)
 
        clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
        clear_bit(NAPI_STATE_DISABLE, &n->state);
+       clear_bit(NAPI_STATE_THREADED, &n->state);
 }
 EXPORT_SYMBOL(napi_disable);
 
+/**
+ *     napi_enable - enable NAPI scheduling
+ *     @n: NAPI context
+ *
+ * Resume NAPI from being scheduled on this context.
+ * Must be paired with napi_disable.
+ */
+void napi_enable(struct napi_struct *n)
+{
+       BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+       smp_mb__before_atomic();
+       clear_bit(NAPI_STATE_SCHED, &n->state);
+       clear_bit(NAPI_STATE_NPSVC, &n->state);
+       if (n->dev->threaded && n->thread)
+               set_bit(NAPI_STATE_THREADED, &n->state);
+}
+EXPORT_SYMBOL(napi_enable);
+
 static void flush_gro_hash(struct napi_struct *napi)
 {
        int i;
@@ -6773,6 +6835,11 @@ void __netif_napi_del(struct napi_struct *napi)
 
        flush_gro_hash(napi);
        napi->gro_bitmask = 0;
+
+       if (napi->thread) {
+               kthread_stop(napi->thread);
+               napi->thread = NULL;
+       }
 }
 EXPORT_SYMBOL(__netif_napi_del);
 
@@ -6867,6 +6934,51 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
        return work;
 }
 
+static int napi_thread_wait(struct napi_struct *napi)
+{
+       set_current_state(TASK_INTERRUPTIBLE);
+
+       while (!kthread_should_stop() && !napi_disable_pending(napi)) {
+               if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+                       WARN_ON(!list_empty(&napi->poll_list));
+                       __set_current_state(TASK_RUNNING);
+                       return 0;
+               }
+
+               schedule();
+               set_current_state(TASK_INTERRUPTIBLE);
+       }
+       __set_current_state(TASK_RUNNING);
+       return -1;
+}
+
+static int napi_threaded_poll(void *data)
+{
+       struct napi_struct *napi = data;
+       void *have;
+
+       while (!napi_thread_wait(napi)) {
+               for (;;) {
+                       bool repoll = false;
+
+                       local_bh_disable();
+
+                       have = netpoll_poll_lock(napi);
+                       __napi_poll(napi, &repoll);
+                       netpoll_poll_unlock(have);
+
+                       __kfree_skb_flush();
+                       local_bh_enable();
+
+                       if (!repoll)
+                               break;
+
+                       cond_resched();
+               }
+       }
+       return 0;
+}
+
 static __latent_entropy void net_rx_action(struct softirq_action *h)
 {
        struct softnet_data *sd = this_cpu_ptr(&softnet_data);