net: Add support for subordinate traffic classes to netdev_pick_tx

[tomoyo/tomoyo-test1.git] / net / core / dev.c
diff --git a/net/core/dev.c b/net/core/dev.c

index a5aa1c7..09a7cc2 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -149,7 +149,6 @@
  
  #include "net-sysfs.h"
  
-/* Instead of increasing this, you should create a hash table. */
  #define MAX_GRO_SKBS 8
  
  /* This should be increased if a protocol with a bigger head is added. */
@@ -2068,11 +2067,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
                 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
                 int i;
  
+               /* walk through the TCs and see if it falls into any of them */
                 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
                         if ((txq - tc->offset) < tc->count)
                                 return i;
                 }
  
+               /* didn't find it, just return -1 to indicate no match */
                 return -1;
         }
  
@@ -2081,6 +2082,10 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
  EXPORT_SYMBOL(netdev_txq_to_tc);
  
  #ifdef CONFIG_XPS
+struct static_key xps_needed __read_mostly;
+EXPORT_SYMBOL(xps_needed);
+struct static_key xps_rxqs_needed __read_mostly;
+EXPORT_SYMBOL(xps_rxqs_needed);
  static DEFINE_MUTEX(xps_map_mutex);
  #define xmap_dereference(P)            \
         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
@@ -2092,7 +2097,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
         int pos;
  
         if (dev_maps)
-               map = xmap_dereference(dev_maps->cpu_map[tci]);
+               map = xmap_dereference(dev_maps->attr_map[tci]);
         if (!map)
                 return false;
  
@@ -2105,7 +2110,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
                         break;
                 }
  
-               RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
+               RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
                 kfree_rcu(map, rcu);
                 return false;
         }
@@ -2135,33 +2140,68 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
         return active;
  }
  
+static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
+                          struct xps_dev_maps *dev_maps, unsigned int nr_ids,
+                          u16 offset, u16 count, bool is_rxqs_map)
+{
+       bool active = false;
+       int i, j;
+
+       for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
+            j < nr_ids;)
+               active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
+                                              count);
+       if (!active) {
+               if (is_rxqs_map) {
+                       RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
+               } else {
+                       RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
+
+                       for (i = offset + (count - 1); count--; i--)
+                               netdev_queue_numa_node_write(
+                                       netdev_get_tx_queue(dev, i),
+                                                       NUMA_NO_NODE);
+               }
+               kfree_rcu(dev_maps, rcu);
+       }
+}
+
  static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
                                    u16 count)
  {
+       const unsigned long *possible_mask = NULL;
         struct xps_dev_maps *dev_maps;
-       int cpu, i;
-       bool active = false;
+       unsigned int nr_ids;
+
+       if (!static_key_false(&xps_needed))
+               return;
  
         mutex_lock(&xps_map_mutex);
-       dev_maps = xmap_dereference(dev->xps_maps);
  
+       if (static_key_false(&xps_rxqs_needed)) {
+               dev_maps = xmap_dereference(dev->xps_rxqs_map);
+               if (dev_maps) {
+                       nr_ids = dev->num_rx_queues;
+                       clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
+                                      offset, count, true);
+               }
+       }
+
+       dev_maps = xmap_dereference(dev->xps_cpus_map);
         if (!dev_maps)
                 goto out_no_maps;
  
-       for_each_possible_cpu(cpu)
-               active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
-                                              offset, count);
-
-       if (!active) {
-               RCU_INIT_POINTER(dev->xps_maps, NULL);
-               kfree_rcu(dev_maps, rcu);
-       }
-
-       for (i = offset + (count - 1); count--; i--)
-               netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
-                                            NUMA_NO_NODE);
+       if (num_possible_cpus() > 1)
+               possible_mask = cpumask_bits(cpu_possible_mask);
+       nr_ids = nr_cpu_ids;
+       clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
+                      false);
  
  out_no_maps:
+       if (static_key_enabled(&xps_rxqs_needed))
+               static_key_slow_dec(&xps_rxqs_needed);
+
+       static_key_slow_dec(&xps_needed);
         mutex_unlock(&xps_map_mutex);
  }
  
@@ -2170,8 +2210,8 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
         netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
  }
  
-static struct xps_map *expand_xps_map(struct xps_map *map,
-                                     int cpu, u16 index)
+static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
+                                     u16 index, bool is_rxqs_map)
  {
         struct xps_map *new_map;
         int alloc_len = XPS_MIN_MAP_ALLOC;
@@ -2183,7 +2223,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
                 return map;
         }
  
-       /* Need to add queue to this CPU's existing map */
+       /* Need to add tx-queue to this CPU's/rx-queue's existing map */
         if (map) {
                 if (pos < map->alloc_len)
                         return map;
@@ -2191,9 +2231,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
                 alloc_len = map->alloc_len * 2;
         }
  
-       /* Need to allocate new map to store queue on this CPU's map */
-       new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
-                              cpu_to_node(cpu));
+       /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
+        *  map
+        */
+       if (is_rxqs_map)
+               new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
+       else
+               new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
+                                      cpu_to_node(attr_index));
         if (!new_map)
                 return NULL;
  
@@ -2205,32 +2250,52 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
         return new_map;
  }
  
-int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
-                       u16 index)
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+                         u16 index, bool is_rxqs_map)
  {
+       const unsigned long *online_mask = NULL, *possible_mask = NULL;
         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
-       int i, cpu, tci, numa_node_id = -2;
+       int i, j, tci, numa_node_id = -2;
         int maps_sz, num_tc = 1, tc = 0;
         struct xps_map *map, *new_map;
         bool active = false;
+       unsigned int nr_ids;
  
         if (dev->num_tc) {
+               /* Do not allow XPS on subordinate device directly */
                 num_tc = dev->num_tc;
+               if (num_tc < 0)
+                       return -EINVAL;
+
+               /* If queue belongs to subordinate dev use its map */
+               dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
                 tc = netdev_txq_to_tc(dev, index);
                 if (tc < 0)
                         return -EINVAL;
         }
  
-       maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
-       if (maps_sz < L1_CACHE_BYTES)
-               maps_sz = L1_CACHE_BYTES;
-
         mutex_lock(&xps_map_mutex);
+       if (is_rxqs_map) {
+               maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
+               dev_maps = xmap_dereference(dev->xps_rxqs_map);
+               nr_ids = dev->num_rx_queues;
+       } else {
+               maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
+               if (num_possible_cpus() > 1) {
+                       online_mask = cpumask_bits(cpu_online_mask);
+                       possible_mask = cpumask_bits(cpu_possible_mask);
+               }
+               dev_maps = xmap_dereference(dev->xps_cpus_map);
+               nr_ids = nr_cpu_ids;
+       }
  
-       dev_maps = xmap_dereference(dev->xps_maps);
+       if (maps_sz < L1_CACHE_BYTES)
+               maps_sz = L1_CACHE_BYTES;
  
         /* allocate memory for queue storage */
-       for_each_cpu_and(cpu, cpu_online_mask, mask) {
+       for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
+            j < nr_ids;) {
                 if (!new_dev_maps)
                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
                 if (!new_dev_maps) {
@@ -2238,73 +2303,85 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
                         return -ENOMEM;
                 }
  
-               tci = cpu * num_tc + tc;
-               map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
+               tci = j * num_tc + tc;
+               map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
                                  NULL;
  
-               map = expand_xps_map(map, cpu, index);
+               map = expand_xps_map(map, j, index, is_rxqs_map);
                 if (!map)
                         goto error;
  
-               RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+               RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
         }
  
         if (!new_dev_maps)
                 goto out_no_new_maps;
  
-       for_each_possible_cpu(cpu) {
+       static_key_slow_inc(&xps_needed);
+       if (is_rxqs_map)
+               static_key_slow_inc(&xps_rxqs_needed);
+
+       for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+            j < nr_ids;) {
                 /* copy maps belonging to foreign traffic classes */
-               for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
+               for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
                         /* fill in the new device map from the old device map */
-                       map = xmap_dereference(dev_maps->cpu_map[tci]);
-                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+                       map = xmap_dereference(dev_maps->attr_map[tci]);
+                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
                 }
  
                 /* We need to explicitly update tci as prevous loop
                  * could break out early if dev_maps is NULL.
                  */
-               tci = cpu * num_tc + tc;
+               tci = j * num_tc + tc;
  
-               if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
-                       /* add queue to CPU maps */
+               if (netif_attr_test_mask(j, mask, nr_ids) &&
+                   netif_attr_test_online(j, online_mask, nr_ids)) {
+                       /* add tx-queue to CPU/rx-queue maps */
                         int pos = 0;
  
-                       map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+                       map = xmap_dereference(new_dev_maps->attr_map[tci]);
                         while ((pos < map->len) && (map->queues[pos] != index))
                                 pos++;
  
                         if (pos == map->len)
                                 map->queues[map->len++] = index;
  #ifdef CONFIG_NUMA
-                       if (numa_node_id == -2)
-                               numa_node_id = cpu_to_node(cpu);
-                       else if (numa_node_id != cpu_to_node(cpu))
-                               numa_node_id = -1;
+                       if (!is_rxqs_map) {
+                               if (numa_node_id == -2)
+                                       numa_node_id = cpu_to_node(j);
+                               else if (numa_node_id != cpu_to_node(j))
+                                       numa_node_id = -1;
+                       }
  #endif
                 } else if (dev_maps) {
                         /* fill in the new device map from the old device map */
-                       map = xmap_dereference(dev_maps->cpu_map[tci]);
-                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+                       map = xmap_dereference(dev_maps->attr_map[tci]);
+                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
                 }
  
                 /* copy maps belonging to foreign traffic classes */
                 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
                         /* fill in the new device map from the old device map */
-                       map = xmap_dereference(dev_maps->cpu_map[tci]);
-                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+                       map = xmap_dereference(dev_maps->attr_map[tci]);
+                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
                 }
         }
  
-       rcu_assign_pointer(dev->xps_maps, new_dev_maps);
+       if (is_rxqs_map)
+               rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
+       else
+               rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
  
         /* Cleanup old maps */
         if (!dev_maps)
                 goto out_no_old_maps;
  
-       for_each_possible_cpu(cpu) {
-               for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
-                       new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
-                       map = xmap_dereference(dev_maps->cpu_map[tci]);
+       for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+            j < nr_ids;) {
+               for (i = num_tc, tci = j * num_tc; i--; tci++) {
+                       new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+                       map = xmap_dereference(dev_maps->attr_map[tci]);
                         if (map && map != new_map)
                                 kfree_rcu(map, rcu);
                 }
@@ -2317,19 +2394,23 @@ out_no_old_maps:
         active = true;
  
  out_no_new_maps:
-       /* update Tx queue numa node */
-       netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
-                                    (numa_node_id >= 0) ? numa_node_id :
-                                    NUMA_NO_NODE);
+       if (!is_rxqs_map) {
+               /* update Tx queue numa node */
+               netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
+                                            (numa_node_id >= 0) ?
+                                            numa_node_id : NUMA_NO_NODE);
+       }
  
         if (!dev_maps)
                 goto out_no_maps;
  
-       /* removes queue from unused CPUs */
-       for_each_possible_cpu(cpu) {
-               for (i = tc, tci = cpu * num_tc; i--; tci++)
+       /* removes tx-queue from unused CPUs/rx-queues */
+       for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+            j < nr_ids;) {
+               for (i = tc, tci = j * num_tc; i--; tci++)
                         active |= remove_xps_queue(dev_maps, tci, index);
-               if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
+               if (!netif_attr_test_mask(j, mask, nr_ids) ||
+                   !netif_attr_test_online(j, online_mask, nr_ids))
                         active |= remove_xps_queue(dev_maps, tci, index);
                 for (i = num_tc - tc, tci++; --i; tci++)
                         active |= remove_xps_queue(dev_maps, tci, index);
@@ -2337,7 +2418,10 @@ out_no_new_maps:
  
         /* free map if not active */
         if (!active) {
-               RCU_INIT_POINTER(dev->xps_maps, NULL);
+               if (is_rxqs_map)
+                       RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
+               else
+                       RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
                 kfree_rcu(dev_maps, rcu);
         }
  
@@ -2347,11 +2431,12 @@ out_no_maps:
         return 0;
  error:
         /* remove any maps that we added */
-       for_each_possible_cpu(cpu) {
-               for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
-                       new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+       for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+            j < nr_ids;) {
+               for (i = num_tc, tci = j * num_tc; i--; tci++) {
+                       new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
                         map = dev_maps ?
-                             xmap_dereference(dev_maps->cpu_map[tci]) :
+                             xmap_dereference(dev_maps->attr_map[tci]) :
                               NULL;
                         if (new_map && new_map != map)
                                 kfree(new_map);
@@ -2363,14 +2448,34 @@ error:
         kfree(new_dev_maps);
         return -ENOMEM;
  }
+
+int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
+                       u16 index)
+{
+       return __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+}
  EXPORT_SYMBOL(netif_set_xps_queue);
  
  #endif
+static void netdev_unbind_all_sb_channels(struct net_device *dev)
+{
+       struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
+
+       /* Unbind any subordinate channels */
+       while (txq-- != &dev->_tx[0]) {
+               if (txq->sb_dev)
+                       netdev_unbind_sb_channel(dev, txq->sb_dev);
+       }
+}
+
  void netdev_reset_tc(struct net_device *dev)
  {
  #ifdef CONFIG_XPS
         netif_reset_xps_queues_gt(dev, 0);
  #endif
+       netdev_unbind_all_sb_channels(dev);
+
+       /* Reset TC configuration of device */
         dev->num_tc = 0;
         memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
         memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
@@ -2399,11 +2504,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
  #ifdef CONFIG_XPS
         netif_reset_xps_queues_gt(dev, 0);
  #endif
+       netdev_unbind_all_sb_channels(dev);
+
         dev->num_tc = num_tc;
         return 0;
  }
  EXPORT_SYMBOL(netdev_set_num_tc);
  
+void netdev_unbind_sb_channel(struct net_device *dev,
+                             struct net_device *sb_dev)
+{
+       struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
+
+#ifdef CONFIG_XPS
+       netif_reset_xps_queues_gt(sb_dev, 0);
+#endif
+       memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
+       memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
+
+       while (txq-- != &dev->_tx[0]) {
+               if (txq->sb_dev == sb_dev)
+                       txq->sb_dev = NULL;
+       }
+}
+EXPORT_SYMBOL(netdev_unbind_sb_channel);
+
+int netdev_bind_sb_channel_queue(struct net_device *dev,
+                                struct net_device *sb_dev,
+                                u8 tc, u16 count, u16 offset)
+{
+       /* Make certain the sb_dev and dev are already configured */
+       if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
+               return -EINVAL;
+
+       /* We cannot hand out queues we don't have */
+       if ((offset + count) > dev->real_num_tx_queues)
+               return -EINVAL;
+
+       /* Record the mapping */
+       sb_dev->tc_to_txq[tc].count = count;
+       sb_dev->tc_to_txq[tc].offset = offset;
+
+       /* Provide a way for Tx queue to find the tc_to_txq map or
+        * XPS map for itself.
+        */
+       while (count--)
+               netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
+
+       return 0;
+}
+EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
+
+int netdev_set_sb_channel(struct net_device *dev, u16 channel)
+{
+       /* Do not use a multiqueue device to represent a subordinate channel */
+       if (netif_is_multiqueue(dev))
+               return -ENODEV;
+
+       /* We allow channels 1 - 32767 to be used for subordinate channels.
+        * Channel 0 is meant to be "native" mode and used only to represent
+        * the main root device. We allow writing 0 to reset the device back
+        * to normal mode after being used as a subordinate channel.
+        */
+       if (channel > S16_MAX)
+               return -EINVAL;
+
+       dev->num_tc = -channel;
+
+       return 0;
+}
+EXPORT_SYMBOL(netdev_set_sb_channel);
+
  /*
   * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
   * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -2615,24 +2786,26 @@ EXPORT_SYMBOL(netif_device_attach);
   * Returns a Tx hash based on the given packet descriptor a Tx queues' number
   * to be used as a distribution range.
   */
-static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)
+static u16 skb_tx_hash(const struct net_device *dev,
+                      const struct net_device *sb_dev,
+                      struct sk_buff *skb)
  {
         u32 hash;
         u16 qoffset = 0;
         u16 qcount = dev->real_num_tx_queues;
  
+       if (dev->num_tc) {
+               u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+
+               qoffset = sb_dev->tc_to_txq[tc].offset;
+               qcount = sb_dev->tc_to_txq[tc].count;
+       }
+
         if (skb_rx_queue_recorded(skb)) {
                 hash = skb_get_rx_queue(skb);
                 while (unlikely(hash >= qcount))
                         hash -= qcount;
-               return hash;
-       }
-
-       if (dev->num_tc) {
-               u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
-
-               qoffset = dev->tc_to_txq[tc].offset;
-               qcount = dev->tc_to_txq[tc].count;
+               return hash + qoffset;
         }
  
         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
@@ -3376,32 +3549,64 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
  }
  #endif /* CONFIG_NET_EGRESS */
  
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+#ifdef CONFIG_XPS
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+                              struct xps_dev_maps *dev_maps, unsigned int tci)
+{
+       struct xps_map *map;
+       int queue_index = -1;
+
+       if (dev->num_tc) {
+               tci *= dev->num_tc;
+               tci += netdev_get_prio_tc_map(dev, skb->priority);
+       }
+
+       map = rcu_dereference(dev_maps->attr_map[tci]);
+       if (map) {
+               if (map->len == 1)
+                       queue_index = map->queues[0];
+               else
+                       queue_index = map->queues[reciprocal_scale(
+                                               skb_get_hash(skb), map->len)];
+               if (unlikely(queue_index >= dev->real_num_tx_queues))
+                       queue_index = -1;
+       }
+       return queue_index;
+}
+#endif
+
+static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
+                        struct sk_buff *skb)
  {
  #ifdef CONFIG_XPS
         struct xps_dev_maps *dev_maps;
-       struct xps_map *map;
+       struct sock *sk = skb->sk;
         int queue_index = -1;
  
+       if (!static_key_false(&xps_needed))
+               return -1;
+
         rcu_read_lock();
-       dev_maps = rcu_dereference(dev->xps_maps);
+       if (!static_key_false(&xps_rxqs_needed))
+               goto get_cpus_map;
+
+       dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
         if (dev_maps) {
-               unsigned int tci = skb->sender_cpu - 1;
+               int tci = sk_rx_queue_get(sk);
  
-               if (dev->num_tc) {
-                       tci *= dev->num_tc;
-                       tci += netdev_get_prio_tc_map(dev, skb->priority);
-               }
+               if (tci >= 0 && tci < dev->num_rx_queues)
+                       queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+                                                         tci);
+       }
  
-               map = rcu_dereference(dev_maps->cpu_map[tci]);
-               if (map) {
-                       if (map->len == 1)
-                               queue_index = map->queues[0];
-                       else
-                               queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
-                                                                          map->len)];
-                       if (unlikely(queue_index >= dev->real_num_tx_queues))
-                               queue_index = -1;
+get_cpus_map:
+       if (queue_index < 0) {
+               dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
+               if (dev_maps) {
+                       unsigned int tci = skb->sender_cpu - 1;
+
+                       queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+                                                         tci);
                 }
         }
         rcu_read_unlock();
@@ -3412,17 +3617,20 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
  #endif
  }
  
-static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
+                            struct net_device *sb_dev)
  {
         struct sock *sk = skb->sk;
         int queue_index = sk_tx_queue_get(sk);
  
+       sb_dev = sb_dev ? : dev;
+
         if (queue_index < 0 || skb->ooo_okay ||
             queue_index >= dev->real_num_tx_queues) {
-               int new_index = get_xps_queue(dev, skb);
+               int new_index = get_xps_queue(dev, sb_dev, skb);
  
                 if (new_index < 0)
-                       new_index = skb_tx_hash(dev, skb);
+                       new_index = skb_tx_hash(dev, sb_dev, skb);
  
                 if (queue_index != new_index && sk &&
                     sk_fullsock(sk) &&
@@ -3435,9 +3643,15 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
         return queue_index;
  }
  
+static u16 __netdev_pick_tx(struct net_device *dev,
+                           struct sk_buff *skb)
+{
+       return ___netdev_pick_tx(dev, skb, NULL);
+}
+
  struct netdev_queue *netdev_pick_tx(struct net_device *dev,
                                     struct sk_buff *skb,
-                                   void *accel_priv)
+                                   struct net_device *sb_dev)
  {
         int queue_index = 0;
  
@@ -3452,10 +3666,10 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
                 const struct net_device_ops *ops = dev->netdev_ops;
  
                 if (ops->ndo_select_queue)
-                       queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+                       queue_index = ops->ndo_select_queue(dev, skb, sb_dev,
                                                             __netdev_pick_tx);
                 else
-                       queue_index = __netdev_pick_tx(dev, skb);
+                       queue_index = ___netdev_pick_tx(dev, skb, sb_dev);
  
                 queue_index = netdev_cap_txqueue(dev, queue_index);
         }
@@ -3467,7 +3681,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
  /**
   *     __dev_queue_xmit - transmit a buffer
   *     @skb: buffer to transmit
- *     @accel_priv: private data used for L2 forwarding offload
+ *     @sb_dev: suboordinate device used for L2 forwarding offload
   *
   *     Queue a buffer for transmission to a network device. The caller must
   *     have set the device and priority and built the buffer before calling
@@ -3490,7 +3704,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
   *      the BH enable code must have IRQs enabled so that it will not deadlock.
   *          --BLG
   */
-static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
+static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
  {
         struct net_device *dev = skb->dev;
         struct netdev_queue *txq;
@@ -3529,7 +3743,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
         else
                 skb_dst_force(skb);
  
-       txq = netdev_pick_tx(dev, skb, accel_priv);
+       txq = netdev_pick_tx(dev, skb, sb_dev);
         q = rcu_dereference_bh(txq->qdisc);
  
         trace_net_dev_queue(skb);
@@ -3603,9 +3817,9 @@ int dev_queue_xmit(struct sk_buff *skb)
  }
  EXPORT_SYMBOL(dev_queue_xmit);
  
-int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
+int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
  {
-       return __dev_queue_xmit(skb, accel_priv);
+       return __dev_queue_xmit(skb, sb_dev);
  }
  EXPORT_SYMBOL(dev_queue_xmit_accel);
  
@@ -4494,7 +4708,8 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
         return 0;
  }
  
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
+static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
+                                   struct packet_type **ppt_prev)
  {
         struct packet_type *ptype, *pt_prev;
         rx_handler_func_t *rx_handler;
@@ -4624,8 +4839,7 @@ skip_classify:
         if (pt_prev) {
                 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
                         goto drop;
-               else
-                       ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+               *ppt_prev = pt_prev;
         } else {
  drop:
                 if (!deliver_exact)
@@ -4643,6 +4857,18 @@ out:
         return ret;
  }
  
+static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
+{
+       struct net_device *orig_dev = skb->dev;
+       struct packet_type *pt_prev = NULL;
+       int ret;
+
+       ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+       if (pt_prev)
+               ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+       return ret;
+}
+
  /**
   *     netif_receive_skb_core - special purpose version of netif_receive_skb
   *     @skb: buffer to process
@@ -4663,13 +4889,67 @@ int netif_receive_skb_core(struct sk_buff *skb)
         int ret;
  
         rcu_read_lock();
-       ret = __netif_receive_skb_core(skb, false);
+       ret = __netif_receive_skb_one_core(skb, false);
         rcu_read_unlock();
  
         return ret;
  }
  EXPORT_SYMBOL(netif_receive_skb_core);
  
+static inline void __netif_receive_skb_list_ptype(struct list_head *head,
+                                                 struct packet_type *pt_prev,
+                                                 struct net_device *orig_dev)
+{
+       struct sk_buff *skb, *next;
+
+       if (!pt_prev)
+               return;
+       if (list_empty(head))
+               return;
+       if (pt_prev->list_func != NULL)
+               pt_prev->list_func(head, pt_prev, orig_dev);
+       else
+               list_for_each_entry_safe(skb, next, head, list)
+                       pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
+{
+       /* Fast-path assumptions:
+        * - There is no RX handler.
+        * - Only one packet_type matches.
+        * If either of these fails, we will end up doing some per-packet
+        * processing in-line, then handling the 'last ptype' for the whole
+        * sublist.  This can't cause out-of-order delivery to any single ptype,
+        * because the 'last ptype' must be constant across the sublist, and all
+        * other ptypes are handled per-packet.
+        */
+       /* Current (common) ptype of sublist */
+       struct packet_type *pt_curr = NULL;
+       /* Current (common) orig_dev of sublist */
+       struct net_device *od_curr = NULL;
+       struct list_head sublist;
+       struct sk_buff *skb, *next;
+
+       list_for_each_entry_safe(skb, next, head, list) {
+               struct net_device *orig_dev = skb->dev;
+               struct packet_type *pt_prev = NULL;
+
+               __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+               if (pt_curr != pt_prev || od_curr != orig_dev) {
+                       /* dispatch old sublist */
+                       list_cut_before(&sublist, head, &skb->list);
+                       __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+                       /* start new sublist */
+                       pt_curr = pt_prev;
+                       od_curr = orig_dev;
+               }
+       }
+
+       /* dispatch final sublist */
+       __netif_receive_skb_list_ptype(head, pt_curr, od_curr);
+}
+
  static int __netif_receive_skb(struct sk_buff *skb)
  {
         int ret;
@@ -4687,14 +4967,44 @@ static int __netif_receive_skb(struct sk_buff *skb)
                  * context down to all allocation sites.
                  */
                 noreclaim_flag = memalloc_noreclaim_save();
-               ret = __netif_receive_skb_core(skb, true);
+               ret = __netif_receive_skb_one_core(skb, true);
                 memalloc_noreclaim_restore(noreclaim_flag);
         } else
-               ret = __netif_receive_skb_core(skb, false);
+               ret = __netif_receive_skb_one_core(skb, false);
  
         return ret;
  }
  
+static void __netif_receive_skb_list(struct list_head *head)
+{
+       unsigned long noreclaim_flag = 0;
+       struct sk_buff *skb, *next;
+       bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
+
+       list_for_each_entry_safe(skb, next, head, list) {
+               if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
+                       struct list_head sublist;
+
+                       /* Handle the previous sublist */
+                       list_cut_before(&sublist, head, &skb->list);
+                       if (!list_empty(&sublist))
+                               __netif_receive_skb_list_core(&sublist, pfmemalloc);
+                       pfmemalloc = !pfmemalloc;
+                       /* See comments in __netif_receive_skb */
+                       if (pfmemalloc)
+                               noreclaim_flag = memalloc_noreclaim_save();
+                       else
+                               memalloc_noreclaim_restore(noreclaim_flag);
+               }
+       }
+       /* Handle the remaining sublist */
+       if (!list_empty(head))
+               __netif_receive_skb_list_core(head, pfmemalloc);
+       /* Restore pflags */
+       if (pfmemalloc)
+               memalloc_noreclaim_restore(noreclaim_flag);
+}
+
  static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
  {
         struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
@@ -4769,6 +5079,50 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
         return ret;
  }
  
+static void netif_receive_skb_list_internal(struct list_head *head)
+{
+       struct bpf_prog *xdp_prog = NULL;
+       struct sk_buff *skb, *next;
+
+       list_for_each_entry_safe(skb, next, head, list) {
+               net_timestamp_check(netdev_tstamp_prequeue, skb);
+               if (skb_defer_rx_timestamp(skb))
+                       /* Handled, remove from list */
+                       list_del(&skb->list);
+       }
+
+       if (static_branch_unlikely(&generic_xdp_needed_key)) {
+               preempt_disable();
+               rcu_read_lock();
+               list_for_each_entry_safe(skb, next, head, list) {
+                       xdp_prog = rcu_dereference(skb->dev->xdp_prog);
+                       if (do_xdp_generic(xdp_prog, skb) != XDP_PASS)
+                               /* Dropped, remove from list */
+                               list_del(&skb->list);
+               }
+               rcu_read_unlock();
+               preempt_enable();
+       }
+
+       rcu_read_lock();
+#ifdef CONFIG_RPS
+       if (static_key_false(&rps_needed)) {
+               list_for_each_entry_safe(skb, next, head, list) {
+                       struct rps_dev_flow voidflow, *rflow = &voidflow;
+                       int cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+                       if (cpu >= 0) {
+                               enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+                               /* Handled, remove from list */
+                               list_del(&skb->list);
+                       }
+               }
+       }
+#endif
+       __netif_receive_skb_list(head);
+       rcu_read_unlock();
+}
+
  /**
   *     netif_receive_skb - process receive buffer from network
   *     @skb: buffer to process
@@ -4792,6 +5146,28 @@ int netif_receive_skb(struct sk_buff *skb)
  }
  EXPORT_SYMBOL(netif_receive_skb);
  
+/**
+ *     netif_receive_skb_list - process many receive buffers from network
+ *     @head: list of skbs to process.
+ *
+ *     Since return value of netif_receive_skb() is normally ignored, and
+ *     wouldn't be meaningful for a list, this function returns void.
+ *
+ *     This function may only be called from softirq context and interrupts
+ *     should be enabled.
+ */
+void netif_receive_skb_list(struct list_head *head)
+{
+       struct sk_buff *skb;
+
+       if (list_empty(head))
+               return;
+       list_for_each_entry(skb, head, list)
+               trace_netif_receive_skb_list_entry(skb);
+       netif_receive_skb_list_internal(head);
+}
+EXPORT_SYMBOL(netif_receive_skb_list);
+
  DEFINE_PER_CPU(struct work_struct, flush_works);
  
  /* Network device is going away, flush any packets still pending */
@@ -4875,42 +5251,45 @@ out:
         return netif_receive_skb_internal(skb);
  }
  
-/* napi->gro_list contains packets ordered by age.
- * youngest packets at the head of it.
- * Complete skbs in reverse order to reduce latencies.
- */
-void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
+                                  bool flush_old)
  {
-       struct sk_buff *skb, *prev = NULL;
-
-       /* scan list and build reverse chain */
-       for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
-               skb->prev = prev;
-               prev = skb;
-       }
-
-       for (skb = prev; skb; skb = prev) {
-               skb->next = NULL;
+       struct list_head *head = &napi->gro_hash[index].list;
+       struct sk_buff *skb, *p;
  
+       list_for_each_entry_safe_reverse(skb, p, head, list) {
                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
                         return;
-
-               prev = skb->prev;
+               list_del_init(&skb->list);
                 napi_gro_complete(skb);
                 napi->gro_count--;
+               napi->gro_hash[index].count--;
         }
+}
+
+/* napi->gro_hash[].list contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+{
+       u32 i;
  
-       napi->gro_list = NULL;
+       for (i = 0; i < GRO_HASH_BUCKETS; i++)
+               __napi_gro_flush_chain(napi, i, flush_old);
  }
  EXPORT_SYMBOL(napi_gro_flush);
  
-static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
+static struct list_head *gro_list_prepare(struct napi_struct *napi,
+                                         struct sk_buff *skb)
  {
-       struct sk_buff *p;
         unsigned int maclen = skb->dev->hard_header_len;
         u32 hash = skb_get_hash_raw(skb);
+       struct list_head *head;
+       struct sk_buff *p;
  
-       for (p = napi->gro_list; p; p = p->next) {
+       head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
+       list_for_each_entry(p, head, list) {
                 unsigned long diffs;
  
                 NAPI_GRO_CB(p)->flush = 0;
@@ -4933,6 +5312,8 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
                                        maclen);
                 NAPI_GRO_CB(p)->same_flow = !diffs;
         }
+
+       return head;
  }
  
  static void skb_gro_reset_offset(struct sk_buff *skb)
@@ -4975,20 +5356,41 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
         }
  }
  
+static void gro_flush_oldest(struct list_head *head)
+{
+       struct sk_buff *oldest;
+
+       oldest = list_last_entry(head, struct sk_buff, list);
+
+       /* We are called with head length >= MAX_GRO_SKBS, so this is
+        * impossible.
+        */
+       if (WARN_ON_ONCE(!oldest))
+               return;
+
+       /* Do not adjust napi->gro_count, caller is adding a new SKB to
+        * the chain.
+        */
+       list_del(&oldest->list);
+       napi_gro_complete(oldest);
+}
+
  static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
  {
-       struct sk_buff **pp = NULL;
+       u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+       struct list_head *head = &offload_base;
         struct packet_offload *ptype;
         __be16 type = skb->protocol;
-       struct list_head *head = &offload_base;
-       int same_flow;
+       struct list_head *gro_head;
+       struct sk_buff *pp = NULL;
         enum gro_result ret;
+       int same_flow;
         int grow;
  
         if (netif_elide_gro(skb->dev))
                 goto normal;
  
-       gro_list_prepare(napi, skb);
+       gro_head = gro_list_prepare(napi, skb);
  
         rcu_read_lock();
         list_for_each_entry_rcu(ptype, head, list) {
@@ -5022,7 +5424,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
                         NAPI_GRO_CB(skb)->csum_valid = 0;
                 }
  
-               pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
+               pp = ptype->callbacks.gro_receive(gro_head, skb);
                 break;
         }
         rcu_read_unlock();
@@ -5039,12 +5441,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
  
         if (pp) {
-               struct sk_buff *nskb = *pp;
-
-               *pp = nskb->next;
-               nskb->next = NULL;
-               napi_gro_complete(nskb);
+               list_del_init(&pp->list);
+               napi_gro_complete(pp);
                 napi->gro_count--;
+               napi->gro_hash[hash].count--;
         }
  
         if (same_flow)
@@ -5053,26 +5453,17 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
         if (NAPI_GRO_CB(skb)->flush)
                 goto normal;
  
-       if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
-               struct sk_buff *nskb = napi->gro_list;
-
-               /* locate the end of the list to select the 'oldest' flow */
-               while (nskb->next) {
-                       pp = &nskb->next;
-                       nskb = *pp;
-               }
-               *pp = NULL;
-               nskb->next = NULL;
-               napi_gro_complete(nskb);
+       if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
+               gro_flush_oldest(gro_head);
         } else {
                 napi->gro_count++;
+               napi->gro_hash[hash].count++;
         }
         NAPI_GRO_CB(skb)->count = 1;
         NAPI_GRO_CB(skb)->age = jiffies;
         NAPI_GRO_CB(skb)->last = skb;
         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
-       skb->next = napi->gro_list;
-       napi->gro_list = skb;
+       list_add(&skb->list, gro_head);
         ret = GRO_HELD;
  
  pull:
@@ -5478,7 +5869,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
                                  NAPIF_STATE_IN_BUSY_POLL)))
                 return false;
  
-       if (n->gro_list) {
+       if (n->gro_count) {
                 unsigned long timeout = 0;
  
                 if (work_done)
@@ -5687,7 +6078,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
         /* Note : we use a relaxed variant of napi_schedule_prep() not setting
          * NAPI_STATE_MISSED, since we do not react to a device IRQ.
          */
-       if (napi->gro_list && !napi_disable_pending(napi) &&
+       if (napi->gro_count && !napi_disable_pending(napi) &&
             !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
                 __napi_schedule_irqoff(napi);
  
@@ -5697,11 +6088,16 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
  void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
                     int (*poll)(struct napi_struct *, int), int weight)
  {
+       int i;
+
         INIT_LIST_HEAD(&napi->poll_list);
         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
         napi->timer.function = napi_watchdog;
         napi->gro_count = 0;
-       napi->gro_list = NULL;
+       for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+               INIT_LIST_HEAD(&napi->gro_hash[i].list);
+               napi->gro_hash[i].count = 0;
+       }
         napi->skb = NULL;
         napi->poll = poll;
         if (weight > NAPI_POLL_WEIGHT)
@@ -5734,6 +6130,19 @@ void napi_disable(struct napi_struct *n)
  }
  EXPORT_SYMBOL(napi_disable);
  
+static void flush_gro_hash(struct napi_struct *napi)
+{
+       int i;
+
+       for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+               struct sk_buff *skb, *n;
+
+               list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
+                       kfree_skb(skb);
+               napi->gro_hash[i].count = 0;
+       }
+}
+
  /* Must be called in process context */
  void netif_napi_del(struct napi_struct *napi)
  {
@@ -5743,8 +6152,7 @@ void netif_napi_del(struct napi_struct *napi)
         list_del_init(&napi->dev_list);
         napi_free_frags(napi);
  
-       kfree_skb_list(napi->gro_list);
-       napi->gro_list = NULL;
+       flush_gro_hash(napi);
         napi->gro_count = 0;
  }
  EXPORT_SYMBOL(netif_napi_del);
@@ -5787,7 +6195,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
                 goto out_unlock;
         }
  
-       if (n->gro_list) {
+       if (n->gro_count) {
                 /* flush too old packets
                  * If HZ < 1000, flush all packets.
                  */