OSDN Git Service

net_sched: sch_fq: do not assume EDT packets are ordered
authorEric Dumazet <edumazet@google.com>
Sat, 4 May 2019 23:48:53 +0000 (16:48 -0700)
committerDavid S. Miller <davem@davemloft.net>
Tue, 7 May 2019 19:09:25 +0000 (12:09 -0700)
TCP stack makes sure packets for a given flow are monotically
increasing, but we want to allow UDP packets to use EDT as
well, so that QUIC servers can use in-kernel pacing.

This patch adds a per-flow rb-tree on which packets might
be stored. We still try to use the linear list for the
typical cases where packets are queued with monotically
increasing skb->tstamp, since queue/dequeue packets on
a standard list is O(1).

Note that the ability to store packets in arbitrary EDT
order will allow us to implement later a per TCP socket
mechanism adding delays (with jitter eventually) and reorders,
to implement convenient network emulators.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
net/sched/sch_fq.c

index d107c74..ee13836 100644 (file)
 #include <net/tcp_states.h>
 #include <net/tcp.h>
 
+struct fq_skb_cb {
+       u64             time_to_send;
+};
+
+static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
+{
+       qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
+       return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
 /*
- * Per flow structure, dynamically allocated
+ * Per flow structure, dynamically allocated.
+ * If packets have monotically increasing time_to_send, they are placed in O(1)
+ * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
  */
 struct fq_flow {
+       struct rb_root  t_root;
        struct sk_buff  *head;          /* list of skbs for this flow : first skb */
        union {
                struct sk_buff *tail;   /* last skb in the list */
@@ -298,6 +311,8 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
                q->stat_allocation_errors++;
                return &q->internal;
        }
+       /* f->t_root is already zeroed after kmem_cache_zalloc() */
+
        fq_flow_set_detached(f);
        f->sk = sk;
        if (skb->sk)
@@ -312,14 +327,40 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
        return f;
 }
 
+static struct sk_buff *fq_peek(struct fq_flow *flow)
+{
+       struct sk_buff *skb = skb_rb_first(&flow->t_root);
+       struct sk_buff *head = flow->head;
+
+       if (!skb)
+               return head;
+
+       if (!head)
+               return skb;
+
+       if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
+               return skb;
+       return head;
+}
+
+static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
+                         struct sk_buff *skb)
+{
+       if (skb == flow->head) {
+               flow->head = skb->next;
+       } else {
+               rb_erase(&skb->rbnode, &flow->t_root);
+               skb->dev = qdisc_dev(sch);
+       }
+}
 
 /* remove one skb from head of flow queue */
 static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
 {
-       struct sk_buff *skb = flow->head;
+       struct sk_buff *skb = fq_peek(flow);
 
        if (skb) {
-               flow->head = skb->next;
+               fq_erase_head(sch, flow, skb);
                skb_mark_not_on_list(skb);
                flow->qlen--;
                qdisc_qstats_backlog_dec(sch, skb);
@@ -330,15 +371,36 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
 
 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
 {
-       struct sk_buff *head = flow->head;
+       struct rb_node **p, *parent;
+       struct sk_buff *head, *aux;
 
-       skb->next = NULL;
-       if (!head)
-               flow->head = skb;
-       else
-               flow->tail->next = skb;
+       fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns();
+
+       head = flow->head;
+       if (!head ||
+           fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
+               if (!head)
+                       flow->head = skb;
+               else
+                       flow->tail->next = skb;
+               flow->tail = skb;
+               skb->next = NULL;
+               return;
+       }
 
-       flow->tail = skb;
+       p = &flow->t_root.rb_node;
+       parent = NULL;
+
+       while (*p) {
+               parent = *p;
+               aux = rb_to_skb(parent);
+               if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
+                       p = &parent->rb_right;
+               else
+                       p = &parent->rb_left;
+       }
+       rb_link_node(&skb->rbnode, parent, p);
+       rb_insert_color(&skb->rbnode, &flow->t_root);
 }
 
 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -450,9 +512,9 @@ begin:
                goto begin;
        }
 
-       skb = f->head;
+       skb = fq_peek(f);
        if (skb) {
-               u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp),
+               u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
                                             f->time_next_packet);
 
                if (now < time_next_packet) {
@@ -533,6 +595,15 @@ out:
 
 static void fq_flow_purge(struct fq_flow *flow)
 {
+       struct rb_node *p = rb_first(&flow->t_root);
+
+       while (p) {
+               struct sk_buff *skb = rb_to_skb(p);
+
+               p = rb_next(p);
+               rb_erase(&skb->rbnode, &flow->t_root);
+               rtnl_kfree_skbs(skb, skb);
+       }
        rtnl_kfree_skbs(flow->head, flow->tail);
        flow->head = NULL;
        flow->qlen = 0;