OSDN Git Service

Merge tag 'kvmarm-fixes-for-5.1' of git://git.kernel.org/pub/scm/linux/kernel/git...
[uclinux-h8/linux.git] / net / sched / sch_cake.c
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *                 (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/pkt_sched.h>
69 #include <net/pkt_cls.h>
70 #include <net/tcp.h>
71 #include <net/flow_dissector.h>
72
73 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
74 #include <net/netfilter/nf_conntrack_core.h>
75 #endif
76
77 #define CAKE_SET_WAYS (8)
78 #define CAKE_MAX_TINS (8)
79 #define CAKE_QUEUES (1024)
80 #define CAKE_FLOW_MASK 63
81 #define CAKE_FLOW_NAT_FLAG 64
82
83 /* struct cobalt_params - contains codel and blue parameters
84  * @interval:   codel initial drop rate
85  * @target:     maximum persistent sojourn time & blue update rate
86  * @mtu_time:   serialisation delay of maximum-size packet
87  * @p_inc:      increment of blue drop probability (0.32 fxp)
88  * @p_dec:      decrement of blue drop probability (0.32 fxp)
89  */
90 struct cobalt_params {
91         u64     interval;
92         u64     target;
93         u64     mtu_time;
94         u32     p_inc;
95         u32     p_dec;
96 };
97
98 /* struct cobalt_vars - contains codel and blue variables
99  * @count:              codel dropping frequency
100  * @rec_inv_sqrt:       reciprocal value of sqrt(count) >> 1
101  * @drop_next:          time to drop next packet, or when we dropped last
102  * @blue_timer:         Blue time to next drop
103  * @p_drop:             BLUE drop probability (0.32 fxp)
104  * @dropping:           set if in dropping state
105  * @ecn_marked:         set if marked
106  */
107 struct cobalt_vars {
108         u32     count;
109         u32     rec_inv_sqrt;
110         ktime_t drop_next;
111         ktime_t blue_timer;
112         u32     p_drop;
113         bool    dropping;
114         bool    ecn_marked;
115 };
116
117 enum {
118         CAKE_SET_NONE = 0,
119         CAKE_SET_SPARSE,
120         CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
121         CAKE_SET_BULK,
122         CAKE_SET_DECAYING
123 };
124
125 struct cake_flow {
126         /* this stuff is all needed per-flow at dequeue time */
127         struct sk_buff    *head;
128         struct sk_buff    *tail;
129         struct list_head  flowchain;
130         s32               deficit;
131         u32               dropped;
132         struct cobalt_vars cvars;
133         u16               srchost; /* index into cake_host table */
134         u16               dsthost;
135         u8                set;
136 }; /* please try to keep this structure <= 64 bytes */
137
138 struct cake_host {
139         u32 srchost_tag;
140         u32 dsthost_tag;
141         u16 srchost_bulk_flow_count;
142         u16 dsthost_bulk_flow_count;
143 };
144
145 struct cake_heap_entry {
146         u16 t:3, b:10;
147 };
148
149 struct cake_tin_data {
150         struct cake_flow flows[CAKE_QUEUES];
151         u32     backlogs[CAKE_QUEUES];
152         u32     tags[CAKE_QUEUES]; /* for set association */
153         u16     overflow_idx[CAKE_QUEUES];
154         struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
155         u16     flow_quantum;
156
157         struct cobalt_params cparams;
158         u32     drop_overlimit;
159         u16     bulk_flow_count;
160         u16     sparse_flow_count;
161         u16     decaying_flow_count;
162         u16     unresponsive_flow_count;
163
164         u32     max_skblen;
165
166         struct list_head new_flows;
167         struct list_head old_flows;
168         struct list_head decaying_flows;
169
170         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
171         ktime_t time_next_packet;
172         u64     tin_rate_ns;
173         u64     tin_rate_bps;
174         u16     tin_rate_shft;
175
176         u16     tin_quantum_prio;
177         u16     tin_quantum_band;
178         s32     tin_deficit;
179         u32     tin_backlog;
180         u32     tin_dropped;
181         u32     tin_ecn_mark;
182
183         u32     packets;
184         u64     bytes;
185
186         u32     ack_drops;
187
188         /* moving averages */
189         u64 avge_delay;
190         u64 peak_delay;
191         u64 base_delay;
192
193         /* hash function stats */
194         u32     way_directs;
195         u32     way_hits;
196         u32     way_misses;
197         u32     way_collisions;
198 }; /* number of tins is small, so size of this struct doesn't matter much */
199
200 struct cake_sched_data {
201         struct tcf_proto __rcu *filter_list; /* optional external classifier */
202         struct tcf_block *block;
203         struct cake_tin_data *tins;
204
205         struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206         u16             overflow_timeout;
207
208         u16             tin_cnt;
209         u8              tin_mode;
210         u8              flow_mode;
211         u8              ack_filter;
212         u8              atm_mode;
213
214         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
215         u16             rate_shft;
216         ktime_t         time_next_packet;
217         ktime_t         failsafe_next_packet;
218         u64             rate_ns;
219         u64             rate_bps;
220         u16             rate_flags;
221         s16             rate_overhead;
222         u16             rate_mpu;
223         u64             interval;
224         u64             target;
225
226         /* resource tracking */
227         u32             buffer_used;
228         u32             buffer_max_used;
229         u32             buffer_limit;
230         u32             buffer_config_limit;
231
232         /* indices for dequeue */
233         u16             cur_tin;
234         u16             cur_flow;
235
236         struct qdisc_watchdog watchdog;
237         const u8        *tin_index;
238         const u8        *tin_order;
239
240         /* bandwidth capacity estimate */
241         ktime_t         last_packet_time;
242         ktime_t         avg_window_begin;
243         u64             avg_packet_interval;
244         u64             avg_window_bytes;
245         u64             avg_peak_bandwidth;
246         ktime_t         last_reconfig_time;
247
248         /* packet length stats */
249         u32             avg_netoff;
250         u16             max_netlen;
251         u16             max_adjlen;
252         u16             min_netlen;
253         u16             min_adjlen;
254 };
255
256 enum {
257         CAKE_FLAG_OVERHEAD         = BIT(0),
258         CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
259         CAKE_FLAG_INGRESS          = BIT(2),
260         CAKE_FLAG_WASH             = BIT(3),
261         CAKE_FLAG_SPLIT_GSO        = BIT(4),
262         CAKE_FLAG_FWMARK           = BIT(5)
263 };
264
265 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
266  * obtain the best features of each.  Codel is excellent on flows which
267  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
268  * unresponsive flows.
269  */
270
271 struct cobalt_skb_cb {
272         ktime_t enqueue_time;
273         u32     adjusted_len;
274 };
275
276 static u64 us_to_ns(u64 us)
277 {
278         return us * NSEC_PER_USEC;
279 }
280
281 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
282 {
283         qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
284         return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
285 }
286
287 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
288 {
289         return get_cobalt_cb(skb)->enqueue_time;
290 }
291
292 static void cobalt_set_enqueue_time(struct sk_buff *skb,
293                                     ktime_t now)
294 {
295         get_cobalt_cb(skb)->enqueue_time = now;
296 }
297
298 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
299
300 /* Diffserv lookup tables */
301
302 static const u8 precedence[] = {
303         0, 0, 0, 0, 0, 0, 0, 0,
304         1, 1, 1, 1, 1, 1, 1, 1,
305         2, 2, 2, 2, 2, 2, 2, 2,
306         3, 3, 3, 3, 3, 3, 3, 3,
307         4, 4, 4, 4, 4, 4, 4, 4,
308         5, 5, 5, 5, 5, 5, 5, 5,
309         6, 6, 6, 6, 6, 6, 6, 6,
310         7, 7, 7, 7, 7, 7, 7, 7,
311 };
312
313 static const u8 diffserv8[] = {
314         2, 5, 1, 2, 4, 2, 2, 2,
315         0, 2, 1, 2, 1, 2, 1, 2,
316         5, 2, 4, 2, 4, 2, 4, 2,
317         3, 2, 3, 2, 3, 2, 3, 2,
318         6, 2, 3, 2, 3, 2, 3, 2,
319         6, 2, 2, 2, 6, 2, 6, 2,
320         7, 2, 2, 2, 2, 2, 2, 2,
321         7, 2, 2, 2, 2, 2, 2, 2,
322 };
323
324 static const u8 diffserv4[] = {
325         0, 2, 0, 0, 2, 0, 0, 0,
326         1, 0, 0, 0, 0, 0, 0, 0,
327         2, 0, 2, 0, 2, 0, 2, 0,
328         2, 0, 2, 0, 2, 0, 2, 0,
329         3, 0, 2, 0, 2, 0, 2, 0,
330         3, 0, 0, 0, 3, 0, 3, 0,
331         3, 0, 0, 0, 0, 0, 0, 0,
332         3, 0, 0, 0, 0, 0, 0, 0,
333 };
334
335 static const u8 diffserv3[] = {
336         0, 0, 0, 0, 2, 0, 0, 0,
337         1, 0, 0, 0, 0, 0, 0, 0,
338         0, 0, 0, 0, 0, 0, 0, 0,
339         0, 0, 0, 0, 0, 0, 0, 0,
340         0, 0, 0, 0, 0, 0, 0, 0,
341         0, 0, 0, 0, 2, 0, 2, 0,
342         2, 0, 0, 0, 0, 0, 0, 0,
343         2, 0, 0, 0, 0, 0, 0, 0,
344 };
345
346 static const u8 besteffort[] = {
347         0, 0, 0, 0, 0, 0, 0, 0,
348         0, 0, 0, 0, 0, 0, 0, 0,
349         0, 0, 0, 0, 0, 0, 0, 0,
350         0, 0, 0, 0, 0, 0, 0, 0,
351         0, 0, 0, 0, 0, 0, 0, 0,
352         0, 0, 0, 0, 0, 0, 0, 0,
353         0, 0, 0, 0, 0, 0, 0, 0,
354         0, 0, 0, 0, 0, 0, 0, 0,
355 };
356
357 /* tin priority order for stats dumping */
358
359 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
360 static const u8 bulk_order[] = {1, 0, 2, 3};
361
362 #define REC_INV_SQRT_CACHE (16)
363 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
364
365 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
366  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
367  *
368  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
369  */
370
371 static void cobalt_newton_step(struct cobalt_vars *vars)
372 {
373         u32 invsqrt, invsqrt2;
374         u64 val;
375
376         invsqrt = vars->rec_inv_sqrt;
377         invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
378         val = (3LL << 32) - ((u64)vars->count * invsqrt2);
379
380         val >>= 2; /* avoid overflow in following multiply */
381         val = (val * invsqrt) >> (32 - 2 + 1);
382
383         vars->rec_inv_sqrt = val;
384 }
385
386 static void cobalt_invsqrt(struct cobalt_vars *vars)
387 {
388         if (vars->count < REC_INV_SQRT_CACHE)
389                 vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
390         else
391                 cobalt_newton_step(vars);
392 }
393
394 /* There is a big difference in timing between the accurate values placed in
395  * the cache and the approximations given by a single Newton step for small
396  * count values, particularly when stepping from count 1 to 2 or vice versa.
397  * Above 16, a single Newton step gives sufficient accuracy in either
398  * direction, given the precision stored.
399  *
400  * The magnitude of the error when stepping up to count 2 is such as to give
401  * the value that *should* have been produced at count 4.
402  */
403
404 static void cobalt_cache_init(void)
405 {
406         struct cobalt_vars v;
407
408         memset(&v, 0, sizeof(v));
409         v.rec_inv_sqrt = ~0U;
410         cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
411
412         for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
413                 cobalt_newton_step(&v);
414                 cobalt_newton_step(&v);
415                 cobalt_newton_step(&v);
416                 cobalt_newton_step(&v);
417
418                 cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
419         }
420 }
421
422 static void cobalt_vars_init(struct cobalt_vars *vars)
423 {
424         memset(vars, 0, sizeof(*vars));
425
426         if (!cobalt_rec_inv_sqrt_cache[0]) {
427                 cobalt_cache_init();
428                 cobalt_rec_inv_sqrt_cache[0] = ~0;
429         }
430 }
431
432 /* CoDel control_law is t + interval/sqrt(count)
433  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
434  * both sqrt() and divide operation.
435  */
436 static ktime_t cobalt_control(ktime_t t,
437                               u64 interval,
438                               u32 rec_inv_sqrt)
439 {
440         return ktime_add_ns(t, reciprocal_scale(interval,
441                                                 rec_inv_sqrt));
442 }
443
444 /* Call this when a packet had to be dropped due to queue overflow.  Returns
445  * true if the BLUE state was quiescent before but active after this call.
446  */
447 static bool cobalt_queue_full(struct cobalt_vars *vars,
448                               struct cobalt_params *p,
449                               ktime_t now)
450 {
451         bool up = false;
452
453         if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
454                 up = !vars->p_drop;
455                 vars->p_drop += p->p_inc;
456                 if (vars->p_drop < p->p_inc)
457                         vars->p_drop = ~0;
458                 vars->blue_timer = now;
459         }
460         vars->dropping = true;
461         vars->drop_next = now;
462         if (!vars->count)
463                 vars->count = 1;
464
465         return up;
466 }
467
468 /* Call this when the queue was serviced but turned out to be empty.  Returns
469  * true if the BLUE state was active before but quiescent after this call.
470  */
471 static bool cobalt_queue_empty(struct cobalt_vars *vars,
472                                struct cobalt_params *p,
473                                ktime_t now)
474 {
475         bool down = false;
476
477         if (vars->p_drop &&
478             ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
479                 if (vars->p_drop < p->p_dec)
480                         vars->p_drop = 0;
481                 else
482                         vars->p_drop -= p->p_dec;
483                 vars->blue_timer = now;
484                 down = !vars->p_drop;
485         }
486         vars->dropping = false;
487
488         if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
489                 vars->count--;
490                 cobalt_invsqrt(vars);
491                 vars->drop_next = cobalt_control(vars->drop_next,
492                                                  p->interval,
493                                                  vars->rec_inv_sqrt);
494         }
495
496         return down;
497 }
498
499 /* Call this with a freshly dequeued packet for possible congestion marking.
500  * Returns true as an instruction to drop the packet, false for delivery.
501  */
502 static bool cobalt_should_drop(struct cobalt_vars *vars,
503                                struct cobalt_params *p,
504                                ktime_t now,
505                                struct sk_buff *skb,
506                                u32 bulk_flows)
507 {
508         bool next_due, over_target, drop = false;
509         ktime_t schedule;
510         u64 sojourn;
511
512 /* The 'schedule' variable records, in its sign, whether 'now' is before or
513  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
514  * scheduling decision is actually branched, without destroying that
515  * information.  Similarly, the first 'schedule' value calculated is preserved
516  * in the boolean 'next_due'.
517  *
518  * As for 'drop_next', we take advantage of the fact that 'interval' is both
519  * the delay between first exceeding 'target' and the first signalling event,
520  * *and* the scaling factor for the signalling frequency.  It's therefore very
521  * natural to use a single mechanism for both purposes, and eliminates a
522  * significant amount of reference Codel's spaghetti code.  To help with this,
523  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
524  * as possible to 1.0 in fixed-point.
525  */
526
527         sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
528         schedule = ktime_sub(now, vars->drop_next);
529         over_target = sojourn > p->target &&
530                       sojourn > p->mtu_time * bulk_flows * 2 &&
531                       sojourn > p->mtu_time * 4;
532         next_due = vars->count && ktime_to_ns(schedule) >= 0;
533
534         vars->ecn_marked = false;
535
536         if (over_target) {
537                 if (!vars->dropping) {
538                         vars->dropping = true;
539                         vars->drop_next = cobalt_control(now,
540                                                          p->interval,
541                                                          vars->rec_inv_sqrt);
542                 }
543                 if (!vars->count)
544                         vars->count = 1;
545         } else if (vars->dropping) {
546                 vars->dropping = false;
547         }
548
549         if (next_due && vars->dropping) {
550                 /* Use ECN mark if possible, otherwise drop */
551                 drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
552
553                 vars->count++;
554                 if (!vars->count)
555                         vars->count--;
556                 cobalt_invsqrt(vars);
557                 vars->drop_next = cobalt_control(vars->drop_next,
558                                                  p->interval,
559                                                  vars->rec_inv_sqrt);
560                 schedule = ktime_sub(now, vars->drop_next);
561         } else {
562                 while (next_due) {
563                         vars->count--;
564                         cobalt_invsqrt(vars);
565                         vars->drop_next = cobalt_control(vars->drop_next,
566                                                          p->interval,
567                                                          vars->rec_inv_sqrt);
568                         schedule = ktime_sub(now, vars->drop_next);
569                         next_due = vars->count && ktime_to_ns(schedule) >= 0;
570                 }
571         }
572
573         /* Simple BLUE implementation.  Lack of ECN is deliberate. */
574         if (vars->p_drop)
575                 drop |= (prandom_u32() < vars->p_drop);
576
577         /* Overload the drop_next field as an activity timeout */
578         if (!vars->count)
579                 vars->drop_next = ktime_add_ns(now, p->interval);
580         else if (ktime_to_ns(schedule) > 0 && !drop)
581                 vars->drop_next = now;
582
583         return drop;
584 }
585
586 static void cake_update_flowkeys(struct flow_keys *keys,
587                                  const struct sk_buff *skb)
588 {
589 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
590         struct nf_conntrack_tuple tuple = {};
591         bool rev = !skb->_nfct;
592
593         if (tc_skb_protocol(skb) != htons(ETH_P_IP))
594                 return;
595
596         if (!nf_ct_get_tuple_skb(&tuple, skb))
597                 return;
598
599         keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
600         keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
601
602         if (keys->ports.ports) {
603                 keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
604                 keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
605         }
606 #endif
607 }
608
609 /* Cake has several subtle multiple bit settings. In these cases you
610  *  would be matching triple isolate mode as well.
611  */
612
613 static bool cake_dsrc(int flow_mode)
614 {
615         return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
616 }
617
618 static bool cake_ddst(int flow_mode)
619 {
620         return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
621 }
622
623 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
624                      int flow_mode, u16 flow_override, u16 host_override)
625 {
626         u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
627         u16 reduced_hash, srchost_idx, dsthost_idx;
628         struct flow_keys keys, host_keys;
629
630         if (unlikely(flow_mode == CAKE_FLOW_NONE))
631                 return 0;
632
633         /* If both overrides are set we can skip packet dissection entirely */
634         if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
635             (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
636                 goto skip_hash;
637
638         skb_flow_dissect_flow_keys(skb, &keys,
639                                    FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
640
641         if (flow_mode & CAKE_FLOW_NAT_FLAG)
642                 cake_update_flowkeys(&keys, skb);
643
644         /* flow_hash_from_keys() sorts the addresses by value, so we have
645          * to preserve their order in a separate data structure to treat
646          * src and dst host addresses as independently selectable.
647          */
648         host_keys = keys;
649         host_keys.ports.ports     = 0;
650         host_keys.basic.ip_proto  = 0;
651         host_keys.keyid.keyid     = 0;
652         host_keys.tags.flow_label = 0;
653
654         switch (host_keys.control.addr_type) {
655         case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
656                 host_keys.addrs.v4addrs.src = 0;
657                 dsthost_hash = flow_hash_from_keys(&host_keys);
658                 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
659                 host_keys.addrs.v4addrs.dst = 0;
660                 srchost_hash = flow_hash_from_keys(&host_keys);
661                 break;
662
663         case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
664                 memset(&host_keys.addrs.v6addrs.src, 0,
665                        sizeof(host_keys.addrs.v6addrs.src));
666                 dsthost_hash = flow_hash_from_keys(&host_keys);
667                 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
668                 memset(&host_keys.addrs.v6addrs.dst, 0,
669                        sizeof(host_keys.addrs.v6addrs.dst));
670                 srchost_hash = flow_hash_from_keys(&host_keys);
671                 break;
672
673         default:
674                 dsthost_hash = 0;
675                 srchost_hash = 0;
676         }
677
678         /* This *must* be after the above switch, since as a
679          * side-effect it sorts the src and dst addresses.
680          */
681         if (flow_mode & CAKE_FLOW_FLOWS)
682                 flow_hash = flow_hash_from_keys(&keys);
683
684 skip_hash:
685         if (flow_override)
686                 flow_hash = flow_override - 1;
687         if (host_override) {
688                 dsthost_hash = host_override - 1;
689                 srchost_hash = host_override - 1;
690         }
691
692         if (!(flow_mode & CAKE_FLOW_FLOWS)) {
693                 if (flow_mode & CAKE_FLOW_SRC_IP)
694                         flow_hash ^= srchost_hash;
695
696                 if (flow_mode & CAKE_FLOW_DST_IP)
697                         flow_hash ^= dsthost_hash;
698         }
699
700         reduced_hash = flow_hash % CAKE_QUEUES;
701
702         /* set-associative hashing */
703         /* fast path if no hash collision (direct lookup succeeds) */
704         if (likely(q->tags[reduced_hash] == flow_hash &&
705                    q->flows[reduced_hash].set)) {
706                 q->way_directs++;
707         } else {
708                 u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
709                 u32 outer_hash = reduced_hash - inner_hash;
710                 bool allocate_src = false;
711                 bool allocate_dst = false;
712                 u32 i, k;
713
714                 /* check if any active queue in the set is reserved for
715                  * this flow.
716                  */
717                 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
718                      i++, k = (k + 1) % CAKE_SET_WAYS) {
719                         if (q->tags[outer_hash + k] == flow_hash) {
720                                 if (i)
721                                         q->way_hits++;
722
723                                 if (!q->flows[outer_hash + k].set) {
724                                         /* need to increment host refcnts */
725                                         allocate_src = cake_dsrc(flow_mode);
726                                         allocate_dst = cake_ddst(flow_mode);
727                                 }
728
729                                 goto found;
730                         }
731                 }
732
733                 /* no queue is reserved for this flow, look for an
734                  * empty one.
735                  */
736                 for (i = 0; i < CAKE_SET_WAYS;
737                          i++, k = (k + 1) % CAKE_SET_WAYS) {
738                         if (!q->flows[outer_hash + k].set) {
739                                 q->way_misses++;
740                                 allocate_src = cake_dsrc(flow_mode);
741                                 allocate_dst = cake_ddst(flow_mode);
742                                 goto found;
743                         }
744                 }
745
746                 /* With no empty queues, default to the original
747                  * queue, accept the collision, update the host tags.
748                  */
749                 q->way_collisions++;
750                 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
751                         q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
752                         q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
753                 }
754                 allocate_src = cake_dsrc(flow_mode);
755                 allocate_dst = cake_ddst(flow_mode);
756 found:
757                 /* reserve queue for future packets in same flow */
758                 reduced_hash = outer_hash + k;
759                 q->tags[reduced_hash] = flow_hash;
760
761                 if (allocate_src) {
762                         srchost_idx = srchost_hash % CAKE_QUEUES;
763                         inner_hash = srchost_idx % CAKE_SET_WAYS;
764                         outer_hash = srchost_idx - inner_hash;
765                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
766                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
767                                 if (q->hosts[outer_hash + k].srchost_tag ==
768                                     srchost_hash)
769                                         goto found_src;
770                         }
771                         for (i = 0; i < CAKE_SET_WAYS;
772                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
773                                 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
774                                         break;
775                         }
776                         q->hosts[outer_hash + k].srchost_tag = srchost_hash;
777 found_src:
778                         srchost_idx = outer_hash + k;
779                         if (q->flows[reduced_hash].set == CAKE_SET_BULK)
780                                 q->hosts[srchost_idx].srchost_bulk_flow_count++;
781                         q->flows[reduced_hash].srchost = srchost_idx;
782                 }
783
784                 if (allocate_dst) {
785                         dsthost_idx = dsthost_hash % CAKE_QUEUES;
786                         inner_hash = dsthost_idx % CAKE_SET_WAYS;
787                         outer_hash = dsthost_idx - inner_hash;
788                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
789                              i++, k = (k + 1) % CAKE_SET_WAYS) {
790                                 if (q->hosts[outer_hash + k].dsthost_tag ==
791                                     dsthost_hash)
792                                         goto found_dst;
793                         }
794                         for (i = 0; i < CAKE_SET_WAYS;
795                              i++, k = (k + 1) % CAKE_SET_WAYS) {
796                                 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
797                                         break;
798                         }
799                         q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
800 found_dst:
801                         dsthost_idx = outer_hash + k;
802                         if (q->flows[reduced_hash].set == CAKE_SET_BULK)
803                                 q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
804                         q->flows[reduced_hash].dsthost = dsthost_idx;
805                 }
806         }
807
808         return reduced_hash;
809 }
810
811 /* helper functions : might be changed when/if skb use a standard list_head */
812 /* remove one skb from head of slot queue */
813
814 static struct sk_buff *dequeue_head(struct cake_flow *flow)
815 {
816         struct sk_buff *skb = flow->head;
817
818         if (skb) {
819                 flow->head = skb->next;
820                 skb_mark_not_on_list(skb);
821         }
822
823         return skb;
824 }
825
826 /* add skb to flow queue (tail add) */
827
828 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
829 {
830         if (!flow->head)
831                 flow->head = skb;
832         else
833                 flow->tail->next = skb;
834         flow->tail = skb;
835         skb->next = NULL;
836 }
837
838 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
839                                     struct ipv6hdr *buf)
840 {
841         unsigned int offset = skb_network_offset(skb);
842         struct iphdr *iph;
843
844         iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
845
846         if (!iph)
847                 return NULL;
848
849         if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
850                 return skb_header_pointer(skb, offset + iph->ihl * 4,
851                                           sizeof(struct ipv6hdr), buf);
852
853         else if (iph->version == 4)
854                 return iph;
855
856         else if (iph->version == 6)
857                 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
858                                           buf);
859
860         return NULL;
861 }
862
863 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
864                                       void *buf, unsigned int bufsize)
865 {
866         unsigned int offset = skb_network_offset(skb);
867         const struct ipv6hdr *ipv6h;
868         const struct tcphdr *tcph;
869         const struct iphdr *iph;
870         struct ipv6hdr _ipv6h;
871         struct tcphdr _tcph;
872
873         ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
874
875         if (!ipv6h)
876                 return NULL;
877
878         if (ipv6h->version == 4) {
879                 iph = (struct iphdr *)ipv6h;
880                 offset += iph->ihl * 4;
881
882                 /* special-case 6in4 tunnelling, as that is a common way to get
883                  * v6 connectivity in the home
884                  */
885                 if (iph->protocol == IPPROTO_IPV6) {
886                         ipv6h = skb_header_pointer(skb, offset,
887                                                    sizeof(_ipv6h), &_ipv6h);
888
889                         if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
890                                 return NULL;
891
892                         offset += sizeof(struct ipv6hdr);
893
894                 } else if (iph->protocol != IPPROTO_TCP) {
895                         return NULL;
896                 }
897
898         } else if (ipv6h->version == 6) {
899                 if (ipv6h->nexthdr != IPPROTO_TCP)
900                         return NULL;
901
902                 offset += sizeof(struct ipv6hdr);
903         } else {
904                 return NULL;
905         }
906
907         tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
908         if (!tcph)
909                 return NULL;
910
911         return skb_header_pointer(skb, offset,
912                                   min(__tcp_hdrlen(tcph), bufsize), buf);
913 }
914
915 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
916                                    int code, int *oplen)
917 {
918         /* inspired by tcp_parse_options in tcp_input.c */
919         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
920         const u8 *ptr = (const u8 *)(tcph + 1);
921
922         while (length > 0) {
923                 int opcode = *ptr++;
924                 int opsize;
925
926                 if (opcode == TCPOPT_EOL)
927                         break;
928                 if (opcode == TCPOPT_NOP) {
929                         length--;
930                         continue;
931                 }
932                 opsize = *ptr++;
933                 if (opsize < 2 || opsize > length)
934                         break;
935
936                 if (opcode == code) {
937                         *oplen = opsize;
938                         return ptr;
939                 }
940
941                 ptr += opsize - 2;
942                 length -= opsize;
943         }
944
945         return NULL;
946 }
947
948 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
949  * bytes than the other. In the case where both sequences ACKs bytes that the
950  * other doesn't, A is considered greater. DSACKs in A also makes A be
951  * considered greater.
952  *
953  * @return -1, 0 or 1 as normal compare functions
954  */
955 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
956                                   const struct tcphdr *tcph_b)
957 {
958         const struct tcp_sack_block_wire *sack_a, *sack_b;
959         u32 ack_seq_a = ntohl(tcph_a->ack_seq);
960         u32 bytes_a = 0, bytes_b = 0;
961         int oplen_a, oplen_b;
962         bool first = true;
963
964         sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
965         sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
966
967         /* pointers point to option contents */
968         oplen_a -= TCPOLEN_SACK_BASE;
969         oplen_b -= TCPOLEN_SACK_BASE;
970
971         if (sack_a && oplen_a >= sizeof(*sack_a) &&
972             (!sack_b || oplen_b < sizeof(*sack_b)))
973                 return -1;
974         else if (sack_b && oplen_b >= sizeof(*sack_b) &&
975                  (!sack_a || oplen_a < sizeof(*sack_a)))
976                 return 1;
977         else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
978                  (!sack_b || oplen_b < sizeof(*sack_b)))
979                 return 0;
980
981         while (oplen_a >= sizeof(*sack_a)) {
982                 const struct tcp_sack_block_wire *sack_tmp = sack_b;
983                 u32 start_a = get_unaligned_be32(&sack_a->start_seq);
984                 u32 end_a = get_unaligned_be32(&sack_a->end_seq);
985                 int oplen_tmp = oplen_b;
986                 bool found = false;
987
988                 /* DSACK; always considered greater to prevent dropping */
989                 if (before(start_a, ack_seq_a))
990                         return -1;
991
992                 bytes_a += end_a - start_a;
993
994                 while (oplen_tmp >= sizeof(*sack_tmp)) {
995                         u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
996                         u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
997
998                         /* first time through we count the total size */
999                         if (first)
1000                                 bytes_b += end_b - start_b;
1001
1002                         if (!after(start_b, start_a) && !before(end_b, end_a)) {
1003                                 found = true;
1004                                 if (!first)
1005                                         break;
1006                         }
1007                         oplen_tmp -= sizeof(*sack_tmp);
1008                         sack_tmp++;
1009                 }
1010
1011                 if (!found)
1012                         return -1;
1013
1014                 oplen_a -= sizeof(*sack_a);
1015                 sack_a++;
1016                 first = false;
1017         }
1018
1019         /* If we made it this far, all ranges SACKed by A are covered by B, so
1020          * either the SACKs are equal, or B SACKs more bytes.
1021          */
1022         return bytes_b > bytes_a ? 1 : 0;
1023 }
1024
1025 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1026                                  u32 *tsval, u32 *tsecr)
1027 {
1028         const u8 *ptr;
1029         int opsize;
1030
1031         ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1032
1033         if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1034                 *tsval = get_unaligned_be32(ptr);
1035                 *tsecr = get_unaligned_be32(ptr + 4);
1036         }
1037 }
1038
1039 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1040                                u32 tstamp_new, u32 tsecr_new)
1041 {
1042         /* inspired by tcp_parse_options in tcp_input.c */
1043         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1044         const u8 *ptr = (const u8 *)(tcph + 1);
1045         u32 tstamp, tsecr;
1046
1047         /* 3 reserved flags must be unset to avoid future breakage
1048          * ACK must be set
1049          * ECE/CWR are handled separately
1050          * All other flags URG/PSH/RST/SYN/FIN must be unset
1051          * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1052          * 0x00C00000 = CWR/ECE (handled separately)
1053          * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1054          */
1055         if (((tcp_flag_word(tcph) &
1056               cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1057                 return false;
1058
1059         while (length > 0) {
1060                 int opcode = *ptr++;
1061                 int opsize;
1062
1063                 if (opcode == TCPOPT_EOL)
1064                         break;
1065                 if (opcode == TCPOPT_NOP) {
1066                         length--;
1067                         continue;
1068                 }
1069                 opsize = *ptr++;
1070                 if (opsize < 2 || opsize > length)
1071                         break;
1072
1073                 switch (opcode) {
1074                 case TCPOPT_MD5SIG: /* doesn't influence state */
1075                         break;
1076
1077                 case TCPOPT_SACK: /* stricter checking performed later */
1078                         if (opsize % 8 != 2)
1079                                 return false;
1080                         break;
1081
1082                 case TCPOPT_TIMESTAMP:
1083                         /* only drop timestamps lower than new */
1084                         if (opsize != TCPOLEN_TIMESTAMP)
1085                                 return false;
1086                         tstamp = get_unaligned_be32(ptr);
1087                         tsecr = get_unaligned_be32(ptr + 4);
1088                         if (after(tstamp, tstamp_new) ||
1089                             after(tsecr, tsecr_new))
1090                                 return false;
1091                         break;
1092
1093                 case TCPOPT_MSS:  /* these should only be set on SYN */
1094                 case TCPOPT_WINDOW:
1095                 case TCPOPT_SACK_PERM:
1096                 case TCPOPT_FASTOPEN:
1097                 case TCPOPT_EXP:
1098                 default: /* don't drop if any unknown options are present */
1099                         return false;
1100                 }
1101
1102                 ptr += opsize - 2;
1103                 length -= opsize;
1104         }
1105
1106         return true;
1107 }
1108
1109 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1110                                        struct cake_flow *flow)
1111 {
1112         bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1113         struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1114         struct sk_buff *skb_check, *skb_prev = NULL;
1115         const struct ipv6hdr *ipv6h, *ipv6h_check;
1116         unsigned char _tcph[64], _tcph_check[64];
1117         const struct tcphdr *tcph, *tcph_check;
1118         const struct iphdr *iph, *iph_check;
1119         struct ipv6hdr _iph, _iph_check;
1120         const struct sk_buff *skb;
1121         int seglen, num_found = 0;
1122         u32 tstamp = 0, tsecr = 0;
1123         __be32 elig_flags = 0;
1124         int sack_comp;
1125
1126         /* no other possible ACKs to filter */
1127         if (flow->head == flow->tail)
1128                 return NULL;
1129
1130         skb = flow->tail;
1131         tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1132         iph = cake_get_iphdr(skb, &_iph);
1133         if (!tcph)
1134                 return NULL;
1135
1136         cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1137
1138         /* the 'triggering' packet need only have the ACK flag set.
1139          * also check that SYN is not set, as there won't be any previous ACKs.
1140          */
1141         if ((tcp_flag_word(tcph) &
1142              (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1143                 return NULL;
1144
1145         /* the 'triggering' ACK is at the tail of the queue, we have already
1146          * returned if it is the only packet in the flow. loop through the rest
1147          * of the queue looking for pure ACKs with the same 5-tuple as the
1148          * triggering one.
1149          */
1150         for (skb_check = flow->head;
1151              skb_check && skb_check != skb;
1152              skb_prev = skb_check, skb_check = skb_check->next) {
1153                 iph_check = cake_get_iphdr(skb_check, &_iph_check);
1154                 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1155                                              sizeof(_tcph_check));
1156
1157                 /* only TCP packets with matching 5-tuple are eligible, and only
1158                  * drop safe headers
1159                  */
1160                 if (!tcph_check || iph->version != iph_check->version ||
1161                     tcph_check->source != tcph->source ||
1162                     tcph_check->dest != tcph->dest)
1163                         continue;
1164
1165                 if (iph_check->version == 4) {
1166                         if (iph_check->saddr != iph->saddr ||
1167                             iph_check->daddr != iph->daddr)
1168                                 continue;
1169
1170                         seglen = ntohs(iph_check->tot_len) -
1171                                        (4 * iph_check->ihl);
1172                 } else if (iph_check->version == 6) {
1173                         ipv6h = (struct ipv6hdr *)iph;
1174                         ipv6h_check = (struct ipv6hdr *)iph_check;
1175
1176                         if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1177                             ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1178                                 continue;
1179
1180                         seglen = ntohs(ipv6h_check->payload_len);
1181                 } else {
1182                         WARN_ON(1);  /* shouldn't happen */
1183                         continue;
1184                 }
1185
1186                 /* If the ECE/CWR flags changed from the previous eligible
1187                  * packet in the same flow, we should no longer be dropping that
1188                  * previous packet as this would lose information.
1189                  */
1190                 if (elig_ack && (tcp_flag_word(tcph_check) &
1191                                  (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1192                         elig_ack = NULL;
1193                         elig_ack_prev = NULL;
1194                         num_found--;
1195                 }
1196
1197                 /* Check TCP options and flags, don't drop ACKs with segment
1198                  * data, and don't drop ACKs with a higher cumulative ACK
1199                  * counter than the triggering packet. Check ACK seqno here to
1200                  * avoid parsing SACK options of packets we are going to exclude
1201                  * anyway.
1202                  */
1203                 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1204                     (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1205                     after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1206                         continue;
1207
1208                 /* Check SACK options. The triggering packet must SACK more data
1209                  * than the ACK under consideration, or SACK the same range but
1210                  * have a larger cumulative ACK counter. The latter is a
1211                  * pathological case, but is contained in the following check
1212                  * anyway, just to be safe.
1213                  */
1214                 sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1215
1216                 if (sack_comp < 0 ||
1217                     (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1218                      sack_comp == 0))
1219                         continue;
1220
1221                 /* At this point we have found an eligible pure ACK to drop; if
1222                  * we are in aggressive mode, we are done. Otherwise, keep
1223                  * searching unless this is the second eligible ACK we
1224                  * found.
1225                  *
1226                  * Since we want to drop ACK closest to the head of the queue,
1227                  * save the first eligible ACK we find, even if we need to loop
1228                  * again.
1229                  */
1230                 if (!elig_ack) {
1231                         elig_ack = skb_check;
1232                         elig_ack_prev = skb_prev;
1233                         elig_flags = (tcp_flag_word(tcph_check)
1234                                       & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1235                 }
1236
1237                 if (num_found++ > 0)
1238                         goto found;
1239         }
1240
1241         /* We made it through the queue without finding two eligible ACKs . If
1242          * we found a single eligible ACK we can drop it in aggressive mode if
1243          * we can guarantee that this does not interfere with ECN flag
1244          * information. We ensure this by dropping it only if the enqueued
1245          * packet is consecutive with the eligible ACK, and their flags match.
1246          */
1247         if (elig_ack && aggressive && elig_ack->next == skb &&
1248             (elig_flags == (tcp_flag_word(tcph) &
1249                             (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1250                 goto found;
1251
1252         return NULL;
1253
1254 found:
1255         if (elig_ack_prev)
1256                 elig_ack_prev->next = elig_ack->next;
1257         else
1258                 flow->head = elig_ack->next;
1259
1260         skb_mark_not_on_list(elig_ack);
1261
1262         return elig_ack;
1263 }
1264
1265 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1266 {
1267         avg -= avg >> shift;
1268         avg += sample >> shift;
1269         return avg;
1270 }
1271
1272 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1273 {
1274         if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1275                 len -= off;
1276
1277         if (q->max_netlen < len)
1278                 q->max_netlen = len;
1279         if (q->min_netlen > len)
1280                 q->min_netlen = len;
1281
1282         len += q->rate_overhead;
1283
1284         if (len < q->rate_mpu)
1285                 len = q->rate_mpu;
1286
1287         if (q->atm_mode == CAKE_ATM_ATM) {
1288                 len += 47;
1289                 len /= 48;
1290                 len *= 53;
1291         } else if (q->atm_mode == CAKE_ATM_PTM) {
1292                 /* Add one byte per 64 bytes or part thereof.
1293                  * This is conservative and easier to calculate than the
1294                  * precise value.
1295                  */
1296                 len += (len + 63) / 64;
1297         }
1298
1299         if (q->max_adjlen < len)
1300                 q->max_adjlen = len;
1301         if (q->min_adjlen > len)
1302                 q->min_adjlen = len;
1303
1304         return len;
1305 }
1306
1307 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1308 {
1309         const struct skb_shared_info *shinfo = skb_shinfo(skb);
1310         unsigned int hdr_len, last_len = 0;
1311         u32 off = skb_network_offset(skb);
1312         u32 len = qdisc_pkt_len(skb);
1313         u16 segs = 1;
1314
1315         q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1316
1317         if (!shinfo->gso_size)
1318                 return cake_calc_overhead(q, len, off);
1319
1320         /* borrowed from qdisc_pkt_len_init() */
1321         hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1322
1323         /* + transport layer */
1324         if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1325                                                 SKB_GSO_TCPV6))) {
1326                 const struct tcphdr *th;
1327                 struct tcphdr _tcphdr;
1328
1329                 th = skb_header_pointer(skb, skb_transport_offset(skb),
1330                                         sizeof(_tcphdr), &_tcphdr);
1331                 if (likely(th))
1332                         hdr_len += __tcp_hdrlen(th);
1333         } else {
1334                 struct udphdr _udphdr;
1335
1336                 if (skb_header_pointer(skb, skb_transport_offset(skb),
1337                                        sizeof(_udphdr), &_udphdr))
1338                         hdr_len += sizeof(struct udphdr);
1339         }
1340
1341         if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1342                 segs = DIV_ROUND_UP(skb->len - hdr_len,
1343                                     shinfo->gso_size);
1344         else
1345                 segs = shinfo->gso_segs;
1346
1347         len = shinfo->gso_size + hdr_len;
1348         last_len = skb->len - shinfo->gso_size * (segs - 1);
1349
1350         return (cake_calc_overhead(q, len, off) * (segs - 1) +
1351                 cake_calc_overhead(q, last_len, off));
1352 }
1353
1354 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1355 {
1356         struct cake_heap_entry ii = q->overflow_heap[i];
1357         struct cake_heap_entry jj = q->overflow_heap[j];
1358
1359         q->overflow_heap[i] = jj;
1360         q->overflow_heap[j] = ii;
1361
1362         q->tins[ii.t].overflow_idx[ii.b] = j;
1363         q->tins[jj.t].overflow_idx[jj.b] = i;
1364 }
1365
1366 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1367 {
1368         struct cake_heap_entry ii = q->overflow_heap[i];
1369
1370         return q->tins[ii.t].backlogs[ii.b];
1371 }
1372
1373 static void cake_heapify(struct cake_sched_data *q, u16 i)
1374 {
1375         static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1376         u32 mb = cake_heap_get_backlog(q, i);
1377         u32 m = i;
1378
1379         while (m < a) {
1380                 u32 l = m + m + 1;
1381                 u32 r = l + 1;
1382
1383                 if (l < a) {
1384                         u32 lb = cake_heap_get_backlog(q, l);
1385
1386                         if (lb > mb) {
1387                                 m  = l;
1388                                 mb = lb;
1389                         }
1390                 }
1391
1392                 if (r < a) {
1393                         u32 rb = cake_heap_get_backlog(q, r);
1394
1395                         if (rb > mb) {
1396                                 m  = r;
1397                                 mb = rb;
1398                         }
1399                 }
1400
1401                 if (m != i) {
1402                         cake_heap_swap(q, i, m);
1403                         i = m;
1404                 } else {
1405                         break;
1406                 }
1407         }
1408 }
1409
1410 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1411 {
1412         while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1413                 u16 p = (i - 1) >> 1;
1414                 u32 ib = cake_heap_get_backlog(q, i);
1415                 u32 pb = cake_heap_get_backlog(q, p);
1416
1417                 if (ib > pb) {
1418                         cake_heap_swap(q, i, p);
1419                         i = p;
1420                 } else {
1421                         break;
1422                 }
1423         }
1424 }
1425
1426 static int cake_advance_shaper(struct cake_sched_data *q,
1427                                struct cake_tin_data *b,
1428                                struct sk_buff *skb,
1429                                ktime_t now, bool drop)
1430 {
1431         u32 len = get_cobalt_cb(skb)->adjusted_len;
1432
1433         /* charge packet bandwidth to this tin
1434          * and to the global shaper.
1435          */
1436         if (q->rate_ns) {
1437                 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1438                 u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1439                 u64 failsafe_dur = global_dur + (global_dur >> 1);
1440
1441                 if (ktime_before(b->time_next_packet, now))
1442                         b->time_next_packet = ktime_add_ns(b->time_next_packet,
1443                                                            tin_dur);
1444
1445                 else if (ktime_before(b->time_next_packet,
1446                                       ktime_add_ns(now, tin_dur)))
1447                         b->time_next_packet = ktime_add_ns(now, tin_dur);
1448
1449                 q->time_next_packet = ktime_add_ns(q->time_next_packet,
1450                                                    global_dur);
1451                 if (!drop)
1452                         q->failsafe_next_packet = \
1453                                 ktime_add_ns(q->failsafe_next_packet,
1454                                              failsafe_dur);
1455         }
1456         return len;
1457 }
1458
1459 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1460 {
1461         struct cake_sched_data *q = qdisc_priv(sch);
1462         ktime_t now = ktime_get();
1463         u32 idx = 0, tin = 0, len;
1464         struct cake_heap_entry qq;
1465         struct cake_tin_data *b;
1466         struct cake_flow *flow;
1467         struct sk_buff *skb;
1468
1469         if (!q->overflow_timeout) {
1470                 int i;
1471                 /* Build fresh max-heap */
1472                 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1473                         cake_heapify(q, i);
1474         }
1475         q->overflow_timeout = 65535;
1476
1477         /* select longest queue for pruning */
1478         qq  = q->overflow_heap[0];
1479         tin = qq.t;
1480         idx = qq.b;
1481
1482         b = &q->tins[tin];
1483         flow = &b->flows[idx];
1484         skb = dequeue_head(flow);
1485         if (unlikely(!skb)) {
1486                 /* heap has gone wrong, rebuild it next time */
1487                 q->overflow_timeout = 0;
1488                 return idx + (tin << 16);
1489         }
1490
1491         if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1492                 b->unresponsive_flow_count++;
1493
1494         len = qdisc_pkt_len(skb);
1495         q->buffer_used      -= skb->truesize;
1496         b->backlogs[idx]    -= len;
1497         b->tin_backlog      -= len;
1498         sch->qstats.backlog -= len;
1499         qdisc_tree_reduce_backlog(sch, 1, len);
1500
1501         flow->dropped++;
1502         b->tin_dropped++;
1503         sch->qstats.drops++;
1504
1505         if (q->rate_flags & CAKE_FLAG_INGRESS)
1506                 cake_advance_shaper(q, b, skb, now, true);
1507
1508         __qdisc_drop(skb, to_free);
1509         sch->q.qlen--;
1510
1511         cake_heapify(q, 0);
1512
1513         return idx + (tin << 16);
1514 }
1515
1516 static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
1517 {
1518         u8 dscp;
1519
1520         switch (skb->protocol) {
1521         case htons(ETH_P_IP):
1522                 dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
1523                 if (wash && dscp)
1524                         ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1525                 return dscp;
1526
1527         case htons(ETH_P_IPV6):
1528                 dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
1529                 if (wash && dscp)
1530                         ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1531                 return dscp;
1532
1533         case htons(ETH_P_ARP):
1534                 return 0x38;  /* CS7 - Net Control */
1535
1536         default:
1537                 /* If there is no Diffserv field, treat as best-effort */
1538                 return 0;
1539         }
1540 }
1541
1542 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1543                                              struct sk_buff *skb)
1544 {
1545         struct cake_sched_data *q = qdisc_priv(sch);
1546         u32 tin;
1547         u8 dscp;
1548
1549         /* Tin selection: Default to diffserv-based selection, allow overriding
1550          * using firewall marks or skb->priority.
1551          */
1552         dscp = cake_handle_diffserv(skb,
1553                                     q->rate_flags & CAKE_FLAG_WASH);
1554
1555         if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1556                 tin = 0;
1557
1558         else if (q->rate_flags & CAKE_FLAG_FWMARK && /* use fw mark */
1559                  skb->mark &&
1560                  skb->mark <= q->tin_cnt)
1561                 tin = q->tin_order[skb->mark - 1];
1562
1563         else if (TC_H_MAJ(skb->priority) == sch->handle &&
1564                  TC_H_MIN(skb->priority) > 0 &&
1565                  TC_H_MIN(skb->priority) <= q->tin_cnt)
1566                 tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1567
1568         else {
1569                 tin = q->tin_index[dscp];
1570
1571                 if (unlikely(tin >= q->tin_cnt))
1572                         tin = 0;
1573         }
1574
1575         return &q->tins[tin];
1576 }
1577
1578 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1579                          struct sk_buff *skb, int flow_mode, int *qerr)
1580 {
1581         struct cake_sched_data *q = qdisc_priv(sch);
1582         struct tcf_proto *filter;
1583         struct tcf_result res;
1584         u16 flow = 0, host = 0;
1585         int result;
1586
1587         filter = rcu_dereference_bh(q->filter_list);
1588         if (!filter)
1589                 goto hash;
1590
1591         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1592         result = tcf_classify(skb, filter, &res, false);
1593
1594         if (result >= 0) {
1595 #ifdef CONFIG_NET_CLS_ACT
1596                 switch (result) {
1597                 case TC_ACT_STOLEN:
1598                 case TC_ACT_QUEUED:
1599                 case TC_ACT_TRAP:
1600                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1601                         /* fall through */
1602                 case TC_ACT_SHOT:
1603                         return 0;
1604                 }
1605 #endif
1606                 if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1607                         flow = TC_H_MIN(res.classid);
1608                 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1609                         host = TC_H_MAJ(res.classid) >> 16;
1610         }
1611 hash:
1612         *t = cake_select_tin(sch, skb);
1613         return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1614 }
1615
1616 static void cake_reconfigure(struct Qdisc *sch);
1617
1618 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1619                         struct sk_buff **to_free)
1620 {
1621         struct cake_sched_data *q = qdisc_priv(sch);
1622         int len = qdisc_pkt_len(skb);
1623         int uninitialized_var(ret);
1624         struct sk_buff *ack = NULL;
1625         ktime_t now = ktime_get();
1626         struct cake_tin_data *b;
1627         struct cake_flow *flow;
1628         u32 idx;
1629
1630         /* choose flow to insert into */
1631         idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1632         if (idx == 0) {
1633                 if (ret & __NET_XMIT_BYPASS)
1634                         qdisc_qstats_drop(sch);
1635                 __qdisc_drop(skb, to_free);
1636                 return ret;
1637         }
1638         idx--;
1639         flow = &b->flows[idx];
1640
1641         /* ensure shaper state isn't stale */
1642         if (!b->tin_backlog) {
1643                 if (ktime_before(b->time_next_packet, now))
1644                         b->time_next_packet = now;
1645
1646                 if (!sch->q.qlen) {
1647                         if (ktime_before(q->time_next_packet, now)) {
1648                                 q->failsafe_next_packet = now;
1649                                 q->time_next_packet = now;
1650                         } else if (ktime_after(q->time_next_packet, now) &&
1651                                    ktime_after(q->failsafe_next_packet, now)) {
1652                                 u64 next = \
1653                                         min(ktime_to_ns(q->time_next_packet),
1654                                             ktime_to_ns(
1655                                                    q->failsafe_next_packet));
1656                                 sch->qstats.overlimits++;
1657                                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1658                         }
1659                 }
1660         }
1661
1662         if (unlikely(len > b->max_skblen))
1663                 b->max_skblen = len;
1664
1665         if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1666                 struct sk_buff *segs, *nskb;
1667                 netdev_features_t features = netif_skb_features(skb);
1668                 unsigned int slen = 0, numsegs = 0;
1669
1670                 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1671                 if (IS_ERR_OR_NULL(segs))
1672                         return qdisc_drop(skb, sch, to_free);
1673
1674                 while (segs) {
1675                         nskb = segs->next;
1676                         skb_mark_not_on_list(segs);
1677                         qdisc_skb_cb(segs)->pkt_len = segs->len;
1678                         cobalt_set_enqueue_time(segs, now);
1679                         get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1680                                                                           segs);
1681                         flow_queue_add(flow, segs);
1682
1683                         sch->q.qlen++;
1684                         numsegs++;
1685                         slen += segs->len;
1686                         q->buffer_used += segs->truesize;
1687                         b->packets++;
1688                         segs = nskb;
1689                 }
1690
1691                 /* stats */
1692                 b->bytes            += slen;
1693                 b->backlogs[idx]    += slen;
1694                 b->tin_backlog      += slen;
1695                 sch->qstats.backlog += slen;
1696                 q->avg_window_bytes += slen;
1697
1698                 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1699                 consume_skb(skb);
1700         } else {
1701                 /* not splitting */
1702                 cobalt_set_enqueue_time(skb, now);
1703                 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1704                 flow_queue_add(flow, skb);
1705
1706                 if (q->ack_filter)
1707                         ack = cake_ack_filter(q, flow);
1708
1709                 if (ack) {
1710                         b->ack_drops++;
1711                         sch->qstats.drops++;
1712                         b->bytes += qdisc_pkt_len(ack);
1713                         len -= qdisc_pkt_len(ack);
1714                         q->buffer_used += skb->truesize - ack->truesize;
1715                         if (q->rate_flags & CAKE_FLAG_INGRESS)
1716                                 cake_advance_shaper(q, b, ack, now, true);
1717
1718                         qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1719                         consume_skb(ack);
1720                 } else {
1721                         sch->q.qlen++;
1722                         q->buffer_used      += skb->truesize;
1723                 }
1724
1725                 /* stats */
1726                 b->packets++;
1727                 b->bytes            += len;
1728                 b->backlogs[idx]    += len;
1729                 b->tin_backlog      += len;
1730                 sch->qstats.backlog += len;
1731                 q->avg_window_bytes += len;
1732         }
1733
1734         if (q->overflow_timeout)
1735                 cake_heapify_up(q, b->overflow_idx[idx]);
1736
1737         /* incoming bandwidth capacity estimate */
1738         if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1739                 u64 packet_interval = \
1740                         ktime_to_ns(ktime_sub(now, q->last_packet_time));
1741
1742                 if (packet_interval > NSEC_PER_SEC)
1743                         packet_interval = NSEC_PER_SEC;
1744
1745                 /* filter out short-term bursts, eg. wifi aggregation */
1746                 q->avg_packet_interval = \
1747                         cake_ewma(q->avg_packet_interval,
1748                                   packet_interval,
1749                                   (packet_interval > q->avg_packet_interval ?
1750                                           2 : 8));
1751
1752                 q->last_packet_time = now;
1753
1754                 if (packet_interval > q->avg_packet_interval) {
1755                         u64 window_interval = \
1756                                 ktime_to_ns(ktime_sub(now,
1757                                                       q->avg_window_begin));
1758                         u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1759
1760                         do_div(b, window_interval);
1761                         q->avg_peak_bandwidth =
1762                                 cake_ewma(q->avg_peak_bandwidth, b,
1763                                           b > q->avg_peak_bandwidth ? 2 : 8);
1764                         q->avg_window_bytes = 0;
1765                         q->avg_window_begin = now;
1766
1767                         if (ktime_after(now,
1768                                         ktime_add_ms(q->last_reconfig_time,
1769                                                      250))) {
1770                                 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1771                                 cake_reconfigure(sch);
1772                         }
1773                 }
1774         } else {
1775                 q->avg_window_bytes = 0;
1776                 q->last_packet_time = now;
1777         }
1778
1779         /* flowchain */
1780         if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1781                 struct cake_host *srchost = &b->hosts[flow->srchost];
1782                 struct cake_host *dsthost = &b->hosts[flow->dsthost];
1783                 u16 host_load = 1;
1784
1785                 if (!flow->set) {
1786                         list_add_tail(&flow->flowchain, &b->new_flows);
1787                 } else {
1788                         b->decaying_flow_count--;
1789                         list_move_tail(&flow->flowchain, &b->new_flows);
1790                 }
1791                 flow->set = CAKE_SET_SPARSE;
1792                 b->sparse_flow_count++;
1793
1794                 if (cake_dsrc(q->flow_mode))
1795                         host_load = max(host_load, srchost->srchost_bulk_flow_count);
1796
1797                 if (cake_ddst(q->flow_mode))
1798                         host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1799
1800                 flow->deficit = (b->flow_quantum *
1801                                  quantum_div[host_load]) >> 16;
1802         } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1803                 struct cake_host *srchost = &b->hosts[flow->srchost];
1804                 struct cake_host *dsthost = &b->hosts[flow->dsthost];
1805
1806                 /* this flow was empty, accounted as a sparse flow, but actually
1807                  * in the bulk rotation.
1808                  */
1809                 flow->set = CAKE_SET_BULK;
1810                 b->sparse_flow_count--;
1811                 b->bulk_flow_count++;
1812
1813                 if (cake_dsrc(q->flow_mode))
1814                         srchost->srchost_bulk_flow_count++;
1815
1816                 if (cake_ddst(q->flow_mode))
1817                         dsthost->dsthost_bulk_flow_count++;
1818
1819         }
1820
1821         if (q->buffer_used > q->buffer_max_used)
1822                 q->buffer_max_used = q->buffer_used;
1823
1824         if (q->buffer_used > q->buffer_limit) {
1825                 u32 dropped = 0;
1826
1827                 while (q->buffer_used > q->buffer_limit) {
1828                         dropped++;
1829                         cake_drop(sch, to_free);
1830                 }
1831                 b->drop_overlimit += dropped;
1832         }
1833         return NET_XMIT_SUCCESS;
1834 }
1835
1836 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1837 {
1838         struct cake_sched_data *q = qdisc_priv(sch);
1839         struct cake_tin_data *b = &q->tins[q->cur_tin];
1840         struct cake_flow *flow = &b->flows[q->cur_flow];
1841         struct sk_buff *skb = NULL;
1842         u32 len;
1843
1844         if (flow->head) {
1845                 skb = dequeue_head(flow);
1846                 len = qdisc_pkt_len(skb);
1847                 b->backlogs[q->cur_flow] -= len;
1848                 b->tin_backlog           -= len;
1849                 sch->qstats.backlog      -= len;
1850                 q->buffer_used           -= skb->truesize;
1851                 sch->q.qlen--;
1852
1853                 if (q->overflow_timeout)
1854                         cake_heapify(q, b->overflow_idx[q->cur_flow]);
1855         }
1856         return skb;
1857 }
1858
1859 /* Discard leftover packets from a tin no longer in use. */
1860 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1861 {
1862         struct cake_sched_data *q = qdisc_priv(sch);
1863         struct sk_buff *skb;
1864
1865         q->cur_tin = tin;
1866         for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1867                 while (!!(skb = cake_dequeue_one(sch)))
1868                         kfree_skb(skb);
1869 }
1870
1871 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1872 {
1873         struct cake_sched_data *q = qdisc_priv(sch);
1874         struct cake_tin_data *b = &q->tins[q->cur_tin];
1875         struct cake_host *srchost, *dsthost;
1876         ktime_t now = ktime_get();
1877         struct cake_flow *flow;
1878         struct list_head *head;
1879         bool first_flow = true;
1880         struct sk_buff *skb;
1881         u16 host_load;
1882         u64 delay;
1883         u32 len;
1884
1885 begin:
1886         if (!sch->q.qlen)
1887                 return NULL;
1888
1889         /* global hard shaper */
1890         if (ktime_after(q->time_next_packet, now) &&
1891             ktime_after(q->failsafe_next_packet, now)) {
1892                 u64 next = min(ktime_to_ns(q->time_next_packet),
1893                                ktime_to_ns(q->failsafe_next_packet));
1894
1895                 sch->qstats.overlimits++;
1896                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1897                 return NULL;
1898         }
1899
1900         /* Choose a class to work on. */
1901         if (!q->rate_ns) {
1902                 /* In unlimited mode, can't rely on shaper timings, just balance
1903                  * with DRR
1904                  */
1905                 bool wrapped = false, empty = true;
1906
1907                 while (b->tin_deficit < 0 ||
1908                        !(b->sparse_flow_count + b->bulk_flow_count)) {
1909                         if (b->tin_deficit <= 0)
1910                                 b->tin_deficit += b->tin_quantum_band;
1911                         if (b->sparse_flow_count + b->bulk_flow_count)
1912                                 empty = false;
1913
1914                         q->cur_tin++;
1915                         b++;
1916                         if (q->cur_tin >= q->tin_cnt) {
1917                                 q->cur_tin = 0;
1918                                 b = q->tins;
1919
1920                                 if (wrapped) {
1921                                         /* It's possible for q->qlen to be
1922                                          * nonzero when we actually have no
1923                                          * packets anywhere.
1924                                          */
1925                                         if (empty)
1926                                                 return NULL;
1927                                 } else {
1928                                         wrapped = true;
1929                                 }
1930                         }
1931                 }
1932         } else {
1933                 /* In shaped mode, choose:
1934                  * - Highest-priority tin with queue and meeting schedule, or
1935                  * - The earliest-scheduled tin with queue.
1936                  */
1937                 ktime_t best_time = KTIME_MAX;
1938                 int tin, best_tin = 0;
1939
1940                 for (tin = 0; tin < q->tin_cnt; tin++) {
1941                         b = q->tins + tin;
1942                         if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
1943                                 ktime_t time_to_pkt = \
1944                                         ktime_sub(b->time_next_packet, now);
1945
1946                                 if (ktime_to_ns(time_to_pkt) <= 0 ||
1947                                     ktime_compare(time_to_pkt,
1948                                                   best_time) <= 0) {
1949                                         best_time = time_to_pkt;
1950                                         best_tin = tin;
1951                                 }
1952                         }
1953                 }
1954
1955                 q->cur_tin = best_tin;
1956                 b = q->tins + best_tin;
1957
1958                 /* No point in going further if no packets to deliver. */
1959                 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
1960                         return NULL;
1961         }
1962
1963 retry:
1964         /* service this class */
1965         head = &b->decaying_flows;
1966         if (!first_flow || list_empty(head)) {
1967                 head = &b->new_flows;
1968                 if (list_empty(head)) {
1969                         head = &b->old_flows;
1970                         if (unlikely(list_empty(head))) {
1971                                 head = &b->decaying_flows;
1972                                 if (unlikely(list_empty(head)))
1973                                         goto begin;
1974                         }
1975                 }
1976         }
1977         flow = list_first_entry(head, struct cake_flow, flowchain);
1978         q->cur_flow = flow - b->flows;
1979         first_flow = false;
1980
1981         /* triple isolation (modified DRR++) */
1982         srchost = &b->hosts[flow->srchost];
1983         dsthost = &b->hosts[flow->dsthost];
1984         host_load = 1;
1985
1986         /* flow isolation (DRR++) */
1987         if (flow->deficit <= 0) {
1988                 /* Keep all flows with deficits out of the sparse and decaying
1989                  * rotations.  No non-empty flow can go into the decaying
1990                  * rotation, so they can't get deficits
1991                  */
1992                 if (flow->set == CAKE_SET_SPARSE) {
1993                         if (flow->head) {
1994                                 b->sparse_flow_count--;
1995                                 b->bulk_flow_count++;
1996
1997                                 if (cake_dsrc(q->flow_mode))
1998                                         srchost->srchost_bulk_flow_count++;
1999
2000                                 if (cake_ddst(q->flow_mode))
2001                                         dsthost->dsthost_bulk_flow_count++;
2002
2003                                 flow->set = CAKE_SET_BULK;
2004                         } else {
2005                                 /* we've moved it to the bulk rotation for
2006                                  * correct deficit accounting but we still want
2007                                  * to count it as a sparse flow, not a bulk one.
2008                                  */
2009                                 flow->set = CAKE_SET_SPARSE_WAIT;
2010                         }
2011                 }
2012
2013                 if (cake_dsrc(q->flow_mode))
2014                         host_load = max(host_load, srchost->srchost_bulk_flow_count);
2015
2016                 if (cake_ddst(q->flow_mode))
2017                         host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2018
2019                 WARN_ON(host_load > CAKE_QUEUES);
2020
2021                 /* The shifted prandom_u32() is a way to apply dithering to
2022                  * avoid accumulating roundoff errors
2023                  */
2024                 flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2025                                   (prandom_u32() >> 16)) >> 16;
2026                 list_move_tail(&flow->flowchain, &b->old_flows);
2027
2028                 goto retry;
2029         }
2030
2031         /* Retrieve a packet via the AQM */
2032         while (1) {
2033                 skb = cake_dequeue_one(sch);
2034                 if (!skb) {
2035                         /* this queue was actually empty */
2036                         if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2037                                 b->unresponsive_flow_count--;
2038
2039                         if (flow->cvars.p_drop || flow->cvars.count ||
2040                             ktime_before(now, flow->cvars.drop_next)) {
2041                                 /* keep in the flowchain until the state has
2042                                  * decayed to rest
2043                                  */
2044                                 list_move_tail(&flow->flowchain,
2045                                                &b->decaying_flows);
2046                                 if (flow->set == CAKE_SET_BULK) {
2047                                         b->bulk_flow_count--;
2048
2049                                         if (cake_dsrc(q->flow_mode))
2050                                                 srchost->srchost_bulk_flow_count--;
2051
2052                                         if (cake_ddst(q->flow_mode))
2053                                                 dsthost->dsthost_bulk_flow_count--;
2054
2055                                         b->decaying_flow_count++;
2056                                 } else if (flow->set == CAKE_SET_SPARSE ||
2057                                            flow->set == CAKE_SET_SPARSE_WAIT) {
2058                                         b->sparse_flow_count--;
2059                                         b->decaying_flow_count++;
2060                                 }
2061                                 flow->set = CAKE_SET_DECAYING;
2062                         } else {
2063                                 /* remove empty queue from the flowchain */
2064                                 list_del_init(&flow->flowchain);
2065                                 if (flow->set == CAKE_SET_SPARSE ||
2066                                     flow->set == CAKE_SET_SPARSE_WAIT)
2067                                         b->sparse_flow_count--;
2068                                 else if (flow->set == CAKE_SET_BULK) {
2069                                         b->bulk_flow_count--;
2070
2071                                         if (cake_dsrc(q->flow_mode))
2072                                                 srchost->srchost_bulk_flow_count--;
2073
2074                                         if (cake_ddst(q->flow_mode))
2075                                                 dsthost->dsthost_bulk_flow_count--;
2076
2077                                 } else
2078                                         b->decaying_flow_count--;
2079
2080                                 flow->set = CAKE_SET_NONE;
2081                         }
2082                         goto begin;
2083                 }
2084
2085                 /* Last packet in queue may be marked, shouldn't be dropped */
2086                 if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2087                                         (b->bulk_flow_count *
2088                                          !!(q->rate_flags &
2089                                             CAKE_FLAG_INGRESS))) ||
2090                     !flow->head)
2091                         break;
2092
2093                 /* drop this packet, get another one */
2094                 if (q->rate_flags & CAKE_FLAG_INGRESS) {
2095                         len = cake_advance_shaper(q, b, skb,
2096                                                   now, true);
2097                         flow->deficit -= len;
2098                         b->tin_deficit -= len;
2099                 }
2100                 flow->dropped++;
2101                 b->tin_dropped++;
2102                 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2103                 qdisc_qstats_drop(sch);
2104                 kfree_skb(skb);
2105                 if (q->rate_flags & CAKE_FLAG_INGRESS)
2106                         goto retry;
2107         }
2108
2109         b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2110         qdisc_bstats_update(sch, skb);
2111
2112         /* collect delay stats */
2113         delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2114         b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2115         b->peak_delay = cake_ewma(b->peak_delay, delay,
2116                                   delay > b->peak_delay ? 2 : 8);
2117         b->base_delay = cake_ewma(b->base_delay, delay,
2118                                   delay < b->base_delay ? 2 : 8);
2119
2120         len = cake_advance_shaper(q, b, skb, now, false);
2121         flow->deficit -= len;
2122         b->tin_deficit -= len;
2123
2124         if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2125                 u64 next = min(ktime_to_ns(q->time_next_packet),
2126                                ktime_to_ns(q->failsafe_next_packet));
2127
2128                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2129         } else if (!sch->q.qlen) {
2130                 int i;
2131
2132                 for (i = 0; i < q->tin_cnt; i++) {
2133                         if (q->tins[i].decaying_flow_count) {
2134                                 ktime_t next = \
2135                                         ktime_add_ns(now,
2136                                                      q->tins[i].cparams.target);
2137
2138                                 qdisc_watchdog_schedule_ns(&q->watchdog,
2139                                                            ktime_to_ns(next));
2140                                 break;
2141                         }
2142                 }
2143         }
2144
2145         if (q->overflow_timeout)
2146                 q->overflow_timeout--;
2147
2148         return skb;
2149 }
2150
2151 static void cake_reset(struct Qdisc *sch)
2152 {
2153         u32 c;
2154
2155         for (c = 0; c < CAKE_MAX_TINS; c++)
2156                 cake_clear_tin(sch, c);
2157 }
2158
2159 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2160         [TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2161         [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2162         [TCA_CAKE_ATM]           = { .type = NLA_U32 },
2163         [TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2164         [TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2165         [TCA_CAKE_RTT]           = { .type = NLA_U32 },
2166         [TCA_CAKE_TARGET]        = { .type = NLA_U32 },
2167         [TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2168         [TCA_CAKE_MEMORY]        = { .type = NLA_U32 },
2169         [TCA_CAKE_NAT]           = { .type = NLA_U32 },
2170         [TCA_CAKE_RAW]           = { .type = NLA_U32 },
2171         [TCA_CAKE_WASH]          = { .type = NLA_U32 },
2172         [TCA_CAKE_MPU]           = { .type = NLA_U32 },
2173         [TCA_CAKE_INGRESS]       = { .type = NLA_U32 },
2174         [TCA_CAKE_ACK_FILTER]    = { .type = NLA_U32 },
2175 };
2176
2177 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2178                           u64 target_ns, u64 rtt_est_ns)
2179 {
2180         /* convert byte-rate into time-per-byte
2181          * so it will always unwedge in reasonable time.
2182          */
2183         static const u64 MIN_RATE = 64;
2184         u32 byte_target = mtu;
2185         u64 byte_target_ns;
2186         u8  rate_shft = 0;
2187         u64 rate_ns = 0;
2188
2189         b->flow_quantum = 1514;
2190         if (rate) {
2191                 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2192                 rate_shft = 34;
2193                 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2194                 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2195                 while (!!(rate_ns >> 34)) {
2196                         rate_ns >>= 1;
2197                         rate_shft--;
2198                 }
2199         } /* else unlimited, ie. zero delay */
2200
2201         b->tin_rate_bps  = rate;
2202         b->tin_rate_ns   = rate_ns;
2203         b->tin_rate_shft = rate_shft;
2204
2205         byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2206
2207         b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2208         b->cparams.interval = max(rtt_est_ns +
2209                                      b->cparams.target - target_ns,
2210                                      b->cparams.target * 2);
2211         b->cparams.mtu_time = byte_target_ns;
2212         b->cparams.p_inc = 1 << 24; /* 1/256 */
2213         b->cparams.p_dec = 1 << 20; /* 1/4096 */
2214 }
2215
2216 static int cake_config_besteffort(struct Qdisc *sch)
2217 {
2218         struct cake_sched_data *q = qdisc_priv(sch);
2219         struct cake_tin_data *b = &q->tins[0];
2220         u32 mtu = psched_mtu(qdisc_dev(sch));
2221         u64 rate = q->rate_bps;
2222
2223         q->tin_cnt = 1;
2224
2225         q->tin_index = besteffort;
2226         q->tin_order = normal_order;
2227
2228         cake_set_rate(b, rate, mtu,
2229                       us_to_ns(q->target), us_to_ns(q->interval));
2230         b->tin_quantum_band = 65535;
2231         b->tin_quantum_prio = 65535;
2232
2233         return 0;
2234 }
2235
2236 static int cake_config_precedence(struct Qdisc *sch)
2237 {
2238         /* convert high-level (user visible) parameters into internal format */
2239         struct cake_sched_data *q = qdisc_priv(sch);
2240         u32 mtu = psched_mtu(qdisc_dev(sch));
2241         u64 rate = q->rate_bps;
2242         u32 quantum1 = 256;
2243         u32 quantum2 = 256;
2244         u32 i;
2245
2246         q->tin_cnt = 8;
2247         q->tin_index = precedence;
2248         q->tin_order = normal_order;
2249
2250         for (i = 0; i < q->tin_cnt; i++) {
2251                 struct cake_tin_data *b = &q->tins[i];
2252
2253                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2254                               us_to_ns(q->interval));
2255
2256                 b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2257                 b->tin_quantum_band = max_t(u16, 1U, quantum2);
2258
2259                 /* calculate next class's parameters */
2260                 rate  *= 7;
2261                 rate >>= 3;
2262
2263                 quantum1  *= 3;
2264                 quantum1 >>= 1;
2265
2266                 quantum2  *= 7;
2267                 quantum2 >>= 3;
2268         }
2269
2270         return 0;
2271 }
2272
2273 /*      List of known Diffserv codepoints:
2274  *
2275  *      Least Effort (CS1)
2276  *      Best Effort (CS0)
2277  *      Max Reliability & LLT "Lo" (TOS1)
2278  *      Max Throughput (TOS2)
2279  *      Min Delay (TOS4)
2280  *      LLT "La" (TOS5)
2281  *      Assured Forwarding 1 (AF1x) - x3
2282  *      Assured Forwarding 2 (AF2x) - x3
2283  *      Assured Forwarding 3 (AF3x) - x3
2284  *      Assured Forwarding 4 (AF4x) - x3
2285  *      Precedence Class 2 (CS2)
2286  *      Precedence Class 3 (CS3)
2287  *      Precedence Class 4 (CS4)
2288  *      Precedence Class 5 (CS5)
2289  *      Precedence Class 6 (CS6)
2290  *      Precedence Class 7 (CS7)
2291  *      Voice Admit (VA)
2292  *      Expedited Forwarding (EF)
2293
2294  *      Total 25 codepoints.
2295  */
2296
2297 /*      List of traffic classes in RFC 4594:
2298  *              (roughly descending order of contended priority)
2299  *              (roughly ascending order of uncontended throughput)
2300  *
2301  *      Network Control (CS6,CS7)      - routing traffic
2302  *      Telephony (EF,VA)         - aka. VoIP streams
2303  *      Signalling (CS5)               - VoIP setup
2304  *      Multimedia Conferencing (AF4x) - aka. video calls
2305  *      Realtime Interactive (CS4)     - eg. games
2306  *      Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2307  *      Broadcast Video (CS3)
2308  *      Low Latency Data (AF2x,TOS4)      - eg. database
2309  *      Ops, Admin, Management (CS2,TOS1) - eg. ssh
2310  *      Standard Service (CS0 & unrecognised codepoints)
2311  *      High Throughput Data (AF1x,TOS2)  - eg. web traffic
2312  *      Low Priority Data (CS1)           - eg. BitTorrent
2313
2314  *      Total 12 traffic classes.
2315  */
2316
2317 static int cake_config_diffserv8(struct Qdisc *sch)
2318 {
2319 /*      Pruned list of traffic classes for typical applications:
2320  *
2321  *              Network Control          (CS6, CS7)
2322  *              Minimum Latency          (EF, VA, CS5, CS4)
2323  *              Interactive Shell        (CS2, TOS1)
2324  *              Low Latency Transactions (AF2x, TOS4)
2325  *              Video Streaming          (AF4x, AF3x, CS3)
2326  *              Bog Standard             (CS0 etc.)
2327  *              High Throughput          (AF1x, TOS2)
2328  *              Background Traffic       (CS1)
2329  *
2330  *              Total 8 traffic classes.
2331  */
2332
2333         struct cake_sched_data *q = qdisc_priv(sch);
2334         u32 mtu = psched_mtu(qdisc_dev(sch));
2335         u64 rate = q->rate_bps;
2336         u32 quantum1 = 256;
2337         u32 quantum2 = 256;
2338         u32 i;
2339
2340         q->tin_cnt = 8;
2341
2342         /* codepoint to class mapping */
2343         q->tin_index = diffserv8;
2344         q->tin_order = normal_order;
2345
2346         /* class characteristics */
2347         for (i = 0; i < q->tin_cnt; i++) {
2348                 struct cake_tin_data *b = &q->tins[i];
2349
2350                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2351                               us_to_ns(q->interval));
2352
2353                 b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2354                 b->tin_quantum_band = max_t(u16, 1U, quantum2);
2355
2356                 /* calculate next class's parameters */
2357                 rate  *= 7;
2358                 rate >>= 3;
2359
2360                 quantum1  *= 3;
2361                 quantum1 >>= 1;
2362
2363                 quantum2  *= 7;
2364                 quantum2 >>= 3;
2365         }
2366
2367         return 0;
2368 }
2369
2370 static int cake_config_diffserv4(struct Qdisc *sch)
2371 {
2372 /*  Further pruned list of traffic classes for four-class system:
2373  *
2374  *          Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2375  *          Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2376  *          Best Effort        (CS0, AF1x, TOS2, and those not specified)
2377  *          Background Traffic (CS1)
2378  *
2379  *              Total 4 traffic classes.
2380  */
2381
2382         struct cake_sched_data *q = qdisc_priv(sch);
2383         u32 mtu = psched_mtu(qdisc_dev(sch));
2384         u64 rate = q->rate_bps;
2385         u32 quantum = 1024;
2386
2387         q->tin_cnt = 4;
2388
2389         /* codepoint to class mapping */
2390         q->tin_index = diffserv4;
2391         q->tin_order = bulk_order;
2392
2393         /* class characteristics */
2394         cake_set_rate(&q->tins[0], rate, mtu,
2395                       us_to_ns(q->target), us_to_ns(q->interval));
2396         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2397                       us_to_ns(q->target), us_to_ns(q->interval));
2398         cake_set_rate(&q->tins[2], rate >> 1, mtu,
2399                       us_to_ns(q->target), us_to_ns(q->interval));
2400         cake_set_rate(&q->tins[3], rate >> 2, mtu,
2401                       us_to_ns(q->target), us_to_ns(q->interval));
2402
2403         /* priority weights */
2404         q->tins[0].tin_quantum_prio = quantum;
2405         q->tins[1].tin_quantum_prio = quantum >> 4;
2406         q->tins[2].tin_quantum_prio = quantum << 2;
2407         q->tins[3].tin_quantum_prio = quantum << 4;
2408
2409         /* bandwidth-sharing weights */
2410         q->tins[0].tin_quantum_band = quantum;
2411         q->tins[1].tin_quantum_band = quantum >> 4;
2412         q->tins[2].tin_quantum_band = quantum >> 1;
2413         q->tins[3].tin_quantum_band = quantum >> 2;
2414
2415         return 0;
2416 }
2417
2418 static int cake_config_diffserv3(struct Qdisc *sch)
2419 {
2420 /*  Simplified Diffserv structure with 3 tins.
2421  *              Low Priority            (CS1)
2422  *              Best Effort
2423  *              Latency Sensitive       (TOS4, VA, EF, CS6, CS7)
2424  */
2425         struct cake_sched_data *q = qdisc_priv(sch);
2426         u32 mtu = psched_mtu(qdisc_dev(sch));
2427         u64 rate = q->rate_bps;
2428         u32 quantum = 1024;
2429
2430         q->tin_cnt = 3;
2431
2432         /* codepoint to class mapping */
2433         q->tin_index = diffserv3;
2434         q->tin_order = bulk_order;
2435
2436         /* class characteristics */
2437         cake_set_rate(&q->tins[0], rate, mtu,
2438                       us_to_ns(q->target), us_to_ns(q->interval));
2439         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2440                       us_to_ns(q->target), us_to_ns(q->interval));
2441         cake_set_rate(&q->tins[2], rate >> 2, mtu,
2442                       us_to_ns(q->target), us_to_ns(q->interval));
2443
2444         /* priority weights */
2445         q->tins[0].tin_quantum_prio = quantum;
2446         q->tins[1].tin_quantum_prio = quantum >> 4;
2447         q->tins[2].tin_quantum_prio = quantum << 4;
2448
2449         /* bandwidth-sharing weights */
2450         q->tins[0].tin_quantum_band = quantum;
2451         q->tins[1].tin_quantum_band = quantum >> 4;
2452         q->tins[2].tin_quantum_band = quantum >> 2;
2453
2454         return 0;
2455 }
2456
2457 static void cake_reconfigure(struct Qdisc *sch)
2458 {
2459         struct cake_sched_data *q = qdisc_priv(sch);
2460         int c, ft;
2461
2462         switch (q->tin_mode) {
2463         case CAKE_DIFFSERV_BESTEFFORT:
2464                 ft = cake_config_besteffort(sch);
2465                 break;
2466
2467         case CAKE_DIFFSERV_PRECEDENCE:
2468                 ft = cake_config_precedence(sch);
2469                 break;
2470
2471         case CAKE_DIFFSERV_DIFFSERV8:
2472                 ft = cake_config_diffserv8(sch);
2473                 break;
2474
2475         case CAKE_DIFFSERV_DIFFSERV4:
2476                 ft = cake_config_diffserv4(sch);
2477                 break;
2478
2479         case CAKE_DIFFSERV_DIFFSERV3:
2480         default:
2481                 ft = cake_config_diffserv3(sch);
2482                 break;
2483         }
2484
2485         for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2486                 cake_clear_tin(sch, c);
2487                 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2488         }
2489
2490         q->rate_ns   = q->tins[ft].tin_rate_ns;
2491         q->rate_shft = q->tins[ft].tin_rate_shft;
2492
2493         if (q->buffer_config_limit) {
2494                 q->buffer_limit = q->buffer_config_limit;
2495         } else if (q->rate_bps) {
2496                 u64 t = q->rate_bps * q->interval;
2497
2498                 do_div(t, USEC_PER_SEC / 4);
2499                 q->buffer_limit = max_t(u32, t, 4U << 20);
2500         } else {
2501                 q->buffer_limit = ~0;
2502         }
2503
2504         sch->flags &= ~TCQ_F_CAN_BYPASS;
2505
2506         q->buffer_limit = min(q->buffer_limit,
2507                               max(sch->limit * psched_mtu(qdisc_dev(sch)),
2508                                   q->buffer_config_limit));
2509 }
2510
2511 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2512                        struct netlink_ext_ack *extack)
2513 {
2514         struct cake_sched_data *q = qdisc_priv(sch);
2515         struct nlattr *tb[TCA_CAKE_MAX + 1];
2516         int err;
2517
2518         if (!opt)
2519                 return -EINVAL;
2520
2521         err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
2522         if (err < 0)
2523                 return err;
2524
2525         if (tb[TCA_CAKE_NAT]) {
2526 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2527                 q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2528                 q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2529                         !!nla_get_u32(tb[TCA_CAKE_NAT]);
2530 #else
2531                 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2532                                     "No conntrack support in kernel");
2533                 return -EOPNOTSUPP;
2534 #endif
2535         }
2536
2537         if (tb[TCA_CAKE_BASE_RATE64])
2538                 q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2539
2540         if (tb[TCA_CAKE_DIFFSERV_MODE])
2541                 q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2542
2543         if (tb[TCA_CAKE_WASH]) {
2544                 if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2545                         q->rate_flags |= CAKE_FLAG_WASH;
2546                 else
2547                         q->rate_flags &= ~CAKE_FLAG_WASH;
2548         }
2549
2550         if (tb[TCA_CAKE_FLOW_MODE])
2551                 q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2552                                 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2553                                         CAKE_FLOW_MASK));
2554
2555         if (tb[TCA_CAKE_ATM])
2556                 q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2557
2558         if (tb[TCA_CAKE_OVERHEAD]) {
2559                 q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2560                 q->rate_flags |= CAKE_FLAG_OVERHEAD;
2561
2562                 q->max_netlen = 0;
2563                 q->max_adjlen = 0;
2564                 q->min_netlen = ~0;
2565                 q->min_adjlen = ~0;
2566         }
2567
2568         if (tb[TCA_CAKE_RAW]) {
2569                 q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2570
2571                 q->max_netlen = 0;
2572                 q->max_adjlen = 0;
2573                 q->min_netlen = ~0;
2574                 q->min_adjlen = ~0;
2575         }
2576
2577         if (tb[TCA_CAKE_MPU])
2578                 q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2579
2580         if (tb[TCA_CAKE_RTT]) {
2581                 q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2582
2583                 if (!q->interval)
2584                         q->interval = 1;
2585         }
2586
2587         if (tb[TCA_CAKE_TARGET]) {
2588                 q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2589
2590                 if (!q->target)
2591                         q->target = 1;
2592         }
2593
2594         if (tb[TCA_CAKE_AUTORATE]) {
2595                 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2596                         q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2597                 else
2598                         q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2599         }
2600
2601         if (tb[TCA_CAKE_INGRESS]) {
2602                 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2603                         q->rate_flags |= CAKE_FLAG_INGRESS;
2604                 else
2605                         q->rate_flags &= ~CAKE_FLAG_INGRESS;
2606         }
2607
2608         if (tb[TCA_CAKE_ACK_FILTER])
2609                 q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2610
2611         if (tb[TCA_CAKE_MEMORY])
2612                 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2613
2614         if (tb[TCA_CAKE_SPLIT_GSO]) {
2615                 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2616                         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2617                 else
2618                         q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2619         }
2620
2621         if (tb[TCA_CAKE_FWMARK]) {
2622                 if (!!nla_get_u32(tb[TCA_CAKE_FWMARK]))
2623                         q->rate_flags |= CAKE_FLAG_FWMARK;
2624                 else
2625                         q->rate_flags &= ~CAKE_FLAG_FWMARK;
2626         }
2627
2628         if (q->tins) {
2629                 sch_tree_lock(sch);
2630                 cake_reconfigure(sch);
2631                 sch_tree_unlock(sch);
2632         }
2633
2634         return 0;
2635 }
2636
2637 static void cake_destroy(struct Qdisc *sch)
2638 {
2639         struct cake_sched_data *q = qdisc_priv(sch);
2640
2641         qdisc_watchdog_cancel(&q->watchdog);
2642         tcf_block_put(q->block);
2643         kvfree(q->tins);
2644 }
2645
2646 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2647                      struct netlink_ext_ack *extack)
2648 {
2649         struct cake_sched_data *q = qdisc_priv(sch);
2650         int i, j, err;
2651
2652         sch->limit = 10240;
2653         q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2654         q->flow_mode  = CAKE_FLOW_TRIPLE;
2655
2656         q->rate_bps = 0; /* unlimited by default */
2657
2658         q->interval = 100000; /* 100ms default */
2659         q->target   =   5000; /* 5ms: codel RFC argues
2660                                * for 5 to 10% of interval
2661                                */
2662         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2663         q->cur_tin = 0;
2664         q->cur_flow  = 0;
2665
2666         qdisc_watchdog_init(&q->watchdog, sch);
2667
2668         if (opt) {
2669                 int err = cake_change(sch, opt, extack);
2670
2671                 if (err)
2672                         return err;
2673         }
2674
2675         err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2676         if (err)
2677                 return err;
2678
2679         quantum_div[0] = ~0;
2680         for (i = 1; i <= CAKE_QUEUES; i++)
2681                 quantum_div[i] = 65535 / i;
2682
2683         q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2684                            GFP_KERNEL);
2685         if (!q->tins)
2686                 goto nomem;
2687
2688         for (i = 0; i < CAKE_MAX_TINS; i++) {
2689                 struct cake_tin_data *b = q->tins + i;
2690
2691                 INIT_LIST_HEAD(&b->new_flows);
2692                 INIT_LIST_HEAD(&b->old_flows);
2693                 INIT_LIST_HEAD(&b->decaying_flows);
2694                 b->sparse_flow_count = 0;
2695                 b->bulk_flow_count = 0;
2696                 b->decaying_flow_count = 0;
2697
2698                 for (j = 0; j < CAKE_QUEUES; j++) {
2699                         struct cake_flow *flow = b->flows + j;
2700                         u32 k = j * CAKE_MAX_TINS + i;
2701
2702                         INIT_LIST_HEAD(&flow->flowchain);
2703                         cobalt_vars_init(&flow->cvars);
2704
2705                         q->overflow_heap[k].t = i;
2706                         q->overflow_heap[k].b = j;
2707                         b->overflow_idx[j] = k;
2708                 }
2709         }
2710
2711         cake_reconfigure(sch);
2712         q->avg_peak_bandwidth = q->rate_bps;
2713         q->min_netlen = ~0;
2714         q->min_adjlen = ~0;
2715         return 0;
2716
2717 nomem:
2718         cake_destroy(sch);
2719         return -ENOMEM;
2720 }
2721
2722 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2723 {
2724         struct cake_sched_data *q = qdisc_priv(sch);
2725         struct nlattr *opts;
2726
2727         opts = nla_nest_start(skb, TCA_OPTIONS);
2728         if (!opts)
2729                 goto nla_put_failure;
2730
2731         if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2732                               TCA_CAKE_PAD))
2733                 goto nla_put_failure;
2734
2735         if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2736                         q->flow_mode & CAKE_FLOW_MASK))
2737                 goto nla_put_failure;
2738
2739         if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2740                 goto nla_put_failure;
2741
2742         if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2743                 goto nla_put_failure;
2744
2745         if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2746                 goto nla_put_failure;
2747
2748         if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2749                         !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2750                 goto nla_put_failure;
2751
2752         if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2753                         !!(q->rate_flags & CAKE_FLAG_INGRESS)))
2754                 goto nla_put_failure;
2755
2756         if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2757                 goto nla_put_failure;
2758
2759         if (nla_put_u32(skb, TCA_CAKE_NAT,
2760                         !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2761                 goto nla_put_failure;
2762
2763         if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2764                 goto nla_put_failure;
2765
2766         if (nla_put_u32(skb, TCA_CAKE_WASH,
2767                         !!(q->rate_flags & CAKE_FLAG_WASH)))
2768                 goto nla_put_failure;
2769
2770         if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2771                 goto nla_put_failure;
2772
2773         if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2774                 if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2775                         goto nla_put_failure;
2776
2777         if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2778                 goto nla_put_failure;
2779
2780         if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2781                 goto nla_put_failure;
2782
2783         if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2784                         !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2785                 goto nla_put_failure;
2786
2787         if (nla_put_u32(skb, TCA_CAKE_FWMARK,
2788                         !!(q->rate_flags & CAKE_FLAG_FWMARK)))
2789                 goto nla_put_failure;
2790
2791         return nla_nest_end(skb, opts);
2792
2793 nla_put_failure:
2794         return -1;
2795 }
2796
2797 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2798 {
2799         struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
2800         struct cake_sched_data *q = qdisc_priv(sch);
2801         struct nlattr *tstats, *ts;
2802         int i;
2803
2804         if (!stats)
2805                 return -1;
2806
2807 #define PUT_STAT_U32(attr, data) do {                                  \
2808                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2809                         goto nla_put_failure;                          \
2810         } while (0)
2811 #define PUT_STAT_U64(attr, data) do {                                  \
2812                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2813                                         data, TCA_CAKE_STATS_PAD)) \
2814                         goto nla_put_failure;                          \
2815         } while (0)
2816
2817         PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2818         PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2819         PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2820         PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2821         PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2822         PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2823         PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2824         PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2825
2826 #undef PUT_STAT_U32
2827 #undef PUT_STAT_U64
2828
2829         tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
2830         if (!tstats)
2831                 goto nla_put_failure;
2832
2833 #define PUT_TSTAT_U32(attr, data) do {                                  \
2834                 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2835                         goto nla_put_failure;                           \
2836         } while (0)
2837 #define PUT_TSTAT_U64(attr, data) do {                                  \
2838                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2839                                         data, TCA_CAKE_TIN_STATS_PAD))  \
2840                         goto nla_put_failure;                           \
2841         } while (0)
2842
2843         for (i = 0; i < q->tin_cnt; i++) {
2844                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2845
2846                 ts = nla_nest_start(d->skb, i + 1);
2847                 if (!ts)
2848                         goto nla_put_failure;
2849
2850                 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2851                 PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2852                 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2853
2854                 PUT_TSTAT_U32(TARGET_US,
2855                               ktime_to_us(ns_to_ktime(b->cparams.target)));
2856                 PUT_TSTAT_U32(INTERVAL_US,
2857                               ktime_to_us(ns_to_ktime(b->cparams.interval)));
2858
2859                 PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2860                 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2861                 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2862                 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2863
2864                 PUT_TSTAT_U32(PEAK_DELAY_US,
2865                               ktime_to_us(ns_to_ktime(b->peak_delay)));
2866                 PUT_TSTAT_U32(AVG_DELAY_US,
2867                               ktime_to_us(ns_to_ktime(b->avge_delay)));
2868                 PUT_TSTAT_U32(BASE_DELAY_US,
2869                               ktime_to_us(ns_to_ktime(b->base_delay)));
2870
2871                 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2872                 PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2873                 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2874
2875                 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2876                                             b->decaying_flow_count);
2877                 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2878                 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2879                 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2880
2881                 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2882                 nla_nest_end(d->skb, ts);
2883         }
2884
2885 #undef PUT_TSTAT_U32
2886 #undef PUT_TSTAT_U64
2887
2888         nla_nest_end(d->skb, tstats);
2889         return nla_nest_end(d->skb, stats);
2890
2891 nla_put_failure:
2892         nla_nest_cancel(d->skb, stats);
2893         return -1;
2894 }
2895
2896 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2897 {
2898         return NULL;
2899 }
2900
2901 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2902 {
2903         return 0;
2904 }
2905
2906 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2907                                u32 classid)
2908 {
2909         return 0;
2910 }
2911
2912 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2913 {
2914 }
2915
2916 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2917                                         struct netlink_ext_ack *extack)
2918 {
2919         struct cake_sched_data *q = qdisc_priv(sch);
2920
2921         if (cl)
2922                 return NULL;
2923         return q->block;
2924 }
2925
2926 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2927                            struct sk_buff *skb, struct tcmsg *tcm)
2928 {
2929         tcm->tcm_handle |= TC_H_MIN(cl);
2930         return 0;
2931 }
2932
2933 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2934                                  struct gnet_dump *d)
2935 {
2936         struct cake_sched_data *q = qdisc_priv(sch);
2937         const struct cake_flow *flow = NULL;
2938         struct gnet_stats_queue qs = { 0 };
2939         struct nlattr *stats;
2940         u32 idx = cl - 1;
2941
2942         if (idx < CAKE_QUEUES * q->tin_cnt) {
2943                 const struct cake_tin_data *b = \
2944                         &q->tins[q->tin_order[idx / CAKE_QUEUES]];
2945                 const struct sk_buff *skb;
2946
2947                 flow = &b->flows[idx % CAKE_QUEUES];
2948
2949                 if (flow->head) {
2950                         sch_tree_lock(sch);
2951                         skb = flow->head;
2952                         while (skb) {
2953                                 qs.qlen++;
2954                                 skb = skb->next;
2955                         }
2956                         sch_tree_unlock(sch);
2957                 }
2958                 qs.backlog = b->backlogs[idx % CAKE_QUEUES];
2959                 qs.drops = flow->dropped;
2960         }
2961         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
2962                 return -1;
2963         if (flow) {
2964                 ktime_t now = ktime_get();
2965
2966                 stats = nla_nest_start(d->skb, TCA_STATS_APP);
2967                 if (!stats)
2968                         return -1;
2969
2970 #define PUT_STAT_U32(attr, data) do {                                  \
2971                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2972                         goto nla_put_failure;                          \
2973         } while (0)
2974 #define PUT_STAT_S32(attr, data) do {                                  \
2975                 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2976                         goto nla_put_failure;                          \
2977         } while (0)
2978
2979                 PUT_STAT_S32(DEFICIT, flow->deficit);
2980                 PUT_STAT_U32(DROPPING, flow->cvars.dropping);
2981                 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
2982                 PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
2983                 if (flow->cvars.p_drop) {
2984                         PUT_STAT_S32(BLUE_TIMER_US,
2985                                      ktime_to_us(
2986                                              ktime_sub(now,
2987                                                      flow->cvars.blue_timer)));
2988                 }
2989                 if (flow->cvars.dropping) {
2990                         PUT_STAT_S32(DROP_NEXT_US,
2991                                      ktime_to_us(
2992                                              ktime_sub(now,
2993                                                        flow->cvars.drop_next)));
2994                 }
2995
2996                 if (nla_nest_end(d->skb, stats) < 0)
2997                         return -1;
2998         }
2999
3000         return 0;
3001
3002 nla_put_failure:
3003         nla_nest_cancel(d->skb, stats);
3004         return -1;
3005 }
3006
3007 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3008 {
3009         struct cake_sched_data *q = qdisc_priv(sch);
3010         unsigned int i, j;
3011
3012         if (arg->stop)
3013                 return;
3014
3015         for (i = 0; i < q->tin_cnt; i++) {
3016                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3017
3018                 for (j = 0; j < CAKE_QUEUES; j++) {
3019                         if (list_empty(&b->flows[j].flowchain) ||
3020                             arg->count < arg->skip) {
3021                                 arg->count++;
3022                                 continue;
3023                         }
3024                         if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
3025                                 arg->stop = 1;
3026                                 break;
3027                         }
3028                         arg->count++;
3029                 }
3030         }
3031 }
3032
3033 static const struct Qdisc_class_ops cake_class_ops = {
3034         .leaf           =       cake_leaf,
3035         .find           =       cake_find,
3036         .tcf_block      =       cake_tcf_block,
3037         .bind_tcf       =       cake_bind,
3038         .unbind_tcf     =       cake_unbind,
3039         .dump           =       cake_dump_class,
3040         .dump_stats     =       cake_dump_class_stats,
3041         .walk           =       cake_walk,
3042 };
3043
3044 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3045         .cl_ops         =       &cake_class_ops,
3046         .id             =       "cake",
3047         .priv_size      =       sizeof(struct cake_sched_data),
3048         .enqueue        =       cake_enqueue,
3049         .dequeue        =       cake_dequeue,
3050         .peek           =       qdisc_peek_dequeued,
3051         .init           =       cake_init,
3052         .reset          =       cake_reset,
3053         .destroy        =       cake_destroy,
3054         .change         =       cake_change,
3055         .dump           =       cake_dump,
3056         .dump_stats     =       cake_dump_stats,
3057         .owner          =       THIS_MODULE,
3058 };
3059
3060 static int __init cake_module_init(void)
3061 {
3062         return register_qdisc(&cake_qdisc_ops);
3063 }
3064
3065 static void __exit cake_module_exit(void)
3066 {
3067         unregister_qdisc(&cake_qdisc_ops);
3068 }
3069
3070 module_init(cake_module_init)
3071 module_exit(cake_module_exit)
3072 MODULE_AUTHOR("Jonathan Morton");
3073 MODULE_LICENSE("Dual BSD/GPL");
3074 MODULE_DESCRIPTION("The CAKE shaper.");