tcp: refine tcp_prune_ofo_queue() logic

author Eric Dumazet <edumazet@google.com>

Tue, 1 Nov 2022 03:52:34 +0000 (03:52 +0000)

committer Jakub Kicinski <kuba@kernel.org>

Wed, 2 Nov 2022 04:19:58 +0000 (21:19 -0700)
author Eric Dumazet <edumazet@google.com>
Tue, 1 Nov 2022 03:52:34 +0000 (03:52 +0000)
committer Jakub Kicinski <kuba@kernel.org>
Wed, 2 Nov 2022 04:19:58 +0000 (21:19 -0700)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index 0640453..d764b58 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4764,8 +4764,8 @@ static void tcp_ofo_queue(struct sock *sk)
         }
  }
  
-static bool tcp_prune_ofo_queue(struct sock *sk);
-static int tcp_prune_queue(struct sock *sk);
+static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
+static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
  
  static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
                                  unsigned int size)
@@ -4773,11 +4773,11 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
         if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
             !sk_rmem_schedule(sk, skb, size)) {
  
-               if (tcp_prune_queue(sk) < 0)
+               if (tcp_prune_queue(sk, skb) < 0)
                         return -1;
  
                 while (!sk_rmem_schedule(sk, skb, size)) {
-                       if (!tcp_prune_ofo_queue(sk))
+                       if (!tcp_prune_ofo_queue(sk, skb))
                                 return -1;
                 }
         }
@@ -5329,6 +5329,8 @@ new_range:
   * Clean the out-of-order queue to make room.
   * We drop high sequences packets to :
   * 1) Let a chance for holes to be filled.
+ *    This means we do not drop packets from ooo queue if their sequence
+ *    is before incoming packet sequence.
   * 2) not add too big latencies if thousands of packets sit there.
   *    (But if application shrinks SO_RCVBUF, we could still end up
   *     freeing whole queue here)
@@ -5336,24 +5338,31 @@ new_range:
   *
   * Return true if queue has shrunk.
   */
-static bool tcp_prune_ofo_queue(struct sock *sk)
+static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
  {
         struct tcp_sock *tp = tcp_sk(sk);
         struct rb_node *node, *prev;
+       bool pruned = false;
         int goal;
  
         if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
                 return false;
  
-       NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
         goal = sk->sk_rcvbuf >> 3;
         node = &tp->ooo_last_skb->rbnode;
+
         do {
+               struct sk_buff *skb = rb_to_skb(node);
+
+               /* If incoming skb would land last in ofo queue, stop pruning. */
+               if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq))
+                       break;
+               pruned = true;
                 prev = rb_prev(node);
                 rb_erase(node, &tp->out_of_order_queue);
-               goal -= rb_to_skb(node)->truesize;
-               tcp_drop_reason(sk, rb_to_skb(node),
-                               SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
+               goal -= skb->truesize;
+               tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
+               tp->ooo_last_skb = rb_to_skb(prev);
                 if (!prev || goal <= 0) {
                         if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
                             !tcp_under_memory_pressure(sk))
@@ -5362,16 +5371,18 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
                 }
                 node = prev;
         } while (node);
-       tp->ooo_last_skb = rb_to_skb(prev);
  
-       /* Reset SACK state.  A conforming SACK implementation will
-        * do the same at a timeout based retransmit.  When a connection
-        * is in a sad state like this, we care only about integrity
-        * of the connection not performance.
-        */
-       if (tp->rx_opt.sack_ok)
-               tcp_sack_reset(&tp->rx_opt);
-       return true;
+       if (pruned) {
+               NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+               /* Reset SACK state.  A conforming SACK implementation will
+                * do the same at a timeout based retransmit.  When a connection
+                * is in a sad state like this, we care only about integrity
+                * of the connection not performance.
+                */
+               if (tp->rx_opt.sack_ok)
+                       tcp_sack_reset(&tp->rx_opt);
+       }
+       return pruned;
  }
  
  /* Reduce allocated memory if we can, trying to get
@@ -5381,7 +5392,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
   * until the socket owning process reads some of the data
   * to stabilize the situation.
   */
-static int tcp_prune_queue(struct sock *sk)
+static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
  {
         struct tcp_sock *tp = tcp_sk(sk);
  
@@ -5408,7 +5419,7 @@ static int tcp_prune_queue(struct sock *sk)
         /* Collapsing did not help, destructive actions follow.
          * This must not ever occur. */
  
-       tcp_prune_ofo_queue(sk);
+       tcp_prune_ofo_queue(sk, in_skb);
  
         if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
                 return 0;
author	Eric Dumazet <edumazet@google.com>
	Tue, 1 Nov 2022 03:52:34 +0000 (03:52 +0000)
committer	Jakub Kicinski <kuba@kernel.org>
	Wed, 2 Nov 2022 04:19:58 +0000 (21:19 -0700)