tcp: instrument how long TCP is busy sending

[uclinux-h8/linux.git] / net / ipv4 / tcp_output.c
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index d48d557..e8ea584 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -734,9 +734,16 @@ static void tcp_tsq_handler(struct sock *sk)
  {
         if ((1 << sk->sk_state) &
             (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
-            TCPF_CLOSE_WAIT  | TCPF_LAST_ACK))
-               tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
+            TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) {
+               struct tcp_sock *tp = tcp_sk(sk);
+
+               if (tp->lost_out > tp->retrans_out &&
+                   tp->snd_cwnd > tcp_packets_in_flight(tp))
+                       tcp_xmit_retransmit_queue(sk);
+
+               tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
                                0, GFP_ATOMIC);
+       }
  }
  /*
   * One tasklet per cpu tries to send more skbs.
@@ -918,6 +925,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                 skb_mstamp_get(&skb->skb_mstamp);
                 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
                         - tp->snd_una;
+               tcp_rate_skb_sent(sk, skb);
  
                 if (unlikely(skb_cloned(skb)))
                         skb = pskb_copy(skb, gfp_mask);
@@ -1213,6 +1221,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
         tcp_set_skb_tso_segs(skb, mss_now);
         tcp_set_skb_tso_segs(buff, mss_now);
  
+       /* Update delivered info for the new segment */
+       TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
+
         /* If this packet has been sent out already, we must
          * adjust the various packet counters.
          */
@@ -1358,6 +1369,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
         }
         return mtu;
  }
+EXPORT_SYMBOL(tcp_mss_to_mtu);
  
  /* MTU probing init per socket */
  void tcp_mtup_init(struct sock *sk)
@@ -1545,7 +1557,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
  /* Return how many segs we'd like on a TSO packet,
   * to send one TSO packet per ms
   */
-static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+                    int min_tso_segs)
  {
         u32 bytes, segs;
  
@@ -1557,10 +1570,23 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
          * This preserves ACK clocking and is consistent
          * with tcp_tso_should_defer() heuristic.
          */
-       segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
+       segs = max_t(u32, bytes / mss_now, min_tso_segs);
  
         return min_t(u32, segs, sk->sk_gso_max_segs);
  }
+EXPORT_SYMBOL(tcp_tso_autosize);
+
+/* Return the number of segments we want in the skb we are transmitting.
+ * See if congestion control module wants to decide; otherwise, autosize.
+ */
+static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+       const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+       u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
+
+       return tso_segs ? :
+               tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs);
+}
  
  /* Returns the portion of skb which can be sent right away */
  static unsigned int tcp_mss_split_point(const struct sock *sk,
@@ -2022,6 +2048,80 @@ static int tcp_mtu_probe(struct sock *sk)
         return -1;
  }
  
+/* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * (These limits are doubled for retransmits)
+ * This allows for :
+ *  - better RTT estimation and ACK scheduling
+ *  - faster recovery
+ *  - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
+ */
+static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
+                                 unsigned int factor)
+{
+       unsigned int limit;
+
+       limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+       limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
+       limit <<= factor;
+
+       if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+               set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
+               /* It is possible TX completion already happened
+                * before we set TSQ_THROTTLED, so we must
+                * test again the condition.
+                */
+               smp_mb__after_atomic();
+               if (atomic_read(&sk->sk_wmem_alloc) > limit)
+                       return true;
+       }
+       return false;
+}
+
+static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
+{
+       const u32 now = tcp_time_stamp;
+
+       if (tp->chrono_type > TCP_CHRONO_UNSPEC)
+               tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start;
+       tp->chrono_start = now;
+       tp->chrono_type = new;
+}
+
+void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       /* If there are multiple conditions worthy of tracking in a
+        * chronograph then the highest priority enum takes precedence
+        * over the other conditions. So that if something "more interesting"
+        * starts happening, stop the previous chrono and start a new one.
+        */
+       if (type > tp->chrono_type)
+               tcp_chrono_set(tp, type);
+}
+
+void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+
+
+       /* There are multiple conditions worthy of tracking in a
+        * chronograph, so that the highest priority enum takes
+        * precedence over the other conditions (see tcp_chrono_start).
+        * If a condition stops, we only stop chrono tracking if
+        * it's the "most interesting" or current chrono we are
+        * tracking and starts busy chrono if we have pending data.
+        */
+       if (tcp_write_queue_empty(sk))
+               tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
+       else if (type == tp->chrono_type)
+               tcp_chrono_set(tp, TCP_CHRONO_BUSY);
+}
+
  /* This routine writes packets to the network.  It advances the
   * send_head.  This happens as incoming acks open up the remote
   * window for us.
@@ -2059,7 +2159,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                 }
         }
  
-       max_segs = tcp_tso_autosize(sk, mss_now);
+       max_segs = tcp_tso_segs(sk, mss_now);
         while ((skb = tcp_send_head(sk))) {
                 unsigned int limit;
  
@@ -2108,29 +2208,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                     unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
                         break;
  
-               /* TCP Small Queues :
-                * Control number of packets in qdisc/devices to two packets / or ~1 ms.
-                * This allows for :
-                *  - better RTT estimation and ACK scheduling
-                *  - faster recovery
-                *  - high rates
-                * Alas, some drivers / subsystems require a fair amount
-                * of queued bytes to ensure line rate.
-                * One example is wifi aggregation (802.11 AMPDU)
-                */
-               limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
-               limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
-
-               if (atomic_read(&sk->sk_wmem_alloc) > limit) {
-                       set_bit(TSQ_THROTTLED, &tp->tsq_flags);
-                       /* It is possible TX completion already happened
-                        * before we set TSQ_THROTTLED, so we must
-                        * test again the condition.
-                        */
-                       smp_mb__after_atomic();
-                       if (atomic_read(&sk->sk_wmem_alloc) > limit)
-                               break;
-               }
+               if (tcp_small_queue_check(sk, skb, 0))
+                       break;
  
                 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
                         break;
@@ -2476,7 +2555,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
  }
  
  /* Collapses two adjacent SKB's during retransmission. */
-static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
+static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
  {
         struct tcp_sock *tp = tcp_sk(sk);
         struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
@@ -2487,13 +2566,17 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
  
         BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
  
+       if (next_skb_size) {
+               if (next_skb_size <= skb_availroom(skb))
+                       skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
+                                     next_skb_size);
+               else if (!skb_shift(skb, next_skb, next_skb_size))
+                       return false;
+       }
         tcp_highest_sack_combine(sk, next_skb, skb);
  
         tcp_unlink_write_queue(next_skb, sk);
  
-       skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
-                                 next_skb_size);
-
         if (next_skb->ip_summed == CHECKSUM_PARTIAL)
                 skb->ip_summed = CHECKSUM_PARTIAL;
  
@@ -2522,6 +2605,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
         tcp_skb_collapse_tstamp(skb, next_skb);
  
         sk_wmem_free_skb(sk, next_skb);
+       return true;
  }
  
  /* Check if coalescing SKBs is legal. */
@@ -2529,14 +2613,11 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
  {
         if (tcp_skb_pcount(skb) > 1)
                 return false;
-       /* TODO: SACK collapsing could be used to remove this condition */
-       if (skb_shinfo(skb)->nr_frags != 0)
-               return false;
         if (skb_cloned(skb))
                 return false;
         if (skb == tcp_send_head(sk))
                 return false;
-       /* Some heurestics for collapsing over SACK'd could be invented */
+       /* Some heuristics for collapsing over SACK'd could be invented */
         if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
                 return false;
  
@@ -2574,16 +2655,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
  
                 if (space < 0)
                         break;
-               /* Punt if not enough space exists in the first SKB for
-                * the data in the second
-                */
-               if (skb->len > skb_availroom(to))
-                       break;
  
                 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
                         break;
  
-               tcp_collapse_retrans(sk, to);
+               if (!tcp_collapse_retrans(sk, to))
+                       break;
         }
  }
  
@@ -2777,9 +2854,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                 last_lost = tp->snd_una;
         }
  
-       max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
+       max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
         tcp_for_write_queue_from(skb, sk) {
-               __u8 sacked = TCP_SKB_CB(skb)->sacked;
+               __u8 sacked;
                 int segs;
  
                 if (skb == tcp_send_head(sk))
@@ -2791,6 +2868,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                 segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
                 if (segs <= 0)
                         return;
+               sacked = TCP_SKB_CB(skb)->sacked;
                 /* In case tcp_shift_skb_data() have aggregated large skbs,
                  * we need to make sure not sending too bigs TSO packets
                  */
@@ -2830,6 +2908,9 @@ begin_fwd:
                 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
                         continue;
  
+               if (tcp_small_queue_check(sk, skb, 1))
+                       return;
+
                 if (tcp_retransmit_skb(sk, skb, segs))
                         return;
  
@@ -3258,6 +3339,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
         fo->copied = space;
  
         tcp_connect_queue_skb(sk, syn_data);
+       if (syn_data->len)
+               tcp_chrono_start(sk, TCP_CHRONO_BUSY);
  
         err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);