{
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
- TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
- tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
+ TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->lost_out > tp->retrans_out &&
+ tp->snd_cwnd > tcp_packets_in_flight(tp))
+ tcp_xmit_retransmit_queue(sk);
+
+ tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
0, GFP_ATOMIC);
+ }
}
/*
* One tasklet per cpu tries to send more skbs.
skb_mstamp_get(&skb->skb_mstamp);
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
+ tcp_rate_skb_sent(sk, skb);
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
tcp_set_skb_tso_segs(skb, mss_now);
tcp_set_skb_tso_segs(buff, mss_now);
+ /* Update delivered info for the new segment */
+ TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
+
/* If this packet has been sent out already, we must
* adjust the various packet counters.
*/
}
return mtu;
}
+EXPORT_SYMBOL(tcp_mss_to_mtu);
/* MTU probing init per socket */
void tcp_mtup_init(struct sock *sk)
/* Return how many segs we'd like on a TSO packet,
* to send one TSO packet per ms
*/
-static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ int min_tso_segs)
{
u32 bytes, segs;
* This preserves ACK clocking and is consistent
* with tcp_tso_should_defer() heuristic.
*/
- segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
+ segs = max_t(u32, bytes / mss_now, min_tso_segs);
return min_t(u32, segs, sk->sk_gso_max_segs);
}
+EXPORT_SYMBOL(tcp_tso_autosize);
+
+/* Return the number of segments we want in the skb we are transmitting.
+ * See if congestion control module wants to decide; otherwise, autosize.
+ */
+static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+ u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
+
+ return tso_segs ? :
+ tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs);
+}
/* Returns the portion of skb which can be sent right away */
static unsigned int tcp_mss_split_point(const struct sock *sk,
return -1;
}
+/* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * (These limits are doubled for retransmits)
+ * This allows for :
+ * - better RTT estimation and ACK scheduling
+ * - faster recovery
+ * - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
+ */
+static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
+ unsigned int factor)
+{
+ unsigned int limit;
+
+ limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+ limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
+ limit <<= factor;
+
+ if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+ set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED, so we must
+ * test again the condition.
+ */
+ smp_mb__after_atomic();
+ if (atomic_read(&sk->sk_wmem_alloc) > limit)
+ return true;
+ }
+ return false;
+}
+
+static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
+{
+ const u32 now = tcp_time_stamp;
+
+ if (tp->chrono_type > TCP_CHRONO_UNSPEC)
+ tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start;
+ tp->chrono_start = now;
+ tp->chrono_type = new;
+}
+
+void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* If there are multiple conditions worthy of tracking in a
+ * chronograph then the highest priority enum takes precedence
+ * over the other conditions. So that if something "more interesting"
+ * starts happening, stop the previous chrono and start a new one.
+ */
+ if (type > tp->chrono_type)
+ tcp_chrono_set(tp, type);
+}
+
+void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+
+ /* There are multiple conditions worthy of tracking in a
+ * chronograph, so that the highest priority enum takes
+ * precedence over the other conditions (see tcp_chrono_start).
+ * If a condition stops, we only stop chrono tracking if
+ * it's the "most interesting" or current chrono we are
+ * tracking and starts busy chrono if we have pending data.
+ */
+ if (tcp_write_queue_empty(sk))
+ tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
+ else if (type == tp->chrono_type)
+ tcp_chrono_set(tp, TCP_CHRONO_BUSY);
+}
+
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
}
}
- max_segs = tcp_tso_autosize(sk, mss_now);
+ max_segs = tcp_tso_segs(sk, mss_now);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
- /* TCP Small Queues :
- * Control number of packets in qdisc/devices to two packets / or ~1 ms.
- * This allows for :
- * - better RTT estimation and ACK scheduling
- * - faster recovery
- * - high rates
- * Alas, some drivers / subsystems require a fair amount
- * of queued bytes to ensure line rate.
- * One example is wifi aggregation (802.11 AMPDU)
- */
- limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
- limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
-
- if (atomic_read(&sk->sk_wmem_alloc) > limit) {
- set_bit(TSQ_THROTTLED, &tp->tsq_flags);
- /* It is possible TX completion already happened
- * before we set TSQ_THROTTLED, so we must
- * test again the condition.
- */
- smp_mb__after_atomic();
- if (atomic_read(&sk->sk_wmem_alloc) > limit)
- break;
- }
+ if (tcp_small_queue_check(sk, skb, 0))
+ break;
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
}
/* Collapses two adjacent SKB's during retransmission. */
-static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
+static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
+ if (next_skb_size) {
+ if (next_skb_size <= skb_availroom(skb))
+ skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
+ next_skb_size);
+ else if (!skb_shift(skb, next_skb, next_skb_size))
+ return false;
+ }
tcp_highest_sack_combine(sk, next_skb, skb);
tcp_unlink_write_queue(next_skb, sk);
- skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
- next_skb_size);
-
if (next_skb->ip_summed == CHECKSUM_PARTIAL)
skb->ip_summed = CHECKSUM_PARTIAL;
tcp_skb_collapse_tstamp(skb, next_skb);
sk_wmem_free_skb(sk, next_skb);
+ return true;
}
/* Check if coalescing SKBs is legal. */
{
if (tcp_skb_pcount(skb) > 1)
return false;
- /* TODO: SACK collapsing could be used to remove this condition */
- if (skb_shinfo(skb)->nr_frags != 0)
- return false;
if (skb_cloned(skb))
return false;
if (skb == tcp_send_head(sk))
return false;
- /* Some heurestics for collapsing over SACK'd could be invented */
+ /* Some heuristics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
return false;
if (space < 0)
break;
- /* Punt if not enough space exists in the first SKB for
- * the data in the second
- */
- if (skb->len > skb_availroom(to))
- break;
if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
break;
- tcp_collapse_retrans(sk, to);
+ if (!tcp_collapse_retrans(sk, to))
+ break;
}
}
last_lost = tp->snd_una;
}
- max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
+ max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
tcp_for_write_queue_from(skb, sk) {
- __u8 sacked = TCP_SKB_CB(skb)->sacked;
+ __u8 sacked;
int segs;
if (skb == tcp_send_head(sk))
segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
if (segs <= 0)
return;
+ sacked = TCP_SKB_CB(skb)->sacked;
/* In case tcp_shift_skb_data() have aggregated large skbs,
* we need to make sure not sending too bigs TSO packets
*/
if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
continue;
+ if (tcp_small_queue_check(sk, skb, 1))
+ return;
+
if (tcp_retransmit_skb(sk, skb, segs))
return;
fo->copied = space;
tcp_connect_queue_skb(sk, syn_data);
+ if (syn_data->len)
+ tcp_chrono_start(sk, TCP_CHRONO_BUSY);
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);