1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
84 #include <trace/events/tcp.h>
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
101 tcp_hdr(skb)->source);
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 #if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156 /* In case of repair and re-using TIME-WAIT sockets we still
157 * want to be sure that it is safe as above but honor the
158 * sequence numbers and time stamps set as part of the repair
161 * Without this check re-using a TIME-WAIT socket with TCP
162 * repair would accumulate a -1 on the repair assigned
163 * sequence number. The first time it is reused the sequence
164 * is -1, the second time -2, etc. This fixes that issue
165 * without appearing to create any others.
167 if (likely(!tp->repair)) {
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
172 WRITE_ONCE(tp->write_seq, seq);
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
187 /* This check is replicated from tcp_v4_connect() and intended to
188 * prevent BPF program called below from accessing bytes that are out
189 * of the bound specified by user in addr_len.
191 if (addr_len < sizeof(struct sockaddr_in))
194 sock_owned_by_me(sk);
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 struct inet_timewait_death_row *tcp_death_row;
204 struct inet_sock *inet = inet_sk(sk);
205 struct tcp_sock *tp = tcp_sk(sk);
206 struct ip_options_rcu *inet_opt;
207 struct net *net = sock_net(sk);
208 __be16 orig_sport, orig_dport;
209 __be32 daddr, nexthop;
214 if (addr_len < sizeof(struct sockaddr_in))
217 if (usin->sin_family != AF_INET)
218 return -EAFNOSUPPORT;
220 nexthop = daddr = usin->sin_addr.s_addr;
221 inet_opt = rcu_dereference_protected(inet->inet_opt,
222 lockdep_sock_is_held(sk));
223 if (inet_opt && inet_opt->opt.srr) {
226 nexthop = inet_opt->opt.faddr;
229 orig_sport = inet->inet_sport;
230 orig_dport = usin->sin_port;
231 fl4 = &inet->cork.fl.u.ip4;
232 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
233 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
237 if (err == -ENETUNREACH)
238 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
242 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
247 if (!inet_opt || !inet_opt->opt.srr)
250 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
252 if (!inet->inet_saddr) {
253 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
259 sk_rcv_saddr_set(sk, inet->inet_saddr);
262 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
263 /* Reset inherited state */
264 tp->rx_opt.ts_recent = 0;
265 tp->rx_opt.ts_recent_stamp = 0;
266 if (likely(!tp->repair))
267 WRITE_ONCE(tp->write_seq, 0);
270 inet->inet_dport = usin->sin_port;
271 sk_daddr_set(sk, daddr);
273 inet_csk(sk)->icsk_ext_hdr_len = 0;
275 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
277 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
279 /* Socket identity is still unknown (sport may be zero).
280 * However we set state to SYN-SENT and not releasing socket
281 * lock select source port, enter ourselves into the hash tables and
282 * complete initialization after this.
284 tcp_set_state(sk, TCP_SYN_SENT);
285 err = inet_hash_connect(tcp_death_row, sk);
291 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
292 inet->inet_sport, inet->inet_dport, sk);
298 /* OK, now commit destination to socket. */
299 sk->sk_gso_type = SKB_GSO_TCPV4;
300 sk_setup_caps(sk, &rt->dst);
303 if (likely(!tp->repair)) {
305 WRITE_ONCE(tp->write_seq,
306 secure_tcp_seq(inet->inet_saddr,
310 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
314 inet->inet_id = get_random_u16();
316 if (tcp_fastopen_defer_connect(sk, &err))
321 err = tcp_connect(sk);
330 * This unhashes the socket and releases the local port,
333 tcp_set_state(sk, TCP_CLOSE);
334 inet_bhash2_reset_saddr(sk);
336 sk->sk_route_caps = 0;
337 inet->inet_dport = 0;
340 EXPORT_SYMBOL(tcp_v4_connect);
343 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
344 * It can be called through tcp_release_cb() if socket was owned by user
345 * at the time tcp_v4_err() was called to handle ICMP message.
347 void tcp_v4_mtu_reduced(struct sock *sk)
349 struct inet_sock *inet = inet_sk(sk);
350 struct dst_entry *dst;
353 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
355 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
356 dst = inet_csk_update_pmtu(sk, mtu);
360 /* Something is about to be wrong... Remember soft error
361 * for the case, if this connection will not able to recover.
363 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
364 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
368 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
369 ip_sk_accept_pmtu(sk) &&
370 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
371 tcp_sync_mss(sk, mtu);
373 /* Resend the TCP packet because it's
374 * clear that the old packet has been
375 * dropped. This is the new "fast" path mtu
378 tcp_simple_retransmit(sk);
379 } /* else let the usual retransmit timer handle it */
381 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
383 static void do_redirect(struct sk_buff *skb, struct sock *sk)
385 struct dst_entry *dst = __sk_dst_check(sk, 0);
388 dst->ops->redirect(dst, sk, skb);
392 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
393 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
395 struct request_sock *req = inet_reqsk(sk);
396 struct net *net = sock_net(sk);
398 /* ICMPs are not backlogged, hence we cannot get
399 * an established socket here.
401 if (seq != tcp_rsk(req)->snt_isn) {
402 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
405 * Still in SYN_RECV, just remove it silently.
406 * There is no good way to pass the error to the newly
407 * created socket, and POSIX does not want network
408 * errors returned from accept().
410 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
411 tcp_listendrop(req->rsk_listener);
415 EXPORT_SYMBOL(tcp_req_err);
417 /* TCP-LD (RFC 6069) logic */
418 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
420 struct inet_connection_sock *icsk = inet_csk(sk);
421 struct tcp_sock *tp = tcp_sk(sk);
426 if (sock_owned_by_user(sk))
429 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
433 skb = tcp_rtx_queue_head(sk);
434 if (WARN_ON_ONCE(!skb))
437 icsk->icsk_backoff--;
438 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
439 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
441 tcp_mstamp_refresh(tp);
442 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
443 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
446 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
447 remaining, TCP_RTO_MAX);
449 /* RTO revert clocked out retransmission.
450 * Will retransmit now.
452 tcp_retransmit_timer(sk);
455 EXPORT_SYMBOL(tcp_ld_RTO_revert);
458 * This routine is called by the ICMP module when it gets some
459 * sort of error condition. If err < 0 then the socket should
460 * be closed and the error returned to the user. If err > 0
461 * it's just the icmp type << 8 | icmp code. After adjustment
462 * header points to the first 8 bytes of the tcp header. We need
463 * to find the appropriate port.
465 * The locking strategy used here is very "optimistic". When
466 * someone else accesses the socket the ICMP is just dropped
467 * and for some paths there is no check at all.
468 * A more general error queue to queue errors for later handling
469 * is probably better.
473 int tcp_v4_err(struct sk_buff *skb, u32 info)
475 const struct iphdr *iph = (const struct iphdr *)skb->data;
476 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
478 struct inet_sock *inet;
479 const int type = icmp_hdr(skb)->type;
480 const int code = icmp_hdr(skb)->code;
482 struct request_sock *fastopen;
485 struct net *net = dev_net(skb->dev);
487 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
488 iph->daddr, th->dest, iph->saddr,
489 ntohs(th->source), inet_iif(skb), 0);
491 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
494 if (sk->sk_state == TCP_TIME_WAIT) {
495 inet_twsk_put(inet_twsk(sk));
498 seq = ntohl(th->seq);
499 if (sk->sk_state == TCP_NEW_SYN_RECV) {
500 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
501 type == ICMP_TIME_EXCEEDED ||
502 (type == ICMP_DEST_UNREACH &&
503 (code == ICMP_NET_UNREACH ||
504 code == ICMP_HOST_UNREACH)));
509 /* If too many ICMPs get dropped on busy
510 * servers this needs to be solved differently.
511 * We do take care of PMTU discovery (RFC1191) special case :
512 * we can receive locally generated ICMP messages while socket is held.
514 if (sock_owned_by_user(sk)) {
515 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
516 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
518 if (sk->sk_state == TCP_CLOSE)
521 if (static_branch_unlikely(&ip4_min_ttl)) {
522 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
523 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
524 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
530 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
531 fastopen = rcu_dereference(tp->fastopen_rsk);
532 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
533 if (sk->sk_state != TCP_LISTEN &&
534 !between(seq, snd_una, tp->snd_nxt)) {
535 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
541 if (!sock_owned_by_user(sk))
542 do_redirect(skb, sk);
544 case ICMP_SOURCE_QUENCH:
545 /* Just silently ignore these. */
547 case ICMP_PARAMETERPROB:
550 case ICMP_DEST_UNREACH:
551 if (code > NR_ICMP_UNREACH)
554 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
555 /* We are not interested in TCP_LISTEN and open_requests
556 * (SYN-ACKs send out by Linux are always <576bytes so
557 * they should go through unfragmented).
559 if (sk->sk_state == TCP_LISTEN)
562 WRITE_ONCE(tp->mtu_info, info);
563 if (!sock_owned_by_user(sk)) {
564 tcp_v4_mtu_reduced(sk);
566 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
572 err = icmp_err_convert[code].errno;
573 /* check if this ICMP message allows revert of backoff.
577 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
578 tcp_ld_RTO_revert(sk, seq);
580 case ICMP_TIME_EXCEEDED:
587 switch (sk->sk_state) {
590 /* Only in fast or simultaneous open. If a fast open socket is
591 * already accepted it is treated as a connected one below.
593 if (fastopen && !fastopen->sk)
596 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
598 if (!sock_owned_by_user(sk)) {
599 WRITE_ONCE(sk->sk_err, err);
605 WRITE_ONCE(sk->sk_err_soft, err);
610 /* If we've already connected we will keep trying
611 * until we time out, or the user gives up.
613 * rfc1122 4.2.3.9 allows to consider as hard errors
614 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
615 * but it is obsoleted by pmtu discovery).
617 * Note, that in modern internet, where routing is unreliable
618 * and in each dark corner broken firewalls sit, sending random
619 * errors ordered by their masters even this two messages finally lose
620 * their original sense (even Linux sends invalid PORT_UNREACHs)
622 * Now we are in compliance with RFCs.
627 if (!sock_owned_by_user(sk) && inet->recverr) {
628 WRITE_ONCE(sk->sk_err, err);
630 } else { /* Only an error on timeout */
631 WRITE_ONCE(sk->sk_err_soft, err);
640 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
642 struct tcphdr *th = tcp_hdr(skb);
644 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
645 skb->csum_start = skb_transport_header(skb) - skb->head;
646 skb->csum_offset = offsetof(struct tcphdr, check);
649 /* This routine computes an IPv4 TCP checksum. */
650 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
652 const struct inet_sock *inet = inet_sk(sk);
654 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
656 EXPORT_SYMBOL(tcp_v4_send_check);
659 * This routine will send an RST to the other tcp.
661 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
663 * Answer: if a packet caused RST, it is not for a socket
664 * existing in our system, if it is matched to a socket,
665 * it is just duplicate segment or bug in other side's TCP.
666 * So that we build reply only basing on parameters
667 * arrived with segment.
668 * Exception: precedence violation. We do not implement it in any case.
671 #ifdef CONFIG_TCP_MD5SIG
672 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
674 #define OPTION_BYTES sizeof(__be32)
677 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
679 const struct tcphdr *th = tcp_hdr(skb);
682 __be32 opt[OPTION_BYTES / sizeof(__be32)];
684 struct ip_reply_arg arg;
685 #ifdef CONFIG_TCP_MD5SIG
686 struct tcp_md5sig_key *key = NULL;
687 const __u8 *hash_location = NULL;
688 unsigned char newhash[16];
690 struct sock *sk1 = NULL;
692 u64 transmit_time = 0;
697 /* Never send a reset in response to a reset. */
701 /* If sk not NULL, it means we did a successful lookup and incoming
702 * route had to be correct. prequeue might have dropped our dst.
704 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
707 /* Swap the send and the receive. */
708 memset(&rep, 0, sizeof(rep));
709 rep.th.dest = th->source;
710 rep.th.source = th->dest;
711 rep.th.doff = sizeof(struct tcphdr) / 4;
715 rep.th.seq = th->ack_seq;
718 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
719 skb->len - (th->doff << 2));
722 memset(&arg, 0, sizeof(arg));
723 arg.iov[0].iov_base = (unsigned char *)&rep;
724 arg.iov[0].iov_len = sizeof(rep.th);
726 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
727 #ifdef CONFIG_TCP_MD5SIG
729 hash_location = tcp_parse_md5sig_option(th);
730 if (sk && sk_fullsock(sk)) {
731 const union tcp_md5_addr *addr;
734 /* sdif set, means packet ingressed via a device
735 * in an L3 domain and inet_iif is set to it.
737 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
738 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
739 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
740 } else if (hash_location) {
741 const union tcp_md5_addr *addr;
742 int sdif = tcp_v4_sdif(skb);
743 int dif = inet_iif(skb);
747 * active side is lost. Try to find listening socket through
748 * source port, and then find md5 key through listening socket.
749 * we are not loose security here:
750 * Incoming packet is checked with md5 hash with finding key,
751 * no RST generated if md5 hash doesn't match.
753 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
754 NULL, 0, ip_hdr(skb)->saddr,
755 th->source, ip_hdr(skb)->daddr,
756 ntohs(th->source), dif, sdif);
757 /* don't send rst if it can't find key */
761 /* sdif set, means packet ingressed via a device
762 * in an L3 domain and dif is set to it.
764 l3index = sdif ? dif : 0;
765 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
766 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
771 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
772 if (genhash || memcmp(hash_location, newhash, 16) != 0)
778 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
780 (TCPOPT_MD5SIG << 8) |
782 /* Update length and the length the header thinks exists */
783 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
784 rep.th.doff = arg.iov[0].iov_len / 4;
786 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
787 key, ip_hdr(skb)->saddr,
788 ip_hdr(skb)->daddr, &rep.th);
791 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
792 if (rep.opt[0] == 0) {
793 __be32 mrst = mptcp_reset_option(skb);
797 arg.iov[0].iov_len += sizeof(mrst);
798 rep.th.doff = arg.iov[0].iov_len / 4;
802 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
803 ip_hdr(skb)->saddr, /* XXX */
804 arg.iov[0].iov_len, IPPROTO_TCP, 0);
805 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
806 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
808 /* When socket is gone, all binding information is lost.
809 * routing might fail in this case. No choice here, if we choose to force
810 * input interface, we will misroute in case of asymmetric route.
813 arg.bound_dev_if = sk->sk_bound_dev_if;
815 trace_tcp_send_reset(sk, skb);
818 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
819 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
821 arg.tos = ip_hdr(skb)->tos;
822 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
824 ctl_sk = this_cpu_read(ipv4_tcp_sk);
825 sock_net_set(ctl_sk, net);
827 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
828 inet_twsk(sk)->tw_mark : sk->sk_mark;
829 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
830 inet_twsk(sk)->tw_priority : sk->sk_priority;
831 transmit_time = tcp_transmit_time(sk);
832 xfrm_sk_clone_policy(ctl_sk, sk);
833 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
834 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
837 ctl_sk->sk_priority = 0;
839 ip_send_unicast_reply(ctl_sk,
840 skb, &TCP_SKB_CB(skb)->header.h4.opt,
841 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
842 &arg, arg.iov[0].iov_len,
843 transmit_time, txhash);
845 xfrm_sk_free_policy(ctl_sk);
846 sock_net_set(ctl_sk, &init_net);
847 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
848 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
851 #ifdef CONFIG_TCP_MD5SIG
857 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
858 outside socket context is ugly, certainly. What can I do?
861 static void tcp_v4_send_ack(const struct sock *sk,
862 struct sk_buff *skb, u32 seq, u32 ack,
863 u32 win, u32 tsval, u32 tsecr, int oif,
864 struct tcp_md5sig_key *key,
865 int reply_flags, u8 tos, u32 txhash)
867 const struct tcphdr *th = tcp_hdr(skb);
870 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
871 #ifdef CONFIG_TCP_MD5SIG
872 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
876 struct net *net = sock_net(sk);
877 struct ip_reply_arg arg;
881 memset(&rep.th, 0, sizeof(struct tcphdr));
882 memset(&arg, 0, sizeof(arg));
884 arg.iov[0].iov_base = (unsigned char *)&rep;
885 arg.iov[0].iov_len = sizeof(rep.th);
887 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
888 (TCPOPT_TIMESTAMP << 8) |
890 rep.opt[1] = htonl(tsval);
891 rep.opt[2] = htonl(tsecr);
892 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
895 /* Swap the send and the receive. */
896 rep.th.dest = th->source;
897 rep.th.source = th->dest;
898 rep.th.doff = arg.iov[0].iov_len / 4;
899 rep.th.seq = htonl(seq);
900 rep.th.ack_seq = htonl(ack);
902 rep.th.window = htons(win);
904 #ifdef CONFIG_TCP_MD5SIG
906 int offset = (tsecr) ? 3 : 0;
908 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
910 (TCPOPT_MD5SIG << 8) |
912 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
913 rep.th.doff = arg.iov[0].iov_len/4;
915 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
916 key, ip_hdr(skb)->saddr,
917 ip_hdr(skb)->daddr, &rep.th);
920 arg.flags = reply_flags;
921 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
922 ip_hdr(skb)->saddr, /* XXX */
923 arg.iov[0].iov_len, IPPROTO_TCP, 0);
924 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
926 arg.bound_dev_if = oif;
928 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
930 ctl_sk = this_cpu_read(ipv4_tcp_sk);
931 sock_net_set(ctl_sk, net);
932 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
933 inet_twsk(sk)->tw_mark : sk->sk_mark;
934 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
935 inet_twsk(sk)->tw_priority : sk->sk_priority;
936 transmit_time = tcp_transmit_time(sk);
937 ip_send_unicast_reply(ctl_sk,
938 skb, &TCP_SKB_CB(skb)->header.h4.opt,
939 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
940 &arg, arg.iov[0].iov_len,
941 transmit_time, txhash);
943 sock_net_set(ctl_sk, &init_net);
944 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
948 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
950 struct inet_timewait_sock *tw = inet_twsk(sk);
951 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
953 tcp_v4_send_ack(sk, skb,
954 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
955 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
956 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
959 tcp_twsk_md5_key(tcptw),
960 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
968 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
969 struct request_sock *req)
971 const union tcp_md5_addr *addr;
974 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
975 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
977 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
981 * The window field (SEG.WND) of every outgoing segment, with the
982 * exception of <SYN> segments, MUST be right-shifted by
983 * Rcv.Wind.Shift bits:
985 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
986 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
987 tcp_v4_send_ack(sk, skb, seq,
988 tcp_rsk(req)->rcv_nxt,
989 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
990 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
991 READ_ONCE(req->ts_recent),
993 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
994 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
996 READ_ONCE(tcp_rsk(req)->txhash));
1000 * Send a SYN-ACK after having received a SYN.
1001 * This still operates on a request_sock only, not on a big
1004 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1006 struct request_sock *req,
1007 struct tcp_fastopen_cookie *foc,
1008 enum tcp_synack_type synack_type,
1009 struct sk_buff *syn_skb)
1011 const struct inet_request_sock *ireq = inet_rsk(req);
1014 struct sk_buff *skb;
1017 /* First, grab a route. */
1018 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1021 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1024 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1026 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1027 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1028 (inet_sk(sk)->tos & INET_ECN_MASK) :
1031 if (!INET_ECN_is_capable(tos) &&
1032 tcp_bpf_ca_needs_ecn((struct sock *)req))
1033 tos |= INET_ECN_ECT_0;
1036 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1038 rcu_dereference(ireq->ireq_opt),
1041 err = net_xmit_eval(err);
1048 * IPv4 request_sock destructor.
1050 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1052 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1055 #ifdef CONFIG_TCP_MD5SIG
1057 * RFC2385 MD5 checksumming requires a mapping of
1058 * IP address->MD5 Key.
1059 * We need to maintain these in the sk structure.
1062 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1063 EXPORT_SYMBOL(tcp_md5_needed);
1065 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1070 /* l3index always overrides non-l3index */
1071 if (old->l3index && new->l3index == 0)
1073 if (old->l3index == 0 && new->l3index)
1076 return old->prefixlen < new->prefixlen;
1079 /* Find the Key structure for an address. */
1080 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1081 const union tcp_md5_addr *addr,
1084 const struct tcp_sock *tp = tcp_sk(sk);
1085 struct tcp_md5sig_key *key;
1086 const struct tcp_md5sig_info *md5sig;
1088 struct tcp_md5sig_key *best_match = NULL;
1091 /* caller either holds rcu_read_lock() or socket lock */
1092 md5sig = rcu_dereference_check(tp->md5sig_info,
1093 lockdep_sock_is_held(sk));
1097 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1098 lockdep_sock_is_held(sk)) {
1099 if (key->family != family)
1101 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1103 if (family == AF_INET) {
1104 mask = inet_make_mask(key->prefixlen);
1105 match = (key->addr.a4.s_addr & mask) ==
1106 (addr->a4.s_addr & mask);
1107 #if IS_ENABLED(CONFIG_IPV6)
1108 } else if (family == AF_INET6) {
1109 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1116 if (match && better_md5_match(best_match, key))
1121 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1123 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1124 const union tcp_md5_addr *addr,
1125 int family, u8 prefixlen,
1126 int l3index, u8 flags)
1128 const struct tcp_sock *tp = tcp_sk(sk);
1129 struct tcp_md5sig_key *key;
1130 unsigned int size = sizeof(struct in_addr);
1131 const struct tcp_md5sig_info *md5sig;
1133 /* caller either holds rcu_read_lock() or socket lock */
1134 md5sig = rcu_dereference_check(tp->md5sig_info,
1135 lockdep_sock_is_held(sk));
1138 #if IS_ENABLED(CONFIG_IPV6)
1139 if (family == AF_INET6)
1140 size = sizeof(struct in6_addr);
1142 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1143 lockdep_sock_is_held(sk)) {
1144 if (key->family != family)
1146 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1148 if (key->l3index != l3index)
1150 if (!memcmp(&key->addr, addr, size) &&
1151 key->prefixlen == prefixlen)
1157 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1158 const struct sock *addr_sk)
1160 const union tcp_md5_addr *addr;
1163 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1164 addr_sk->sk_bound_dev_if);
1165 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1166 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1168 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1170 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1172 struct tcp_sock *tp = tcp_sk(sk);
1173 struct tcp_md5sig_info *md5sig;
1175 md5sig = kmalloc(sizeof(*md5sig), gfp);
1180 INIT_HLIST_HEAD(&md5sig->head);
1181 rcu_assign_pointer(tp->md5sig_info, md5sig);
1185 /* This can be called on a newly created socket, from other files */
1186 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1187 int family, u8 prefixlen, int l3index, u8 flags,
1188 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1190 /* Add Key to the list */
1191 struct tcp_md5sig_key *key;
1192 struct tcp_sock *tp = tcp_sk(sk);
1193 struct tcp_md5sig_info *md5sig;
1195 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1197 /* Pre-existing entry - just update that one.
1198 * Note that the key might be used concurrently.
1199 * data_race() is telling kcsan that we do not care of
1200 * key mismatches, since changing MD5 key on live flows
1201 * can lead to packet drops.
1203 data_race(memcpy(key->key, newkey, newkeylen));
1205 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1206 * Also note that a reader could catch new key->keylen value
1207 * but old key->key[], this is the reason we use __GFP_ZERO
1208 * at sock_kmalloc() time below these lines.
1210 WRITE_ONCE(key->keylen, newkeylen);
1215 md5sig = rcu_dereference_protected(tp->md5sig_info,
1216 lockdep_sock_is_held(sk));
1218 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1221 if (!tcp_alloc_md5sig_pool()) {
1222 sock_kfree_s(sk, key, sizeof(*key));
1226 memcpy(key->key, newkey, newkeylen);
1227 key->keylen = newkeylen;
1228 key->family = family;
1229 key->prefixlen = prefixlen;
1230 key->l3index = l3index;
1232 memcpy(&key->addr, addr,
1233 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1234 sizeof(struct in_addr));
1235 hlist_add_head_rcu(&key->node, &md5sig->head);
1239 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1240 int family, u8 prefixlen, int l3index, u8 flags,
1241 const u8 *newkey, u8 newkeylen)
1243 struct tcp_sock *tp = tcp_sk(sk);
1245 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1246 if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1249 if (!static_branch_inc(&tcp_md5_needed.key)) {
1250 struct tcp_md5sig_info *md5sig;
1252 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1253 rcu_assign_pointer(tp->md5sig_info, NULL);
1254 kfree_rcu(md5sig, rcu);
1259 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1260 newkey, newkeylen, GFP_KERNEL);
1262 EXPORT_SYMBOL(tcp_md5_do_add);
1264 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1265 int family, u8 prefixlen, int l3index,
1266 struct tcp_md5sig_key *key)
1268 struct tcp_sock *tp = tcp_sk(sk);
1270 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1271 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1274 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1275 struct tcp_md5sig_info *md5sig;
1277 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1278 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1279 rcu_assign_pointer(tp->md5sig_info, NULL);
1280 kfree_rcu(md5sig, rcu);
1285 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1286 key->flags, key->key, key->keylen,
1287 sk_gfp_mask(sk, GFP_ATOMIC));
1289 EXPORT_SYMBOL(tcp_md5_key_copy);
1291 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1292 u8 prefixlen, int l3index, u8 flags)
1294 struct tcp_md5sig_key *key;
1296 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1299 hlist_del_rcu(&key->node);
1300 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1301 kfree_rcu(key, rcu);
1304 EXPORT_SYMBOL(tcp_md5_do_del);
1306 static void tcp_clear_md5_list(struct sock *sk)
1308 struct tcp_sock *tp = tcp_sk(sk);
1309 struct tcp_md5sig_key *key;
1310 struct hlist_node *n;
1311 struct tcp_md5sig_info *md5sig;
1313 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1315 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1316 hlist_del_rcu(&key->node);
1317 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1318 kfree_rcu(key, rcu);
1322 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1323 sockptr_t optval, int optlen)
1325 struct tcp_md5sig cmd;
1326 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1327 const union tcp_md5_addr *addr;
1332 if (optlen < sizeof(cmd))
1335 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1338 if (sin->sin_family != AF_INET)
1341 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1343 if (optname == TCP_MD5SIG_EXT &&
1344 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1345 prefixlen = cmd.tcpm_prefixlen;
1350 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1351 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1352 struct net_device *dev;
1355 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1356 if (dev && netif_is_l3_master(dev))
1357 l3index = dev->ifindex;
1361 /* ok to reference set/not set outside of rcu;
1362 * right now device MUST be an L3 master
1364 if (!dev || !l3index)
1368 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1370 if (!cmd.tcpm_keylen)
1371 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1373 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1376 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1377 cmd.tcpm_key, cmd.tcpm_keylen);
1380 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1381 __be32 daddr, __be32 saddr,
1382 const struct tcphdr *th, int nbytes)
1384 struct tcp4_pseudohdr *bp;
1385 struct scatterlist sg;
1392 bp->protocol = IPPROTO_TCP;
1393 bp->len = cpu_to_be16(nbytes);
1395 _th = (struct tcphdr *)(bp + 1);
1396 memcpy(_th, th, sizeof(*th));
1399 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1400 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1401 sizeof(*bp) + sizeof(*th));
1402 return crypto_ahash_update(hp->md5_req);
1405 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1406 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1408 struct tcp_md5sig_pool *hp;
1409 struct ahash_request *req;
1411 hp = tcp_get_md5sig_pool();
1413 goto clear_hash_noput;
1416 if (crypto_ahash_init(req))
1418 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1420 if (tcp_md5_hash_key(hp, key))
1422 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1423 if (crypto_ahash_final(req))
1426 tcp_put_md5sig_pool();
1430 tcp_put_md5sig_pool();
1432 memset(md5_hash, 0, 16);
1436 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1437 const struct sock *sk,
1438 const struct sk_buff *skb)
1440 struct tcp_md5sig_pool *hp;
1441 struct ahash_request *req;
1442 const struct tcphdr *th = tcp_hdr(skb);
1443 __be32 saddr, daddr;
1445 if (sk) { /* valid for establish/request sockets */
1446 saddr = sk->sk_rcv_saddr;
1447 daddr = sk->sk_daddr;
1449 const struct iphdr *iph = ip_hdr(skb);
1454 hp = tcp_get_md5sig_pool();
1456 goto clear_hash_noput;
1459 if (crypto_ahash_init(req))
1462 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1464 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1466 if (tcp_md5_hash_key(hp, key))
1468 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1469 if (crypto_ahash_final(req))
1472 tcp_put_md5sig_pool();
1476 tcp_put_md5sig_pool();
1478 memset(md5_hash, 0, 16);
1481 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1485 static void tcp_v4_init_req(struct request_sock *req,
1486 const struct sock *sk_listener,
1487 struct sk_buff *skb)
1489 struct inet_request_sock *ireq = inet_rsk(req);
1490 struct net *net = sock_net(sk_listener);
1492 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1493 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1494 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1497 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1498 struct sk_buff *skb,
1500 struct request_sock *req)
1502 tcp_v4_init_req(req, sk, skb);
1504 if (security_inet_conn_request(sk, skb, req))
1507 return inet_csk_route_req(sk, &fl->u.ip4, req);
1510 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1512 .obj_size = sizeof(struct tcp_request_sock),
1513 .rtx_syn_ack = tcp_rtx_synack,
1514 .send_ack = tcp_v4_reqsk_send_ack,
1515 .destructor = tcp_v4_reqsk_destructor,
1516 .send_reset = tcp_v4_send_reset,
1517 .syn_ack_timeout = tcp_syn_ack_timeout,
1520 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1521 .mss_clamp = TCP_MSS_DEFAULT,
1522 #ifdef CONFIG_TCP_MD5SIG
1523 .req_md5_lookup = tcp_v4_md5_lookup,
1524 .calc_md5_hash = tcp_v4_md5_hash_skb,
1526 #ifdef CONFIG_SYN_COOKIES
1527 .cookie_init_seq = cookie_v4_init_sequence,
1529 .route_req = tcp_v4_route_req,
1530 .init_seq = tcp_v4_init_seq,
1531 .init_ts_off = tcp_v4_init_ts_off,
1532 .send_synack = tcp_v4_send_synack,
1535 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1537 /* Never answer to SYNs send to broadcast or multicast */
1538 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1541 return tcp_conn_request(&tcp_request_sock_ops,
1542 &tcp_request_sock_ipv4_ops, sk, skb);
1548 EXPORT_SYMBOL(tcp_v4_conn_request);
1552 * The three way handshake has completed - we got a valid synack -
1553 * now create the new socket.
1555 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1556 struct request_sock *req,
1557 struct dst_entry *dst,
1558 struct request_sock *req_unhash,
1561 struct inet_request_sock *ireq;
1562 bool found_dup_sk = false;
1563 struct inet_sock *newinet;
1564 struct tcp_sock *newtp;
1566 #ifdef CONFIG_TCP_MD5SIG
1567 const union tcp_md5_addr *addr;
1568 struct tcp_md5sig_key *key;
1571 struct ip_options_rcu *inet_opt;
1573 if (sk_acceptq_is_full(sk))
1576 newsk = tcp_create_openreq_child(sk, req, skb);
1580 newsk->sk_gso_type = SKB_GSO_TCPV4;
1581 inet_sk_rx_dst_set(newsk, skb);
1583 newtp = tcp_sk(newsk);
1584 newinet = inet_sk(newsk);
1585 ireq = inet_rsk(req);
1586 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1587 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1588 newsk->sk_bound_dev_if = ireq->ir_iif;
1589 newinet->inet_saddr = ireq->ir_loc_addr;
1590 inet_opt = rcu_dereference(ireq->ireq_opt);
1591 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1592 newinet->mc_index = inet_iif(skb);
1593 newinet->mc_ttl = ip_hdr(skb)->ttl;
1594 newinet->rcv_tos = ip_hdr(skb)->tos;
1595 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1597 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1598 newinet->inet_id = get_random_u16();
1600 /* Set ToS of the new socket based upon the value of incoming SYN.
1601 * ECT bits are set later in tcp_init_transfer().
1603 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1604 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1607 dst = inet_csk_route_child_sock(sk, newsk, req);
1611 /* syncookie case : see end of cookie_v4_check() */
1613 sk_setup_caps(newsk, dst);
1615 tcp_ca_openreq_child(newsk, dst);
1617 tcp_sync_mss(newsk, dst_mtu(dst));
1618 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1620 tcp_initialize_rcv_mss(newsk);
1622 #ifdef CONFIG_TCP_MD5SIG
1623 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1624 /* Copy over the MD5 key from the original socket */
1625 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1626 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1628 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1630 sk_gso_disable(newsk);
1634 if (__inet_inherit_port(sk, newsk) < 0)
1636 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1638 if (likely(*own_req)) {
1639 tcp_move_syn(newtp, req);
1640 ireq->ireq_opt = NULL;
1642 newinet->inet_opt = NULL;
1644 if (!req_unhash && found_dup_sk) {
1645 /* This code path should only be executed in the
1646 * syncookie case only
1648 bh_unlock_sock(newsk);
1656 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1663 newinet->inet_opt = NULL;
1664 inet_csk_prepare_forced_close(newsk);
1668 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1670 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1672 #ifdef CONFIG_SYN_COOKIES
1673 const struct tcphdr *th = tcp_hdr(skb);
1676 sk = cookie_v4_check(sk, skb);
1681 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1682 struct tcphdr *th, u32 *cookie)
1685 #ifdef CONFIG_SYN_COOKIES
1686 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1687 &tcp_request_sock_ipv4_ops, sk, th);
1689 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1690 tcp_synq_overflow(sk);
1696 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1698 /* The socket must have it's spinlock held when we get
1699 * here, unless it is a TCP_LISTEN socket.
1701 * We have a potential double-lock case here, so even when
1702 * doing backlog processing we use the BH locking scheme.
1703 * This is because we cannot sleep with the original spinlock
1706 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1708 enum skb_drop_reason reason;
1711 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1712 struct dst_entry *dst;
1714 dst = rcu_dereference_protected(sk->sk_rx_dst,
1715 lockdep_sock_is_held(sk));
1717 sock_rps_save_rxhash(sk, skb);
1718 sk_mark_napi_id(sk, skb);
1720 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1721 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1723 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1727 tcp_rcv_established(sk, skb);
1731 reason = SKB_DROP_REASON_NOT_SPECIFIED;
1732 if (tcp_checksum_complete(skb))
1735 if (sk->sk_state == TCP_LISTEN) {
1736 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1741 if (tcp_child_process(sk, nsk, skb)) {
1748 sock_rps_save_rxhash(sk, skb);
1750 if (tcp_rcv_state_process(sk, skb)) {
1757 tcp_v4_send_reset(rsk, skb);
1759 kfree_skb_reason(skb, reason);
1760 /* Be careful here. If this function gets more complicated and
1761 * gcc suffers from register pressure on the x86, sk (in %ebx)
1762 * might be destroyed here. This current version compiles correctly,
1763 * but you have been warned.
1768 reason = SKB_DROP_REASON_TCP_CSUM;
1769 trace_tcp_bad_csum(skb);
1770 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1771 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1774 EXPORT_SYMBOL(tcp_v4_do_rcv);
1776 int tcp_v4_early_demux(struct sk_buff *skb)
1778 struct net *net = dev_net(skb->dev);
1779 const struct iphdr *iph;
1780 const struct tcphdr *th;
1783 if (skb->pkt_type != PACKET_HOST)
1786 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1792 if (th->doff < sizeof(struct tcphdr) / 4)
1795 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1796 iph->saddr, th->source,
1797 iph->daddr, ntohs(th->dest),
1798 skb->skb_iif, inet_sdif(skb));
1801 skb->destructor = sock_edemux;
1802 if (sk_fullsock(sk)) {
1803 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1806 dst = dst_check(dst, 0);
1808 sk->sk_rx_dst_ifindex == skb->skb_iif)
1809 skb_dst_set_noref(skb, dst);
1815 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1816 enum skb_drop_reason *reason)
1818 u32 limit, tail_gso_size, tail_gso_segs;
1819 struct skb_shared_info *shinfo;
1820 const struct tcphdr *th;
1821 struct tcphdr *thtail;
1822 struct sk_buff *tail;
1823 unsigned int hdrlen;
1829 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1830 * we can fix skb->truesize to its real value to avoid future drops.
1831 * This is valid because skb is not yet charged to the socket.
1832 * It has been noticed pure SACK packets were sometimes dropped
1833 * (if cooked by drivers without copybreak feature).
1839 if (unlikely(tcp_checksum_complete(skb))) {
1841 trace_tcp_bad_csum(skb);
1842 *reason = SKB_DROP_REASON_TCP_CSUM;
1843 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1844 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1848 /* Attempt coalescing to last skb in backlog, even if we are
1850 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1852 th = (const struct tcphdr *)skb->data;
1853 hdrlen = th->doff * 4;
1855 tail = sk->sk_backlog.tail;
1858 thtail = (struct tcphdr *)tail->data;
1860 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1861 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1862 ((TCP_SKB_CB(tail)->tcp_flags |
1863 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1864 !((TCP_SKB_CB(tail)->tcp_flags &
1865 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1866 ((TCP_SKB_CB(tail)->tcp_flags ^
1867 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1868 #ifdef CONFIG_TLS_DEVICE
1869 tail->decrypted != skb->decrypted ||
1871 thtail->doff != th->doff ||
1872 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1875 __skb_pull(skb, hdrlen);
1877 shinfo = skb_shinfo(skb);
1878 gso_size = shinfo->gso_size ?: skb->len;
1879 gso_segs = shinfo->gso_segs ?: 1;
1881 shinfo = skb_shinfo(tail);
1882 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1883 tail_gso_segs = shinfo->gso_segs ?: 1;
1885 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1886 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1888 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1889 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1890 thtail->window = th->window;
1893 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1894 * thtail->fin, so that the fast path in tcp_rcv_established()
1895 * is not entered if we append a packet with a FIN.
1896 * SYN, RST, URG are not present.
1897 * ACK is set on both packets.
1898 * PSH : we do not really care in TCP stack,
1899 * at least for 'GRO' packets.
1901 thtail->fin |= th->fin;
1902 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1904 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1905 TCP_SKB_CB(tail)->has_rxtstamp = true;
1906 tail->tstamp = skb->tstamp;
1907 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1910 /* Not as strict as GRO. We only need to carry mss max value */
1911 shinfo->gso_size = max(gso_size, tail_gso_size);
1912 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1914 sk->sk_backlog.len += delta;
1915 __NET_INC_STATS(sock_net(sk),
1916 LINUX_MIB_TCPBACKLOGCOALESCE);
1917 kfree_skb_partial(skb, fragstolen);
1920 __skb_push(skb, hdrlen);
1923 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1925 /* Only socket owner can try to collapse/prune rx queues
1926 * to reduce memory overhead, so add a little headroom here.
1927 * Few sockets backlog are possibly concurrently non empty.
1931 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1933 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1934 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1939 EXPORT_SYMBOL(tcp_add_backlog);
1941 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1943 struct tcphdr *th = (struct tcphdr *)skb->data;
1945 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1947 EXPORT_SYMBOL(tcp_filter);
1949 static void tcp_v4_restore_cb(struct sk_buff *skb)
1951 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1952 sizeof(struct inet_skb_parm));
1955 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1956 const struct tcphdr *th)
1958 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1959 * barrier() makes sure compiler wont play fool^Waliasing games.
1961 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1962 sizeof(struct inet_skb_parm));
1965 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1966 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1967 skb->len - th->doff * 4);
1968 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1969 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1970 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1971 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1972 TCP_SKB_CB(skb)->sacked = 0;
1973 TCP_SKB_CB(skb)->has_rxtstamp =
1974 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1981 int tcp_v4_rcv(struct sk_buff *skb)
1983 struct net *net = dev_net(skb->dev);
1984 enum skb_drop_reason drop_reason;
1985 int sdif = inet_sdif(skb);
1986 int dif = inet_iif(skb);
1987 const struct iphdr *iph;
1988 const struct tcphdr *th;
1993 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1994 if (skb->pkt_type != PACKET_HOST)
1997 /* Count it even if it's bad */
1998 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2000 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2003 th = (const struct tcphdr *)skb->data;
2005 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2006 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2009 if (!pskb_may_pull(skb, th->doff * 4))
2012 /* An explanation is required here, I think.
2013 * Packet length and doff are validated by header prediction,
2014 * provided case of th->doff==0 is eliminated.
2015 * So, we defer the checks. */
2017 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2020 th = (const struct tcphdr *)skb->data;
2023 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2024 skb, __tcp_hdrlen(th), th->source,
2025 th->dest, sdif, &refcounted);
2030 if (sk->sk_state == TCP_TIME_WAIT)
2033 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2034 struct request_sock *req = inet_reqsk(sk);
2035 bool req_stolen = false;
2038 sk = req->rsk_listener;
2039 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2040 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2042 drop_reason = tcp_inbound_md5_hash(sk, skb,
2043 &iph->saddr, &iph->daddr,
2044 AF_INET, dif, sdif);
2045 if (unlikely(drop_reason)) {
2046 sk_drops_add(sk, skb);
2050 if (tcp_checksum_complete(skb)) {
2054 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2055 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2057 inet_csk_reqsk_queue_drop_and_put(sk, req);
2061 /* reuseport_migrate_sock() has already held one sk_refcnt
2065 /* We own a reference on the listener, increase it again
2066 * as we might lose it too soon.
2072 if (!tcp_filter(sk, skb)) {
2073 th = (const struct tcphdr *)skb->data;
2075 tcp_v4_fill_cb(skb, iph, th);
2076 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2078 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2083 /* Another cpu got exclusive access to req
2084 * and created a full blown socket.
2085 * Try to feed this packet to this socket
2086 * instead of discarding it.
2088 tcp_v4_restore_cb(skb);
2092 goto discard_and_relse;
2097 tcp_v4_restore_cb(skb);
2098 } else if (tcp_child_process(sk, nsk, skb)) {
2099 tcp_v4_send_reset(nsk, skb);
2100 goto discard_and_relse;
2107 if (static_branch_unlikely(&ip4_min_ttl)) {
2108 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2109 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2110 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2111 drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2112 goto discard_and_relse;
2116 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2117 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2118 goto discard_and_relse;
2121 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2122 &iph->daddr, AF_INET, dif, sdif);
2124 goto discard_and_relse;
2128 if (tcp_filter(sk, skb)) {
2129 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2130 goto discard_and_relse;
2132 th = (const struct tcphdr *)skb->data;
2134 tcp_v4_fill_cb(skb, iph, th);
2138 if (sk->sk_state == TCP_LISTEN) {
2139 ret = tcp_v4_do_rcv(sk, skb);
2140 goto put_and_return;
2143 sk_incoming_cpu_update(sk);
2145 bh_lock_sock_nested(sk);
2146 tcp_segs_in(tcp_sk(sk), skb);
2148 if (!sock_owned_by_user(sk)) {
2149 ret = tcp_v4_do_rcv(sk, skb);
2151 if (tcp_add_backlog(sk, skb, &drop_reason))
2152 goto discard_and_relse;
2163 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2164 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2167 tcp_v4_fill_cb(skb, iph, th);
2169 if (tcp_checksum_complete(skb)) {
2171 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2172 trace_tcp_bad_csum(skb);
2173 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2175 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2177 tcp_v4_send_reset(NULL, skb);
2181 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2182 /* Discard frame. */
2183 kfree_skb_reason(skb, drop_reason);
2187 sk_drops_add(sk, skb);
2193 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2194 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2195 inet_twsk_put(inet_twsk(sk));
2199 tcp_v4_fill_cb(skb, iph, th);
2201 if (tcp_checksum_complete(skb)) {
2202 inet_twsk_put(inet_twsk(sk));
2205 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2207 struct sock *sk2 = inet_lookup_listener(net,
2208 net->ipv4.tcp_death_row.hashinfo,
2209 skb, __tcp_hdrlen(th),
2210 iph->saddr, th->source,
2211 iph->daddr, th->dest,
2215 inet_twsk_deschedule_put(inet_twsk(sk));
2217 tcp_v4_restore_cb(skb);
2225 tcp_v4_timewait_ack(sk, skb);
2228 tcp_v4_send_reset(sk, skb);
2229 inet_twsk_deschedule_put(inet_twsk(sk));
2231 case TCP_TW_SUCCESS:;
2236 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2237 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2238 .twsk_unique = tcp_twsk_unique,
2239 .twsk_destructor= tcp_twsk_destructor,
2242 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2244 struct dst_entry *dst = skb_dst(skb);
2246 if (dst && dst_hold_safe(dst)) {
2247 rcu_assign_pointer(sk->sk_rx_dst, dst);
2248 sk->sk_rx_dst_ifindex = skb->skb_iif;
2251 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2253 const struct inet_connection_sock_af_ops ipv4_specific = {
2254 .queue_xmit = ip_queue_xmit,
2255 .send_check = tcp_v4_send_check,
2256 .rebuild_header = inet_sk_rebuild_header,
2257 .sk_rx_dst_set = inet_sk_rx_dst_set,
2258 .conn_request = tcp_v4_conn_request,
2259 .syn_recv_sock = tcp_v4_syn_recv_sock,
2260 .net_header_len = sizeof(struct iphdr),
2261 .setsockopt = ip_setsockopt,
2262 .getsockopt = ip_getsockopt,
2263 .addr2sockaddr = inet_csk_addr2sockaddr,
2264 .sockaddr_len = sizeof(struct sockaddr_in),
2265 .mtu_reduced = tcp_v4_mtu_reduced,
2267 EXPORT_SYMBOL(ipv4_specific);
2269 #ifdef CONFIG_TCP_MD5SIG
2270 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2271 .md5_lookup = tcp_v4_md5_lookup,
2272 .calc_md5_hash = tcp_v4_md5_hash_skb,
2273 .md5_parse = tcp_v4_parse_md5_keys,
2277 /* NOTE: A lot of things set to zero explicitly by call to
2278 * sk_alloc() so need not be done here.
2280 static int tcp_v4_init_sock(struct sock *sk)
2282 struct inet_connection_sock *icsk = inet_csk(sk);
2286 icsk->icsk_af_ops = &ipv4_specific;
2288 #ifdef CONFIG_TCP_MD5SIG
2289 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2295 void tcp_v4_destroy_sock(struct sock *sk)
2297 struct tcp_sock *tp = tcp_sk(sk);
2299 trace_tcp_destroy_sock(sk);
2301 tcp_clear_xmit_timers(sk);
2303 tcp_cleanup_congestion_control(sk);
2305 tcp_cleanup_ulp(sk);
2307 /* Cleanup up the write buffer. */
2308 tcp_write_queue_purge(sk);
2310 /* Check if we want to disable active TFO */
2311 tcp_fastopen_active_disable_ofo_check(sk);
2313 /* Cleans up our, hopefully empty, out_of_order_queue. */
2314 skb_rbtree_purge(&tp->out_of_order_queue);
2316 #ifdef CONFIG_TCP_MD5SIG
2317 /* Clean up the MD5 key list, if any */
2318 if (tp->md5sig_info) {
2319 tcp_clear_md5_list(sk);
2320 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2321 tp->md5sig_info = NULL;
2322 static_branch_slow_dec_deferred(&tcp_md5_needed);
2326 /* Clean up a referenced TCP bind bucket. */
2327 if (inet_csk(sk)->icsk_bind_hash)
2330 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2332 /* If socket is aborted during connect operation */
2333 tcp_free_fastopen_req(tp);
2334 tcp_fastopen_destroy_cipher(sk);
2335 tcp_saved_syn_free(tp);
2337 sk_sockets_allocated_dec(sk);
2339 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2341 #ifdef CONFIG_PROC_FS
2342 /* Proc filesystem TCP sock list dumping. */
2344 static unsigned short seq_file_family(const struct seq_file *seq);
2346 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2348 unsigned short family = seq_file_family(seq);
2350 /* AF_UNSPEC is used as a match all */
2351 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2352 net_eq(sock_net(sk), seq_file_net(seq)));
2355 /* Find a non empty bucket (starting from st->bucket)
2356 * and return the first sk from it.
2358 static void *listening_get_first(struct seq_file *seq)
2360 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2361 struct tcp_iter_state *st = seq->private;
2364 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2365 struct inet_listen_hashbucket *ilb2;
2366 struct hlist_nulls_node *node;
2369 ilb2 = &hinfo->lhash2[st->bucket];
2370 if (hlist_nulls_empty(&ilb2->nulls_head))
2373 spin_lock(&ilb2->lock);
2374 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2375 if (seq_sk_match(seq, sk))
2378 spin_unlock(&ilb2->lock);
2384 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2385 * If "cur" is the last one in the st->bucket,
2386 * call listening_get_first() to return the first sk of the next
2389 static void *listening_get_next(struct seq_file *seq, void *cur)
2391 struct tcp_iter_state *st = seq->private;
2392 struct inet_listen_hashbucket *ilb2;
2393 struct hlist_nulls_node *node;
2394 struct inet_hashinfo *hinfo;
2395 struct sock *sk = cur;
2400 sk = sk_nulls_next(sk);
2401 sk_nulls_for_each_from(sk, node) {
2402 if (seq_sk_match(seq, sk))
2406 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2407 ilb2 = &hinfo->lhash2[st->bucket];
2408 spin_unlock(&ilb2->lock);
2410 return listening_get_first(seq);
2413 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2415 struct tcp_iter_state *st = seq->private;
2420 rc = listening_get_first(seq);
2422 while (rc && *pos) {
2423 rc = listening_get_next(seq, rc);
2429 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2430 const struct tcp_iter_state *st)
2432 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2436 * Get first established socket starting from bucket given in st->bucket.
2437 * If st->bucket is zero, the very first socket in the hash is returned.
2439 static void *established_get_first(struct seq_file *seq)
2441 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2442 struct tcp_iter_state *st = seq->private;
2445 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2447 struct hlist_nulls_node *node;
2448 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2450 /* Lockless fast path for the common case of empty buckets */
2451 if (empty_bucket(hinfo, st))
2455 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2456 if (seq_sk_match(seq, sk))
2459 spin_unlock_bh(lock);
2465 static void *established_get_next(struct seq_file *seq, void *cur)
2467 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2468 struct tcp_iter_state *st = seq->private;
2469 struct hlist_nulls_node *node;
2470 struct sock *sk = cur;
2475 sk = sk_nulls_next(sk);
2477 sk_nulls_for_each_from(sk, node) {
2478 if (seq_sk_match(seq, sk))
2482 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2484 return established_get_first(seq);
2487 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2489 struct tcp_iter_state *st = seq->private;
2493 rc = established_get_first(seq);
2496 rc = established_get_next(seq, rc);
2502 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2505 struct tcp_iter_state *st = seq->private;
2507 st->state = TCP_SEQ_STATE_LISTENING;
2508 rc = listening_get_idx(seq, &pos);
2511 st->state = TCP_SEQ_STATE_ESTABLISHED;
2512 rc = established_get_idx(seq, pos);
2518 static void *tcp_seek_last_pos(struct seq_file *seq)
2520 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2521 struct tcp_iter_state *st = seq->private;
2522 int bucket = st->bucket;
2523 int offset = st->offset;
2524 int orig_num = st->num;
2527 switch (st->state) {
2528 case TCP_SEQ_STATE_LISTENING:
2529 if (st->bucket > hinfo->lhash2_mask)
2531 rc = listening_get_first(seq);
2532 while (offset-- && rc && bucket == st->bucket)
2533 rc = listening_get_next(seq, rc);
2537 st->state = TCP_SEQ_STATE_ESTABLISHED;
2539 case TCP_SEQ_STATE_ESTABLISHED:
2540 if (st->bucket > hinfo->ehash_mask)
2542 rc = established_get_first(seq);
2543 while (offset-- && rc && bucket == st->bucket)
2544 rc = established_get_next(seq, rc);
2552 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2554 struct tcp_iter_state *st = seq->private;
2557 if (*pos && *pos == st->last_pos) {
2558 rc = tcp_seek_last_pos(seq);
2563 st->state = TCP_SEQ_STATE_LISTENING;
2567 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2570 st->last_pos = *pos;
2573 EXPORT_SYMBOL(tcp_seq_start);
2575 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2577 struct tcp_iter_state *st = seq->private;
2580 if (v == SEQ_START_TOKEN) {
2581 rc = tcp_get_idx(seq, 0);
2585 switch (st->state) {
2586 case TCP_SEQ_STATE_LISTENING:
2587 rc = listening_get_next(seq, v);
2589 st->state = TCP_SEQ_STATE_ESTABLISHED;
2592 rc = established_get_first(seq);
2595 case TCP_SEQ_STATE_ESTABLISHED:
2596 rc = established_get_next(seq, v);
2601 st->last_pos = *pos;
2604 EXPORT_SYMBOL(tcp_seq_next);
2606 void tcp_seq_stop(struct seq_file *seq, void *v)
2608 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2609 struct tcp_iter_state *st = seq->private;
2611 switch (st->state) {
2612 case TCP_SEQ_STATE_LISTENING:
2613 if (v != SEQ_START_TOKEN)
2614 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2616 case TCP_SEQ_STATE_ESTABLISHED:
2618 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2622 EXPORT_SYMBOL(tcp_seq_stop);
2624 static void get_openreq4(const struct request_sock *req,
2625 struct seq_file *f, int i)
2627 const struct inet_request_sock *ireq = inet_rsk(req);
2628 long delta = req->rsk_timer.expires - jiffies;
2630 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2631 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2636 ntohs(ireq->ir_rmt_port),
2638 0, 0, /* could print option size, but that is af dependent. */
2639 1, /* timers active (only the expire timer) */
2640 jiffies_delta_to_clock_t(delta),
2642 from_kuid_munged(seq_user_ns(f),
2643 sock_i_uid(req->rsk_listener)),
2644 0, /* non standard timer */
2645 0, /* open_requests have no inode */
2650 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2653 unsigned long timer_expires;
2654 const struct tcp_sock *tp = tcp_sk(sk);
2655 const struct inet_connection_sock *icsk = inet_csk(sk);
2656 const struct inet_sock *inet = inet_sk(sk);
2657 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2658 __be32 dest = inet->inet_daddr;
2659 __be32 src = inet->inet_rcv_saddr;
2660 __u16 destp = ntohs(inet->inet_dport);
2661 __u16 srcp = ntohs(inet->inet_sport);
2665 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2666 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2667 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2669 timer_expires = icsk->icsk_timeout;
2670 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2672 timer_expires = icsk->icsk_timeout;
2673 } else if (timer_pending(&sk->sk_timer)) {
2675 timer_expires = sk->sk_timer.expires;
2678 timer_expires = jiffies;
2681 state = inet_sk_state_load(sk);
2682 if (state == TCP_LISTEN)
2683 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2685 /* Because we don't lock the socket,
2686 * we might find a transient negative value.
2688 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2689 READ_ONCE(tp->copied_seq), 0);
2691 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2692 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2693 i, src, srcp, dest, destp, state,
2694 READ_ONCE(tp->write_seq) - tp->snd_una,
2697 jiffies_delta_to_clock_t(timer_expires - jiffies),
2698 icsk->icsk_retransmits,
2699 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2700 icsk->icsk_probes_out,
2702 refcount_read(&sk->sk_refcnt), sk,
2703 jiffies_to_clock_t(icsk->icsk_rto),
2704 jiffies_to_clock_t(icsk->icsk_ack.ato),
2705 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2707 state == TCP_LISTEN ?
2708 fastopenq->max_qlen :
2709 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2712 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2713 struct seq_file *f, int i)
2715 long delta = tw->tw_timer.expires - jiffies;
2719 dest = tw->tw_daddr;
2720 src = tw->tw_rcv_saddr;
2721 destp = ntohs(tw->tw_dport);
2722 srcp = ntohs(tw->tw_sport);
2724 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2725 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2726 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2727 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2728 refcount_read(&tw->tw_refcnt), tw);
2733 static int tcp4_seq_show(struct seq_file *seq, void *v)
2735 struct tcp_iter_state *st;
2736 struct sock *sk = v;
2738 seq_setwidth(seq, TMPSZ - 1);
2739 if (v == SEQ_START_TOKEN) {
2740 seq_puts(seq, " sl local_address rem_address st tx_queue "
2741 "rx_queue tr tm->when retrnsmt uid timeout "
2747 if (sk->sk_state == TCP_TIME_WAIT)
2748 get_timewait4_sock(v, seq, st->num);
2749 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2750 get_openreq4(v, seq, st->num);
2752 get_tcp4_sock(v, seq, st->num);
2758 #ifdef CONFIG_BPF_SYSCALL
2759 struct bpf_tcp_iter_state {
2760 struct tcp_iter_state state;
2761 unsigned int cur_sk;
2762 unsigned int end_sk;
2763 unsigned int max_sk;
2764 struct sock **batch;
2765 bool st_bucket_done;
2768 struct bpf_iter__tcp {
2769 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2770 __bpf_md_ptr(struct sock_common *, sk_common);
2771 uid_t uid __aligned(8);
2774 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2775 struct sock_common *sk_common, uid_t uid)
2777 struct bpf_iter__tcp ctx;
2779 meta->seq_num--; /* skip SEQ_START_TOKEN */
2781 ctx.sk_common = sk_common;
2783 return bpf_iter_run_prog(prog, &ctx);
2786 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2788 while (iter->cur_sk < iter->end_sk)
2789 sock_gen_put(iter->batch[iter->cur_sk++]);
2792 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2793 unsigned int new_batch_sz)
2795 struct sock **new_batch;
2797 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2798 GFP_USER | __GFP_NOWARN);
2802 bpf_iter_tcp_put_batch(iter);
2803 kvfree(iter->batch);
2804 iter->batch = new_batch;
2805 iter->max_sk = new_batch_sz;
2810 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2811 struct sock *start_sk)
2813 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2814 struct bpf_tcp_iter_state *iter = seq->private;
2815 struct tcp_iter_state *st = &iter->state;
2816 struct hlist_nulls_node *node;
2817 unsigned int expected = 1;
2820 sock_hold(start_sk);
2821 iter->batch[iter->end_sk++] = start_sk;
2823 sk = sk_nulls_next(start_sk);
2824 sk_nulls_for_each_from(sk, node) {
2825 if (seq_sk_match(seq, sk)) {
2826 if (iter->end_sk < iter->max_sk) {
2828 iter->batch[iter->end_sk++] = sk;
2833 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2838 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2839 struct sock *start_sk)
2841 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2842 struct bpf_tcp_iter_state *iter = seq->private;
2843 struct tcp_iter_state *st = &iter->state;
2844 struct hlist_nulls_node *node;
2845 unsigned int expected = 1;
2848 sock_hold(start_sk);
2849 iter->batch[iter->end_sk++] = start_sk;
2851 sk = sk_nulls_next(start_sk);
2852 sk_nulls_for_each_from(sk, node) {
2853 if (seq_sk_match(seq, sk)) {
2854 if (iter->end_sk < iter->max_sk) {
2856 iter->batch[iter->end_sk++] = sk;
2861 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2866 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2868 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2869 struct bpf_tcp_iter_state *iter = seq->private;
2870 struct tcp_iter_state *st = &iter->state;
2871 unsigned int expected;
2872 bool resized = false;
2875 /* The st->bucket is done. Directly advance to the next
2876 * bucket instead of having the tcp_seek_last_pos() to skip
2877 * one by one in the current bucket and eventually find out
2878 * it has to advance to the next bucket.
2880 if (iter->st_bucket_done) {
2883 if (st->state == TCP_SEQ_STATE_LISTENING &&
2884 st->bucket > hinfo->lhash2_mask) {
2885 st->state = TCP_SEQ_STATE_ESTABLISHED;
2891 /* Get a new batch */
2894 iter->st_bucket_done = false;
2896 sk = tcp_seek_last_pos(seq);
2898 return NULL; /* Done */
2900 if (st->state == TCP_SEQ_STATE_LISTENING)
2901 expected = bpf_iter_tcp_listening_batch(seq, sk);
2903 expected = bpf_iter_tcp_established_batch(seq, sk);
2905 if (iter->end_sk == expected) {
2906 iter->st_bucket_done = true;
2910 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2918 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2920 /* bpf iter does not support lseek, so it always
2921 * continue from where it was stop()-ped.
2924 return bpf_iter_tcp_batch(seq);
2926 return SEQ_START_TOKEN;
2929 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2931 struct bpf_tcp_iter_state *iter = seq->private;
2932 struct tcp_iter_state *st = &iter->state;
2935 /* Whenever seq_next() is called, the iter->cur_sk is
2936 * done with seq_show(), so advance to the next sk in
2939 if (iter->cur_sk < iter->end_sk) {
2940 /* Keeping st->num consistent in tcp_iter_state.
2941 * bpf_iter_tcp does not use st->num.
2942 * meta.seq_num is used instead.
2945 /* Move st->offset to the next sk in the bucket such that
2946 * the future start() will resume at st->offset in
2947 * st->bucket. See tcp_seek_last_pos().
2950 sock_gen_put(iter->batch[iter->cur_sk++]);
2953 if (iter->cur_sk < iter->end_sk)
2954 sk = iter->batch[iter->cur_sk];
2956 sk = bpf_iter_tcp_batch(seq);
2959 /* Keeping st->last_pos consistent in tcp_iter_state.
2960 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2962 st->last_pos = *pos;
2966 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2968 struct bpf_iter_meta meta;
2969 struct bpf_prog *prog;
2970 struct sock *sk = v;
2974 if (v == SEQ_START_TOKEN)
2977 if (sk_fullsock(sk))
2980 if (unlikely(sk_unhashed(sk))) {
2985 if (sk->sk_state == TCP_TIME_WAIT) {
2987 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2988 const struct request_sock *req = v;
2990 uid = from_kuid_munged(seq_user_ns(seq),
2991 sock_i_uid(req->rsk_listener));
2993 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2997 prog = bpf_iter_get_info(&meta, false);
2998 ret = tcp_prog_seq_show(prog, &meta, v, uid);
3001 if (sk_fullsock(sk))
3007 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3009 struct bpf_tcp_iter_state *iter = seq->private;
3010 struct bpf_iter_meta meta;
3011 struct bpf_prog *prog;
3015 prog = bpf_iter_get_info(&meta, true);
3017 (void)tcp_prog_seq_show(prog, &meta, v, 0);
3020 if (iter->cur_sk < iter->end_sk) {
3021 bpf_iter_tcp_put_batch(iter);
3022 iter->st_bucket_done = false;
3026 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3027 .show = bpf_iter_tcp_seq_show,
3028 .start = bpf_iter_tcp_seq_start,
3029 .next = bpf_iter_tcp_seq_next,
3030 .stop = bpf_iter_tcp_seq_stop,
3033 static unsigned short seq_file_family(const struct seq_file *seq)
3035 const struct tcp_seq_afinfo *afinfo;
3037 #ifdef CONFIG_BPF_SYSCALL
3038 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3039 if (seq->op == &bpf_iter_tcp_seq_ops)
3043 /* Iterated from proc fs */
3044 afinfo = pde_data(file_inode(seq->file));
3045 return afinfo->family;
3048 static const struct seq_operations tcp4_seq_ops = {
3049 .show = tcp4_seq_show,
3050 .start = tcp_seq_start,
3051 .next = tcp_seq_next,
3052 .stop = tcp_seq_stop,
3055 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3059 static int __net_init tcp4_proc_init_net(struct net *net)
3061 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3062 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3067 static void __net_exit tcp4_proc_exit_net(struct net *net)
3069 remove_proc_entry("tcp", net->proc_net);
3072 static struct pernet_operations tcp4_net_ops = {
3073 .init = tcp4_proc_init_net,
3074 .exit = tcp4_proc_exit_net,
3077 int __init tcp4_proc_init(void)
3079 return register_pernet_subsys(&tcp4_net_ops);
3082 void tcp4_proc_exit(void)
3084 unregister_pernet_subsys(&tcp4_net_ops);
3086 #endif /* CONFIG_PROC_FS */
3088 /* @wake is one when sk_stream_write_space() calls us.
3089 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3090 * This mimics the strategy used in sock_def_write_space().
3092 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3094 const struct tcp_sock *tp = tcp_sk(sk);
3095 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3096 READ_ONCE(tp->snd_nxt);
3098 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3100 EXPORT_SYMBOL(tcp_stream_memory_free);
3102 struct proto tcp_prot = {
3104 .owner = THIS_MODULE,
3106 .pre_connect = tcp_v4_pre_connect,
3107 .connect = tcp_v4_connect,
3108 .disconnect = tcp_disconnect,
3109 .accept = inet_csk_accept,
3111 .init = tcp_v4_init_sock,
3112 .destroy = tcp_v4_destroy_sock,
3113 .shutdown = tcp_shutdown,
3114 .setsockopt = tcp_setsockopt,
3115 .getsockopt = tcp_getsockopt,
3116 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3117 .keepalive = tcp_set_keepalive,
3118 .recvmsg = tcp_recvmsg,
3119 .sendmsg = tcp_sendmsg,
3120 .splice_eof = tcp_splice_eof,
3121 .backlog_rcv = tcp_v4_do_rcv,
3122 .release_cb = tcp_release_cb,
3124 .unhash = inet_unhash,
3125 .get_port = inet_csk_get_port,
3126 .put_port = inet_put_port,
3127 #ifdef CONFIG_BPF_SYSCALL
3128 .psock_update_sk_prot = tcp_bpf_update_proto,
3130 .enter_memory_pressure = tcp_enter_memory_pressure,
3131 .leave_memory_pressure = tcp_leave_memory_pressure,
3132 .stream_memory_free = tcp_stream_memory_free,
3133 .sockets_allocated = &tcp_sockets_allocated,
3134 .orphan_count = &tcp_orphan_count,
3136 .memory_allocated = &tcp_memory_allocated,
3137 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3139 .memory_pressure = &tcp_memory_pressure,
3140 .sysctl_mem = sysctl_tcp_mem,
3141 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3142 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3143 .max_header = MAX_TCP_HEADER,
3144 .obj_size = sizeof(struct tcp_sock),
3145 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3146 .twsk_prot = &tcp_timewait_sock_ops,
3147 .rsk_prot = &tcp_request_sock_ops,
3149 .no_autobind = true,
3150 .diag_destroy = tcp_abort,
3152 EXPORT_SYMBOL(tcp_prot);
3154 static void __net_exit tcp_sk_exit(struct net *net)
3156 if (net->ipv4.tcp_congestion_control)
3157 bpf_module_put(net->ipv4.tcp_congestion_control,
3158 net->ipv4.tcp_congestion_control->owner);
3161 static void __net_init tcp_set_hashinfo(struct net *net)
3163 struct inet_hashinfo *hinfo;
3164 unsigned int ehash_entries;
3165 struct net *old_net;
3167 if (net_eq(net, &init_net))
3170 old_net = current->nsproxy->net_ns;
3171 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3175 ehash_entries = roundup_pow_of_two(ehash_entries);
3176 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3178 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3179 "for a netns, fallback to the global one\n",
3182 hinfo = &tcp_hashinfo;
3183 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3186 net->ipv4.tcp_death_row.hashinfo = hinfo;
3187 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3188 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3191 static int __net_init tcp_sk_init(struct net *net)
3193 net->ipv4.sysctl_tcp_ecn = 2;
3194 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3196 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3197 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3198 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3199 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3200 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3202 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3203 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3204 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3206 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3207 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3208 net->ipv4.sysctl_tcp_syncookies = 1;
3209 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3210 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3211 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3212 net->ipv4.sysctl_tcp_orphan_retries = 0;
3213 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3214 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3215 net->ipv4.sysctl_tcp_tw_reuse = 2;
3216 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3218 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3219 tcp_set_hashinfo(net);
3221 net->ipv4.sysctl_tcp_sack = 1;
3222 net->ipv4.sysctl_tcp_window_scaling = 1;
3223 net->ipv4.sysctl_tcp_timestamps = 1;
3224 net->ipv4.sysctl_tcp_early_retrans = 3;
3225 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3226 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3227 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3228 net->ipv4.sysctl_tcp_max_reordering = 300;
3229 net->ipv4.sysctl_tcp_dsack = 1;
3230 net->ipv4.sysctl_tcp_app_win = 31;
3231 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3232 net->ipv4.sysctl_tcp_frto = 2;
3233 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3234 /* This limits the percentage of the congestion window which we
3235 * will allow a single TSO frame to consume. Building TSO frames
3236 * which are too large can cause TCP streams to be bursty.
3238 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3239 /* Default TSQ limit of 16 TSO segments */
3240 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3242 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3243 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3245 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3246 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3247 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3248 net->ipv4.sysctl_tcp_autocorking = 1;
3249 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3250 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3251 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3252 if (net != &init_net) {
3253 memcpy(net->ipv4.sysctl_tcp_rmem,
3254 init_net.ipv4.sysctl_tcp_rmem,
3255 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3256 memcpy(net->ipv4.sysctl_tcp_wmem,
3257 init_net.ipv4.sysctl_tcp_wmem,
3258 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3260 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3261 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3262 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3263 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3264 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3265 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3267 /* Set default values for PLB */
3268 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3269 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3270 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3271 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3272 /* Default congestion threshold for PLB to mark a round is 50% */
3273 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3275 /* Reno is always built in */
3276 if (!net_eq(net, &init_net) &&
3277 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3278 init_net.ipv4.tcp_congestion_control->owner))
3279 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3281 net->ipv4.tcp_congestion_control = &tcp_reno;
3283 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3284 net->ipv4.sysctl_tcp_shrink_window = 0;
3289 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3293 tcp_twsk_purge(net_exit_list, AF_INET);
3295 list_for_each_entry(net, net_exit_list, exit_list) {
3296 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3297 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3298 tcp_fastopen_ctx_destroy(net);
3302 static struct pernet_operations __net_initdata tcp_sk_ops = {
3303 .init = tcp_sk_init,
3304 .exit = tcp_sk_exit,
3305 .exit_batch = tcp_sk_exit_batch,
3308 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3309 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3310 struct sock_common *sk_common, uid_t uid)
3312 #define INIT_BATCH_SZ 16
3314 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3316 struct bpf_tcp_iter_state *iter = priv_data;
3319 err = bpf_iter_init_seq_net(priv_data, aux);
3323 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3325 bpf_iter_fini_seq_net(priv_data);
3332 static void bpf_iter_fini_tcp(void *priv_data)
3334 struct bpf_tcp_iter_state *iter = priv_data;
3336 bpf_iter_fini_seq_net(priv_data);
3337 kvfree(iter->batch);
3340 static const struct bpf_iter_seq_info tcp_seq_info = {
3341 .seq_ops = &bpf_iter_tcp_seq_ops,
3342 .init_seq_private = bpf_iter_init_tcp,
3343 .fini_seq_private = bpf_iter_fini_tcp,
3344 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3347 static const struct bpf_func_proto *
3348 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3349 const struct bpf_prog *prog)
3352 case BPF_FUNC_setsockopt:
3353 return &bpf_sk_setsockopt_proto;
3354 case BPF_FUNC_getsockopt:
3355 return &bpf_sk_getsockopt_proto;
3361 static struct bpf_iter_reg tcp_reg_info = {
3363 .ctx_arg_info_size = 1,
3365 { offsetof(struct bpf_iter__tcp, sk_common),
3366 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3368 .get_func_proto = bpf_iter_tcp_get_func_proto,
3369 .seq_info = &tcp_seq_info,
3372 static void __init bpf_iter_register(void)
3374 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3375 if (bpf_iter_reg_target(&tcp_reg_info))
3376 pr_warn("Warning: could not register bpf iterator tcp\n");
3381 void __init tcp_v4_init(void)
3385 for_each_possible_cpu(cpu) {
3388 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3389 IPPROTO_TCP, &init_net);
3391 panic("Failed to create the TCP control socket.\n");
3392 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3394 /* Please enforce IP_DF and IPID==0 for RST and
3395 * ACK sent in SYN-RECV and TIME-WAIT state.
3397 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3399 per_cpu(ipv4_tcp_sk, cpu) = sk;
3401 if (register_pernet_subsys(&tcp_sk_ops))
3402 panic("Failed to create the TCP control socket.\n");
3404 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3405 bpf_iter_register();