OSDN Git Service

tcp: annotate data-races around tcp_rsk(req)->ts_recent
[tomoyo/tomoyo-test1.git] / net / ipv4 / tcp_ipv4.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              Implementation of the Transmission Control Protocol(TCP).
8  *
9  *              IPv4 specific functions
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  */
18
19 /*
20  * Changes:
21  *              David S. Miller :       New socket lookup architecture.
22  *                                      This code is dedicated to John Dyson.
23  *              David S. Miller :       Change semantics of established hash,
24  *                                      half is devoted to TIME_WAIT sockets
25  *                                      and the rest go in the other half.
26  *              Andi Kleen :            Add support for syncookies and fixed
27  *                                      some bugs: ip options weren't passed to
28  *                                      the TCP layer, missed a check for an
29  *                                      ACK bit.
30  *              Andi Kleen :            Implemented fast path mtu discovery.
31  *                                      Fixed many serious bugs in the
32  *                                      request_sock handling and moved
33  *                                      most of it into the af independent code.
34  *                                      Added tail drop and some other bugfixes.
35  *                                      Added new listen semantics.
36  *              Mike McLagan    :       Routing by source
37  *      Juan Jose Ciarlante:            ip_dynaddr bits
38  *              Andi Kleen:             various fixes.
39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
40  *                                      coma.
41  *      Andi Kleen              :       Fix new listen.
42  *      Andi Kleen              :       Fix accept error reporting.
43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
45  *                                      a single port at the same time.
46  */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98         return secure_tcp_seq(ip_hdr(skb)->daddr,
99                               ip_hdr(skb)->saddr,
100                               tcp_hdr(skb)->dest,
101                               tcp_hdr(skb)->source);
102 }
103
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112         const struct inet_timewait_sock *tw = inet_twsk(sktw);
113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114         struct tcp_sock *tp = tcp_sk(sk);
115
116         if (reuse == 2) {
117                 /* Still does not detect *everything* that goes through
118                  * lo, since we require a loopback src or dst address
119                  * or direct binding to 'lo' interface.
120                  */
121                 bool loopback = false;
122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123                         loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125                 if (tw->tw_family == AF_INET6) {
126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130                                 loopback = true;
131                 } else
132 #endif
133                 {
134                         if (ipv4_is_loopback(tw->tw_daddr) ||
135                             ipv4_is_loopback(tw->tw_rcv_saddr))
136                                 loopback = true;
137                 }
138                 if (!loopback)
139                         reuse = 0;
140         }
141
142         /* With PAWS, it is safe from the viewpoint
143            of data integrity. Even without PAWS it is safe provided sequence
144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145
146            Actually, the idea is close to VJ's one, only timestamp cache is
147            held not per host, but per port pair and TW bucket is used as state
148            holder.
149
150            If TW bucket has been already destroyed we fall back to VJ's scheme
151            and use initial timestamp retrieved from peer table.
152          */
153         if (tcptw->tw_ts_recent_stamp &&
154             (!twp || (reuse && time_after32(ktime_get_seconds(),
155                                             tcptw->tw_ts_recent_stamp)))) {
156                 /* In case of repair and re-using TIME-WAIT sockets we still
157                  * want to be sure that it is safe as above but honor the
158                  * sequence numbers and time stamps set as part of the repair
159                  * process.
160                  *
161                  * Without this check re-using a TIME-WAIT socket with TCP
162                  * repair would accumulate a -1 on the repair assigned
163                  * sequence number. The first time it is reused the sequence
164                  * is -1, the second time -2, etc. This fixes that issue
165                  * without appearing to create any others.
166                  */
167                 if (likely(!tp->repair)) {
168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170                         if (!seq)
171                                 seq = 1;
172                         WRITE_ONCE(tp->write_seq, seq);
173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175                 }
176                 sock_hold(sktw);
177                 return 1;
178         }
179
180         return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185                               int addr_len)
186 {
187         /* This check is replicated from tcp_v4_connect() and intended to
188          * prevent BPF program called below from accessing bytes that are out
189          * of the bound specified by user in addr_len.
190          */
191         if (addr_len < sizeof(struct sockaddr_in))
192                 return -EINVAL;
193
194         sock_owned_by_me(sk);
195
196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203         struct inet_timewait_death_row *tcp_death_row;
204         struct inet_sock *inet = inet_sk(sk);
205         struct tcp_sock *tp = tcp_sk(sk);
206         struct ip_options_rcu *inet_opt;
207         struct net *net = sock_net(sk);
208         __be16 orig_sport, orig_dport;
209         __be32 daddr, nexthop;
210         struct flowi4 *fl4;
211         struct rtable *rt;
212         int err;
213
214         if (addr_len < sizeof(struct sockaddr_in))
215                 return -EINVAL;
216
217         if (usin->sin_family != AF_INET)
218                 return -EAFNOSUPPORT;
219
220         nexthop = daddr = usin->sin_addr.s_addr;
221         inet_opt = rcu_dereference_protected(inet->inet_opt,
222                                              lockdep_sock_is_held(sk));
223         if (inet_opt && inet_opt->opt.srr) {
224                 if (!daddr)
225                         return -EINVAL;
226                 nexthop = inet_opt->opt.faddr;
227         }
228
229         orig_sport = inet->inet_sport;
230         orig_dport = usin->sin_port;
231         fl4 = &inet->cork.fl.u.ip4;
232         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
233                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
234                               orig_dport, sk);
235         if (IS_ERR(rt)) {
236                 err = PTR_ERR(rt);
237                 if (err == -ENETUNREACH)
238                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
239                 return err;
240         }
241
242         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
243                 ip_rt_put(rt);
244                 return -ENETUNREACH;
245         }
246
247         if (!inet_opt || !inet_opt->opt.srr)
248                 daddr = fl4->daddr;
249
250         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
251
252         if (!inet->inet_saddr) {
253                 err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
254                 if (err) {
255                         ip_rt_put(rt);
256                         return err;
257                 }
258         } else {
259                 sk_rcv_saddr_set(sk, inet->inet_saddr);
260         }
261
262         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
263                 /* Reset inherited state */
264                 tp->rx_opt.ts_recent       = 0;
265                 tp->rx_opt.ts_recent_stamp = 0;
266                 if (likely(!tp->repair))
267                         WRITE_ONCE(tp->write_seq, 0);
268         }
269
270         inet->inet_dport = usin->sin_port;
271         sk_daddr_set(sk, daddr);
272
273         inet_csk(sk)->icsk_ext_hdr_len = 0;
274         if (inet_opt)
275                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
276
277         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
278
279         /* Socket identity is still unknown (sport may be zero).
280          * However we set state to SYN-SENT and not releasing socket
281          * lock select source port, enter ourselves into the hash tables and
282          * complete initialization after this.
283          */
284         tcp_set_state(sk, TCP_SYN_SENT);
285         err = inet_hash_connect(tcp_death_row, sk);
286         if (err)
287                 goto failure;
288
289         sk_set_txhash(sk);
290
291         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
292                                inet->inet_sport, inet->inet_dport, sk);
293         if (IS_ERR(rt)) {
294                 err = PTR_ERR(rt);
295                 rt = NULL;
296                 goto failure;
297         }
298         /* OK, now commit destination to socket.  */
299         sk->sk_gso_type = SKB_GSO_TCPV4;
300         sk_setup_caps(sk, &rt->dst);
301         rt = NULL;
302
303         if (likely(!tp->repair)) {
304                 if (!tp->write_seq)
305                         WRITE_ONCE(tp->write_seq,
306                                    secure_tcp_seq(inet->inet_saddr,
307                                                   inet->inet_daddr,
308                                                   inet->inet_sport,
309                                                   usin->sin_port));
310                 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
311                                                  inet->inet_daddr);
312         }
313
314         inet->inet_id = get_random_u16();
315
316         if (tcp_fastopen_defer_connect(sk, &err))
317                 return err;
318         if (err)
319                 goto failure;
320
321         err = tcp_connect(sk);
322
323         if (err)
324                 goto failure;
325
326         return 0;
327
328 failure:
329         /*
330          * This unhashes the socket and releases the local port,
331          * if necessary.
332          */
333         tcp_set_state(sk, TCP_CLOSE);
334         inet_bhash2_reset_saddr(sk);
335         ip_rt_put(rt);
336         sk->sk_route_caps = 0;
337         inet->inet_dport = 0;
338         return err;
339 }
340 EXPORT_SYMBOL(tcp_v4_connect);
341
342 /*
343  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
344  * It can be called through tcp_release_cb() if socket was owned by user
345  * at the time tcp_v4_err() was called to handle ICMP message.
346  */
347 void tcp_v4_mtu_reduced(struct sock *sk)
348 {
349         struct inet_sock *inet = inet_sk(sk);
350         struct dst_entry *dst;
351         u32 mtu;
352
353         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
354                 return;
355         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
356         dst = inet_csk_update_pmtu(sk, mtu);
357         if (!dst)
358                 return;
359
360         /* Something is about to be wrong... Remember soft error
361          * for the case, if this connection will not able to recover.
362          */
363         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
364                 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
365
366         mtu = dst_mtu(dst);
367
368         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
369             ip_sk_accept_pmtu(sk) &&
370             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
371                 tcp_sync_mss(sk, mtu);
372
373                 /* Resend the TCP packet because it's
374                  * clear that the old packet has been
375                  * dropped. This is the new "fast" path mtu
376                  * discovery.
377                  */
378                 tcp_simple_retransmit(sk);
379         } /* else let the usual retransmit timer handle it */
380 }
381 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
382
383 static void do_redirect(struct sk_buff *skb, struct sock *sk)
384 {
385         struct dst_entry *dst = __sk_dst_check(sk, 0);
386
387         if (dst)
388                 dst->ops->redirect(dst, sk, skb);
389 }
390
391
392 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
393 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
394 {
395         struct request_sock *req = inet_reqsk(sk);
396         struct net *net = sock_net(sk);
397
398         /* ICMPs are not backlogged, hence we cannot get
399          * an established socket here.
400          */
401         if (seq != tcp_rsk(req)->snt_isn) {
402                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
403         } else if (abort) {
404                 /*
405                  * Still in SYN_RECV, just remove it silently.
406                  * There is no good way to pass the error to the newly
407                  * created socket, and POSIX does not want network
408                  * errors returned from accept().
409                  */
410                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
411                 tcp_listendrop(req->rsk_listener);
412         }
413         reqsk_put(req);
414 }
415 EXPORT_SYMBOL(tcp_req_err);
416
417 /* TCP-LD (RFC 6069) logic */
418 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
419 {
420         struct inet_connection_sock *icsk = inet_csk(sk);
421         struct tcp_sock *tp = tcp_sk(sk);
422         struct sk_buff *skb;
423         s32 remaining;
424         u32 delta_us;
425
426         if (sock_owned_by_user(sk))
427                 return;
428
429         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
430             !icsk->icsk_backoff)
431                 return;
432
433         skb = tcp_rtx_queue_head(sk);
434         if (WARN_ON_ONCE(!skb))
435                 return;
436
437         icsk->icsk_backoff--;
438         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
439         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
440
441         tcp_mstamp_refresh(tp);
442         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
443         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
444
445         if (remaining > 0) {
446                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
447                                           remaining, TCP_RTO_MAX);
448         } else {
449                 /* RTO revert clocked out retransmission.
450                  * Will retransmit now.
451                  */
452                 tcp_retransmit_timer(sk);
453         }
454 }
455 EXPORT_SYMBOL(tcp_ld_RTO_revert);
456
457 /*
458  * This routine is called by the ICMP module when it gets some
459  * sort of error condition.  If err < 0 then the socket should
460  * be closed and the error returned to the user.  If err > 0
461  * it's just the icmp type << 8 | icmp code.  After adjustment
462  * header points to the first 8 bytes of the tcp header.  We need
463  * to find the appropriate port.
464  *
465  * The locking strategy used here is very "optimistic". When
466  * someone else accesses the socket the ICMP is just dropped
467  * and for some paths there is no check at all.
468  * A more general error queue to queue errors for later handling
469  * is probably better.
470  *
471  */
472
473 int tcp_v4_err(struct sk_buff *skb, u32 info)
474 {
475         const struct iphdr *iph = (const struct iphdr *)skb->data;
476         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
477         struct tcp_sock *tp;
478         struct inet_sock *inet;
479         const int type = icmp_hdr(skb)->type;
480         const int code = icmp_hdr(skb)->code;
481         struct sock *sk;
482         struct request_sock *fastopen;
483         u32 seq, snd_una;
484         int err;
485         struct net *net = dev_net(skb->dev);
486
487         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
488                                        iph->daddr, th->dest, iph->saddr,
489                                        ntohs(th->source), inet_iif(skb), 0);
490         if (!sk) {
491                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
492                 return -ENOENT;
493         }
494         if (sk->sk_state == TCP_TIME_WAIT) {
495                 inet_twsk_put(inet_twsk(sk));
496                 return 0;
497         }
498         seq = ntohl(th->seq);
499         if (sk->sk_state == TCP_NEW_SYN_RECV) {
500                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
501                                      type == ICMP_TIME_EXCEEDED ||
502                                      (type == ICMP_DEST_UNREACH &&
503                                       (code == ICMP_NET_UNREACH ||
504                                        code == ICMP_HOST_UNREACH)));
505                 return 0;
506         }
507
508         bh_lock_sock(sk);
509         /* If too many ICMPs get dropped on busy
510          * servers this needs to be solved differently.
511          * We do take care of PMTU discovery (RFC1191) special case :
512          * we can receive locally generated ICMP messages while socket is held.
513          */
514         if (sock_owned_by_user(sk)) {
515                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
516                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
517         }
518         if (sk->sk_state == TCP_CLOSE)
519                 goto out;
520
521         if (static_branch_unlikely(&ip4_min_ttl)) {
522                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
523                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
524                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
525                         goto out;
526                 }
527         }
528
529         tp = tcp_sk(sk);
530         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
531         fastopen = rcu_dereference(tp->fastopen_rsk);
532         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
533         if (sk->sk_state != TCP_LISTEN &&
534             !between(seq, snd_una, tp->snd_nxt)) {
535                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
536                 goto out;
537         }
538
539         switch (type) {
540         case ICMP_REDIRECT:
541                 if (!sock_owned_by_user(sk))
542                         do_redirect(skb, sk);
543                 goto out;
544         case ICMP_SOURCE_QUENCH:
545                 /* Just silently ignore these. */
546                 goto out;
547         case ICMP_PARAMETERPROB:
548                 err = EPROTO;
549                 break;
550         case ICMP_DEST_UNREACH:
551                 if (code > NR_ICMP_UNREACH)
552                         goto out;
553
554                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
555                         /* We are not interested in TCP_LISTEN and open_requests
556                          * (SYN-ACKs send out by Linux are always <576bytes so
557                          * they should go through unfragmented).
558                          */
559                         if (sk->sk_state == TCP_LISTEN)
560                                 goto out;
561
562                         WRITE_ONCE(tp->mtu_info, info);
563                         if (!sock_owned_by_user(sk)) {
564                                 tcp_v4_mtu_reduced(sk);
565                         } else {
566                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
567                                         sock_hold(sk);
568                         }
569                         goto out;
570                 }
571
572                 err = icmp_err_convert[code].errno;
573                 /* check if this ICMP message allows revert of backoff.
574                  * (see RFC 6069)
575                  */
576                 if (!fastopen &&
577                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
578                         tcp_ld_RTO_revert(sk, seq);
579                 break;
580         case ICMP_TIME_EXCEEDED:
581                 err = EHOSTUNREACH;
582                 break;
583         default:
584                 goto out;
585         }
586
587         switch (sk->sk_state) {
588         case TCP_SYN_SENT:
589         case TCP_SYN_RECV:
590                 /* Only in fast or simultaneous open. If a fast open socket is
591                  * already accepted it is treated as a connected one below.
592                  */
593                 if (fastopen && !fastopen->sk)
594                         break;
595
596                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
597
598                 if (!sock_owned_by_user(sk)) {
599                         WRITE_ONCE(sk->sk_err, err);
600
601                         sk_error_report(sk);
602
603                         tcp_done(sk);
604                 } else {
605                         WRITE_ONCE(sk->sk_err_soft, err);
606                 }
607                 goto out;
608         }
609
610         /* If we've already connected we will keep trying
611          * until we time out, or the user gives up.
612          *
613          * rfc1122 4.2.3.9 allows to consider as hard errors
614          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
615          * but it is obsoleted by pmtu discovery).
616          *
617          * Note, that in modern internet, where routing is unreliable
618          * and in each dark corner broken firewalls sit, sending random
619          * errors ordered by their masters even this two messages finally lose
620          * their original sense (even Linux sends invalid PORT_UNREACHs)
621          *
622          * Now we are in compliance with RFCs.
623          *                                                      --ANK (980905)
624          */
625
626         inet = inet_sk(sk);
627         if (!sock_owned_by_user(sk) && inet->recverr) {
628                 WRITE_ONCE(sk->sk_err, err);
629                 sk_error_report(sk);
630         } else  { /* Only an error on timeout */
631                 WRITE_ONCE(sk->sk_err_soft, err);
632         }
633
634 out:
635         bh_unlock_sock(sk);
636         sock_put(sk);
637         return 0;
638 }
639
640 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
641 {
642         struct tcphdr *th = tcp_hdr(skb);
643
644         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
645         skb->csum_start = skb_transport_header(skb) - skb->head;
646         skb->csum_offset = offsetof(struct tcphdr, check);
647 }
648
649 /* This routine computes an IPv4 TCP checksum. */
650 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
651 {
652         const struct inet_sock *inet = inet_sk(sk);
653
654         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
655 }
656 EXPORT_SYMBOL(tcp_v4_send_check);
657
658 /*
659  *      This routine will send an RST to the other tcp.
660  *
661  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
662  *                    for reset.
663  *      Answer: if a packet caused RST, it is not for a socket
664  *              existing in our system, if it is matched to a socket,
665  *              it is just duplicate segment or bug in other side's TCP.
666  *              So that we build reply only basing on parameters
667  *              arrived with segment.
668  *      Exception: precedence violation. We do not implement it in any case.
669  */
670
671 #ifdef CONFIG_TCP_MD5SIG
672 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
673 #else
674 #define OPTION_BYTES sizeof(__be32)
675 #endif
676
677 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
678 {
679         const struct tcphdr *th = tcp_hdr(skb);
680         struct {
681                 struct tcphdr th;
682                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
683         } rep;
684         struct ip_reply_arg arg;
685 #ifdef CONFIG_TCP_MD5SIG
686         struct tcp_md5sig_key *key = NULL;
687         const __u8 *hash_location = NULL;
688         unsigned char newhash[16];
689         int genhash;
690         struct sock *sk1 = NULL;
691 #endif
692         u64 transmit_time = 0;
693         struct sock *ctl_sk;
694         struct net *net;
695         u32 txhash = 0;
696
697         /* Never send a reset in response to a reset. */
698         if (th->rst)
699                 return;
700
701         /* If sk not NULL, it means we did a successful lookup and incoming
702          * route had to be correct. prequeue might have dropped our dst.
703          */
704         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
705                 return;
706
707         /* Swap the send and the receive. */
708         memset(&rep, 0, sizeof(rep));
709         rep.th.dest   = th->source;
710         rep.th.source = th->dest;
711         rep.th.doff   = sizeof(struct tcphdr) / 4;
712         rep.th.rst    = 1;
713
714         if (th->ack) {
715                 rep.th.seq = th->ack_seq;
716         } else {
717                 rep.th.ack = 1;
718                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
719                                        skb->len - (th->doff << 2));
720         }
721
722         memset(&arg, 0, sizeof(arg));
723         arg.iov[0].iov_base = (unsigned char *)&rep;
724         arg.iov[0].iov_len  = sizeof(rep.th);
725
726         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
727 #ifdef CONFIG_TCP_MD5SIG
728         rcu_read_lock();
729         hash_location = tcp_parse_md5sig_option(th);
730         if (sk && sk_fullsock(sk)) {
731                 const union tcp_md5_addr *addr;
732                 int l3index;
733
734                 /* sdif set, means packet ingressed via a device
735                  * in an L3 domain and inet_iif is set to it.
736                  */
737                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
738                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
739                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
740         } else if (hash_location) {
741                 const union tcp_md5_addr *addr;
742                 int sdif = tcp_v4_sdif(skb);
743                 int dif = inet_iif(skb);
744                 int l3index;
745
746                 /*
747                  * active side is lost. Try to find listening socket through
748                  * source port, and then find md5 key through listening socket.
749                  * we are not loose security here:
750                  * Incoming packet is checked with md5 hash with finding key,
751                  * no RST generated if md5 hash doesn't match.
752                  */
753                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
754                                              NULL, 0, ip_hdr(skb)->saddr,
755                                              th->source, ip_hdr(skb)->daddr,
756                                              ntohs(th->source), dif, sdif);
757                 /* don't send rst if it can't find key */
758                 if (!sk1)
759                         goto out;
760
761                 /* sdif set, means packet ingressed via a device
762                  * in an L3 domain and dif is set to it.
763                  */
764                 l3index = sdif ? dif : 0;
765                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
766                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
767                 if (!key)
768                         goto out;
769
770
771                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
772                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
773                         goto out;
774
775         }
776
777         if (key) {
778                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
779                                    (TCPOPT_NOP << 16) |
780                                    (TCPOPT_MD5SIG << 8) |
781                                    TCPOLEN_MD5SIG);
782                 /* Update length and the length the header thinks exists */
783                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
784                 rep.th.doff = arg.iov[0].iov_len / 4;
785
786                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
787                                      key, ip_hdr(skb)->saddr,
788                                      ip_hdr(skb)->daddr, &rep.th);
789         }
790 #endif
791         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
792         if (rep.opt[0] == 0) {
793                 __be32 mrst = mptcp_reset_option(skb);
794
795                 if (mrst) {
796                         rep.opt[0] = mrst;
797                         arg.iov[0].iov_len += sizeof(mrst);
798                         rep.th.doff = arg.iov[0].iov_len / 4;
799                 }
800         }
801
802         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
803                                       ip_hdr(skb)->saddr, /* XXX */
804                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
805         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
806         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
807
808         /* When socket is gone, all binding information is lost.
809          * routing might fail in this case. No choice here, if we choose to force
810          * input interface, we will misroute in case of asymmetric route.
811          */
812         if (sk) {
813                 arg.bound_dev_if = sk->sk_bound_dev_if;
814                 if (sk_fullsock(sk))
815                         trace_tcp_send_reset(sk, skb);
816         }
817
818         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
819                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
820
821         arg.tos = ip_hdr(skb)->tos;
822         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
823         local_bh_disable();
824         ctl_sk = this_cpu_read(ipv4_tcp_sk);
825         sock_net_set(ctl_sk, net);
826         if (sk) {
827                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
828                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
829                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
830                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
831                 transmit_time = tcp_transmit_time(sk);
832                 xfrm_sk_clone_policy(ctl_sk, sk);
833                 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
834                          inet_twsk(sk)->tw_txhash : sk->sk_txhash;
835         } else {
836                 ctl_sk->sk_mark = 0;
837                 ctl_sk->sk_priority = 0;
838         }
839         ip_send_unicast_reply(ctl_sk,
840                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
841                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
842                               &arg, arg.iov[0].iov_len,
843                               transmit_time, txhash);
844
845         xfrm_sk_free_policy(ctl_sk);
846         sock_net_set(ctl_sk, &init_net);
847         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
848         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
849         local_bh_enable();
850
851 #ifdef CONFIG_TCP_MD5SIG
852 out:
853         rcu_read_unlock();
854 #endif
855 }
856
857 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
858    outside socket context is ugly, certainly. What can I do?
859  */
860
861 static void tcp_v4_send_ack(const struct sock *sk,
862                             struct sk_buff *skb, u32 seq, u32 ack,
863                             u32 win, u32 tsval, u32 tsecr, int oif,
864                             struct tcp_md5sig_key *key,
865                             int reply_flags, u8 tos, u32 txhash)
866 {
867         const struct tcphdr *th = tcp_hdr(skb);
868         struct {
869                 struct tcphdr th;
870                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
871 #ifdef CONFIG_TCP_MD5SIG
872                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
873 #endif
874                         ];
875         } rep;
876         struct net *net = sock_net(sk);
877         struct ip_reply_arg arg;
878         struct sock *ctl_sk;
879         u64 transmit_time;
880
881         memset(&rep.th, 0, sizeof(struct tcphdr));
882         memset(&arg, 0, sizeof(arg));
883
884         arg.iov[0].iov_base = (unsigned char *)&rep;
885         arg.iov[0].iov_len  = sizeof(rep.th);
886         if (tsecr) {
887                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
888                                    (TCPOPT_TIMESTAMP << 8) |
889                                    TCPOLEN_TIMESTAMP);
890                 rep.opt[1] = htonl(tsval);
891                 rep.opt[2] = htonl(tsecr);
892                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
893         }
894
895         /* Swap the send and the receive. */
896         rep.th.dest    = th->source;
897         rep.th.source  = th->dest;
898         rep.th.doff    = arg.iov[0].iov_len / 4;
899         rep.th.seq     = htonl(seq);
900         rep.th.ack_seq = htonl(ack);
901         rep.th.ack     = 1;
902         rep.th.window  = htons(win);
903
904 #ifdef CONFIG_TCP_MD5SIG
905         if (key) {
906                 int offset = (tsecr) ? 3 : 0;
907
908                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
909                                           (TCPOPT_NOP << 16) |
910                                           (TCPOPT_MD5SIG << 8) |
911                                           TCPOLEN_MD5SIG);
912                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
913                 rep.th.doff = arg.iov[0].iov_len/4;
914
915                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
916                                     key, ip_hdr(skb)->saddr,
917                                     ip_hdr(skb)->daddr, &rep.th);
918         }
919 #endif
920         arg.flags = reply_flags;
921         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
922                                       ip_hdr(skb)->saddr, /* XXX */
923                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
924         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
925         if (oif)
926                 arg.bound_dev_if = oif;
927         arg.tos = tos;
928         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
929         local_bh_disable();
930         ctl_sk = this_cpu_read(ipv4_tcp_sk);
931         sock_net_set(ctl_sk, net);
932         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
933                            inet_twsk(sk)->tw_mark : sk->sk_mark;
934         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
935                            inet_twsk(sk)->tw_priority : sk->sk_priority;
936         transmit_time = tcp_transmit_time(sk);
937         ip_send_unicast_reply(ctl_sk,
938                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
939                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
940                               &arg, arg.iov[0].iov_len,
941                               transmit_time, txhash);
942
943         sock_net_set(ctl_sk, &init_net);
944         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
945         local_bh_enable();
946 }
947
948 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
949 {
950         struct inet_timewait_sock *tw = inet_twsk(sk);
951         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
952
953         tcp_v4_send_ack(sk, skb,
954                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
955                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
956                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
957                         tcptw->tw_ts_recent,
958                         tw->tw_bound_dev_if,
959                         tcp_twsk_md5_key(tcptw),
960                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
961                         tw->tw_tos,
962                         tw->tw_txhash
963                         );
964
965         inet_twsk_put(tw);
966 }
967
968 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
969                                   struct request_sock *req)
970 {
971         const union tcp_md5_addr *addr;
972         int l3index;
973
974         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
975          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
976          */
977         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
978                                              tcp_sk(sk)->snd_nxt;
979
980         /* RFC 7323 2.3
981          * The window field (SEG.WND) of every outgoing segment, with the
982          * exception of <SYN> segments, MUST be right-shifted by
983          * Rcv.Wind.Shift bits:
984          */
985         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
986         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
987         tcp_v4_send_ack(sk, skb, seq,
988                         tcp_rsk(req)->rcv_nxt,
989                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
990                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
991                         READ_ONCE(req->ts_recent),
992                         0,
993                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
994                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
995                         ip_hdr(skb)->tos,
996                         READ_ONCE(tcp_rsk(req)->txhash));
997 }
998
999 /*
1000  *      Send a SYN-ACK after having received a SYN.
1001  *      This still operates on a request_sock only, not on a big
1002  *      socket.
1003  */
1004 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1005                               struct flowi *fl,
1006                               struct request_sock *req,
1007                               struct tcp_fastopen_cookie *foc,
1008                               enum tcp_synack_type synack_type,
1009                               struct sk_buff *syn_skb)
1010 {
1011         const struct inet_request_sock *ireq = inet_rsk(req);
1012         struct flowi4 fl4;
1013         int err = -1;
1014         struct sk_buff *skb;
1015         u8 tos;
1016
1017         /* First, grab a route. */
1018         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1019                 return -1;
1020
1021         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1022
1023         if (skb) {
1024                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1025
1026                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1027                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1028                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1029                                 inet_sk(sk)->tos;
1030
1031                 if (!INET_ECN_is_capable(tos) &&
1032                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1033                         tos |= INET_ECN_ECT_0;
1034
1035                 rcu_read_lock();
1036                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1037                                             ireq->ir_rmt_addr,
1038                                             rcu_dereference(ireq->ireq_opt),
1039                                             tos);
1040                 rcu_read_unlock();
1041                 err = net_xmit_eval(err);
1042         }
1043
1044         return err;
1045 }
1046
1047 /*
1048  *      IPv4 request_sock destructor.
1049  */
1050 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1051 {
1052         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1053 }
1054
1055 #ifdef CONFIG_TCP_MD5SIG
1056 /*
1057  * RFC2385 MD5 checksumming requires a mapping of
1058  * IP address->MD5 Key.
1059  * We need to maintain these in the sk structure.
1060  */
1061
1062 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1063 EXPORT_SYMBOL(tcp_md5_needed);
1064
1065 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1066 {
1067         if (!old)
1068                 return true;
1069
1070         /* l3index always overrides non-l3index */
1071         if (old->l3index && new->l3index == 0)
1072                 return false;
1073         if (old->l3index == 0 && new->l3index)
1074                 return true;
1075
1076         return old->prefixlen < new->prefixlen;
1077 }
1078
1079 /* Find the Key structure for an address.  */
1080 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1081                                            const union tcp_md5_addr *addr,
1082                                            int family)
1083 {
1084         const struct tcp_sock *tp = tcp_sk(sk);
1085         struct tcp_md5sig_key *key;
1086         const struct tcp_md5sig_info *md5sig;
1087         __be32 mask;
1088         struct tcp_md5sig_key *best_match = NULL;
1089         bool match;
1090
1091         /* caller either holds rcu_read_lock() or socket lock */
1092         md5sig = rcu_dereference_check(tp->md5sig_info,
1093                                        lockdep_sock_is_held(sk));
1094         if (!md5sig)
1095                 return NULL;
1096
1097         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1098                                  lockdep_sock_is_held(sk)) {
1099                 if (key->family != family)
1100                         continue;
1101                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1102                         continue;
1103                 if (family == AF_INET) {
1104                         mask = inet_make_mask(key->prefixlen);
1105                         match = (key->addr.a4.s_addr & mask) ==
1106                                 (addr->a4.s_addr & mask);
1107 #if IS_ENABLED(CONFIG_IPV6)
1108                 } else if (family == AF_INET6) {
1109                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1110                                                   key->prefixlen);
1111 #endif
1112                 } else {
1113                         match = false;
1114                 }
1115
1116                 if (match && better_md5_match(best_match, key))
1117                         best_match = key;
1118         }
1119         return best_match;
1120 }
1121 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1122
1123 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1124                                                       const union tcp_md5_addr *addr,
1125                                                       int family, u8 prefixlen,
1126                                                       int l3index, u8 flags)
1127 {
1128         const struct tcp_sock *tp = tcp_sk(sk);
1129         struct tcp_md5sig_key *key;
1130         unsigned int size = sizeof(struct in_addr);
1131         const struct tcp_md5sig_info *md5sig;
1132
1133         /* caller either holds rcu_read_lock() or socket lock */
1134         md5sig = rcu_dereference_check(tp->md5sig_info,
1135                                        lockdep_sock_is_held(sk));
1136         if (!md5sig)
1137                 return NULL;
1138 #if IS_ENABLED(CONFIG_IPV6)
1139         if (family == AF_INET6)
1140                 size = sizeof(struct in6_addr);
1141 #endif
1142         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1143                                  lockdep_sock_is_held(sk)) {
1144                 if (key->family != family)
1145                         continue;
1146                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1147                         continue;
1148                 if (key->l3index != l3index)
1149                         continue;
1150                 if (!memcmp(&key->addr, addr, size) &&
1151                     key->prefixlen == prefixlen)
1152                         return key;
1153         }
1154         return NULL;
1155 }
1156
1157 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1158                                          const struct sock *addr_sk)
1159 {
1160         const union tcp_md5_addr *addr;
1161         int l3index;
1162
1163         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1164                                                  addr_sk->sk_bound_dev_if);
1165         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1166         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1167 }
1168 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1169
1170 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1171 {
1172         struct tcp_sock *tp = tcp_sk(sk);
1173         struct tcp_md5sig_info *md5sig;
1174
1175         md5sig = kmalloc(sizeof(*md5sig), gfp);
1176         if (!md5sig)
1177                 return -ENOMEM;
1178
1179         sk_gso_disable(sk);
1180         INIT_HLIST_HEAD(&md5sig->head);
1181         rcu_assign_pointer(tp->md5sig_info, md5sig);
1182         return 0;
1183 }
1184
1185 /* This can be called on a newly created socket, from other files */
1186 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1187                             int family, u8 prefixlen, int l3index, u8 flags,
1188                             const u8 *newkey, u8 newkeylen, gfp_t gfp)
1189 {
1190         /* Add Key to the list */
1191         struct tcp_md5sig_key *key;
1192         struct tcp_sock *tp = tcp_sk(sk);
1193         struct tcp_md5sig_info *md5sig;
1194
1195         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1196         if (key) {
1197                 /* Pre-existing entry - just update that one.
1198                  * Note that the key might be used concurrently.
1199                  * data_race() is telling kcsan that we do not care of
1200                  * key mismatches, since changing MD5 key on live flows
1201                  * can lead to packet drops.
1202                  */
1203                 data_race(memcpy(key->key, newkey, newkeylen));
1204
1205                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1206                  * Also note that a reader could catch new key->keylen value
1207                  * but old key->key[], this is the reason we use __GFP_ZERO
1208                  * at sock_kmalloc() time below these lines.
1209                  */
1210                 WRITE_ONCE(key->keylen, newkeylen);
1211
1212                 return 0;
1213         }
1214
1215         md5sig = rcu_dereference_protected(tp->md5sig_info,
1216                                            lockdep_sock_is_held(sk));
1217
1218         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1219         if (!key)
1220                 return -ENOMEM;
1221         if (!tcp_alloc_md5sig_pool()) {
1222                 sock_kfree_s(sk, key, sizeof(*key));
1223                 return -ENOMEM;
1224         }
1225
1226         memcpy(key->key, newkey, newkeylen);
1227         key->keylen = newkeylen;
1228         key->family = family;
1229         key->prefixlen = prefixlen;
1230         key->l3index = l3index;
1231         key->flags = flags;
1232         memcpy(&key->addr, addr,
1233                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1234                                                                  sizeof(struct in_addr));
1235         hlist_add_head_rcu(&key->node, &md5sig->head);
1236         return 0;
1237 }
1238
1239 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1240                    int family, u8 prefixlen, int l3index, u8 flags,
1241                    const u8 *newkey, u8 newkeylen)
1242 {
1243         struct tcp_sock *tp = tcp_sk(sk);
1244
1245         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1246                 if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1247                         return -ENOMEM;
1248
1249                 if (!static_branch_inc(&tcp_md5_needed.key)) {
1250                         struct tcp_md5sig_info *md5sig;
1251
1252                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1253                         rcu_assign_pointer(tp->md5sig_info, NULL);
1254                         kfree_rcu(md5sig, rcu);
1255                         return -EUSERS;
1256                 }
1257         }
1258
1259         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1260                                 newkey, newkeylen, GFP_KERNEL);
1261 }
1262 EXPORT_SYMBOL(tcp_md5_do_add);
1263
1264 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1265                      int family, u8 prefixlen, int l3index,
1266                      struct tcp_md5sig_key *key)
1267 {
1268         struct tcp_sock *tp = tcp_sk(sk);
1269
1270         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1271                 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1272                         return -ENOMEM;
1273
1274                 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1275                         struct tcp_md5sig_info *md5sig;
1276
1277                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1278                         net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1279                         rcu_assign_pointer(tp->md5sig_info, NULL);
1280                         kfree_rcu(md5sig, rcu);
1281                         return -EUSERS;
1282                 }
1283         }
1284
1285         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1286                                 key->flags, key->key, key->keylen,
1287                                 sk_gfp_mask(sk, GFP_ATOMIC));
1288 }
1289 EXPORT_SYMBOL(tcp_md5_key_copy);
1290
1291 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1292                    u8 prefixlen, int l3index, u8 flags)
1293 {
1294         struct tcp_md5sig_key *key;
1295
1296         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1297         if (!key)
1298                 return -ENOENT;
1299         hlist_del_rcu(&key->node);
1300         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1301         kfree_rcu(key, rcu);
1302         return 0;
1303 }
1304 EXPORT_SYMBOL(tcp_md5_do_del);
1305
1306 static void tcp_clear_md5_list(struct sock *sk)
1307 {
1308         struct tcp_sock *tp = tcp_sk(sk);
1309         struct tcp_md5sig_key *key;
1310         struct hlist_node *n;
1311         struct tcp_md5sig_info *md5sig;
1312
1313         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1314
1315         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1316                 hlist_del_rcu(&key->node);
1317                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1318                 kfree_rcu(key, rcu);
1319         }
1320 }
1321
1322 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1323                                  sockptr_t optval, int optlen)
1324 {
1325         struct tcp_md5sig cmd;
1326         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1327         const union tcp_md5_addr *addr;
1328         u8 prefixlen = 32;
1329         int l3index = 0;
1330         u8 flags;
1331
1332         if (optlen < sizeof(cmd))
1333                 return -EINVAL;
1334
1335         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1336                 return -EFAULT;
1337
1338         if (sin->sin_family != AF_INET)
1339                 return -EINVAL;
1340
1341         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1342
1343         if (optname == TCP_MD5SIG_EXT &&
1344             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1345                 prefixlen = cmd.tcpm_prefixlen;
1346                 if (prefixlen > 32)
1347                         return -EINVAL;
1348         }
1349
1350         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1351             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1352                 struct net_device *dev;
1353
1354                 rcu_read_lock();
1355                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1356                 if (dev && netif_is_l3_master(dev))
1357                         l3index = dev->ifindex;
1358
1359                 rcu_read_unlock();
1360
1361                 /* ok to reference set/not set outside of rcu;
1362                  * right now device MUST be an L3 master
1363                  */
1364                 if (!dev || !l3index)
1365                         return -EINVAL;
1366         }
1367
1368         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1369
1370         if (!cmd.tcpm_keylen)
1371                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1372
1373         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1374                 return -EINVAL;
1375
1376         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1377                               cmd.tcpm_key, cmd.tcpm_keylen);
1378 }
1379
1380 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1381                                    __be32 daddr, __be32 saddr,
1382                                    const struct tcphdr *th, int nbytes)
1383 {
1384         struct tcp4_pseudohdr *bp;
1385         struct scatterlist sg;
1386         struct tcphdr *_th;
1387
1388         bp = hp->scratch;
1389         bp->saddr = saddr;
1390         bp->daddr = daddr;
1391         bp->pad = 0;
1392         bp->protocol = IPPROTO_TCP;
1393         bp->len = cpu_to_be16(nbytes);
1394
1395         _th = (struct tcphdr *)(bp + 1);
1396         memcpy(_th, th, sizeof(*th));
1397         _th->check = 0;
1398
1399         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1400         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1401                                 sizeof(*bp) + sizeof(*th));
1402         return crypto_ahash_update(hp->md5_req);
1403 }
1404
1405 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1406                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1407 {
1408         struct tcp_md5sig_pool *hp;
1409         struct ahash_request *req;
1410
1411         hp = tcp_get_md5sig_pool();
1412         if (!hp)
1413                 goto clear_hash_noput;
1414         req = hp->md5_req;
1415
1416         if (crypto_ahash_init(req))
1417                 goto clear_hash;
1418         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1419                 goto clear_hash;
1420         if (tcp_md5_hash_key(hp, key))
1421                 goto clear_hash;
1422         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1423         if (crypto_ahash_final(req))
1424                 goto clear_hash;
1425
1426         tcp_put_md5sig_pool();
1427         return 0;
1428
1429 clear_hash:
1430         tcp_put_md5sig_pool();
1431 clear_hash_noput:
1432         memset(md5_hash, 0, 16);
1433         return 1;
1434 }
1435
1436 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1437                         const struct sock *sk,
1438                         const struct sk_buff *skb)
1439 {
1440         struct tcp_md5sig_pool *hp;
1441         struct ahash_request *req;
1442         const struct tcphdr *th = tcp_hdr(skb);
1443         __be32 saddr, daddr;
1444
1445         if (sk) { /* valid for establish/request sockets */
1446                 saddr = sk->sk_rcv_saddr;
1447                 daddr = sk->sk_daddr;
1448         } else {
1449                 const struct iphdr *iph = ip_hdr(skb);
1450                 saddr = iph->saddr;
1451                 daddr = iph->daddr;
1452         }
1453
1454         hp = tcp_get_md5sig_pool();
1455         if (!hp)
1456                 goto clear_hash_noput;
1457         req = hp->md5_req;
1458
1459         if (crypto_ahash_init(req))
1460                 goto clear_hash;
1461
1462         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1463                 goto clear_hash;
1464         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1465                 goto clear_hash;
1466         if (tcp_md5_hash_key(hp, key))
1467                 goto clear_hash;
1468         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1469         if (crypto_ahash_final(req))
1470                 goto clear_hash;
1471
1472         tcp_put_md5sig_pool();
1473         return 0;
1474
1475 clear_hash:
1476         tcp_put_md5sig_pool();
1477 clear_hash_noput:
1478         memset(md5_hash, 0, 16);
1479         return 1;
1480 }
1481 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1482
1483 #endif
1484
1485 static void tcp_v4_init_req(struct request_sock *req,
1486                             const struct sock *sk_listener,
1487                             struct sk_buff *skb)
1488 {
1489         struct inet_request_sock *ireq = inet_rsk(req);
1490         struct net *net = sock_net(sk_listener);
1491
1492         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1493         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1494         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1495 }
1496
1497 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1498                                           struct sk_buff *skb,
1499                                           struct flowi *fl,
1500                                           struct request_sock *req)
1501 {
1502         tcp_v4_init_req(req, sk, skb);
1503
1504         if (security_inet_conn_request(sk, skb, req))
1505                 return NULL;
1506
1507         return inet_csk_route_req(sk, &fl->u.ip4, req);
1508 }
1509
1510 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1511         .family         =       PF_INET,
1512         .obj_size       =       sizeof(struct tcp_request_sock),
1513         .rtx_syn_ack    =       tcp_rtx_synack,
1514         .send_ack       =       tcp_v4_reqsk_send_ack,
1515         .destructor     =       tcp_v4_reqsk_destructor,
1516         .send_reset     =       tcp_v4_send_reset,
1517         .syn_ack_timeout =      tcp_syn_ack_timeout,
1518 };
1519
1520 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1521         .mss_clamp      =       TCP_MSS_DEFAULT,
1522 #ifdef CONFIG_TCP_MD5SIG
1523         .req_md5_lookup =       tcp_v4_md5_lookup,
1524         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1525 #endif
1526 #ifdef CONFIG_SYN_COOKIES
1527         .cookie_init_seq =      cookie_v4_init_sequence,
1528 #endif
1529         .route_req      =       tcp_v4_route_req,
1530         .init_seq       =       tcp_v4_init_seq,
1531         .init_ts_off    =       tcp_v4_init_ts_off,
1532         .send_synack    =       tcp_v4_send_synack,
1533 };
1534
1535 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1536 {
1537         /* Never answer to SYNs send to broadcast or multicast */
1538         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1539                 goto drop;
1540
1541         return tcp_conn_request(&tcp_request_sock_ops,
1542                                 &tcp_request_sock_ipv4_ops, sk, skb);
1543
1544 drop:
1545         tcp_listendrop(sk);
1546         return 0;
1547 }
1548 EXPORT_SYMBOL(tcp_v4_conn_request);
1549
1550
1551 /*
1552  * The three way handshake has completed - we got a valid synack -
1553  * now create the new socket.
1554  */
1555 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1556                                   struct request_sock *req,
1557                                   struct dst_entry *dst,
1558                                   struct request_sock *req_unhash,
1559                                   bool *own_req)
1560 {
1561         struct inet_request_sock *ireq;
1562         bool found_dup_sk = false;
1563         struct inet_sock *newinet;
1564         struct tcp_sock *newtp;
1565         struct sock *newsk;
1566 #ifdef CONFIG_TCP_MD5SIG
1567         const union tcp_md5_addr *addr;
1568         struct tcp_md5sig_key *key;
1569         int l3index;
1570 #endif
1571         struct ip_options_rcu *inet_opt;
1572
1573         if (sk_acceptq_is_full(sk))
1574                 goto exit_overflow;
1575
1576         newsk = tcp_create_openreq_child(sk, req, skb);
1577         if (!newsk)
1578                 goto exit_nonewsk;
1579
1580         newsk->sk_gso_type = SKB_GSO_TCPV4;
1581         inet_sk_rx_dst_set(newsk, skb);
1582
1583         newtp                 = tcp_sk(newsk);
1584         newinet               = inet_sk(newsk);
1585         ireq                  = inet_rsk(req);
1586         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1587         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1588         newsk->sk_bound_dev_if = ireq->ir_iif;
1589         newinet->inet_saddr   = ireq->ir_loc_addr;
1590         inet_opt              = rcu_dereference(ireq->ireq_opt);
1591         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1592         newinet->mc_index     = inet_iif(skb);
1593         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1594         newinet->rcv_tos      = ip_hdr(skb)->tos;
1595         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1596         if (inet_opt)
1597                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1598         newinet->inet_id = get_random_u16();
1599
1600         /* Set ToS of the new socket based upon the value of incoming SYN.
1601          * ECT bits are set later in tcp_init_transfer().
1602          */
1603         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1604                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1605
1606         if (!dst) {
1607                 dst = inet_csk_route_child_sock(sk, newsk, req);
1608                 if (!dst)
1609                         goto put_and_exit;
1610         } else {
1611                 /* syncookie case : see end of cookie_v4_check() */
1612         }
1613         sk_setup_caps(newsk, dst);
1614
1615         tcp_ca_openreq_child(newsk, dst);
1616
1617         tcp_sync_mss(newsk, dst_mtu(dst));
1618         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1619
1620         tcp_initialize_rcv_mss(newsk);
1621
1622 #ifdef CONFIG_TCP_MD5SIG
1623         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1624         /* Copy over the MD5 key from the original socket */
1625         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1626         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1627         if (key) {
1628                 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1629                         goto put_and_exit;
1630                 sk_gso_disable(newsk);
1631         }
1632 #endif
1633
1634         if (__inet_inherit_port(sk, newsk) < 0)
1635                 goto put_and_exit;
1636         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1637                                        &found_dup_sk);
1638         if (likely(*own_req)) {
1639                 tcp_move_syn(newtp, req);
1640                 ireq->ireq_opt = NULL;
1641         } else {
1642                 newinet->inet_opt = NULL;
1643
1644                 if (!req_unhash && found_dup_sk) {
1645                         /* This code path should only be executed in the
1646                          * syncookie case only
1647                          */
1648                         bh_unlock_sock(newsk);
1649                         sock_put(newsk);
1650                         newsk = NULL;
1651                 }
1652         }
1653         return newsk;
1654
1655 exit_overflow:
1656         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1657 exit_nonewsk:
1658         dst_release(dst);
1659 exit:
1660         tcp_listendrop(sk);
1661         return NULL;
1662 put_and_exit:
1663         newinet->inet_opt = NULL;
1664         inet_csk_prepare_forced_close(newsk);
1665         tcp_done(newsk);
1666         goto exit;
1667 }
1668 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1669
1670 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1671 {
1672 #ifdef CONFIG_SYN_COOKIES
1673         const struct tcphdr *th = tcp_hdr(skb);
1674
1675         if (!th->syn)
1676                 sk = cookie_v4_check(sk, skb);
1677 #endif
1678         return sk;
1679 }
1680
1681 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1682                          struct tcphdr *th, u32 *cookie)
1683 {
1684         u16 mss = 0;
1685 #ifdef CONFIG_SYN_COOKIES
1686         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1687                                     &tcp_request_sock_ipv4_ops, sk, th);
1688         if (mss) {
1689                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1690                 tcp_synq_overflow(sk);
1691         }
1692 #endif
1693         return mss;
1694 }
1695
1696 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1697                                                            u32));
1698 /* The socket must have it's spinlock held when we get
1699  * here, unless it is a TCP_LISTEN socket.
1700  *
1701  * We have a potential double-lock case here, so even when
1702  * doing backlog processing we use the BH locking scheme.
1703  * This is because we cannot sleep with the original spinlock
1704  * held.
1705  */
1706 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1707 {
1708         enum skb_drop_reason reason;
1709         struct sock *rsk;
1710
1711         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1712                 struct dst_entry *dst;
1713
1714                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1715                                                 lockdep_sock_is_held(sk));
1716
1717                 sock_rps_save_rxhash(sk, skb);
1718                 sk_mark_napi_id(sk, skb);
1719                 if (dst) {
1720                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1721                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1722                                              dst, 0)) {
1723                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1724                                 dst_release(dst);
1725                         }
1726                 }
1727                 tcp_rcv_established(sk, skb);
1728                 return 0;
1729         }
1730
1731         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1732         if (tcp_checksum_complete(skb))
1733                 goto csum_err;
1734
1735         if (sk->sk_state == TCP_LISTEN) {
1736                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1737
1738                 if (!nsk)
1739                         goto discard;
1740                 if (nsk != sk) {
1741                         if (tcp_child_process(sk, nsk, skb)) {
1742                                 rsk = nsk;
1743                                 goto reset;
1744                         }
1745                         return 0;
1746                 }
1747         } else
1748                 sock_rps_save_rxhash(sk, skb);
1749
1750         if (tcp_rcv_state_process(sk, skb)) {
1751                 rsk = sk;
1752                 goto reset;
1753         }
1754         return 0;
1755
1756 reset:
1757         tcp_v4_send_reset(rsk, skb);
1758 discard:
1759         kfree_skb_reason(skb, reason);
1760         /* Be careful here. If this function gets more complicated and
1761          * gcc suffers from register pressure on the x86, sk (in %ebx)
1762          * might be destroyed here. This current version compiles correctly,
1763          * but you have been warned.
1764          */
1765         return 0;
1766
1767 csum_err:
1768         reason = SKB_DROP_REASON_TCP_CSUM;
1769         trace_tcp_bad_csum(skb);
1770         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1771         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1772         goto discard;
1773 }
1774 EXPORT_SYMBOL(tcp_v4_do_rcv);
1775
1776 int tcp_v4_early_demux(struct sk_buff *skb)
1777 {
1778         struct net *net = dev_net(skb->dev);
1779         const struct iphdr *iph;
1780         const struct tcphdr *th;
1781         struct sock *sk;
1782
1783         if (skb->pkt_type != PACKET_HOST)
1784                 return 0;
1785
1786         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1787                 return 0;
1788
1789         iph = ip_hdr(skb);
1790         th = tcp_hdr(skb);
1791
1792         if (th->doff < sizeof(struct tcphdr) / 4)
1793                 return 0;
1794
1795         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1796                                        iph->saddr, th->source,
1797                                        iph->daddr, ntohs(th->dest),
1798                                        skb->skb_iif, inet_sdif(skb));
1799         if (sk) {
1800                 skb->sk = sk;
1801                 skb->destructor = sock_edemux;
1802                 if (sk_fullsock(sk)) {
1803                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1804
1805                         if (dst)
1806                                 dst = dst_check(dst, 0);
1807                         if (dst &&
1808                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1809                                 skb_dst_set_noref(skb, dst);
1810                 }
1811         }
1812         return 0;
1813 }
1814
1815 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1816                      enum skb_drop_reason *reason)
1817 {
1818         u32 limit, tail_gso_size, tail_gso_segs;
1819         struct skb_shared_info *shinfo;
1820         const struct tcphdr *th;
1821         struct tcphdr *thtail;
1822         struct sk_buff *tail;
1823         unsigned int hdrlen;
1824         bool fragstolen;
1825         u32 gso_segs;
1826         u32 gso_size;
1827         int delta;
1828
1829         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1830          * we can fix skb->truesize to its real value to avoid future drops.
1831          * This is valid because skb is not yet charged to the socket.
1832          * It has been noticed pure SACK packets were sometimes dropped
1833          * (if cooked by drivers without copybreak feature).
1834          */
1835         skb_condense(skb);
1836
1837         skb_dst_drop(skb);
1838
1839         if (unlikely(tcp_checksum_complete(skb))) {
1840                 bh_unlock_sock(sk);
1841                 trace_tcp_bad_csum(skb);
1842                 *reason = SKB_DROP_REASON_TCP_CSUM;
1843                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1844                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1845                 return true;
1846         }
1847
1848         /* Attempt coalescing to last skb in backlog, even if we are
1849          * above the limits.
1850          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1851          */
1852         th = (const struct tcphdr *)skb->data;
1853         hdrlen = th->doff * 4;
1854
1855         tail = sk->sk_backlog.tail;
1856         if (!tail)
1857                 goto no_coalesce;
1858         thtail = (struct tcphdr *)tail->data;
1859
1860         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1861             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1862             ((TCP_SKB_CB(tail)->tcp_flags |
1863               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1864             !((TCP_SKB_CB(tail)->tcp_flags &
1865               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1866             ((TCP_SKB_CB(tail)->tcp_flags ^
1867               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1868 #ifdef CONFIG_TLS_DEVICE
1869             tail->decrypted != skb->decrypted ||
1870 #endif
1871             thtail->doff != th->doff ||
1872             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1873                 goto no_coalesce;
1874
1875         __skb_pull(skb, hdrlen);
1876
1877         shinfo = skb_shinfo(skb);
1878         gso_size = shinfo->gso_size ?: skb->len;
1879         gso_segs = shinfo->gso_segs ?: 1;
1880
1881         shinfo = skb_shinfo(tail);
1882         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1883         tail_gso_segs = shinfo->gso_segs ?: 1;
1884
1885         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1886                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1887
1888                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1889                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1890                         thtail->window = th->window;
1891                 }
1892
1893                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1894                  * thtail->fin, so that the fast path in tcp_rcv_established()
1895                  * is not entered if we append a packet with a FIN.
1896                  * SYN, RST, URG are not present.
1897                  * ACK is set on both packets.
1898                  * PSH : we do not really care in TCP stack,
1899                  *       at least for 'GRO' packets.
1900                  */
1901                 thtail->fin |= th->fin;
1902                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1903
1904                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1905                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1906                         tail->tstamp = skb->tstamp;
1907                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1908                 }
1909
1910                 /* Not as strict as GRO. We only need to carry mss max value */
1911                 shinfo->gso_size = max(gso_size, tail_gso_size);
1912                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1913
1914                 sk->sk_backlog.len += delta;
1915                 __NET_INC_STATS(sock_net(sk),
1916                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1917                 kfree_skb_partial(skb, fragstolen);
1918                 return false;
1919         }
1920         __skb_push(skb, hdrlen);
1921
1922 no_coalesce:
1923         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1924
1925         /* Only socket owner can try to collapse/prune rx queues
1926          * to reduce memory overhead, so add a little headroom here.
1927          * Few sockets backlog are possibly concurrently non empty.
1928          */
1929         limit += 64 * 1024;
1930
1931         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1932                 bh_unlock_sock(sk);
1933                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1934                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1935                 return true;
1936         }
1937         return false;
1938 }
1939 EXPORT_SYMBOL(tcp_add_backlog);
1940
1941 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1942 {
1943         struct tcphdr *th = (struct tcphdr *)skb->data;
1944
1945         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1946 }
1947 EXPORT_SYMBOL(tcp_filter);
1948
1949 static void tcp_v4_restore_cb(struct sk_buff *skb)
1950 {
1951         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1952                 sizeof(struct inet_skb_parm));
1953 }
1954
1955 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1956                            const struct tcphdr *th)
1957 {
1958         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1959          * barrier() makes sure compiler wont play fool^Waliasing games.
1960          */
1961         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1962                 sizeof(struct inet_skb_parm));
1963         barrier();
1964
1965         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1966         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1967                                     skb->len - th->doff * 4);
1968         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1969         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1970         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1971         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1972         TCP_SKB_CB(skb)->sacked  = 0;
1973         TCP_SKB_CB(skb)->has_rxtstamp =
1974                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1975 }
1976
1977 /*
1978  *      From tcp_input.c
1979  */
1980
1981 int tcp_v4_rcv(struct sk_buff *skb)
1982 {
1983         struct net *net = dev_net(skb->dev);
1984         enum skb_drop_reason drop_reason;
1985         int sdif = inet_sdif(skb);
1986         int dif = inet_iif(skb);
1987         const struct iphdr *iph;
1988         const struct tcphdr *th;
1989         bool refcounted;
1990         struct sock *sk;
1991         int ret;
1992
1993         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1994         if (skb->pkt_type != PACKET_HOST)
1995                 goto discard_it;
1996
1997         /* Count it even if it's bad */
1998         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1999
2000         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2001                 goto discard_it;
2002
2003         th = (const struct tcphdr *)skb->data;
2004
2005         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2006                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2007                 goto bad_packet;
2008         }
2009         if (!pskb_may_pull(skb, th->doff * 4))
2010                 goto discard_it;
2011
2012         /* An explanation is required here, I think.
2013          * Packet length and doff are validated by header prediction,
2014          * provided case of th->doff==0 is eliminated.
2015          * So, we defer the checks. */
2016
2017         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2018                 goto csum_error;
2019
2020         th = (const struct tcphdr *)skb->data;
2021         iph = ip_hdr(skb);
2022 lookup:
2023         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2024                                skb, __tcp_hdrlen(th), th->source,
2025                                th->dest, sdif, &refcounted);
2026         if (!sk)
2027                 goto no_tcp_socket;
2028
2029 process:
2030         if (sk->sk_state == TCP_TIME_WAIT)
2031                 goto do_time_wait;
2032
2033         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2034                 struct request_sock *req = inet_reqsk(sk);
2035                 bool req_stolen = false;
2036                 struct sock *nsk;
2037
2038                 sk = req->rsk_listener;
2039                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2040                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2041                 else
2042                         drop_reason = tcp_inbound_md5_hash(sk, skb,
2043                                                    &iph->saddr, &iph->daddr,
2044                                                    AF_INET, dif, sdif);
2045                 if (unlikely(drop_reason)) {
2046                         sk_drops_add(sk, skb);
2047                         reqsk_put(req);
2048                         goto discard_it;
2049                 }
2050                 if (tcp_checksum_complete(skb)) {
2051                         reqsk_put(req);
2052                         goto csum_error;
2053                 }
2054                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2055                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2056                         if (!nsk) {
2057                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2058                                 goto lookup;
2059                         }
2060                         sk = nsk;
2061                         /* reuseport_migrate_sock() has already held one sk_refcnt
2062                          * before returning.
2063                          */
2064                 } else {
2065                         /* We own a reference on the listener, increase it again
2066                          * as we might lose it too soon.
2067                          */
2068                         sock_hold(sk);
2069                 }
2070                 refcounted = true;
2071                 nsk = NULL;
2072                 if (!tcp_filter(sk, skb)) {
2073                         th = (const struct tcphdr *)skb->data;
2074                         iph = ip_hdr(skb);
2075                         tcp_v4_fill_cb(skb, iph, th);
2076                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2077                 } else {
2078                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2079                 }
2080                 if (!nsk) {
2081                         reqsk_put(req);
2082                         if (req_stolen) {
2083                                 /* Another cpu got exclusive access to req
2084                                  * and created a full blown socket.
2085                                  * Try to feed this packet to this socket
2086                                  * instead of discarding it.
2087                                  */
2088                                 tcp_v4_restore_cb(skb);
2089                                 sock_put(sk);
2090                                 goto lookup;
2091                         }
2092                         goto discard_and_relse;
2093                 }
2094                 nf_reset_ct(skb);
2095                 if (nsk == sk) {
2096                         reqsk_put(req);
2097                         tcp_v4_restore_cb(skb);
2098                 } else if (tcp_child_process(sk, nsk, skb)) {
2099                         tcp_v4_send_reset(nsk, skb);
2100                         goto discard_and_relse;
2101                 } else {
2102                         sock_put(sk);
2103                         return 0;
2104                 }
2105         }
2106
2107         if (static_branch_unlikely(&ip4_min_ttl)) {
2108                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2109                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2110                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2111                         drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2112                         goto discard_and_relse;
2113                 }
2114         }
2115
2116         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2117                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2118                 goto discard_and_relse;
2119         }
2120
2121         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2122                                            &iph->daddr, AF_INET, dif, sdif);
2123         if (drop_reason)
2124                 goto discard_and_relse;
2125
2126         nf_reset_ct(skb);
2127
2128         if (tcp_filter(sk, skb)) {
2129                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2130                 goto discard_and_relse;
2131         }
2132         th = (const struct tcphdr *)skb->data;
2133         iph = ip_hdr(skb);
2134         tcp_v4_fill_cb(skb, iph, th);
2135
2136         skb->dev = NULL;
2137
2138         if (sk->sk_state == TCP_LISTEN) {
2139                 ret = tcp_v4_do_rcv(sk, skb);
2140                 goto put_and_return;
2141         }
2142
2143         sk_incoming_cpu_update(sk);
2144
2145         bh_lock_sock_nested(sk);
2146         tcp_segs_in(tcp_sk(sk), skb);
2147         ret = 0;
2148         if (!sock_owned_by_user(sk)) {
2149                 ret = tcp_v4_do_rcv(sk, skb);
2150         } else {
2151                 if (tcp_add_backlog(sk, skb, &drop_reason))
2152                         goto discard_and_relse;
2153         }
2154         bh_unlock_sock(sk);
2155
2156 put_and_return:
2157         if (refcounted)
2158                 sock_put(sk);
2159
2160         return ret;
2161
2162 no_tcp_socket:
2163         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2164         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2165                 goto discard_it;
2166
2167         tcp_v4_fill_cb(skb, iph, th);
2168
2169         if (tcp_checksum_complete(skb)) {
2170 csum_error:
2171                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2172                 trace_tcp_bad_csum(skb);
2173                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2174 bad_packet:
2175                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2176         } else {
2177                 tcp_v4_send_reset(NULL, skb);
2178         }
2179
2180 discard_it:
2181         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2182         /* Discard frame. */
2183         kfree_skb_reason(skb, drop_reason);
2184         return 0;
2185
2186 discard_and_relse:
2187         sk_drops_add(sk, skb);
2188         if (refcounted)
2189                 sock_put(sk);
2190         goto discard_it;
2191
2192 do_time_wait:
2193         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2194                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2195                 inet_twsk_put(inet_twsk(sk));
2196                 goto discard_it;
2197         }
2198
2199         tcp_v4_fill_cb(skb, iph, th);
2200
2201         if (tcp_checksum_complete(skb)) {
2202                 inet_twsk_put(inet_twsk(sk));
2203                 goto csum_error;
2204         }
2205         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2206         case TCP_TW_SYN: {
2207                 struct sock *sk2 = inet_lookup_listener(net,
2208                                                         net->ipv4.tcp_death_row.hashinfo,
2209                                                         skb, __tcp_hdrlen(th),
2210                                                         iph->saddr, th->source,
2211                                                         iph->daddr, th->dest,
2212                                                         inet_iif(skb),
2213                                                         sdif);
2214                 if (sk2) {
2215                         inet_twsk_deschedule_put(inet_twsk(sk));
2216                         sk = sk2;
2217                         tcp_v4_restore_cb(skb);
2218                         refcounted = false;
2219                         goto process;
2220                 }
2221         }
2222                 /* to ACK */
2223                 fallthrough;
2224         case TCP_TW_ACK:
2225                 tcp_v4_timewait_ack(sk, skb);
2226                 break;
2227         case TCP_TW_RST:
2228                 tcp_v4_send_reset(sk, skb);
2229                 inet_twsk_deschedule_put(inet_twsk(sk));
2230                 goto discard_it;
2231         case TCP_TW_SUCCESS:;
2232         }
2233         goto discard_it;
2234 }
2235
2236 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2237         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2238         .twsk_unique    = tcp_twsk_unique,
2239         .twsk_destructor= tcp_twsk_destructor,
2240 };
2241
2242 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2243 {
2244         struct dst_entry *dst = skb_dst(skb);
2245
2246         if (dst && dst_hold_safe(dst)) {
2247                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2248                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2249         }
2250 }
2251 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2252
2253 const struct inet_connection_sock_af_ops ipv4_specific = {
2254         .queue_xmit        = ip_queue_xmit,
2255         .send_check        = tcp_v4_send_check,
2256         .rebuild_header    = inet_sk_rebuild_header,
2257         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2258         .conn_request      = tcp_v4_conn_request,
2259         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2260         .net_header_len    = sizeof(struct iphdr),
2261         .setsockopt        = ip_setsockopt,
2262         .getsockopt        = ip_getsockopt,
2263         .addr2sockaddr     = inet_csk_addr2sockaddr,
2264         .sockaddr_len      = sizeof(struct sockaddr_in),
2265         .mtu_reduced       = tcp_v4_mtu_reduced,
2266 };
2267 EXPORT_SYMBOL(ipv4_specific);
2268
2269 #ifdef CONFIG_TCP_MD5SIG
2270 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2271         .md5_lookup             = tcp_v4_md5_lookup,
2272         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2273         .md5_parse              = tcp_v4_parse_md5_keys,
2274 };
2275 #endif
2276
2277 /* NOTE: A lot of things set to zero explicitly by call to
2278  *       sk_alloc() so need not be done here.
2279  */
2280 static int tcp_v4_init_sock(struct sock *sk)
2281 {
2282         struct inet_connection_sock *icsk = inet_csk(sk);
2283
2284         tcp_init_sock(sk);
2285
2286         icsk->icsk_af_ops = &ipv4_specific;
2287
2288 #ifdef CONFIG_TCP_MD5SIG
2289         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2290 #endif
2291
2292         return 0;
2293 }
2294
2295 void tcp_v4_destroy_sock(struct sock *sk)
2296 {
2297         struct tcp_sock *tp = tcp_sk(sk);
2298
2299         trace_tcp_destroy_sock(sk);
2300
2301         tcp_clear_xmit_timers(sk);
2302
2303         tcp_cleanup_congestion_control(sk);
2304
2305         tcp_cleanup_ulp(sk);
2306
2307         /* Cleanup up the write buffer. */
2308         tcp_write_queue_purge(sk);
2309
2310         /* Check if we want to disable active TFO */
2311         tcp_fastopen_active_disable_ofo_check(sk);
2312
2313         /* Cleans up our, hopefully empty, out_of_order_queue. */
2314         skb_rbtree_purge(&tp->out_of_order_queue);
2315
2316 #ifdef CONFIG_TCP_MD5SIG
2317         /* Clean up the MD5 key list, if any */
2318         if (tp->md5sig_info) {
2319                 tcp_clear_md5_list(sk);
2320                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2321                 tp->md5sig_info = NULL;
2322                 static_branch_slow_dec_deferred(&tcp_md5_needed);
2323         }
2324 #endif
2325
2326         /* Clean up a referenced TCP bind bucket. */
2327         if (inet_csk(sk)->icsk_bind_hash)
2328                 inet_put_port(sk);
2329
2330         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2331
2332         /* If socket is aborted during connect operation */
2333         tcp_free_fastopen_req(tp);
2334         tcp_fastopen_destroy_cipher(sk);
2335         tcp_saved_syn_free(tp);
2336
2337         sk_sockets_allocated_dec(sk);
2338 }
2339 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2340
2341 #ifdef CONFIG_PROC_FS
2342 /* Proc filesystem TCP sock list dumping. */
2343
2344 static unsigned short seq_file_family(const struct seq_file *seq);
2345
2346 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2347 {
2348         unsigned short family = seq_file_family(seq);
2349
2350         /* AF_UNSPEC is used as a match all */
2351         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2352                 net_eq(sock_net(sk), seq_file_net(seq)));
2353 }
2354
2355 /* Find a non empty bucket (starting from st->bucket)
2356  * and return the first sk from it.
2357  */
2358 static void *listening_get_first(struct seq_file *seq)
2359 {
2360         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2361         struct tcp_iter_state *st = seq->private;
2362
2363         st->offset = 0;
2364         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2365                 struct inet_listen_hashbucket *ilb2;
2366                 struct hlist_nulls_node *node;
2367                 struct sock *sk;
2368
2369                 ilb2 = &hinfo->lhash2[st->bucket];
2370                 if (hlist_nulls_empty(&ilb2->nulls_head))
2371                         continue;
2372
2373                 spin_lock(&ilb2->lock);
2374                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2375                         if (seq_sk_match(seq, sk))
2376                                 return sk;
2377                 }
2378                 spin_unlock(&ilb2->lock);
2379         }
2380
2381         return NULL;
2382 }
2383
2384 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2385  * If "cur" is the last one in the st->bucket,
2386  * call listening_get_first() to return the first sk of the next
2387  * non empty bucket.
2388  */
2389 static void *listening_get_next(struct seq_file *seq, void *cur)
2390 {
2391         struct tcp_iter_state *st = seq->private;
2392         struct inet_listen_hashbucket *ilb2;
2393         struct hlist_nulls_node *node;
2394         struct inet_hashinfo *hinfo;
2395         struct sock *sk = cur;
2396
2397         ++st->num;
2398         ++st->offset;
2399
2400         sk = sk_nulls_next(sk);
2401         sk_nulls_for_each_from(sk, node) {
2402                 if (seq_sk_match(seq, sk))
2403                         return sk;
2404         }
2405
2406         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2407         ilb2 = &hinfo->lhash2[st->bucket];
2408         spin_unlock(&ilb2->lock);
2409         ++st->bucket;
2410         return listening_get_first(seq);
2411 }
2412
2413 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2414 {
2415         struct tcp_iter_state *st = seq->private;
2416         void *rc;
2417
2418         st->bucket = 0;
2419         st->offset = 0;
2420         rc = listening_get_first(seq);
2421
2422         while (rc && *pos) {
2423                 rc = listening_get_next(seq, rc);
2424                 --*pos;
2425         }
2426         return rc;
2427 }
2428
2429 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2430                                 const struct tcp_iter_state *st)
2431 {
2432         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2433 }
2434
2435 /*
2436  * Get first established socket starting from bucket given in st->bucket.
2437  * If st->bucket is zero, the very first socket in the hash is returned.
2438  */
2439 static void *established_get_first(struct seq_file *seq)
2440 {
2441         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2442         struct tcp_iter_state *st = seq->private;
2443
2444         st->offset = 0;
2445         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2446                 struct sock *sk;
2447                 struct hlist_nulls_node *node;
2448                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2449
2450                 /* Lockless fast path for the common case of empty buckets */
2451                 if (empty_bucket(hinfo, st))
2452                         continue;
2453
2454                 spin_lock_bh(lock);
2455                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2456                         if (seq_sk_match(seq, sk))
2457                                 return sk;
2458                 }
2459                 spin_unlock_bh(lock);
2460         }
2461
2462         return NULL;
2463 }
2464
2465 static void *established_get_next(struct seq_file *seq, void *cur)
2466 {
2467         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2468         struct tcp_iter_state *st = seq->private;
2469         struct hlist_nulls_node *node;
2470         struct sock *sk = cur;
2471
2472         ++st->num;
2473         ++st->offset;
2474
2475         sk = sk_nulls_next(sk);
2476
2477         sk_nulls_for_each_from(sk, node) {
2478                 if (seq_sk_match(seq, sk))
2479                         return sk;
2480         }
2481
2482         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2483         ++st->bucket;
2484         return established_get_first(seq);
2485 }
2486
2487 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2488 {
2489         struct tcp_iter_state *st = seq->private;
2490         void *rc;
2491
2492         st->bucket = 0;
2493         rc = established_get_first(seq);
2494
2495         while (rc && pos) {
2496                 rc = established_get_next(seq, rc);
2497                 --pos;
2498         }
2499         return rc;
2500 }
2501
2502 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2503 {
2504         void *rc;
2505         struct tcp_iter_state *st = seq->private;
2506
2507         st->state = TCP_SEQ_STATE_LISTENING;
2508         rc        = listening_get_idx(seq, &pos);
2509
2510         if (!rc) {
2511                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2512                 rc        = established_get_idx(seq, pos);
2513         }
2514
2515         return rc;
2516 }
2517
2518 static void *tcp_seek_last_pos(struct seq_file *seq)
2519 {
2520         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2521         struct tcp_iter_state *st = seq->private;
2522         int bucket = st->bucket;
2523         int offset = st->offset;
2524         int orig_num = st->num;
2525         void *rc = NULL;
2526
2527         switch (st->state) {
2528         case TCP_SEQ_STATE_LISTENING:
2529                 if (st->bucket > hinfo->lhash2_mask)
2530                         break;
2531                 rc = listening_get_first(seq);
2532                 while (offset-- && rc && bucket == st->bucket)
2533                         rc = listening_get_next(seq, rc);
2534                 if (rc)
2535                         break;
2536                 st->bucket = 0;
2537                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2538                 fallthrough;
2539         case TCP_SEQ_STATE_ESTABLISHED:
2540                 if (st->bucket > hinfo->ehash_mask)
2541                         break;
2542                 rc = established_get_first(seq);
2543                 while (offset-- && rc && bucket == st->bucket)
2544                         rc = established_get_next(seq, rc);
2545         }
2546
2547         st->num = orig_num;
2548
2549         return rc;
2550 }
2551
2552 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2553 {
2554         struct tcp_iter_state *st = seq->private;
2555         void *rc;
2556
2557         if (*pos && *pos == st->last_pos) {
2558                 rc = tcp_seek_last_pos(seq);
2559                 if (rc)
2560                         goto out;
2561         }
2562
2563         st->state = TCP_SEQ_STATE_LISTENING;
2564         st->num = 0;
2565         st->bucket = 0;
2566         st->offset = 0;
2567         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2568
2569 out:
2570         st->last_pos = *pos;
2571         return rc;
2572 }
2573 EXPORT_SYMBOL(tcp_seq_start);
2574
2575 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2576 {
2577         struct tcp_iter_state *st = seq->private;
2578         void *rc = NULL;
2579
2580         if (v == SEQ_START_TOKEN) {
2581                 rc = tcp_get_idx(seq, 0);
2582                 goto out;
2583         }
2584
2585         switch (st->state) {
2586         case TCP_SEQ_STATE_LISTENING:
2587                 rc = listening_get_next(seq, v);
2588                 if (!rc) {
2589                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2590                         st->bucket = 0;
2591                         st->offset = 0;
2592                         rc        = established_get_first(seq);
2593                 }
2594                 break;
2595         case TCP_SEQ_STATE_ESTABLISHED:
2596                 rc = established_get_next(seq, v);
2597                 break;
2598         }
2599 out:
2600         ++*pos;
2601         st->last_pos = *pos;
2602         return rc;
2603 }
2604 EXPORT_SYMBOL(tcp_seq_next);
2605
2606 void tcp_seq_stop(struct seq_file *seq, void *v)
2607 {
2608         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2609         struct tcp_iter_state *st = seq->private;
2610
2611         switch (st->state) {
2612         case TCP_SEQ_STATE_LISTENING:
2613                 if (v != SEQ_START_TOKEN)
2614                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2615                 break;
2616         case TCP_SEQ_STATE_ESTABLISHED:
2617                 if (v)
2618                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2619                 break;
2620         }
2621 }
2622 EXPORT_SYMBOL(tcp_seq_stop);
2623
2624 static void get_openreq4(const struct request_sock *req,
2625                          struct seq_file *f, int i)
2626 {
2627         const struct inet_request_sock *ireq = inet_rsk(req);
2628         long delta = req->rsk_timer.expires - jiffies;
2629
2630         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2631                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2632                 i,
2633                 ireq->ir_loc_addr,
2634                 ireq->ir_num,
2635                 ireq->ir_rmt_addr,
2636                 ntohs(ireq->ir_rmt_port),
2637                 TCP_SYN_RECV,
2638                 0, 0, /* could print option size, but that is af dependent. */
2639                 1,    /* timers active (only the expire timer) */
2640                 jiffies_delta_to_clock_t(delta),
2641                 req->num_timeout,
2642                 from_kuid_munged(seq_user_ns(f),
2643                                  sock_i_uid(req->rsk_listener)),
2644                 0,  /* non standard timer */
2645                 0, /* open_requests have no inode */
2646                 0,
2647                 req);
2648 }
2649
2650 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2651 {
2652         int timer_active;
2653         unsigned long timer_expires;
2654         const struct tcp_sock *tp = tcp_sk(sk);
2655         const struct inet_connection_sock *icsk = inet_csk(sk);
2656         const struct inet_sock *inet = inet_sk(sk);
2657         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2658         __be32 dest = inet->inet_daddr;
2659         __be32 src = inet->inet_rcv_saddr;
2660         __u16 destp = ntohs(inet->inet_dport);
2661         __u16 srcp = ntohs(inet->inet_sport);
2662         int rx_queue;
2663         int state;
2664
2665         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2666             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2667             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2668                 timer_active    = 1;
2669                 timer_expires   = icsk->icsk_timeout;
2670         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2671                 timer_active    = 4;
2672                 timer_expires   = icsk->icsk_timeout;
2673         } else if (timer_pending(&sk->sk_timer)) {
2674                 timer_active    = 2;
2675                 timer_expires   = sk->sk_timer.expires;
2676         } else {
2677                 timer_active    = 0;
2678                 timer_expires = jiffies;
2679         }
2680
2681         state = inet_sk_state_load(sk);
2682         if (state == TCP_LISTEN)
2683                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2684         else
2685                 /* Because we don't lock the socket,
2686                  * we might find a transient negative value.
2687                  */
2688                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2689                                       READ_ONCE(tp->copied_seq), 0);
2690
2691         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2692                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2693                 i, src, srcp, dest, destp, state,
2694                 READ_ONCE(tp->write_seq) - tp->snd_una,
2695                 rx_queue,
2696                 timer_active,
2697                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2698                 icsk->icsk_retransmits,
2699                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2700                 icsk->icsk_probes_out,
2701                 sock_i_ino(sk),
2702                 refcount_read(&sk->sk_refcnt), sk,
2703                 jiffies_to_clock_t(icsk->icsk_rto),
2704                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2705                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2706                 tcp_snd_cwnd(tp),
2707                 state == TCP_LISTEN ?
2708                     fastopenq->max_qlen :
2709                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2710 }
2711
2712 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2713                                struct seq_file *f, int i)
2714 {
2715         long delta = tw->tw_timer.expires - jiffies;
2716         __be32 dest, src;
2717         __u16 destp, srcp;
2718
2719         dest  = tw->tw_daddr;
2720         src   = tw->tw_rcv_saddr;
2721         destp = ntohs(tw->tw_dport);
2722         srcp  = ntohs(tw->tw_sport);
2723
2724         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2725                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2726                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2727                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2728                 refcount_read(&tw->tw_refcnt), tw);
2729 }
2730
2731 #define TMPSZ 150
2732
2733 static int tcp4_seq_show(struct seq_file *seq, void *v)
2734 {
2735         struct tcp_iter_state *st;
2736         struct sock *sk = v;
2737
2738         seq_setwidth(seq, TMPSZ - 1);
2739         if (v == SEQ_START_TOKEN) {
2740                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2741                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2742                            "inode");
2743                 goto out;
2744         }
2745         st = seq->private;
2746
2747         if (sk->sk_state == TCP_TIME_WAIT)
2748                 get_timewait4_sock(v, seq, st->num);
2749         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2750                 get_openreq4(v, seq, st->num);
2751         else
2752                 get_tcp4_sock(v, seq, st->num);
2753 out:
2754         seq_pad(seq, '\n');
2755         return 0;
2756 }
2757
2758 #ifdef CONFIG_BPF_SYSCALL
2759 struct bpf_tcp_iter_state {
2760         struct tcp_iter_state state;
2761         unsigned int cur_sk;
2762         unsigned int end_sk;
2763         unsigned int max_sk;
2764         struct sock **batch;
2765         bool st_bucket_done;
2766 };
2767
2768 struct bpf_iter__tcp {
2769         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2770         __bpf_md_ptr(struct sock_common *, sk_common);
2771         uid_t uid __aligned(8);
2772 };
2773
2774 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2775                              struct sock_common *sk_common, uid_t uid)
2776 {
2777         struct bpf_iter__tcp ctx;
2778
2779         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2780         ctx.meta = meta;
2781         ctx.sk_common = sk_common;
2782         ctx.uid = uid;
2783         return bpf_iter_run_prog(prog, &ctx);
2784 }
2785
2786 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2787 {
2788         while (iter->cur_sk < iter->end_sk)
2789                 sock_gen_put(iter->batch[iter->cur_sk++]);
2790 }
2791
2792 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2793                                       unsigned int new_batch_sz)
2794 {
2795         struct sock **new_batch;
2796
2797         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2798                              GFP_USER | __GFP_NOWARN);
2799         if (!new_batch)
2800                 return -ENOMEM;
2801
2802         bpf_iter_tcp_put_batch(iter);
2803         kvfree(iter->batch);
2804         iter->batch = new_batch;
2805         iter->max_sk = new_batch_sz;
2806
2807         return 0;
2808 }
2809
2810 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2811                                                  struct sock *start_sk)
2812 {
2813         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2814         struct bpf_tcp_iter_state *iter = seq->private;
2815         struct tcp_iter_state *st = &iter->state;
2816         struct hlist_nulls_node *node;
2817         unsigned int expected = 1;
2818         struct sock *sk;
2819
2820         sock_hold(start_sk);
2821         iter->batch[iter->end_sk++] = start_sk;
2822
2823         sk = sk_nulls_next(start_sk);
2824         sk_nulls_for_each_from(sk, node) {
2825                 if (seq_sk_match(seq, sk)) {
2826                         if (iter->end_sk < iter->max_sk) {
2827                                 sock_hold(sk);
2828                                 iter->batch[iter->end_sk++] = sk;
2829                         }
2830                         expected++;
2831                 }
2832         }
2833         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2834
2835         return expected;
2836 }
2837
2838 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2839                                                    struct sock *start_sk)
2840 {
2841         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2842         struct bpf_tcp_iter_state *iter = seq->private;
2843         struct tcp_iter_state *st = &iter->state;
2844         struct hlist_nulls_node *node;
2845         unsigned int expected = 1;
2846         struct sock *sk;
2847
2848         sock_hold(start_sk);
2849         iter->batch[iter->end_sk++] = start_sk;
2850
2851         sk = sk_nulls_next(start_sk);
2852         sk_nulls_for_each_from(sk, node) {
2853                 if (seq_sk_match(seq, sk)) {
2854                         if (iter->end_sk < iter->max_sk) {
2855                                 sock_hold(sk);
2856                                 iter->batch[iter->end_sk++] = sk;
2857                         }
2858                         expected++;
2859                 }
2860         }
2861         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2862
2863         return expected;
2864 }
2865
2866 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2867 {
2868         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2869         struct bpf_tcp_iter_state *iter = seq->private;
2870         struct tcp_iter_state *st = &iter->state;
2871         unsigned int expected;
2872         bool resized = false;
2873         struct sock *sk;
2874
2875         /* The st->bucket is done.  Directly advance to the next
2876          * bucket instead of having the tcp_seek_last_pos() to skip
2877          * one by one in the current bucket and eventually find out
2878          * it has to advance to the next bucket.
2879          */
2880         if (iter->st_bucket_done) {
2881                 st->offset = 0;
2882                 st->bucket++;
2883                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2884                     st->bucket > hinfo->lhash2_mask) {
2885                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2886                         st->bucket = 0;
2887                 }
2888         }
2889
2890 again:
2891         /* Get a new batch */
2892         iter->cur_sk = 0;
2893         iter->end_sk = 0;
2894         iter->st_bucket_done = false;
2895
2896         sk = tcp_seek_last_pos(seq);
2897         if (!sk)
2898                 return NULL; /* Done */
2899
2900         if (st->state == TCP_SEQ_STATE_LISTENING)
2901                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2902         else
2903                 expected = bpf_iter_tcp_established_batch(seq, sk);
2904
2905         if (iter->end_sk == expected) {
2906                 iter->st_bucket_done = true;
2907                 return sk;
2908         }
2909
2910         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2911                 resized = true;
2912                 goto again;
2913         }
2914
2915         return sk;
2916 }
2917
2918 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2919 {
2920         /* bpf iter does not support lseek, so it always
2921          * continue from where it was stop()-ped.
2922          */
2923         if (*pos)
2924                 return bpf_iter_tcp_batch(seq);
2925
2926         return SEQ_START_TOKEN;
2927 }
2928
2929 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2930 {
2931         struct bpf_tcp_iter_state *iter = seq->private;
2932         struct tcp_iter_state *st = &iter->state;
2933         struct sock *sk;
2934
2935         /* Whenever seq_next() is called, the iter->cur_sk is
2936          * done with seq_show(), so advance to the next sk in
2937          * the batch.
2938          */
2939         if (iter->cur_sk < iter->end_sk) {
2940                 /* Keeping st->num consistent in tcp_iter_state.
2941                  * bpf_iter_tcp does not use st->num.
2942                  * meta.seq_num is used instead.
2943                  */
2944                 st->num++;
2945                 /* Move st->offset to the next sk in the bucket such that
2946                  * the future start() will resume at st->offset in
2947                  * st->bucket.  See tcp_seek_last_pos().
2948                  */
2949                 st->offset++;
2950                 sock_gen_put(iter->batch[iter->cur_sk++]);
2951         }
2952
2953         if (iter->cur_sk < iter->end_sk)
2954                 sk = iter->batch[iter->cur_sk];
2955         else
2956                 sk = bpf_iter_tcp_batch(seq);
2957
2958         ++*pos;
2959         /* Keeping st->last_pos consistent in tcp_iter_state.
2960          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2961          */
2962         st->last_pos = *pos;
2963         return sk;
2964 }
2965
2966 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2967 {
2968         struct bpf_iter_meta meta;
2969         struct bpf_prog *prog;
2970         struct sock *sk = v;
2971         uid_t uid;
2972         int ret;
2973
2974         if (v == SEQ_START_TOKEN)
2975                 return 0;
2976
2977         if (sk_fullsock(sk))
2978                 lock_sock(sk);
2979
2980         if (unlikely(sk_unhashed(sk))) {
2981                 ret = SEQ_SKIP;
2982                 goto unlock;
2983         }
2984
2985         if (sk->sk_state == TCP_TIME_WAIT) {
2986                 uid = 0;
2987         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2988                 const struct request_sock *req = v;
2989
2990                 uid = from_kuid_munged(seq_user_ns(seq),
2991                                        sock_i_uid(req->rsk_listener));
2992         } else {
2993                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2994         }
2995
2996         meta.seq = seq;
2997         prog = bpf_iter_get_info(&meta, false);
2998         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2999
3000 unlock:
3001         if (sk_fullsock(sk))
3002                 release_sock(sk);
3003         return ret;
3004
3005 }
3006
3007 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3008 {
3009         struct bpf_tcp_iter_state *iter = seq->private;
3010         struct bpf_iter_meta meta;
3011         struct bpf_prog *prog;
3012
3013         if (!v) {
3014                 meta.seq = seq;
3015                 prog = bpf_iter_get_info(&meta, true);
3016                 if (prog)
3017                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
3018         }
3019
3020         if (iter->cur_sk < iter->end_sk) {
3021                 bpf_iter_tcp_put_batch(iter);
3022                 iter->st_bucket_done = false;
3023         }
3024 }
3025
3026 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3027         .show           = bpf_iter_tcp_seq_show,
3028         .start          = bpf_iter_tcp_seq_start,
3029         .next           = bpf_iter_tcp_seq_next,
3030         .stop           = bpf_iter_tcp_seq_stop,
3031 };
3032 #endif
3033 static unsigned short seq_file_family(const struct seq_file *seq)
3034 {
3035         const struct tcp_seq_afinfo *afinfo;
3036
3037 #ifdef CONFIG_BPF_SYSCALL
3038         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3039         if (seq->op == &bpf_iter_tcp_seq_ops)
3040                 return AF_UNSPEC;
3041 #endif
3042
3043         /* Iterated from proc fs */
3044         afinfo = pde_data(file_inode(seq->file));
3045         return afinfo->family;
3046 }
3047
3048 static const struct seq_operations tcp4_seq_ops = {
3049         .show           = tcp4_seq_show,
3050         .start          = tcp_seq_start,
3051         .next           = tcp_seq_next,
3052         .stop           = tcp_seq_stop,
3053 };
3054
3055 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3056         .family         = AF_INET,
3057 };
3058
3059 static int __net_init tcp4_proc_init_net(struct net *net)
3060 {
3061         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3062                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3063                 return -ENOMEM;
3064         return 0;
3065 }
3066
3067 static void __net_exit tcp4_proc_exit_net(struct net *net)
3068 {
3069         remove_proc_entry("tcp", net->proc_net);
3070 }
3071
3072 static struct pernet_operations tcp4_net_ops = {
3073         .init = tcp4_proc_init_net,
3074         .exit = tcp4_proc_exit_net,
3075 };
3076
3077 int __init tcp4_proc_init(void)
3078 {
3079         return register_pernet_subsys(&tcp4_net_ops);
3080 }
3081
3082 void tcp4_proc_exit(void)
3083 {
3084         unregister_pernet_subsys(&tcp4_net_ops);
3085 }
3086 #endif /* CONFIG_PROC_FS */
3087
3088 /* @wake is one when sk_stream_write_space() calls us.
3089  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3090  * This mimics the strategy used in sock_def_write_space().
3091  */
3092 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3093 {
3094         const struct tcp_sock *tp = tcp_sk(sk);
3095         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3096                             READ_ONCE(tp->snd_nxt);
3097
3098         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3099 }
3100 EXPORT_SYMBOL(tcp_stream_memory_free);
3101
3102 struct proto tcp_prot = {
3103         .name                   = "TCP",
3104         .owner                  = THIS_MODULE,
3105         .close                  = tcp_close,
3106         .pre_connect            = tcp_v4_pre_connect,
3107         .connect                = tcp_v4_connect,
3108         .disconnect             = tcp_disconnect,
3109         .accept                 = inet_csk_accept,
3110         .ioctl                  = tcp_ioctl,
3111         .init                   = tcp_v4_init_sock,
3112         .destroy                = tcp_v4_destroy_sock,
3113         .shutdown               = tcp_shutdown,
3114         .setsockopt             = tcp_setsockopt,
3115         .getsockopt             = tcp_getsockopt,
3116         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3117         .keepalive              = tcp_set_keepalive,
3118         .recvmsg                = tcp_recvmsg,
3119         .sendmsg                = tcp_sendmsg,
3120         .splice_eof             = tcp_splice_eof,
3121         .backlog_rcv            = tcp_v4_do_rcv,
3122         .release_cb             = tcp_release_cb,
3123         .hash                   = inet_hash,
3124         .unhash                 = inet_unhash,
3125         .get_port               = inet_csk_get_port,
3126         .put_port               = inet_put_port,
3127 #ifdef CONFIG_BPF_SYSCALL
3128         .psock_update_sk_prot   = tcp_bpf_update_proto,
3129 #endif
3130         .enter_memory_pressure  = tcp_enter_memory_pressure,
3131         .leave_memory_pressure  = tcp_leave_memory_pressure,
3132         .stream_memory_free     = tcp_stream_memory_free,
3133         .sockets_allocated      = &tcp_sockets_allocated,
3134         .orphan_count           = &tcp_orphan_count,
3135
3136         .memory_allocated       = &tcp_memory_allocated,
3137         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3138
3139         .memory_pressure        = &tcp_memory_pressure,
3140         .sysctl_mem             = sysctl_tcp_mem,
3141         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3142         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3143         .max_header             = MAX_TCP_HEADER,
3144         .obj_size               = sizeof(struct tcp_sock),
3145         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3146         .twsk_prot              = &tcp_timewait_sock_ops,
3147         .rsk_prot               = &tcp_request_sock_ops,
3148         .h.hashinfo             = NULL,
3149         .no_autobind            = true,
3150         .diag_destroy           = tcp_abort,
3151 };
3152 EXPORT_SYMBOL(tcp_prot);
3153
3154 static void __net_exit tcp_sk_exit(struct net *net)
3155 {
3156         if (net->ipv4.tcp_congestion_control)
3157                 bpf_module_put(net->ipv4.tcp_congestion_control,
3158                                net->ipv4.tcp_congestion_control->owner);
3159 }
3160
3161 static void __net_init tcp_set_hashinfo(struct net *net)
3162 {
3163         struct inet_hashinfo *hinfo;
3164         unsigned int ehash_entries;
3165         struct net *old_net;
3166
3167         if (net_eq(net, &init_net))
3168                 goto fallback;
3169
3170         old_net = current->nsproxy->net_ns;
3171         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3172         if (!ehash_entries)
3173                 goto fallback;
3174
3175         ehash_entries = roundup_pow_of_two(ehash_entries);
3176         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3177         if (!hinfo) {
3178                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3179                         "for a netns, fallback to the global one\n",
3180                         ehash_entries);
3181 fallback:
3182                 hinfo = &tcp_hashinfo;
3183                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3184         }
3185
3186         net->ipv4.tcp_death_row.hashinfo = hinfo;
3187         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3188         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3189 }
3190
3191 static int __net_init tcp_sk_init(struct net *net)
3192 {
3193         net->ipv4.sysctl_tcp_ecn = 2;
3194         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3195
3196         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3197         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3198         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3199         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3200         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3201
3202         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3203         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3204         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3205
3206         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3207         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3208         net->ipv4.sysctl_tcp_syncookies = 1;
3209         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3210         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3211         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3212         net->ipv4.sysctl_tcp_orphan_retries = 0;
3213         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3214         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3215         net->ipv4.sysctl_tcp_tw_reuse = 2;
3216         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3217
3218         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3219         tcp_set_hashinfo(net);
3220
3221         net->ipv4.sysctl_tcp_sack = 1;
3222         net->ipv4.sysctl_tcp_window_scaling = 1;
3223         net->ipv4.sysctl_tcp_timestamps = 1;
3224         net->ipv4.sysctl_tcp_early_retrans = 3;
3225         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3226         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3227         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3228         net->ipv4.sysctl_tcp_max_reordering = 300;
3229         net->ipv4.sysctl_tcp_dsack = 1;
3230         net->ipv4.sysctl_tcp_app_win = 31;
3231         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3232         net->ipv4.sysctl_tcp_frto = 2;
3233         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3234         /* This limits the percentage of the congestion window which we
3235          * will allow a single TSO frame to consume.  Building TSO frames
3236          * which are too large can cause TCP streams to be bursty.
3237          */
3238         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3239         /* Default TSQ limit of 16 TSO segments */
3240         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3241
3242         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3243         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3244
3245         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3246         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3247         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3248         net->ipv4.sysctl_tcp_autocorking = 1;
3249         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3250         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3251         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3252         if (net != &init_net) {
3253                 memcpy(net->ipv4.sysctl_tcp_rmem,
3254                        init_net.ipv4.sysctl_tcp_rmem,
3255                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3256                 memcpy(net->ipv4.sysctl_tcp_wmem,
3257                        init_net.ipv4.sysctl_tcp_wmem,
3258                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3259         }
3260         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3261         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3262         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3263         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3264         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3265         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3266
3267         /* Set default values for PLB */
3268         net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3269         net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3270         net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3271         net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3272         /* Default congestion threshold for PLB to mark a round is 50% */
3273         net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3274
3275         /* Reno is always built in */
3276         if (!net_eq(net, &init_net) &&
3277             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3278                                init_net.ipv4.tcp_congestion_control->owner))
3279                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3280         else
3281                 net->ipv4.tcp_congestion_control = &tcp_reno;
3282
3283         net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3284         net->ipv4.sysctl_tcp_shrink_window = 0;
3285
3286         return 0;
3287 }
3288
3289 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3290 {
3291         struct net *net;
3292
3293         tcp_twsk_purge(net_exit_list, AF_INET);
3294
3295         list_for_each_entry(net, net_exit_list, exit_list) {
3296                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3297                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3298                 tcp_fastopen_ctx_destroy(net);
3299         }
3300 }
3301
3302 static struct pernet_operations __net_initdata tcp_sk_ops = {
3303        .init       = tcp_sk_init,
3304        .exit       = tcp_sk_exit,
3305        .exit_batch = tcp_sk_exit_batch,
3306 };
3307
3308 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3309 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3310                      struct sock_common *sk_common, uid_t uid)
3311
3312 #define INIT_BATCH_SZ 16
3313
3314 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3315 {
3316         struct bpf_tcp_iter_state *iter = priv_data;
3317         int err;
3318
3319         err = bpf_iter_init_seq_net(priv_data, aux);
3320         if (err)
3321                 return err;
3322
3323         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3324         if (err) {
3325                 bpf_iter_fini_seq_net(priv_data);
3326                 return err;
3327         }
3328
3329         return 0;
3330 }
3331
3332 static void bpf_iter_fini_tcp(void *priv_data)
3333 {
3334         struct bpf_tcp_iter_state *iter = priv_data;
3335
3336         bpf_iter_fini_seq_net(priv_data);
3337         kvfree(iter->batch);
3338 }
3339
3340 static const struct bpf_iter_seq_info tcp_seq_info = {
3341         .seq_ops                = &bpf_iter_tcp_seq_ops,
3342         .init_seq_private       = bpf_iter_init_tcp,
3343         .fini_seq_private       = bpf_iter_fini_tcp,
3344         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3345 };
3346
3347 static const struct bpf_func_proto *
3348 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3349                             const struct bpf_prog *prog)
3350 {
3351         switch (func_id) {
3352         case BPF_FUNC_setsockopt:
3353                 return &bpf_sk_setsockopt_proto;
3354         case BPF_FUNC_getsockopt:
3355                 return &bpf_sk_getsockopt_proto;
3356         default:
3357                 return NULL;
3358         }
3359 }
3360
3361 static struct bpf_iter_reg tcp_reg_info = {
3362         .target                 = "tcp",
3363         .ctx_arg_info_size      = 1,
3364         .ctx_arg_info           = {
3365                 { offsetof(struct bpf_iter__tcp, sk_common),
3366                   PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3367         },
3368         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3369         .seq_info               = &tcp_seq_info,
3370 };
3371
3372 static void __init bpf_iter_register(void)
3373 {
3374         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3375         if (bpf_iter_reg_target(&tcp_reg_info))
3376                 pr_warn("Warning: could not register bpf iterator tcp\n");
3377 }
3378
3379 #endif
3380
3381 void __init tcp_v4_init(void)
3382 {
3383         int cpu, res;
3384
3385         for_each_possible_cpu(cpu) {
3386                 struct sock *sk;
3387
3388                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3389                                            IPPROTO_TCP, &init_net);
3390                 if (res)
3391                         panic("Failed to create the TCP control socket.\n");
3392                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3393
3394                 /* Please enforce IP_DF and IPID==0 for RST and
3395                  * ACK sent in SYN-RECV and TIME-WAIT state.
3396                  */
3397                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3398
3399                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3400         }
3401         if (register_pernet_subsys(&tcp_sk_ops))
3402                 panic("Failed to create the TCP control socket.\n");
3403
3404 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3405         bpf_iter_register();
3406 #endif
3407 }