net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/tcp_memcontrol.h>
  77 #include <net/busy_poll.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92 #ifdef CONFIG_TCP_MD5SIG
  93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  94                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  95 #endif
  96
  97 struct inet_hashinfo tcp_hashinfo;
  98 EXPORT_SYMBOL(tcp_hashinfo);
  99
 100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 101 {
 102         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 103                                           ip_hdr(skb)->saddr,
 104                                           tcp_hdr(skb)->dest,
 105                                           tcp_hdr(skb)->source);
 106 }
 107
 108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 109 {
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112
 113         /* With PAWS, it is safe from the viewpoint
 114            of data integrity. Even without PAWS it is safe provided sequence
 115            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 116
 117            Actually, the idea is close to VJ's one, only timestamp cache is
 118            held not per host, but per port pair and TW bucket is used as state
 119            holder.
 120
 121            If TW bucket has been already destroyed we fall back to VJ's scheme
 122            and use initial timestamp retrieved from peer table.
 123          */
 124         if (tcptw->tw_ts_recent_stamp &&
 125             (!twp || (sysctl_tcp_tw_reuse &&
 126                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 127                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 128                 if (tp->write_seq == 0)
 129                         tp->write_seq = 1;
 130                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 131                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 132                 sock_hold(sktw);
 133                 return 1;
 134         }
 135
 136         return 0;
 137 }
 138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 139
 140 /* This will initiate an outgoing connection. */
 141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 142 {
 143         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 144         struct inet_sock *inet = inet_sk(sk);
 145         struct tcp_sock *tp = tcp_sk(sk);
 146         __be16 orig_sport, orig_dport;
 147         __be32 daddr, nexthop;
 148         struct flowi4 *fl4;
 149         struct rtable *rt;
 150         int err;
 151         struct ip_options_rcu *inet_opt;
 152
 153         if (addr_len < sizeof(struct sockaddr_in))
 154                 return -EINVAL;
 155
 156         if (usin->sin_family != AF_INET)
 157                 return -EAFNOSUPPORT;
 158
 159         nexthop = daddr = usin->sin_addr.s_addr;
 160         inet_opt = rcu_dereference_protected(inet->inet_opt,
 161                                              sock_owned_by_user(sk));
 162         if (inet_opt && inet_opt->opt.srr) {
 163                 if (!daddr)
 164                         return -EINVAL;
 165                 nexthop = inet_opt->opt.faddr;
 166         }
 167
 168         orig_sport = inet->inet_sport;
 169         orig_dport = usin->sin_port;
 170         fl4 = &inet->cork.fl.u.ip4;
 171         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 172                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 173                               IPPROTO_TCP,
 174                               orig_sport, orig_dport, sk);
 175         if (IS_ERR(rt)) {
 176                 err = PTR_ERR(rt);
 177                 if (err == -ENETUNREACH)
 178                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 179                 return err;
 180         }
 181
 182         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 183                 ip_rt_put(rt);
 184                 return -ENETUNREACH;
 185         }
 186
 187         if (!inet_opt || !inet_opt->opt.srr)
 188                 daddr = fl4->daddr;
 189
 190         if (!inet->inet_saddr)
 191                 inet->inet_saddr = fl4->saddr;
 192         sk_rcv_saddr_set(sk, inet->inet_saddr);
 193
 194         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 195                 /* Reset inherited state */
 196                 tp->rx_opt.ts_recent       = 0;
 197                 tp->rx_opt.ts_recent_stamp = 0;
 198                 if (likely(!tp->repair))
 199                         tp->write_seq      = 0;
 200         }
 201
 202         if (tcp_death_row.sysctl_tw_recycle &&
 203             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 204                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 205
 206         inet->inet_dport = usin->sin_port;
 207         sk_daddr_set(sk, daddr);
 208
 209         inet_csk(sk)->icsk_ext_hdr_len = 0;
 210         if (inet_opt)
 211                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 212
 213         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 214
 215         /* Socket identity is still unknown (sport may be zero).
 216          * However we set state to SYN-SENT and not releasing socket
 217          * lock select source port, enter ourselves into the hash tables and
 218          * complete initialization after this.
 219          */
 220         tcp_set_state(sk, TCP_SYN_SENT);
 221         err = inet_hash_connect(&tcp_death_row, sk);
 222         if (err)
 223                 goto failure;
 224
 225         sk_set_txhash(sk);
 226
 227         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 228                                inet->inet_sport, inet->inet_dport, sk);
 229         if (IS_ERR(rt)) {
 230                 err = PTR_ERR(rt);
 231                 rt = NULL;
 232                 goto failure;
 233         }
 234         /* OK, now commit destination to socket.  */
 235         sk->sk_gso_type = SKB_GSO_TCPV4;
 236         sk_setup_caps(sk, &rt->dst);
 237
 238         if (!tp->write_seq && likely(!tp->repair))
 239                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 240                                                            inet->inet_daddr,
 241                                                            inet->inet_sport,
 242                                                            usin->sin_port);
 243
 244         inet->inet_id = tp->write_seq ^ jiffies;
 245
 246         err = tcp_connect(sk);
 247
 248         rt = NULL;
 249         if (err)
 250                 goto failure;
 251
 252         return 0;
 253
 254 failure:
 255         /*
 256          * This unhashes the socket and releases the local port,
 257          * if necessary.
 258          */
 259         tcp_set_state(sk, TCP_CLOSE);
 260         ip_rt_put(rt);
 261         sk->sk_route_caps = 0;
 262         inet->inet_dport = 0;
 263         return err;
 264 }
 265 EXPORT_SYMBOL(tcp_v4_connect);
 266
 267 /*
 268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 269  * It can be called through tcp_release_cb() if socket was owned by user
 270  * at the time tcp_v4_err() was called to handle ICMP message.
 271  */
 272 void tcp_v4_mtu_reduced(struct sock *sk)
 273 {
 274         struct inet_sock *inet = inet_sk(sk);
 275         struct dst_entry *dst;
 276         u32 mtu;
 277
 278         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 279                 return;
 280         mtu = tcp_sk(sk)->mtu_info;
 281         dst = inet_csk_update_pmtu(sk, mtu);
 282         if (!dst)
 283                 return;
 284
 285         /* Something is about to be wrong... Remember soft error
 286          * for the case, if this connection will not able to recover.
 287          */
 288         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 289                 sk->sk_err_soft = EMSGSIZE;
 290
 291         mtu = dst_mtu(dst);
 292
 293         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 294             ip_sk_accept_pmtu(sk) &&
 295             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 296                 tcp_sync_mss(sk, mtu);
 297
 298                 /* Resend the TCP packet because it's
 299                  * clear that the old packet has been
 300                  * dropped. This is the new "fast" path mtu
 301                  * discovery.
 302                  */
 303                 tcp_simple_retransmit(sk);
 304         } /* else let the usual retransmit timer handle it */
 305 }
 306 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 307
 308 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 309 {
 310         struct dst_entry *dst = __sk_dst_check(sk, 0);
 311
 312         if (dst)
 313                 dst->ops->redirect(dst, sk, skb);
 314 }
 315
 316
 317 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 318 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 319 {
 320         struct request_sock *req = inet_reqsk(sk);
 321         struct net *net = sock_net(sk);
 322
 323         /* ICMPs are not backlogged, hence we cannot get
 324          * an established socket here.
 325          */
 326         if (seq != tcp_rsk(req)->snt_isn) {
 327                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 328         } else if (abort) {
 329                 /*
 330                  * Still in SYN_RECV, just remove it silently.
 331                  * There is no good way to pass the error to the newly
 332                  * created socket, and POSIX does not want network
 333                  * errors returned from accept().
 334                  */
 335                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 336                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
 337         }
 338         reqsk_put(req);
 339 }
 340 EXPORT_SYMBOL(tcp_req_err);
 341
 342 /*
 343  * This routine is called by the ICMP module when it gets some
 344  * sort of error condition.  If err < 0 then the socket should
 345  * be closed and the error returned to the user.  If err > 0
 346  * it's just the icmp type << 8 | icmp code.  After adjustment
 347  * header points to the first 8 bytes of the tcp header.  We need
 348  * to find the appropriate port.
 349  *
 350  * The locking strategy used here is very "optimistic". When
 351  * someone else accesses the socket the ICMP is just dropped
 352  * and for some paths there is no check at all.
 353  * A more general error queue to queue errors for later handling
 354  * is probably better.
 355  *
 356  */
 357
 358 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 359 {
 360         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 361         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 362         struct inet_connection_sock *icsk;
 363         struct tcp_sock *tp;
 364         struct inet_sock *inet;
 365         const int type = icmp_hdr(icmp_skb)->type;
 366         const int code = icmp_hdr(icmp_skb)->code;
 367         struct sock *sk;
 368         struct sk_buff *skb;
 369         struct request_sock *fastopen;
 370         __u32 seq, snd_una;
 371         __u32 remaining;
 372         int err;
 373         struct net *net = dev_net(icmp_skb->dev);
 374
 375         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 376                                        th->dest, iph->saddr, ntohs(th->source),
 377                                        inet_iif(icmp_skb));
 378         if (!sk) {
 379                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 380                 return;
 381         }
 382         if (sk->sk_state == TCP_TIME_WAIT) {
 383                 inet_twsk_put(inet_twsk(sk));
 384                 return;
 385         }
 386         seq = ntohl(th->seq);
 387         if (sk->sk_state == TCP_NEW_SYN_RECV)
 388                 return tcp_req_err(sk, seq,
 389                                   type == ICMP_PARAMETERPROB ||
 390                                   type == ICMP_TIME_EXCEEDED ||
 391                                   (type == ICMP_DEST_UNREACH &&
 392                                    (code == ICMP_NET_UNREACH ||
 393                                     code == ICMP_HOST_UNREACH)));
 394
 395         bh_lock_sock(sk);
 396         /* If too many ICMPs get dropped on busy
 397          * servers this needs to be solved differently.
 398          * We do take care of PMTU discovery (RFC1191) special case :
 399          * we can receive locally generated ICMP messages while socket is held.
 400          */
 401         if (sock_owned_by_user(sk)) {
 402                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 403                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 404         }
 405         if (sk->sk_state == TCP_CLOSE)
 406                 goto out;
 407
 408         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 409                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 410                 goto out;
 411         }
 412
 413         icsk = inet_csk(sk);
 414         tp = tcp_sk(sk);
 415         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 416         fastopen = tp->fastopen_rsk;
 417         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 418         if (sk->sk_state != TCP_LISTEN &&
 419             !between(seq, snd_una, tp->snd_nxt)) {
 420                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 421                 goto out;
 422         }
 423
 424         switch (type) {
 425         case ICMP_REDIRECT:
 426                 if (!sock_owned_by_user(sk))
 427                         do_redirect(icmp_skb, sk);
 428                 goto out;
 429         case ICMP_SOURCE_QUENCH:
 430                 /* Just silently ignore these. */
 431                 goto out;
 432         case ICMP_PARAMETERPROB:
 433                 err = EPROTO;
 434                 break;
 435         case ICMP_DEST_UNREACH:
 436                 if (code > NR_ICMP_UNREACH)
 437                         goto out;
 438
 439                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 440                         /* We are not interested in TCP_LISTEN and open_requests
 441                          * (SYN-ACKs send out by Linux are always <576bytes so
 442                          * they should go through unfragmented).
 443                          */
 444                         if (sk->sk_state == TCP_LISTEN)
 445                                 goto out;
 446
 447                         tp->mtu_info = info;
 448                         if (!sock_owned_by_user(sk)) {
 449                                 tcp_v4_mtu_reduced(sk);
 450                         } else {
 451                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 452                                         sock_hold(sk);
 453                         }
 454                         goto out;
 455                 }
 456
 457                 err = icmp_err_convert[code].errno;
 458                 /* check if icmp_skb allows revert of backoff
 459                  * (see draft-zimmermann-tcp-lcd) */
 460                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 461                         break;
 462                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 463                     !icsk->icsk_backoff || fastopen)
 464                         break;
 465
 466                 if (sock_owned_by_user(sk))
 467                         break;
 468
 469                 icsk->icsk_backoff--;
 470                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 471                                                TCP_TIMEOUT_INIT;
 472                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 473
 474                 skb = tcp_write_queue_head(sk);
 475                 BUG_ON(!skb);
 476
 477                 remaining = icsk->icsk_rto -
 478                             min(icsk->icsk_rto,
 479                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 480
 481                 if (remaining) {
 482                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 483                                                   remaining, TCP_RTO_MAX);
 484                 } else {
 485                         /* RTO revert clocked out retransmission.
 486                          * Will retransmit now */
 487                         tcp_retransmit_timer(sk);
 488                 }
 489
 490                 break;
 491         case ICMP_TIME_EXCEEDED:
 492                 err = EHOSTUNREACH;
 493                 break;
 494         default:
 495                 goto out;
 496         }
 497
 498         switch (sk->sk_state) {
 499         case TCP_SYN_SENT:
 500         case TCP_SYN_RECV:
 501                 /* Only in fast or simultaneous open. If a fast open socket is
 502                  * is already accepted it is treated as a connected one below.
 503                  */
 504                 if (fastopen && !fastopen->sk)
 505                         break;
 506
 507                 if (!sock_owned_by_user(sk)) {
 508                         sk->sk_err = err;
 509
 510                         sk->sk_error_report(sk);
 511
 512                         tcp_done(sk);
 513                 } else {
 514                         sk->sk_err_soft = err;
 515                 }
 516                 goto out;
 517         }
 518
 519         /* If we've already connected we will keep trying
 520          * until we time out, or the user gives up.
 521          *
 522          * rfc1122 4.2.3.9 allows to consider as hard errors
 523          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 524          * but it is obsoleted by pmtu discovery).
 525          *
 526          * Note, that in modern internet, where routing is unreliable
 527          * and in each dark corner broken firewalls sit, sending random
 528          * errors ordered by their masters even this two messages finally lose
 529          * their original sense (even Linux sends invalid PORT_UNREACHs)
 530          *
 531          * Now we are in compliance with RFCs.
 532          *                                                      --ANK (980905)
 533          */
 534
 535         inet = inet_sk(sk);
 536         if (!sock_owned_by_user(sk) && inet->recverr) {
 537                 sk->sk_err = err;
 538                 sk->sk_error_report(sk);
 539         } else  { /* Only an error on timeout */
 540                 sk->sk_err_soft = err;
 541         }
 542
 543 out:
 544         bh_unlock_sock(sk);
 545         sock_put(sk);
 546 }
 547
 548 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 549 {
 550         struct tcphdr *th = tcp_hdr(skb);
 551
 552         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 553                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 554                 skb->csum_start = skb_transport_header(skb) - skb->head;
 555                 skb->csum_offset = offsetof(struct tcphdr, check);
 556         } else {
 557                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 558                                          csum_partial(th,
 559                                                       th->doff << 2,
 560                                                       skb->csum));
 561         }
 562 }
 563
 564 /* This routine computes an IPv4 TCP checksum. */
 565 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 566 {
 567         const struct inet_sock *inet = inet_sk(sk);
 568
 569         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 570 }
 571 EXPORT_SYMBOL(tcp_v4_send_check);
 572
 573 /*
 574  *      This routine will send an RST to the other tcp.
 575  *
 576  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 577  *                    for reset.
 578  *      Answer: if a packet caused RST, it is not for a socket
 579  *              existing in our system, if it is matched to a socket,
 580  *              it is just duplicate segment or bug in other side's TCP.
 581  *              So that we build reply only basing on parameters
 582  *              arrived with segment.
 583  *      Exception: precedence violation. We do not implement it in any case.
 584  */
 585
 586 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 587 {
 588         const struct tcphdr *th = tcp_hdr(skb);
 589         struct {
 590                 struct tcphdr th;
 591 #ifdef CONFIG_TCP_MD5SIG
 592                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 593 #endif
 594         } rep;
 595         struct ip_reply_arg arg;
 596 #ifdef CONFIG_TCP_MD5SIG
 597         struct tcp_md5sig_key *key;
 598         const __u8 *hash_location = NULL;
 599         unsigned char newhash[16];
 600         int genhash;
 601         struct sock *sk1 = NULL;
 602 #endif
 603         struct net *net;
 604
 605         /* Never send a reset in response to a reset. */
 606         if (th->rst)
 607                 return;
 608
 609         /* If sk not NULL, it means we did a successful lookup and incoming
 610          * route had to be correct. prequeue might have dropped our dst.
 611          */
 612         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 613                 return;
 614
 615         /* Swap the send and the receive. */
 616         memset(&rep, 0, sizeof(rep));
 617         rep.th.dest   = th->source;
 618         rep.th.source = th->dest;
 619         rep.th.doff   = sizeof(struct tcphdr) / 4;
 620         rep.th.rst    = 1;
 621
 622         if (th->ack) {
 623                 rep.th.seq = th->ack_seq;
 624         } else {
 625                 rep.th.ack = 1;
 626                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 627                                        skb->len - (th->doff << 2));
 628         }
 629
 630         memset(&arg, 0, sizeof(arg));
 631         arg.iov[0].iov_base = (unsigned char *)&rep;
 632         arg.iov[0].iov_len  = sizeof(rep.th);
 633
 634         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 635 #ifdef CONFIG_TCP_MD5SIG
 636         hash_location = tcp_parse_md5sig_option(th);
 637         if (!sk && hash_location) {
 638                 /*
 639                  * active side is lost. Try to find listening socket through
 640                  * source port, and then find md5 key through listening socket.
 641                  * we are not loose security here:
 642                  * Incoming packet is checked with md5 hash with finding key,
 643                  * no RST generated if md5 hash doesn't match.
 644                  */
 645                 sk1 = __inet_lookup_listener(net,
 646                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
 647                                              th->source, ip_hdr(skb)->daddr,
 648                                              ntohs(th->source), inet_iif(skb));
 649                 /* don't send rst if it can't find key */
 650                 if (!sk1)
 651                         return;
 652                 rcu_read_lock();
 653                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 654                                         &ip_hdr(skb)->saddr, AF_INET);
 655                 if (!key)
 656                         goto release_sk1;
 657
 658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 660                         goto release_sk1;
 661         } else {
 662                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 663                                              &ip_hdr(skb)->saddr,
 664                                              AF_INET) : NULL;
 665         }
 666
 667         if (key) {
 668                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 669                                    (TCPOPT_NOP << 16) |
 670                                    (TCPOPT_MD5SIG << 8) |
 671                                    TCPOLEN_MD5SIG);
 672                 /* Update length and the length the header thinks exists */
 673                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 674                 rep.th.doff = arg.iov[0].iov_len / 4;
 675
 676                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 677                                      key, ip_hdr(skb)->saddr,
 678                                      ip_hdr(skb)->daddr, &rep.th);
 679         }
 680 #endif
 681         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 682                                       ip_hdr(skb)->saddr, /* XXX */
 683                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 684         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 685         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 686         /* When socket is gone, all binding information is lost.
 687          * routing might fail in this case. No choice here, if we choose to force
 688          * input interface, we will misroute in case of asymmetric route.
 689          */
 690         if (sk)
 691                 arg.bound_dev_if = sk->sk_bound_dev_if;
 692
 693         arg.tos = ip_hdr(skb)->tos;
 694         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 695         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 696                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 697                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 698                               &arg, arg.iov[0].iov_len);
 699
 700         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 701         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 702
 703 #ifdef CONFIG_TCP_MD5SIG
 704 release_sk1:
 705         if (sk1) {
 706                 rcu_read_unlock();
 707                 sock_put(sk1);
 708         }
 709 #endif
 710 }
 711
 712 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 713    outside socket context is ugly, certainly. What can I do?
 714  */
 715
 716 static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb,
 717                             u32 seq, u32 ack,
 718                             u32 win, u32 tsval, u32 tsecr, int oif,
 719                             struct tcp_md5sig_key *key,
 720                             int reply_flags, u8 tos)
 721 {
 722         const struct tcphdr *th = tcp_hdr(skb);
 723         struct {
 724                 struct tcphdr th;
 725                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 726 #ifdef CONFIG_TCP_MD5SIG
 727                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 728 #endif
 729                         ];
 730         } rep;
 731         struct ip_reply_arg arg;
 732         struct net *net = sock_net(sk);
 733
 734         memset(&rep.th, 0, sizeof(struct tcphdr));
 735         memset(&arg, 0, sizeof(arg));
 736
 737         arg.iov[0].iov_base = (unsigned char *)&rep;
 738         arg.iov[0].iov_len  = sizeof(rep.th);
 739         if (tsecr) {
 740                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 741                                    (TCPOPT_TIMESTAMP << 8) |
 742                                    TCPOLEN_TIMESTAMP);
 743                 rep.opt[1] = htonl(tsval);
 744                 rep.opt[2] = htonl(tsecr);
 745                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 746         }
 747
 748         /* Swap the send and the receive. */
 749         rep.th.dest    = th->source;
 750         rep.th.source  = th->dest;
 751         rep.th.doff    = arg.iov[0].iov_len / 4;
 752         rep.th.seq     = htonl(seq);
 753         rep.th.ack_seq = htonl(ack);
 754         rep.th.ack     = 1;
 755         rep.th.window  = htons(win);
 756
 757 #ifdef CONFIG_TCP_MD5SIG
 758         if (key) {
 759                 int offset = (tsecr) ? 3 : 0;
 760
 761                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 762                                           (TCPOPT_NOP << 16) |
 763                                           (TCPOPT_MD5SIG << 8) |
 764                                           TCPOLEN_MD5SIG);
 765                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 766                 rep.th.doff = arg.iov[0].iov_len/4;
 767
 768                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 769                                     key, ip_hdr(skb)->saddr,
 770                                     ip_hdr(skb)->daddr, &rep.th);
 771         }
 772 #endif
 773         arg.flags = reply_flags;
 774         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 775                                       ip_hdr(skb)->saddr, /* XXX */
 776                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 777         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 778         if (oif)
 779                 arg.bound_dev_if = oif;
 780         arg.tos = tos;
 781         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 782         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 783                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 784                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 785                               &arg, arg.iov[0].iov_len);
 786
 787         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 788 }
 789
 790 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 791 {
 792         struct inet_timewait_sock *tw = inet_twsk(sk);
 793         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 794
 795         tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 796                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 797                         tcp_time_stamp + tcptw->tw_ts_offset,
 798                         tcptw->tw_ts_recent,
 799                         tw->tw_bound_dev_if,
 800                         tcp_twsk_md5_key(tcptw),
 801                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 802                         tw->tw_tos
 803                         );
 804
 805         inet_twsk_put(tw);
 806 }
 807
 808 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 809                                   struct request_sock *req)
 810 {
 811         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 812          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 813          */
 814         tcp_v4_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
 815                         tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 816                         tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
 817                         tcp_time_stamp,
 818                         req->ts_recent,
 819                         0,
 820                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 821                                           AF_INET),
 822                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 823                         ip_hdr(skb)->tos);
 824 }
 825
 826 /*
 827  *      Send a SYN-ACK after having received a SYN.
 828  *      This still operates on a request_sock only, not on a big
 829  *      socket.
 830  */
 831 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 832                               struct flowi *fl,
 833                               struct request_sock *req,
 834                               struct tcp_fastopen_cookie *foc,
 835                                   bool attach_req)
 836 {
 837         const struct inet_request_sock *ireq = inet_rsk(req);
 838         struct flowi4 fl4;
 839         int err = -1;
 840         struct sk_buff *skb;
 841
 842         /* First, grab a route. */
 843         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 844                 return -1;
 845
 846         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
 847
 848         if (skb) {
 849                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 850
 851                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 852                                             ireq->ir_rmt_addr,
 853                                             ireq->opt);
 854                 err = net_xmit_eval(err);
 855         }
 856
 857         return err;
 858 }
 859
 860 /*
 861  *      IPv4 request_sock destructor.
 862  */
 863 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 864 {
 865         kfree(inet_rsk(req)->opt);
 866 }
 867
 868
 869 #ifdef CONFIG_TCP_MD5SIG
 870 /*
 871  * RFC2385 MD5 checksumming requires a mapping of
 872  * IP address->MD5 Key.
 873  * We need to maintain these in the sk structure.
 874  */
 875
 876 /* Find the Key structure for an address.  */
 877 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 878                                          const union tcp_md5_addr *addr,
 879                                          int family)
 880 {
 881         const struct tcp_sock *tp = tcp_sk(sk);
 882         struct tcp_md5sig_key *key;
 883         unsigned int size = sizeof(struct in_addr);
 884         const struct tcp_md5sig_info *md5sig;
 885
 886         /* caller either holds rcu_read_lock() or socket lock */
 887         md5sig = rcu_dereference_check(tp->md5sig_info,
 888                                        sock_owned_by_user(sk) ||
 889                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
 890         if (!md5sig)
 891                 return NULL;
 892 #if IS_ENABLED(CONFIG_IPV6)
 893         if (family == AF_INET6)
 894                 size = sizeof(struct in6_addr);
 895 #endif
 896         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 897                 if (key->family != family)
 898                         continue;
 899                 if (!memcmp(&key->addr, addr, size))
 900                         return key;
 901         }
 902         return NULL;
 903 }
 904 EXPORT_SYMBOL(tcp_md5_do_lookup);
 905
 906 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 907                                          const struct sock *addr_sk)
 908 {
 909         const union tcp_md5_addr *addr;
 910
 911         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 912         return tcp_md5_do_lookup(sk, addr, AF_INET);
 913 }
 914 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 915
 916 /* This can be called on a newly created socket, from other files */
 917 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 918                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 919 {
 920         /* Add Key to the list */
 921         struct tcp_md5sig_key *key;
 922         struct tcp_sock *tp = tcp_sk(sk);
 923         struct tcp_md5sig_info *md5sig;
 924
 925         key = tcp_md5_do_lookup(sk, addr, family);
 926         if (key) {
 927                 /* Pre-existing entry - just update that one. */
 928                 memcpy(key->key, newkey, newkeylen);
 929                 key->keylen = newkeylen;
 930                 return 0;
 931         }
 932
 933         md5sig = rcu_dereference_protected(tp->md5sig_info,
 934                                            sock_owned_by_user(sk) ||
 935                                            lockdep_is_held(&sk->sk_lock.slock));
 936         if (!md5sig) {
 937                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 938                 if (!md5sig)
 939                         return -ENOMEM;
 940
 941                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 942                 INIT_HLIST_HEAD(&md5sig->head);
 943                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 944         }
 945
 946         key = sock_kmalloc(sk, sizeof(*key), gfp);
 947         if (!key)
 948                 return -ENOMEM;
 949         if (!tcp_alloc_md5sig_pool()) {
 950                 sock_kfree_s(sk, key, sizeof(*key));
 951                 return -ENOMEM;
 952         }
 953
 954         memcpy(key->key, newkey, newkeylen);
 955         key->keylen = newkeylen;
 956         key->family = family;
 957         memcpy(&key->addr, addr,
 958                (family == AF_INET6) ? sizeof(struct in6_addr) :
 959                                       sizeof(struct in_addr));
 960         hlist_add_head_rcu(&key->node, &md5sig->head);
 961         return 0;
 962 }
 963 EXPORT_SYMBOL(tcp_md5_do_add);
 964
 965 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 966 {
 967         struct tcp_md5sig_key *key;
 968
 969         key = tcp_md5_do_lookup(sk, addr, family);
 970         if (!key)
 971                 return -ENOENT;
 972         hlist_del_rcu(&key->node);
 973         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 974         kfree_rcu(key, rcu);
 975         return 0;
 976 }
 977 EXPORT_SYMBOL(tcp_md5_do_del);
 978
 979 static void tcp_clear_md5_list(struct sock *sk)
 980 {
 981         struct tcp_sock *tp = tcp_sk(sk);
 982         struct tcp_md5sig_key *key;
 983         struct hlist_node *n;
 984         struct tcp_md5sig_info *md5sig;
 985
 986         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 987
 988         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
 989                 hlist_del_rcu(&key->node);
 990                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 991                 kfree_rcu(key, rcu);
 992         }
 993 }
 994
 995 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 996                                  int optlen)
 997 {
 998         struct tcp_md5sig cmd;
 999         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1000
1001         if (optlen < sizeof(cmd))
1002                 return -EINVAL;
1003
1004         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1005                 return -EFAULT;
1006
1007         if (sin->sin_family != AF_INET)
1008                 return -EINVAL;
1009
1010         if (!cmd.tcpm_keylen)
1011                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1012                                       AF_INET);
1013
1014         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1015                 return -EINVAL;
1016
1017         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1018                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1019                               GFP_KERNEL);
1020 }
1021
1022 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1023                                         __be32 daddr, __be32 saddr, int nbytes)
1024 {
1025         struct tcp4_pseudohdr *bp;
1026         struct scatterlist sg;
1027
1028         bp = &hp->md5_blk.ip4;
1029
1030         /*
1031          * 1. the TCP pseudo-header (in the order: source IP address,
1032          * destination IP address, zero-padded protocol number, and
1033          * segment length)
1034          */
1035         bp->saddr = saddr;
1036         bp->daddr = daddr;
1037         bp->pad = 0;
1038         bp->protocol = IPPROTO_TCP;
1039         bp->len = cpu_to_be16(nbytes);
1040
1041         sg_init_one(&sg, bp, sizeof(*bp));
1042         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1043 }
1044
1045 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1046                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1047 {
1048         struct tcp_md5sig_pool *hp;
1049         struct hash_desc *desc;
1050
1051         hp = tcp_get_md5sig_pool();
1052         if (!hp)
1053                 goto clear_hash_noput;
1054         desc = &hp->md5_desc;
1055
1056         if (crypto_hash_init(desc))
1057                 goto clear_hash;
1058         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1059                 goto clear_hash;
1060         if (tcp_md5_hash_header(hp, th))
1061                 goto clear_hash;
1062         if (tcp_md5_hash_key(hp, key))
1063                 goto clear_hash;
1064         if (crypto_hash_final(desc, md5_hash))
1065                 goto clear_hash;
1066
1067         tcp_put_md5sig_pool();
1068         return 0;
1069
1070 clear_hash:
1071         tcp_put_md5sig_pool();
1072 clear_hash_noput:
1073         memset(md5_hash, 0, 16);
1074         return 1;
1075 }
1076
1077 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1078                         const struct sock *sk,
1079                         const struct sk_buff *skb)
1080 {
1081         struct tcp_md5sig_pool *hp;
1082         struct hash_desc *desc;
1083         const struct tcphdr *th = tcp_hdr(skb);
1084         __be32 saddr, daddr;
1085
1086         if (sk) { /* valid for establish/request sockets */
1087                 saddr = sk->sk_rcv_saddr;
1088                 daddr = sk->sk_daddr;
1089         } else {
1090                 const struct iphdr *iph = ip_hdr(skb);
1091                 saddr = iph->saddr;
1092                 daddr = iph->daddr;
1093         }
1094
1095         hp = tcp_get_md5sig_pool();
1096         if (!hp)
1097                 goto clear_hash_noput;
1098         desc = &hp->md5_desc;
1099
1100         if (crypto_hash_init(desc))
1101                 goto clear_hash;
1102
1103         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1104                 goto clear_hash;
1105         if (tcp_md5_hash_header(hp, th))
1106                 goto clear_hash;
1107         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1108                 goto clear_hash;
1109         if (tcp_md5_hash_key(hp, key))
1110                 goto clear_hash;
1111         if (crypto_hash_final(desc, md5_hash))
1112                 goto clear_hash;
1113
1114         tcp_put_md5sig_pool();
1115         return 0;
1116
1117 clear_hash:
1118         tcp_put_md5sig_pool();
1119 clear_hash_noput:
1120         memset(md5_hash, 0, 16);
1121         return 1;
1122 }
1123 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1124
1125 #endif
1126
1127 /* Called with rcu_read_lock() */
1128 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1129                                     const struct sk_buff *skb)
1130 {
1131 #ifdef CONFIG_TCP_MD5SIG
1132         /*
1133          * This gets called for each TCP segment that arrives
1134          * so we want to be efficient.
1135          * We have 3 drop cases:
1136          * o No MD5 hash and one expected.
1137          * o MD5 hash and we're not expecting one.
1138          * o MD5 hash and its wrong.
1139          */
1140         const __u8 *hash_location = NULL;
1141         struct tcp_md5sig_key *hash_expected;
1142         const struct iphdr *iph = ip_hdr(skb);
1143         const struct tcphdr *th = tcp_hdr(skb);
1144         int genhash;
1145         unsigned char newhash[16];
1146
1147         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1148                                           AF_INET);
1149         hash_location = tcp_parse_md5sig_option(th);
1150
1151         /* We've parsed the options - do we have a hash? */
1152         if (!hash_expected && !hash_location)
1153                 return false;
1154
1155         if (hash_expected && !hash_location) {
1156                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1157                 return true;
1158         }
1159
1160         if (!hash_expected && hash_location) {
1161                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1162                 return true;
1163         }
1164
1165         /* Okay, so this is hash_expected and hash_location -
1166          * so we need to calculate the checksum.
1167          */
1168         genhash = tcp_v4_md5_hash_skb(newhash,
1169                                       hash_expected,
1170                                       NULL, skb);
1171
1172         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1173                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1174                                      &iph->saddr, ntohs(th->source),
1175                                      &iph->daddr, ntohs(th->dest),
1176                                      genhash ? " tcp_v4_calc_md5_hash failed"
1177                                      : "");
1178                 return true;
1179         }
1180         return false;
1181 #endif
1182         return false;
1183 }
1184
1185 static void tcp_v4_init_req(struct request_sock *req,
1186                             const struct sock *sk_listener,
1187                             struct sk_buff *skb)
1188 {
1189         struct inet_request_sock *ireq = inet_rsk(req);
1190
1191         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1192         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1193         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1194         ireq->opt = tcp_v4_save_options(skb);
1195 }
1196
1197 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1198                                           struct flowi *fl,
1199                                           const struct request_sock *req,
1200                                           bool *strict)
1201 {
1202         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1203
1204         if (strict) {
1205                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1206                         *strict = true;
1207                 else
1208                         *strict = false;
1209         }
1210
1211         return dst;
1212 }
1213
1214 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1215         .family         =       PF_INET,
1216         .obj_size       =       sizeof(struct tcp_request_sock),
1217         .rtx_syn_ack    =       tcp_rtx_synack,
1218         .send_ack       =       tcp_v4_reqsk_send_ack,
1219         .destructor     =       tcp_v4_reqsk_destructor,
1220         .send_reset     =       tcp_v4_send_reset,
1221         .syn_ack_timeout =      tcp_syn_ack_timeout,
1222 };
1223
1224 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1225         .mss_clamp      =       TCP_MSS_DEFAULT,
1226 #ifdef CONFIG_TCP_MD5SIG
1227         .req_md5_lookup =       tcp_v4_md5_lookup,
1228         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1229 #endif
1230         .init_req       =       tcp_v4_init_req,
1231 #ifdef CONFIG_SYN_COOKIES
1232         .cookie_init_seq =      cookie_v4_init_sequence,
1233 #endif
1234         .route_req      =       tcp_v4_route_req,
1235         .init_seq       =       tcp_v4_init_sequence,
1236         .send_synack    =       tcp_v4_send_synack,
1237 };
1238
1239 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1240 {
1241         /* Never answer to SYNs send to broadcast or multicast */
1242         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1243                 goto drop;
1244
1245         return tcp_conn_request(&tcp_request_sock_ops,
1246                                 &tcp_request_sock_ipv4_ops, sk, skb);
1247
1248 drop:
1249         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1250         return 0;
1251 }
1252 EXPORT_SYMBOL(tcp_v4_conn_request);
1253
1254
1255 /*
1256  * The three way handshake has completed - we got a valid synack -
1257  * now create the new socket.
1258  */
1259 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1260                                   struct request_sock *req,
1261                                   struct dst_entry *dst,
1262                                   struct request_sock *req_unhash,
1263                                   bool *own_req)
1264 {
1265         struct inet_request_sock *ireq;
1266         struct inet_sock *newinet;
1267         struct tcp_sock *newtp;
1268         struct sock *newsk;
1269 #ifdef CONFIG_TCP_MD5SIG
1270         struct tcp_md5sig_key *key;
1271 #endif
1272         struct ip_options_rcu *inet_opt;
1273
1274         if (sk_acceptq_is_full(sk))
1275                 goto exit_overflow;
1276
1277         newsk = tcp_create_openreq_child(sk, req, skb);
1278         if (!newsk)
1279                 goto exit_nonewsk;
1280
1281         newsk->sk_gso_type = SKB_GSO_TCPV4;
1282         inet_sk_rx_dst_set(newsk, skb);
1283
1284         newtp                 = tcp_sk(newsk);
1285         newinet               = inet_sk(newsk);
1286         ireq                  = inet_rsk(req);
1287         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1288         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1289         newinet->inet_saddr           = ireq->ir_loc_addr;
1290         inet_opt              = ireq->opt;
1291         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1292         ireq->opt             = NULL;
1293         newinet->mc_index     = inet_iif(skb);
1294         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1295         newinet->rcv_tos      = ip_hdr(skb)->tos;
1296         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1297         if (inet_opt)
1298                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1299         newinet->inet_id = newtp->write_seq ^ jiffies;
1300
1301         if (!dst) {
1302                 dst = inet_csk_route_child_sock(sk, newsk, req);
1303                 if (!dst)
1304                         goto put_and_exit;
1305         } else {
1306                 /* syncookie case : see end of cookie_v4_check() */
1307         }
1308         sk_setup_caps(newsk, dst);
1309
1310         tcp_ca_openreq_child(newsk, dst);
1311
1312         tcp_sync_mss(newsk, dst_mtu(dst));
1313         newtp->advmss = dst_metric_advmss(dst);
1314         if (tcp_sk(sk)->rx_opt.user_mss &&
1315             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1316                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1317
1318         tcp_initialize_rcv_mss(newsk);
1319
1320 #ifdef CONFIG_TCP_MD5SIG
1321         /* Copy over the MD5 key from the original socket */
1322         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1323                                 AF_INET);
1324         if (key) {
1325                 /*
1326                  * We're using one, so create a matching key
1327                  * on the newsk structure. If we fail to get
1328                  * memory, then we end up not copying the key
1329                  * across. Shucks.
1330                  */
1331                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1332                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1333                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1334         }
1335 #endif
1336
1337         if (__inet_inherit_port(sk, newsk) < 0)
1338                 goto put_and_exit;
1339         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1340         if (*own_req)
1341                 tcp_move_syn(newtp, req);
1342
1343         return newsk;
1344
1345 exit_overflow:
1346         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1347 exit_nonewsk:
1348         dst_release(dst);
1349 exit:
1350         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1351         return NULL;
1352 put_and_exit:
1353         inet_csk_prepare_forced_close(newsk);
1354         tcp_done(newsk);
1355         goto exit;
1356 }
1357 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1358
1359 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1360 {
1361 #ifdef CONFIG_SYN_COOKIES
1362         const struct tcphdr *th = tcp_hdr(skb);
1363
1364         if (!th->syn)
1365                 sk = cookie_v4_check(sk, skb);
1366 #endif
1367         return sk;
1368 }
1369
1370 /* The socket must have it's spinlock held when we get
1371  * here, unless it is a TCP_LISTEN socket.
1372  *
1373  * We have a potential double-lock case here, so even when
1374  * doing backlog processing we use the BH locking scheme.
1375  * This is because we cannot sleep with the original spinlock
1376  * held.
1377  */
1378 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1379 {
1380         struct sock *rsk;
1381
1382         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1383                 struct dst_entry *dst = sk->sk_rx_dst;
1384
1385                 sock_rps_save_rxhash(sk, skb);
1386                 sk_mark_napi_id(sk, skb);
1387                 if (dst) {
1388                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1389                             !dst->ops->check(dst, 0)) {
1390                                 dst_release(dst);
1391                                 sk->sk_rx_dst = NULL;
1392                         }
1393                 }
1394                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1395                 return 0;
1396         }
1397
1398         if (tcp_checksum_complete(skb))
1399                 goto csum_err;
1400
1401         if (sk->sk_state == TCP_LISTEN) {
1402                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1403
1404                 if (!nsk)
1405                         goto discard;
1406                 if (nsk != sk) {
1407                         sock_rps_save_rxhash(nsk, skb);
1408                         sk_mark_napi_id(nsk, skb);
1409                         if (tcp_child_process(sk, nsk, skb)) {
1410                                 rsk = nsk;
1411                                 goto reset;
1412                         }
1413                         return 0;
1414                 }
1415         } else
1416                 sock_rps_save_rxhash(sk, skb);
1417
1418         if (tcp_rcv_state_process(sk, skb)) {
1419                 rsk = sk;
1420                 goto reset;
1421         }
1422         return 0;
1423
1424 reset:
1425         tcp_v4_send_reset(rsk, skb);
1426 discard:
1427         kfree_skb(skb);
1428         /* Be careful here. If this function gets more complicated and
1429          * gcc suffers from register pressure on the x86, sk (in %ebx)
1430          * might be destroyed here. This current version compiles correctly,
1431          * but you have been warned.
1432          */
1433         return 0;
1434
1435 csum_err:
1436         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1437         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1438         goto discard;
1439 }
1440 EXPORT_SYMBOL(tcp_v4_do_rcv);
1441
1442 void tcp_v4_early_demux(struct sk_buff *skb)
1443 {
1444         const struct iphdr *iph;
1445         const struct tcphdr *th;
1446         struct sock *sk;
1447
1448         if (skb->pkt_type != PACKET_HOST)
1449                 return;
1450
1451         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1452                 return;
1453
1454         iph = ip_hdr(skb);
1455         th = tcp_hdr(skb);
1456
1457         if (th->doff < sizeof(struct tcphdr) / 4)
1458                 return;
1459
1460         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1461                                        iph->saddr, th->source,
1462                                        iph->daddr, ntohs(th->dest),
1463                                        skb->skb_iif);
1464         if (sk) {
1465                 skb->sk = sk;
1466                 skb->destructor = sock_edemux;
1467                 if (sk_fullsock(sk)) {
1468                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1469
1470                         if (dst)
1471                                 dst = dst_check(dst, 0);
1472                         if (dst &&
1473                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1474                                 skb_dst_set_noref(skb, dst);
1475                 }
1476         }
1477 }
1478
1479 /* Packet is added to VJ-style prequeue for processing in process
1480  * context, if a reader task is waiting. Apparently, this exciting
1481  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1482  * failed somewhere. Latency? Burstiness? Well, at least now we will
1483  * see, why it failed. 8)8)                               --ANK
1484  *
1485  */
1486 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1487 {
1488         struct tcp_sock *tp = tcp_sk(sk);
1489
1490         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1491                 return false;
1492
1493         if (skb->len <= tcp_hdrlen(skb) &&
1494             skb_queue_len(&tp->ucopy.prequeue) == 0)
1495                 return false;
1496
1497         /* Before escaping RCU protected region, we need to take care of skb
1498          * dst. Prequeue is only enabled for established sockets.
1499          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1500          * Instead of doing full sk_rx_dst validity here, let's perform
1501          * an optimistic check.
1502          */
1503         if (likely(sk->sk_rx_dst))
1504                 skb_dst_drop(skb);
1505         else
1506                 skb_dst_force_safe(skb);
1507
1508         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1509         tp->ucopy.memory += skb->truesize;
1510         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1511                 struct sk_buff *skb1;
1512
1513                 BUG_ON(sock_owned_by_user(sk));
1514
1515                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1516                         sk_backlog_rcv(sk, skb1);
1517                         NET_INC_STATS_BH(sock_net(sk),
1518                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1519                 }
1520
1521                 tp->ucopy.memory = 0;
1522         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1523                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1524                                            POLLIN | POLLRDNORM | POLLRDBAND);
1525                 if (!inet_csk_ack_scheduled(sk))
1526                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1527                                                   (3 * tcp_rto_min(sk)) / 4,
1528                                                   TCP_RTO_MAX);
1529         }
1530         return true;
1531 }
1532 EXPORT_SYMBOL(tcp_prequeue);
1533
1534 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1535 {
1536         struct tcphdr *th = (struct tcphdr *)skb->data;
1537         unsigned int eaten = skb->len;
1538         int err;
1539
1540         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1541         if (!err) {
1542                 eaten -= skb->len;
1543                 TCP_SKB_CB(skb)->end_seq -= eaten;
1544         }
1545         return err;
1546 }
1547 EXPORT_SYMBOL(tcp_filter);
1548
1549 /*
1550  *      From tcp_input.c
1551  */
1552
1553 int tcp_v4_rcv(struct sk_buff *skb)
1554 {
1555         const struct iphdr *iph;
1556         const struct tcphdr *th;
1557         struct sock *sk;
1558         int ret;
1559         struct net *net = dev_net(skb->dev);
1560
1561         if (skb->pkt_type != PACKET_HOST)
1562                 goto discard_it;
1563
1564         /* Count it even if it's bad */
1565         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1566
1567         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1568                 goto discard_it;
1569
1570         th = tcp_hdr(skb);
1571
1572         if (th->doff < sizeof(struct tcphdr) / 4)
1573                 goto bad_packet;
1574         if (!pskb_may_pull(skb, th->doff * 4))
1575                 goto discard_it;
1576
1577         /* An explanation is required here, I think.
1578          * Packet length and doff are validated by header prediction,
1579          * provided case of th->doff==0 is eliminated.
1580          * So, we defer the checks. */
1581
1582         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1583                 goto csum_error;
1584
1585         th = tcp_hdr(skb);
1586         iph = ip_hdr(skb);
1587         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1588          * barrier() makes sure compiler wont play fool^Waliasing games.
1589          */
1590         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1591                 sizeof(struct inet_skb_parm));
1592         barrier();
1593
1594         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1595         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1596                                     skb->len - th->doff * 4);
1597         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1598         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1599         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1600         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1601         TCP_SKB_CB(skb)->sacked  = 0;
1602
1603 lookup:
1604         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1605         if (!sk)
1606                 goto no_tcp_socket;
1607
1608 process:
1609         if (sk->sk_state == TCP_TIME_WAIT)
1610                 goto do_time_wait;
1611
1612         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1613                 struct request_sock *req = inet_reqsk(sk);
1614                 struct sock *nsk;
1615
1616                 sk = req->rsk_listener;
1617                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1618                         reqsk_put(req);
1619                         goto discard_it;
1620                 }
1621                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1622                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1623                         goto lookup;
1624                 }
1625                 sock_hold(sk);
1626                 nsk = tcp_check_req(sk, skb, req, false);
1627                 if (!nsk) {
1628                         reqsk_put(req);
1629                         goto discard_and_relse;
1630                 }
1631                 if (nsk == sk) {
1632                         reqsk_put(req);
1633                 } else if (tcp_child_process(sk, nsk, skb)) {
1634                         tcp_v4_send_reset(nsk, skb);
1635                         goto discard_and_relse;
1636                 } else {
1637                         sock_put(sk);
1638                         return 0;
1639                 }
1640         }
1641         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1642                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1643                 goto discard_and_relse;
1644         }
1645
1646         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1647                 goto discard_and_relse;
1648
1649         if (tcp_v4_inbound_md5_hash(sk, skb))
1650                 goto discard_and_relse;
1651
1652         nf_reset(skb);
1653
1654         if (tcp_filter(sk, skb))
1655                 goto discard_and_relse;
1656         th = (const struct tcphdr *)skb->data;
1657         iph = ip_hdr(skb);
1658
1659         skb->dev = NULL;
1660
1661         if (sk->sk_state == TCP_LISTEN) {
1662                 ret = tcp_v4_do_rcv(sk, skb);
1663                 goto put_and_return;
1664         }
1665
1666         sk_incoming_cpu_update(sk);
1667
1668         bh_lock_sock_nested(sk);
1669         tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1670         ret = 0;
1671         if (!sock_owned_by_user(sk)) {
1672                 if (!tcp_prequeue(sk, skb))
1673                         ret = tcp_v4_do_rcv(sk, skb);
1674         } else if (unlikely(sk_add_backlog(sk, skb,
1675                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1676                 bh_unlock_sock(sk);
1677                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1678                 goto discard_and_relse;
1679         }
1680         bh_unlock_sock(sk);
1681
1682 put_and_return:
1683         sock_put(sk);
1684
1685         return ret;
1686
1687 no_tcp_socket:
1688         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1689                 goto discard_it;
1690
1691         if (tcp_checksum_complete(skb)) {
1692 csum_error:
1693                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1694 bad_packet:
1695                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1696         } else {
1697                 tcp_v4_send_reset(NULL, skb);
1698         }
1699
1700 discard_it:
1701         /* Discard frame. */
1702         kfree_skb(skb);
1703         return 0;
1704
1705 discard_and_relse:
1706         sock_put(sk);
1707         goto discard_it;
1708
1709 do_time_wait:
1710         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1711                 inet_twsk_put(inet_twsk(sk));
1712                 goto discard_it;
1713         }
1714
1715         if (tcp_checksum_complete(skb)) {
1716                 inet_twsk_put(inet_twsk(sk));
1717                 goto csum_error;
1718         }
1719         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1720         case TCP_TW_SYN: {
1721                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1722                                                         &tcp_hashinfo,
1723                                                         iph->saddr, th->source,
1724                                                         iph->daddr, th->dest,
1725                                                         inet_iif(skb));
1726                 if (sk2) {
1727                         inet_twsk_deschedule_put(inet_twsk(sk));
1728                         sk = sk2;
1729                         goto process;
1730                 }
1731                 /* Fall through to ACK */
1732         }
1733         case TCP_TW_ACK:
1734                 tcp_v4_timewait_ack(sk, skb);
1735                 break;
1736         case TCP_TW_RST:
1737                 goto no_tcp_socket;
1738         case TCP_TW_SUCCESS:;
1739         }
1740         goto discard_it;
1741 }
1742
1743 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1744         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1745         .twsk_unique    = tcp_twsk_unique,
1746         .twsk_destructor= tcp_twsk_destructor,
1747 };
1748
1749 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1750 {
1751         struct dst_entry *dst = skb_dst(skb);
1752
1753         if (dst && dst_hold_safe(dst)) {
1754                 sk->sk_rx_dst = dst;
1755                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1756         }
1757 }
1758 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1759
1760 const struct inet_connection_sock_af_ops ipv4_specific = {
1761         .queue_xmit        = ip_queue_xmit,
1762         .send_check        = tcp_v4_send_check,
1763         .rebuild_header    = inet_sk_rebuild_header,
1764         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1765         .conn_request      = tcp_v4_conn_request,
1766         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1767         .net_header_len    = sizeof(struct iphdr),
1768         .setsockopt        = ip_setsockopt,
1769         .getsockopt        = ip_getsockopt,
1770         .addr2sockaddr     = inet_csk_addr2sockaddr,
1771         .sockaddr_len      = sizeof(struct sockaddr_in),
1772         .bind_conflict     = inet_csk_bind_conflict,
1773 #ifdef CONFIG_COMPAT
1774         .compat_setsockopt = compat_ip_setsockopt,
1775         .compat_getsockopt = compat_ip_getsockopt,
1776 #endif
1777         .mtu_reduced       = tcp_v4_mtu_reduced,
1778 };
1779 EXPORT_SYMBOL(ipv4_specific);
1780
1781 #ifdef CONFIG_TCP_MD5SIG
1782 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1783         .md5_lookup             = tcp_v4_md5_lookup,
1784         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1785         .md5_parse              = tcp_v4_parse_md5_keys,
1786 };
1787 #endif
1788
1789 /* NOTE: A lot of things set to zero explicitly by call to
1790  *       sk_alloc() so need not be done here.
1791  */
1792 static int tcp_v4_init_sock(struct sock *sk)
1793 {
1794         struct inet_connection_sock *icsk = inet_csk(sk);
1795
1796         tcp_init_sock(sk);
1797
1798         icsk->icsk_af_ops = &ipv4_specific;
1799
1800 #ifdef CONFIG_TCP_MD5SIG
1801         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1802 #endif
1803
1804         return 0;
1805 }
1806
1807 void tcp_v4_destroy_sock(struct sock *sk)
1808 {
1809         struct tcp_sock *tp = tcp_sk(sk);
1810
1811         tcp_clear_xmit_timers(sk);
1812
1813         tcp_cleanup_congestion_control(sk);
1814
1815         /* Cleanup up the write buffer. */
1816         tcp_write_queue_purge(sk);
1817
1818         /* Cleans up our, hopefully empty, out_of_order_queue. */
1819         __skb_queue_purge(&tp->out_of_order_queue);
1820
1821 #ifdef CONFIG_TCP_MD5SIG
1822         /* Clean up the MD5 key list, if any */
1823         if (tp->md5sig_info) {
1824                 tcp_clear_md5_list(sk);
1825                 kfree_rcu(tp->md5sig_info, rcu);
1826                 tp->md5sig_info = NULL;
1827         }
1828 #endif
1829
1830         /* Clean prequeue, it must be empty really */
1831         __skb_queue_purge(&tp->ucopy.prequeue);
1832
1833         /* Clean up a referenced TCP bind bucket. */
1834         if (inet_csk(sk)->icsk_bind_hash)
1835                 inet_put_port(sk);
1836
1837         BUG_ON(tp->fastopen_rsk);
1838
1839         /* If socket is aborted during connect operation */
1840         tcp_free_fastopen_req(tp);
1841         tcp_saved_syn_free(tp);
1842
1843         sk_sockets_allocated_dec(sk);
1844         sock_release_memcg(sk);
1845 }
1846 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1847
1848 #ifdef CONFIG_PROC_FS
1849 /* Proc filesystem TCP sock list dumping. */
1850
1851 /*
1852  * Get next listener socket follow cur.  If cur is NULL, get first socket
1853  * starting from bucket given in st->bucket; when st->bucket is zero the
1854  * very first socket in the hash table is returned.
1855  */
1856 static void *listening_get_next(struct seq_file *seq, void *cur)
1857 {
1858         struct inet_connection_sock *icsk;
1859         struct hlist_nulls_node *node;
1860         struct sock *sk = cur;
1861         struct inet_listen_hashbucket *ilb;
1862         struct tcp_iter_state *st = seq->private;
1863         struct net *net = seq_file_net(seq);
1864
1865         if (!sk) {
1866                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1867                 spin_lock_bh(&ilb->lock);
1868                 sk = sk_nulls_head(&ilb->head);
1869                 st->offset = 0;
1870                 goto get_sk;
1871         }
1872         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1873         ++st->num;
1874         ++st->offset;
1875
1876         sk = sk_nulls_next(sk);
1877 get_sk:
1878         sk_nulls_for_each_from(sk, node) {
1879                 if (!net_eq(sock_net(sk), net))
1880                         continue;
1881                 if (sk->sk_family == st->family) {
1882                         cur = sk;
1883                         goto out;
1884                 }
1885                 icsk = inet_csk(sk);
1886         }
1887         spin_unlock_bh(&ilb->lock);
1888         st->offset = 0;
1889         if (++st->bucket < INET_LHTABLE_SIZE) {
1890                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1891                 spin_lock_bh(&ilb->lock);
1892                 sk = sk_nulls_head(&ilb->head);
1893                 goto get_sk;
1894         }
1895         cur = NULL;
1896 out:
1897         return cur;
1898 }
1899
1900 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1901 {
1902         struct tcp_iter_state *st = seq->private;
1903         void *rc;
1904
1905         st->bucket = 0;
1906         st->offset = 0;
1907         rc = listening_get_next(seq, NULL);
1908
1909         while (rc && *pos) {
1910                 rc = listening_get_next(seq, rc);
1911                 --*pos;
1912         }
1913         return rc;
1914 }
1915
1916 static inline bool empty_bucket(const struct tcp_iter_state *st)
1917 {
1918         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1919 }
1920
1921 /*
1922  * Get first established socket starting from bucket given in st->bucket.
1923  * If st->bucket is zero, the very first socket in the hash is returned.
1924  */
1925 static void *established_get_first(struct seq_file *seq)
1926 {
1927         struct tcp_iter_state *st = seq->private;
1928         struct net *net = seq_file_net(seq);
1929         void *rc = NULL;
1930
1931         st->offset = 0;
1932         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1933                 struct sock *sk;
1934                 struct hlist_nulls_node *node;
1935                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1936
1937                 /* Lockless fast path for the common case of empty buckets */
1938                 if (empty_bucket(st))
1939                         continue;
1940
1941                 spin_lock_bh(lock);
1942                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1943                         if (sk->sk_family != st->family ||
1944                             !net_eq(sock_net(sk), net)) {
1945                                 continue;
1946                         }
1947                         rc = sk;
1948                         goto out;
1949                 }
1950                 spin_unlock_bh(lock);
1951         }
1952 out:
1953         return rc;
1954 }
1955
1956 static void *established_get_next(struct seq_file *seq, void *cur)
1957 {
1958         struct sock *sk = cur;
1959         struct hlist_nulls_node *node;
1960         struct tcp_iter_state *st = seq->private;
1961         struct net *net = seq_file_net(seq);
1962
1963         ++st->num;
1964         ++st->offset;
1965
1966         sk = sk_nulls_next(sk);
1967
1968         sk_nulls_for_each_from(sk, node) {
1969                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1970                         return sk;
1971         }
1972
1973         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1974         ++st->bucket;
1975         return established_get_first(seq);
1976 }
1977
1978 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1979 {
1980         struct tcp_iter_state *st = seq->private;
1981         void *rc;
1982
1983         st->bucket = 0;
1984         rc = established_get_first(seq);
1985
1986         while (rc && pos) {
1987                 rc = established_get_next(seq, rc);
1988                 --pos;
1989         }
1990         return rc;
1991 }
1992
1993 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1994 {
1995         void *rc;
1996         struct tcp_iter_state *st = seq->private;
1997
1998         st->state = TCP_SEQ_STATE_LISTENING;
1999         rc        = listening_get_idx(seq, &pos);
2000
2001         if (!rc) {
2002                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2003                 rc        = established_get_idx(seq, pos);
2004         }
2005
2006         return rc;
2007 }
2008
2009 static void *tcp_seek_last_pos(struct seq_file *seq)
2010 {
2011         struct tcp_iter_state *st = seq->private;
2012         int offset = st->offset;
2013         int orig_num = st->num;
2014         void *rc = NULL;
2015
2016         switch (st->state) {
2017         case TCP_SEQ_STATE_LISTENING:
2018                 if (st->bucket >= INET_LHTABLE_SIZE)
2019                         break;
2020                 st->state = TCP_SEQ_STATE_LISTENING;
2021                 rc = listening_get_next(seq, NULL);
2022                 while (offset-- && rc)
2023                         rc = listening_get_next(seq, rc);
2024                 if (rc)
2025                         break;
2026                 st->bucket = 0;
2027                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2028                 /* Fallthrough */
2029         case TCP_SEQ_STATE_ESTABLISHED:
2030                 if (st->bucket > tcp_hashinfo.ehash_mask)
2031                         break;
2032                 rc = established_get_first(seq);
2033                 while (offset-- && rc)
2034                         rc = established_get_next(seq, rc);
2035         }
2036
2037         st->num = orig_num;
2038
2039         return rc;
2040 }
2041
2042 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2043 {
2044         struct tcp_iter_state *st = seq->private;
2045         void *rc;
2046
2047         if (*pos && *pos == st->last_pos) {
2048                 rc = tcp_seek_last_pos(seq);
2049                 if (rc)
2050                         goto out;
2051         }
2052
2053         st->state = TCP_SEQ_STATE_LISTENING;
2054         st->num = 0;
2055         st->bucket = 0;
2056         st->offset = 0;
2057         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2058
2059 out:
2060         st->last_pos = *pos;
2061         return rc;
2062 }
2063
2064 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2065 {
2066         struct tcp_iter_state *st = seq->private;
2067         void *rc = NULL;
2068
2069         if (v == SEQ_START_TOKEN) {
2070                 rc = tcp_get_idx(seq, 0);
2071                 goto out;
2072         }
2073
2074         switch (st->state) {
2075         case TCP_SEQ_STATE_LISTENING:
2076                 rc = listening_get_next(seq, v);
2077                 if (!rc) {
2078                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2079                         st->bucket = 0;
2080                         st->offset = 0;
2081                         rc        = established_get_first(seq);
2082                 }
2083                 break;
2084         case TCP_SEQ_STATE_ESTABLISHED:
2085                 rc = established_get_next(seq, v);
2086                 break;
2087         }
2088 out:
2089         ++*pos;
2090         st->last_pos = *pos;
2091         return rc;
2092 }
2093
2094 static void tcp_seq_stop(struct seq_file *seq, void *v)
2095 {
2096         struct tcp_iter_state *st = seq->private;
2097
2098         switch (st->state) {
2099         case TCP_SEQ_STATE_LISTENING:
2100                 if (v != SEQ_START_TOKEN)
2101                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2102                 break;
2103         case TCP_SEQ_STATE_ESTABLISHED:
2104                 if (v)
2105                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2106                 break;
2107         }
2108 }
2109
2110 int tcp_seq_open(struct inode *inode, struct file *file)
2111 {
2112         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2113         struct tcp_iter_state *s;
2114         int err;
2115
2116         err = seq_open_net(inode, file, &afinfo->seq_ops,
2117                           sizeof(struct tcp_iter_state));
2118         if (err < 0)
2119                 return err;
2120
2121         s = ((struct seq_file *)file->private_data)->private;
2122         s->family               = afinfo->family;
2123         s->last_pos             = 0;
2124         return 0;
2125 }
2126 EXPORT_SYMBOL(tcp_seq_open);
2127
2128 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2129 {
2130         int rc = 0;
2131         struct proc_dir_entry *p;
2132
2133         afinfo->seq_ops.start           = tcp_seq_start;
2134         afinfo->seq_ops.next            = tcp_seq_next;
2135         afinfo->seq_ops.stop            = tcp_seq_stop;
2136
2137         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2138                              afinfo->seq_fops, afinfo);
2139         if (!p)
2140                 rc = -ENOMEM;
2141         return rc;
2142 }
2143 EXPORT_SYMBOL(tcp_proc_register);
2144
2145 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2146 {
2147         remove_proc_entry(afinfo->name, net->proc_net);
2148 }
2149 EXPORT_SYMBOL(tcp_proc_unregister);
2150
2151 static void get_openreq4(const struct request_sock *req,
2152                          struct seq_file *f, int i)
2153 {
2154         const struct inet_request_sock *ireq = inet_rsk(req);
2155         long delta = req->rsk_timer.expires - jiffies;
2156
2157         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2158                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2159                 i,
2160                 ireq->ir_loc_addr,
2161                 ireq->ir_num,
2162                 ireq->ir_rmt_addr,
2163                 ntohs(ireq->ir_rmt_port),
2164                 TCP_SYN_RECV,
2165                 0, 0, /* could print option size, but that is af dependent. */
2166                 1,    /* timers active (only the expire timer) */
2167                 jiffies_delta_to_clock_t(delta),
2168                 req->num_timeout,
2169                 from_kuid_munged(seq_user_ns(f),
2170                                  sock_i_uid(req->rsk_listener)),
2171                 0,  /* non standard timer */
2172                 0, /* open_requests have no inode */
2173                 0,
2174                 req);
2175 }
2176
2177 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2178 {
2179         int timer_active;
2180         unsigned long timer_expires;
2181         const struct tcp_sock *tp = tcp_sk(sk);
2182         const struct inet_connection_sock *icsk = inet_csk(sk);
2183         const struct inet_sock *inet = inet_sk(sk);
2184         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2185         __be32 dest = inet->inet_daddr;
2186         __be32 src = inet->inet_rcv_saddr;
2187         __u16 destp = ntohs(inet->inet_dport);
2188         __u16 srcp = ntohs(inet->inet_sport);
2189         int rx_queue;
2190         int state;
2191
2192         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2193             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2194             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2195                 timer_active    = 1;
2196                 timer_expires   = icsk->icsk_timeout;
2197         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2198                 timer_active    = 4;
2199                 timer_expires   = icsk->icsk_timeout;
2200         } else if (timer_pending(&sk->sk_timer)) {
2201                 timer_active    = 2;
2202                 timer_expires   = sk->sk_timer.expires;
2203         } else {
2204                 timer_active    = 0;
2205                 timer_expires = jiffies;
2206         }
2207
2208         state = sk_state_load(sk);
2209         if (state == TCP_LISTEN)
2210                 rx_queue = sk->sk_ack_backlog;
2211         else
2212                 /* Because we don't lock the socket,
2213                  * we might find a transient negative value.
2214                  */
2215                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2216
2217         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2218                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2219                 i, src, srcp, dest, destp, state,
2220                 tp->write_seq - tp->snd_una,
2221                 rx_queue,
2222                 timer_active,
2223                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2224                 icsk->icsk_retransmits,
2225                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2226                 icsk->icsk_probes_out,
2227                 sock_i_ino(sk),
2228                 atomic_read(&sk->sk_refcnt), sk,
2229                 jiffies_to_clock_t(icsk->icsk_rto),
2230                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2231                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2232                 tp->snd_cwnd,
2233                 state == TCP_LISTEN ?
2234                     fastopenq->max_qlen :
2235                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2236 }
2237
2238 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2239                                struct seq_file *f, int i)
2240 {
2241         long delta = tw->tw_timer.expires - jiffies;
2242         __be32 dest, src;
2243         __u16 destp, srcp;
2244
2245         dest  = tw->tw_daddr;
2246         src   = tw->tw_rcv_saddr;
2247         destp = ntohs(tw->tw_dport);
2248         srcp  = ntohs(tw->tw_sport);
2249
2250         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2251                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2252                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2253                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2254                 atomic_read(&tw->tw_refcnt), tw);
2255 }
2256
2257 #define TMPSZ 150
2258
2259 static int tcp4_seq_show(struct seq_file *seq, void *v)
2260 {
2261         struct tcp_iter_state *st;
2262         struct sock *sk = v;
2263
2264         seq_setwidth(seq, TMPSZ - 1);
2265         if (v == SEQ_START_TOKEN) {
2266                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2267                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2268                            "inode");
2269                 goto out;
2270         }
2271         st = seq->private;
2272
2273         if (sk->sk_state == TCP_TIME_WAIT)
2274                 get_timewait4_sock(v, seq, st->num);
2275         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2276                 get_openreq4(v, seq, st->num);
2277         else
2278                 get_tcp4_sock(v, seq, st->num);
2279 out:
2280         seq_pad(seq, '\n');
2281         return 0;
2282 }
2283
2284 static const struct file_operations tcp_afinfo_seq_fops = {
2285         .owner   = THIS_MODULE,
2286         .open    = tcp_seq_open,
2287         .read    = seq_read,
2288         .llseek  = seq_lseek,
2289         .release = seq_release_net
2290 };
2291
2292 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2293         .name           = "tcp",
2294         .family         = AF_INET,
2295         .seq_fops       = &tcp_afinfo_seq_fops,
2296         .seq_ops        = {
2297                 .show           = tcp4_seq_show,
2298         },
2299 };
2300
2301 static int __net_init tcp4_proc_init_net(struct net *net)
2302 {
2303         return tcp_proc_register(net, &tcp4_seq_afinfo);
2304 }
2305
2306 static void __net_exit tcp4_proc_exit_net(struct net *net)
2307 {
2308         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2309 }
2310
2311 static struct pernet_operations tcp4_net_ops = {
2312         .init = tcp4_proc_init_net,
2313         .exit = tcp4_proc_exit_net,
2314 };
2315
2316 int __init tcp4_proc_init(void)
2317 {
2318         return register_pernet_subsys(&tcp4_net_ops);
2319 }
2320
2321 void tcp4_proc_exit(void)
2322 {
2323         unregister_pernet_subsys(&tcp4_net_ops);
2324 }
2325 #endif /* CONFIG_PROC_FS */
2326
2327 struct proto tcp_prot = {
2328         .name                   = "TCP",
2329         .owner                  = THIS_MODULE,
2330         .close                  = tcp_close,
2331         .connect                = tcp_v4_connect,
2332         .disconnect             = tcp_disconnect,
2333         .accept                 = inet_csk_accept,
2334         .ioctl                  = tcp_ioctl,
2335         .init                   = tcp_v4_init_sock,
2336         .destroy                = tcp_v4_destroy_sock,
2337         .shutdown               = tcp_shutdown,
2338         .setsockopt             = tcp_setsockopt,
2339         .getsockopt             = tcp_getsockopt,
2340         .recvmsg                = tcp_recvmsg,
2341         .sendmsg                = tcp_sendmsg,
2342         .sendpage               = tcp_sendpage,
2343         .backlog_rcv            = tcp_v4_do_rcv,
2344         .release_cb             = tcp_release_cb,
2345         .hash                   = inet_hash,
2346         .unhash                 = inet_unhash,
2347         .get_port               = inet_csk_get_port,
2348         .enter_memory_pressure  = tcp_enter_memory_pressure,
2349         .stream_memory_free     = tcp_stream_memory_free,
2350         .sockets_allocated      = &tcp_sockets_allocated,
2351         .orphan_count           = &tcp_orphan_count,
2352         .memory_allocated       = &tcp_memory_allocated,
2353         .memory_pressure        = &tcp_memory_pressure,
2354         .sysctl_mem             = sysctl_tcp_mem,
2355         .sysctl_wmem            = sysctl_tcp_wmem,
2356         .sysctl_rmem            = sysctl_tcp_rmem,
2357         .max_header             = MAX_TCP_HEADER,
2358         .obj_size               = sizeof(struct tcp_sock),
2359         .slab_flags             = SLAB_DESTROY_BY_RCU,
2360         .twsk_prot              = &tcp_timewait_sock_ops,
2361         .rsk_prot               = &tcp_request_sock_ops,
2362         .h.hashinfo             = &tcp_hashinfo,
2363         .no_autobind            = true,
2364 #ifdef CONFIG_COMPAT
2365         .compat_setsockopt      = compat_tcp_setsockopt,
2366         .compat_getsockopt      = compat_tcp_getsockopt,
2367 #endif
2368 #ifdef CONFIG_MEMCG_KMEM
2369         .init_cgroup            = tcp_init_cgroup,
2370         .destroy_cgroup         = tcp_destroy_cgroup,
2371         .proto_cgroup           = tcp_proto_cgroup,
2372 #endif
2373         .diag_destroy           = tcp_abort,
2374 };
2375 EXPORT_SYMBOL(tcp_prot);
2376
2377 static void __net_exit tcp_sk_exit(struct net *net)
2378 {
2379         int cpu;
2380
2381         for_each_possible_cpu(cpu)
2382                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2383         free_percpu(net->ipv4.tcp_sk);
2384 }
2385
2386 static int __net_init tcp_sk_init(struct net *net)
2387 {
2388         int res, cpu;
2389
2390         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2391         if (!net->ipv4.tcp_sk)
2392                 return -ENOMEM;
2393
2394         for_each_possible_cpu(cpu) {
2395                 struct sock *sk;
2396
2397                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2398                                            IPPROTO_TCP, net);
2399                 if (res)
2400                         goto fail;
2401                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2402         }
2403
2404         net->ipv4.sysctl_tcp_ecn = 2;
2405         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2406
2407         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2408         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2409         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2410
2411         return 0;
2412 fail:
2413         tcp_sk_exit(net);
2414
2415         return res;
2416 }
2417
2418 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2419 {
2420         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2421 }
2422
2423 static struct pernet_operations __net_initdata tcp_sk_ops = {
2424        .init       = tcp_sk_init,
2425        .exit       = tcp_sk_exit,
2426        .exit_batch = tcp_sk_exit_batch,
2427 };
2428
2429 void __init tcp_v4_init(void)
2430 {
2431         inet_hashinfo_init(&tcp_hashinfo);
2432         if (register_pernet_subsys(&tcp_sk_ops))
2433                 panic("Failed to create the TCP control socket.\n");
2434 }